Last active
January 14, 2020 10:41
-
-
Save aurbano/9a04cb4e5a2976f9f5c4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Sift4 - extended version | |
// online algorithm to compute the distance between two strings in O(n) | |
// maxOffset is the number of positions to search for matching tokens | |
// options: the options for the function, allowing for customization of the scope and algorithm: | |
// maxDistance: the distance at which the algorithm should stop computing the value and just exit (the strings are too different anyway) | |
// tokenizer: a function to transform strings into vectors of tokens | |
// tokenMatcher: a function to determine if two tokens are matching (equal) | |
// matchingEvaluator: a function to determine the way a token match should be added to the local_cs. For example a fuzzy match could be implemented. | |
// localLengthEvaluator: a function to determine the way the local_cs value is added to the lcss. For example longer continuous substrings could be awarded. | |
// transpositionCostEvaluator: a function to determine the value of an individual transposition. For example longer transpositions should have a higher cost. | |
// transpositionsEvaluator: a function to determine the way the total cost of transpositions affects the final result | |
// the options can and should be implemented at a class level, but this is the demo algorithm | |
function sift4(s1, s2, maxOffset, options) { | |
options=extend(options,{ | |
maxDistance:null, | |
tokenizer: function(s) { return s?s.split(''):[]; }, | |
tokenMatcher: function(t1,t2) { return t1==t2; }, | |
matchingEvaluator: function(t1,t2) { return 1; }, | |
localLengthEvaluator: function(local_cs) { return local_cs; }, | |
transpositionCostEvaluator: function(c1,c2) { return 1; }, | |
transpositionsEvaluator: function(lcss,trans) { return lcss-trans; } | |
}); | |
var t1=options.tokenizer(s1); | |
var t2=options.tokenizer(s2); | |
var l1=t1.length; | |
var l2=t2.length; | |
if (l1==0) return l2; | |
if (l2==0) return l1; | |
var c1 = 0; //cursor for string 1 | |
var c2 = 0; //cursor for string 2 | |
var lcss = 0; //largest common subsequence | |
var local_cs = 0; //local common substring | |
var trans = 0; //number of transpositions ('ab' vs 'ba') | |
var offset_arr=[]; //offset pair array, for computing the transpositions | |
while ((c1 < l1) && (c2 < l2)) { | |
if (options.tokenMatcher(t1[c1],t2[c2])) { | |
local_cs+=options.matchingEvaluator(t1[c1],t2[c2]); | |
var isTrans=false; | |
//see if current match is a transposition | |
var i=0; | |
while (i<offset_arr.length) { | |
var ofs=offset_arr[i]; | |
if (c1<=ofs.c1 || c2 <= ofs.c2) { | |
// when two matches cross, the one considered a transposition is the one with the largest difference in offsets | |
isTrans=Math.abs(c2-c1)>=Math.abs(ofs.c2-ofs.c1); | |
if (isTrans) | |
{ | |
trans+=options.transpositionCostEvaluator(c1,c2); | |
} else | |
{ | |
if (!ofs.trans) { | |
ofs.trans=true; | |
trans+=options.transpositionCostEvaluator(ofs.c1,ofs.c2); | |
} | |
} | |
break; | |
} else { | |
if (c1>ofs.c2 && c2>ofs.c1) { | |
offset_arr.splice(i,1); | |
} else { | |
i++; | |
} | |
} | |
} | |
offset_arr.push({ | |
c1:c1, | |
c2:c2, | |
trans:isTrans | |
}); | |
} else { | |
lcss+=options.localLengthEvaluator(local_cs); | |
local_cs=0; | |
if (c1!=c2) { | |
c1=c2=Math.min(c1,c2); //using min allows the computation of transpositions | |
} | |
//if matching tokens are found, remove 1 from both cursors (they get incremented at the end of the loop) | |
//so that we can have only one code block handling matches | |
for (var i = 0; i < maxOffset && (c1+i<l1 || c2+i<l2); i++) { | |
if ((c1 + i < l1) && options.tokenMatcher(t1[c1+i],t2[c2])) { | |
c1+= i-1; | |
c2--; | |
break; | |
} | |
if ((c2 + i < l2) && options.tokenMatcher(t1[c1],t2[c2+i])) { | |
c1--; | |
c2+= i-1; | |
break; | |
} | |
} | |
} | |
c1++; | |
c2++; | |
if (options.maxDistance) | |
{ | |
var temporaryDistance=options.localLengthEvaluator(Math.max(c1,c2))-options.transpositionsEvaluator(lcss,trans); | |
if (temporaryDistance>=options.maxDistance) return Math.round(temporaryDistance); | |
} | |
// this covers the case where the last match is on the last token in list, so that it can compute transpositions correctly | |
if ((c1 >= l1) || (c2 >= l2)) { | |
lcss+=options.localLengthEvaluator(local_cs); | |
local_cs=0; | |
c1=c2=Math.min(c1,c2); | |
} | |
} | |
lcss+=options.localLengthEvaluator(local_cs); | |
return Math.round(options.localLengthEvaluator(Math.max(l1,l2)) - options.transpositionsEvaluator(lcss,trans)); //add the cost of found transpositions | |
} | |
function extend(obj,def) { | |
var result={}; | |
for (var prop in def) { | |
if (!obj||!obj.hasOwnProperty(prop)) { | |
result[prop]=def[prop]; | |
} else { | |
result[prop]=obj[prop]; | |
} | |
} | |
return result; | |
} | |
// possible values for the options | |
// tokenizers: | |
function nGramTokenizer(s,n) { //tokenizer:function(s) { return nGramTokenizer(s,2); } | |
var result=[]; | |
if (!s) return result; | |
for (var i=0; i<=s.length-n; i++) { | |
result.push(s.substr(i,n)); | |
} | |
return result; | |
} | |
function wordSplitTokenizer(s) { //tokenizer:wordSplitTokenizer | |
if (!s) return []; | |
return s.split(/\s+/); | |
} | |
function characterFrequencyTokenizer(s) { //tokenizer:characterFrequencyTokenizer (letters only) | |
var result = []; | |
for (var i=0; i<=25; i++) { | |
var val=0; | |
if (s) { | |
for (j=0; j<s.length; j++) { | |
var code=s.charCodeAt(j); | |
if (code==i+65 || code==i+97) val++; | |
} | |
} | |
result.push(val); | |
} | |
return result; | |
} | |
//tokenMatchers: | |
function sift4TokenMatcher(t1,t2) { //tokenMatcher:sift4TokenMatcher | |
var similarity=1-sift4(t1,t2,5)/Math.max(t1.length,t2.length); | |
return similarity>0.7; | |
} | |
//matchingEvaluators: | |
function sift4MatchingEvaluator(t1,t2) { //matchingEvaluator:sift4MatchingEvaluator | |
var similarity=1-sift4(t1,t2,5)/Math.max(t1.length,t2.length); | |
return similarity; | |
} | |
//localLengthEvaluators: | |
function rewardLengthEvaluator(l) { | |
if (l<1) return l; //0 -> 0 | |
return l-1/(l+1); //1 -> 0.5, 2-> 0.66, 9 -> 0.9 | |
} | |
function rewardLengthEvaluator2(l) { | |
return Math.pow(l,1.5); // 0 -> 0, 1 -> 1, 2 -> 2.83, 10 -> 31.62 | |
} | |
//transpositionCostEvaluators: | |
function longerTranspositionsAreMoreCostly(c1,c2) { | |
return Math.abs(c2-c1)/9+1; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, please update the source URL to https://siderite.dev/blog/super-fast-and-accurate-string-distance.html. Thank you!