aurbano · January 14, 2020 10:41 · Siderite · Jan 14, 2020
diff --git a/sift4-extended.js b/sift4-extended.js
 // Sift4 - extended version
 // online algorithm to compute the distance between two strings in O(n)
 // maxOffset is the number of positions to search for matching tokens
 // options: the options for the function, allowing for customization of the scope and algorithm:
 //         maxDistance: the distance at which the algorithm should stop computing the value and just exit (the strings are too different anyway)
 //         tokenizer: a function to transform strings into vectors of tokens
 //        tokenMatcher: a function to determine if two tokens are matching (equal)
 //        matchingEvaluator: a function to determine the way a token match should be added to the local_cs. For example a fuzzy match could be implemented.
 //        localLengthEvaluator: a function to determine the way the local_cs value is added to the lcss. For example longer continuous substrings could be awarded.
 //        transpositionCostEvaluator: a function to determine the value of an individual transposition. For example longer transpositions should have a higher cost.
 //        transpositionsEvaluator: a function to determine the way the total cost of transpositions affects the final result
 // the options can and should be implemented at a class level, but this is the demo algorithm
 function sift4(s1, s2, maxOffset, options) {

    options=extend(options,{
        maxDistance:null,
        tokenizer: function(s) { return s?s.split(''):[]; },
        tokenMatcher: function(t1,t2) { return t1==t2; },
        matchingEvaluator: function(t1,t2) { return 1; },
        localLengthEvaluator: function(local_cs) { return local_cs; },
        transpositionCostEvaluator: function(c1,c2) { return 1; },
        transpositionsEvaluator: function(lcss,trans) { return lcss-trans; }
    });

    var t1=options.tokenizer(s1);
    var t2=options.tokenizer(s2);

    var l1=t1.length;
    var l2=t2.length;

    if (l1==0) return l2;
    if (l2==0) return l1;

    var c1 = 0;  //cursor for string 1
    var c2 = 0;  //cursor for string 2
    var lcss = 0;  //largest common subsequence
    var local_cs = 0; //local common substring
    var trans = 0;  //number of transpositions ('ab' vs 'ba')
    var offset_arr=[];  //offset pair array, for computing the transpositions

    while ((c1 < l1) && (c2 < l2)) {
        if (options.tokenMatcher(t1[c1],t2[c2])) {
            local_cs+=options.matchingEvaluator(t1[c1],t2[c2]);
            var isTrans=false;
            //see if current match is a transposition
            var i=0;
            while (i<offset_arr.length) {
                var ofs=offset_arr[i];
                if (c1<=ofs.c1 || c2 <= ofs.c2) {
                    // when two matches cross, the one considered a transposition is the one with the largest difference in offsets
                    isTrans=Math.abs(c2-c1)>=Math.abs(ofs.c2-ofs.c1);
                    if (isTrans)
                    {
                        trans+=options.transpositionCostEvaluator(c1,c2);
                    } else
                    {
                        if (!ofs.trans) {
                            ofs.trans=true;
                            trans+=options.transpositionCostEvaluator(ofs.c1,ofs.c2);
                        }
                    }
                    break;
                } else {
                    if (c1>ofs.c2 && c2>ofs.c1) {
                        offset_arr.splice(i,1);
                    } else {
                        i++;
                    }
                }
            }
            offset_arr.push({
                c1:c1,
                c2:c2,
                trans:isTrans
            });
        } else {
            lcss+=options.localLengthEvaluator(local_cs);
            local_cs=0;
            if (c1!=c2) {
                c1=c2=Math.min(c1,c2);  //using min allows the computation of transpositions
            }
            //if matching tokens are found, remove 1 from both cursors (they get incremented at the end of the loop)
            //so that we can have only one code block handling matches 
            for (var i = 0; i < maxOffset && (c1+i<l1 || c2+i<l2); i++) {
                if ((c1 + i < l1) && options.tokenMatcher(t1[c1+i],t2[c2])) {
                    c1+= i-1; 
                    c2--;
                    break;
                }
                if ((c2 + i < l2) && options.tokenMatcher(t1[c1],t2[c2+i])) {
                    c1--;
                    c2+= i-1;
                    break;
                }
            }
        }
           c1++;
          c2++;
        if (options.maxDistance)
        {
            var temporaryDistance=options.localLengthEvaluator(Math.max(c1,c2))-options.transpositionsEvaluator(lcss,trans);
            if (temporaryDistance>=options.maxDistance) return Math.round(temporaryDistance);
        }
        // this covers the case where the last match is on the last token in list, so that it can compute transpositions correctly
        if ((c1 >= l1) || (c2 >= l2)) {
            lcss+=options.localLengthEvaluator(local_cs);
            local_cs=0;
            c1=c2=Math.min(c1,c2);
        }
    }
    lcss+=options.localLengthEvaluator(local_cs);
    return Math.round(options.localLengthEvaluator(Math.max(l1,l2)) - options.transpositionsEvaluator(lcss,trans)); //add the cost of found transpositions
 }

 function extend(obj,def) {
    var result={};
    for (var prop in def) {
        if (!obj||!obj.hasOwnProperty(prop)) {
            result[prop]=def[prop];
        } else {
            result[prop]=obj[prop];
        }
    }
    return result;
 }

 // possible values for the options
 // tokenizers:
 function nGramTokenizer(s,n) { //tokenizer:function(s) { return nGramTokenizer(s,2); }
    var result=[];
    if (!s) return result;
    for (var i=0; i<=s.length-n; i++) {
        result.push(s.substr(i,n));
    }
    return result;
 }

 function wordSplitTokenizer(s) { //tokenizer:wordSplitTokenizer
    if (!s) return [];
    return s.split(/\s+/);
 }

 function characterFrequencyTokenizer(s) { //tokenizer:characterFrequencyTokenizer (letters only)
    var result = [];
    for (var i=0; i<=25; i++) {
        var val=0;
        if (s) {
            for (j=0; j<s.length; j++) {
                var code=s.charCodeAt(j);
                if (code==i+65 || code==i+97) val++;
            }
        }
        result.push(val);
    }
    return result;
 }

 //tokenMatchers:
 function sift4TokenMatcher(t1,t2) { //tokenMatcher:sift4TokenMatcher
    var similarity=1-sift4(t1,t2,5)/Math.max(t1.length,t2.length);
    return similarity>0.7;
 }

 //matchingEvaluators:
 function sift4MatchingEvaluator(t1,t2) { //matchingEvaluator:sift4MatchingEvaluator
    var similarity=1-sift4(t1,t2,5)/Math.max(t1.length,t2.length);
    return similarity;
 }

 //localLengthEvaluators:
 function rewardLengthEvaluator(l) {
    if (l<1) return l; //0 -> 0
    return l-1/(l+1);  //1 -> 0.5, 2-> 0.66, 9 -> 0.9
 }

 function rewardLengthEvaluator2(l) {
    return Math.pow(l,1.5); // 0 -> 0, 1 -> 1, 2 -> 2.83, 10 -> 31.62
 }

 //transpositionCostEvaluators:
 function longerTranspositionsAreMoreCostly(c1,c2) {
    return Math.abs(c2-c1)/9+1;
 }
	// Sift4 - extended version
	// online algorithm to compute the distance between two strings in O(n)
	// maxOffset is the number of positions to search for matching tokens
	// options: the options for the function, allowing for customization of the scope and algorithm:
	// maxDistance: the distance at which the algorithm should stop computing the value and just exit (the strings are too different anyway)
	// tokenizer: a function to transform strings into vectors of tokens
	// tokenMatcher: a function to determine if two tokens are matching (equal)
	// matchingEvaluator: a function to determine the way a token match should be added to the local_cs. For example a fuzzy match could be implemented.
	// localLengthEvaluator: a function to determine the way the local_cs value is added to the lcss. For example longer continuous substrings could be awarded.
	// transpositionCostEvaluator: a function to determine the value of an individual transposition. For example longer transpositions should have a higher cost.
	// transpositionsEvaluator: a function to determine the way the total cost of transpositions affects the final result
	// the options can and should be implemented at a class level, but this is the demo algorithm
	function sift4(s1, s2, maxOffset, options) {

	options=extend(options,{
	maxDistance:null,
	tokenizer: function(s) { return s?s.split(''):[]; },
	tokenMatcher: function(t1,t2) { return t1==t2; },
	matchingEvaluator: function(t1,t2) { return 1; },
	localLengthEvaluator: function(local_cs) { return local_cs; },
	transpositionCostEvaluator: function(c1,c2) { return 1; },
	transpositionsEvaluator: function(lcss,trans) { return lcss-trans; }
	});

	var t1=options.tokenizer(s1);
	var t2=options.tokenizer(s2);

	var l1=t1.length;
	var l2=t2.length;

	if (l1==0) return l2;
	if (l2==0) return l1;

	var c1 = 0; //cursor for string 1
	var c2 = 0; //cursor for string 2
	var lcss = 0; //largest common subsequence
	var local_cs = 0; //local common substring
	var trans = 0; //number of transpositions ('ab' vs 'ba')
	var offset_arr=[]; //offset pair array, for computing the transpositions

	while ((c1 < l1) && (c2 < l2)) {
	if (options.tokenMatcher(t1[c1],t2[c2])) {
	local_cs+=options.matchingEvaluator(t1[c1],t2[c2]);
	var isTrans=false;
	//see if current match is a transposition
	var i=0;
	while (i<offset_arr.length) {
	var ofs=offset_arr[i];
	if (c1<=ofs.c1 \|\| c2 <= ofs.c2) {
	// when two matches cross, the one considered a transposition is the one with the largest difference in offsets
	isTrans=Math.abs(c2-c1)>=Math.abs(ofs.c2-ofs.c1);
	if (isTrans)
	{
	trans+=options.transpositionCostEvaluator(c1,c2);
	} else
	{
	if (!ofs.trans) {
	ofs.trans=true;
	trans+=options.transpositionCostEvaluator(ofs.c1,ofs.c2);
	}
	}
	break;
	} else {
	if (c1>ofs.c2 && c2>ofs.c1) {
	offset_arr.splice(i,1);
	} else {
	i++;
	}
	}
	}
	offset_arr.push({
	c1:c1,
	c2:c2,
	trans:isTrans
	});
	} else {
	lcss+=options.localLengthEvaluator(local_cs);
	local_cs=0;
	if (c1!=c2) {
	c1=c2=Math.min(c1,c2); //using min allows the computation of transpositions
	}
	//if matching tokens are found, remove 1 from both cursors (they get incremented at the end of the loop)
	//so that we can have only one code block handling matches
	for (var i = 0; i < maxOffset && (c1+i<l1 \|\| c2+i<l2); i++) {
	if ((c1 + i < l1) && options.tokenMatcher(t1[c1+i],t2[c2])) {
	c1+= i-1;
	c2--;
	break;
	}
	if ((c2 + i < l2) && options.tokenMatcher(t1[c1],t2[c2+i])) {
	c1--;
	c2+= i-1;
	break;
	}
	}
	}
	c1++;
	c2++;
	if (options.maxDistance)
	{
	var temporaryDistance=options.localLengthEvaluator(Math.max(c1,c2))-options.transpositionsEvaluator(lcss,trans);
	if (temporaryDistance>=options.maxDistance) return Math.round(temporaryDistance);
	}
	// this covers the case where the last match is on the last token in list, so that it can compute transpositions correctly
	if ((c1 >= l1) \|\| (c2 >= l2)) {
	lcss+=options.localLengthEvaluator(local_cs);
	local_cs=0;
	c1=c2=Math.min(c1,c2);
	}
	}
	lcss+=options.localLengthEvaluator(local_cs);
	return Math.round(options.localLengthEvaluator(Math.max(l1,l2)) - options.transpositionsEvaluator(lcss,trans)); //add the cost of found transpositions
	}

	function extend(obj,def) {
	var result={};
	for (var prop in def) {
	if (!obj\|\|!obj.hasOwnProperty(prop)) {
	result[prop]=def[prop];
	} else {
	result[prop]=obj[prop];
	}
	}
	return result;
	}

	// possible values for the options
	// tokenizers:
	function nGramTokenizer(s,n) { //tokenizer:function(s) { return nGramTokenizer(s,2); }
	var result=[];
	if (!s) return result;
	for (var i=0; i<=s.length-n; i++) {
	result.push(s.substr(i,n));
	}
	return result;
	}

	function wordSplitTokenizer(s) { //tokenizer:wordSplitTokenizer
	if (!s) return [];
	return s.split(/\s+/);
	}

	function characterFrequencyTokenizer(s) { //tokenizer:characterFrequencyTokenizer (letters only)
	var result = [];
	for (var i=0; i<=25; i++) {
	var val=0;
	if (s) {
	for (j=0; j<s.length; j++) {
	var code=s.charCodeAt(j);
	if (code==i+65 \|\| code==i+97) val++;
	}
	}
	result.push(val);
	}
	return result;
	}

	//tokenMatchers:
	function sift4TokenMatcher(t1,t2) { //tokenMatcher:sift4TokenMatcher
	var similarity=1-sift4(t1,t2,5)/Math.max(t1.length,t2.length);
	return similarity>0.7;
	}

	//matchingEvaluators:
	function sift4MatchingEvaluator(t1,t2) { //matchingEvaluator:sift4MatchingEvaluator
	var similarity=1-sift4(t1,t2,5)/Math.max(t1.length,t2.length);
	return similarity;
	}

	//localLengthEvaluators:
	function rewardLengthEvaluator(l) {
	if (l<1) return l; //0 -> 0
	return l-1/(l+1); //1 -> 0.5, 2-> 0.66, 9 -> 0.9
	}

	function rewardLengthEvaluator2(l) {
	return Math.pow(l,1.5); // 0 -> 0, 1 -> 1, 2 -> 2.83, 10 -> 31.62
	}

	//transpositionCostEvaluators:
	function longerTranspositionsAreMoreCostly(c1,c2) {
	return Math.abs(c2-c1)/9+1;
	}