Created
September 18, 2012 16:10
-
-
Save srideepprasad/3743993 to your computer and use it in GitHub Desktop.
Fuzzy String Search based on a modified Dice Coeffecient alogorithm and conditional application of Levenstein Distance formula
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function FuzzyComparator (keywords){ | |
var adaptForComparison = function(str){ | |
return str.replace(/\s+/,' ').toLowerCase(); | |
} | |
var bigramize = function(keyword){ | |
var iter = 0; | |
var bigrams = new Array(); | |
for (iter=0;iter<keyword.length; iter=iter+2){ | |
if (iter+1<keyword.length){ | |
bigrams.push((keyword.charCodeAt(iter) << 8) | keyword.charCodeAt(iter+1)); | |
} | |
else{ | |
bigrams.push((keyword.charCodeAt(iter - 1) << 8) | keyword.charCodeAt(iter)); | |
} | |
} | |
return bigrams; | |
}; | |
var bigramizeCandidate = function(candidate){ | |
var tokens = candidate.split(' '); | |
var tokensAndBigramsForCandidate = []; | |
for (tokenIndex in tokens){ | |
if (!Array.isArray(tokensAndBigramsForCandidate[tokens[tokenIndex]])){ | |
tokensAndBigramsForCandidate[tokens[tokenIndex]] = new Array(); | |
} | |
tokensAndBigramsForCandidate[tokens[tokenIndex]].push(bigramize(tokens[tokenIndex])); | |
}; | |
return tokensAndBigramsForCandidate; | |
}; | |
var compareInternal = function(candidateBigrams, compareToBigrams){ | |
var score = 0.0; | |
var matches = 0; | |
for (iter in candidateBigrams){ | |
//console.log(candidateBigrams[iter] + ":" + compareToBigrams[iter]); | |
if (candidateBigrams[iter] == compareToBigrams[iter]){ | |
matches++; | |
} | |
} | |
return (2 * matches) / (candidateBigrams.length + compareToBigrams.length); | |
}; | |
var calcBigramsetSize = function(bigramSet){ | |
var retSize = 0; | |
for (bigramIndex in bigramSet){ | |
retSize = retSize + bigramSet[bigramIndex].length; | |
}; | |
return retSize; | |
}; | |
this.fuzzyCompare = function(userInput, candidateInput){ | |
var inputBigramsAndTokenSet = bigramizeCandidate(adaptForComparison(userInput)); | |
var candidateBigramsAndTokenSet = bigramizeCandidate(adaptForComparison(candidateInput)); | |
//var inputTokenSize = Object.keys(inputBigramsAndTokenSet).length; | |
//var candidateTokenSize = Object.keys(candidateBigramsAndTokenSet).length; | |
var inputTokenSize = calcBigramsetSize(inputBigramsAndTokenSet); | |
var candidateTokenSize = calcBigramsetSize(candidateBigramsAndTokenSet); | |
var bestPossibleScore = (inputTokenSize > candidateTokenSize) ? inputTokenSize : candidateTokenSize; | |
var totalScore = 0.0; | |
var bestCandidateIndex = null; | |
var bestInputIndex = null; | |
do{ | |
var bestScore = 0.0; | |
var bestCandidateIndex = null; | |
var bestInputIndex = null; | |
for (inputIndex in inputBigramsAndTokenSet){ | |
if (inputBigramsAndTokenSet[inputIndex].length == 0) continue; | |
var bigramsForInputToken = inputBigramsAndTokenSet[inputIndex][0]; | |
var score = 0.0; | |
for (candidateIndex in candidateBigramsAndTokenSet){ | |
if (candidateBigramsAndTokenSet[candidateIndex].length == 0) continue; | |
var candidateTokenBigrams = candidateBigramsAndTokenSet[candidateIndex][0]; | |
score = compareInternal(candidateTokenBigrams,bigramsForInputToken); | |
if (score >= bestScore){ | |
bestScore = score; | |
bestCandidateIndex = candidateIndex; | |
bestInputIndex = inputIndex; | |
} | |
} | |
} | |
if (bestScore !=0){ | |
inputBigramsAndTokenSet[bestInputIndex].pop(); | |
candidateBigramsAndTokenSet[bestCandidateIndex].pop(); | |
} | |
totalScore = totalScore + bestScore; | |
}while(bestScore!=0); | |
return (totalScore / bestPossibleScore) * 100; | |
}; | |
//Optimized Levenstein distance algorithm by akidee - https://gist.github.com/527658 | |
var levenshtein = function(){ | |
var min = Math.min; | |
try { split = !('0')[0]; } catch(i) { split = true; } | |
return function (a, b) { | |
if (a == b) return 0; | |
if (!a.length || !b.length) return b.length || a.length; | |
if (split) { | |
a = a.split(''); | |
b = b.split(''); | |
}; | |
var l_a = a.length + 1, | |
l_b = b.length + 1, | |
I = 0, | |
i = 0, | |
d = [0], | |
c, j = 0, J, | |
ld; | |
while(++j < l_b) d[j] = j; | |
i = 0; | |
while(++i < l_a){ | |
J = j = 0; | |
c = a[I]; | |
ld = d.slice(0); | |
d = [i]; | |
while(++j < l_b){ | |
d[j] = min( | |
ld[j] + 1, | |
d[J] + 1, | |
ld[J] + (c != b[J]) | |
); | |
++J; | |
} | |
++I; | |
} | |
return d[l_b - 1]; | |
} | |
}(); | |
//End of levenstein implementation | |
this.fuzzyCompareAndRank = function(userInput){ | |
var results = new Array(); | |
var score =0.0; | |
for (keywordIndex in keywords){ | |
score = this.fuzzyCompare(keywords[keywordIndex], userInput); | |
if (score > 0){ | |
var result = new Object(); | |
result.text = keywords[keywordIndex]; | |
result.weightage = score; | |
results.push(result); | |
} | |
} | |
results.sort(function sortfunc(result1,result2){ | |
return result2.weightage - result1.weightage ; | |
}); | |
results.sort(function conditionalLevensteinSort(result1,result2){ | |
if (result2.weightage == result1.weightage) { | |
return levenshtein(result1.text,userInput) - levenshtein(result2.text,userInput); | |
} | |
return 0; | |
}); | |
return results; | |
}; | |
this.fuzzyCompareAndRankObjects = function(userInput, evaluationClosure){ | |
var results = new Array(); | |
var score =0.0; | |
for (keywordIndex in keywords){ | |
score = this.fuzzyCompare(evaluationClosure(keywords[keywordIndex]), userInput); | |
if (score > 0){ | |
var result = new Object(); | |
result.text = keywords[keywordIndex]; | |
result.weightage = score; | |
results.push(result); | |
} | |
} | |
results.sort(function sortfunc(result1,result2){ | |
return result2.weightage - result1.weightage ; | |
}); | |
results.sort(function conditionalLevensteinSort(result1,result2){ | |
if (result2.weightage == result1.weightage) { | |
return levenshtein(result1.text,userInput) - levenshtein(result2.text,userInput); | |
} | |
return 0; | |
}); | |
return results; | |
}; | |
} |
//Example usage:
var fuzzy = new FuzzyComparator(['Open New', 'Open a File','Open Another File', 'Open File','File Open','File Save','Exit','Delete File']);
console.log(fuzzy.fuzzyCompareAndRank('file open'));
/*
Prints following :
[ { text: 'File Open', weightage: 100 },
{ text: 'Open File', weightage: 100 },
{ text: 'Open a File', weightage: 66.66666666666666 },
{ text: 'Open Another File', weightage: 66.66666666666666 },
{ text: 'File Save', weightage: 50 },
{ text: 'Open New', weightage: 50 },
{ text: 'Delete File', weightage: 50 } ]
*/
test comment
test comment
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example usage:
Prints following :