Created
August 15, 2012 15:44
-
-
Save jimfleming/3361107 to your computer and use it in GitHub Desktop.
Fuzzy string comparison
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Based on: http://seatgeek.com/blog/dev/fuzzywuzzy-fuzzy-string-matching-in-python | |
_ = require 'underscore' | |
token_regex = /\w+/g | |
trim_regex = /^\s+|\s+$/g | |
levenshtein_dist = (a, b) -> | |
throw new TypeError('a is empty') unless _.isString(a) | |
throw new TypeError('b is empty') unless _.isString(b) | |
lensum = a.length + b.length | |
d = [e = 0] | |
while a[e] | |
c = [f = 0] | |
while b[++f] | |
g = d[f] = (if e then 1 + Math.min(d[--f], c[f] - (a[e - 1] is b[f]), c[++f] = d[f]) else f) | |
e++ | |
return (lensum - g) / lensum | |
token_set_ratio = (s1, s2) -> | |
throw new TypeError('s1 is empty') unless _.isString(s1) | |
throw new TypeError('s2 is empty') unless _.isString(s2) | |
tokens1 = s1.match token_regex | |
tokens2 = s2.match token_regex | |
intersection = _.intersection tokens1, tokens2 | |
diff_1to2 = _.difference tokens1, tokens2 | |
diff_2to1 = _.difference tokens2, tokens1 | |
sorted_sect = _.sortBy intersection, (word) -> | |
return word | |
.join ' ' | |
.replace trim_regex, '' | |
sorted_1to2 = _.sortBy diff_1to2, (word) -> | |
return word | |
.join ' ' | |
.replace trim_regex, '' | |
sorted_2to1 = _.sortBy diff_2to1, (word) -> | |
return word | |
.join ' ' | |
.replace trim_regex, '' | |
combined_1to2 = "#{sorted_sect} #{sorted_1to2}".replace trim_regex, '' | |
combined_2to1 = "#{sorted_sect} #{sorted_2to1}".replace trim_regex, '' | |
pairwise = [ | |
levenshtein_dist sorted_sect, combined_1to2 | |
levenshtein_dist sorted_sect, combined_2to1 | |
levenshtein_dist combined_1to2, combined_2to1 | |
] | |
return _.max pairwise |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment