-
-
Save hakunin/5568313 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# adapted from https://gist.github.com/johdax/771943 | |
# modified to work without creating new String methods | |
class StringDistance | |
def self.damerau_levenshtein(str1, str2) | |
d = Array.new(str1.size+1){Array.new(str2.size+1)} | |
for i in (0..str1.size) | |
d[i][0] = i | |
end | |
for j in (0..str2.size) | |
d[0][j] = j | |
end | |
each_char_with_index(str1) do |i,c1| | |
each_char_with_index(str2) do |j,c2| | |
c = (c1 == c2 ? 0 : 1) | |
d[i+1][j+1] = [ | |
d[i][j+1] + 1, #deletion | |
d[i+1][j] + 1, #insertion | |
d[i][j] + c].min #substitution | |
if (i>0) and (j>0) and (str1[i]==str2[j-1]) and (str1[i-1]==str2[j]) | |
d[i+1][j+1] = [ | |
d[i+1][j+1], | |
d[i-1][j-1] + c].min #transposition | |
end | |
end | |
end | |
d[str1.size][str2.size] | |
end | |
def self.levenshtein(str1, str2) | |
return str1.length if 0 == str2.length | |
return str2.length if 0 == str1.length | |
c = Array.new | |
c << (str1[0] == str2[0] ? 0 : 1) + (levenshtein str1[1..-1], str2[1..-1]) | |
c << 1 + levenshtein(str1[1..-1], str2) | |
c << 1 + levenshtein(str1, str2[1..-1]) | |
return c.min | |
end | |
def self.dice_coefficient(str1, str2) | |
str1_2grams = ngrams(str1, 2) | |
str2_2grams = ngrams(str2, 2) | |
intersection = (str1_2grams & str2_2grams).length | |
total = str1_2grams.length + str2_2grams.length | |
dice = 2.0 * intersection / total | |
end | |
def self.hamming_distance(str1, str2) | |
str1.split(//).zip(str2.split(//)).inject(0) { |h, e| e[0]==e[1] ? h+0 : h+1 } | |
end | |
protected | |
# get ngrams of string | |
def self.ngrams(string, len = 1) | |
ngrams = [] | |
len = string.size if len > string.size | |
(0..string.size - len).each do |n| | |
ng = string[n...(n + len)] | |
ngrams.push(ng) | |
end | |
ngrams | |
end | |
# return character array of string with indices | |
def self.each_char_with_index(string) | |
i = 0 | |
string.split(//).each do |c| | |
yield i, c | |
i += 1 | |
end | |
end | |
end | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is how I use it in Rspec to test my scraper | |
# assertion that compares two strings | |
def compare a, b, required_sameness = 0.9 | |
if a == nil | |
fail "extracted string should not be nil" | |
end | |
if a == nil | |
fail "control string should not be nil" | |
end | |
a = a.gsub(/[\n\r\t]/, ' ') | |
a = a.gsub(/\s{2,}/, ' ') | |
b = b.gsub(/[\n\r\t]/, ' ') | |
b = b.gsub(/\s{2,}/, ' ') | |
len = [a.length, b.length].min | |
a = a[0..len] | |
b = b[0..len] | |
diff_len = StringDistance.damerau_levenshtein(a, b) | |
sameness = 1 - (diff_len * 1.0 / len) | |
if sameness > required_sameness | |
assert true | |
else | |
puts "extracted sring:", a | |
puts "control string:", b | |
fail "exrtacted string sameness was too low (#{sameness})" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment