Created
          January 9, 2011 19:42 
        
      - 
      
- 
        Save dingsdax/771943 to your computer and use it in GitHub Desktop. 
    string similarity metrics in ruby
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # extension for string class | |
| class String | |
| # return character array of string with indices | |
| def each_char_with_index | |
| i = 0 | |
| split(//).each do |c| | |
| yield i, c | |
| i += 1 | |
| end | |
| end | |
| end | |
| def damerau_levenshtein(str1, str2) | |
| d = Array.new(str1.size+1){Array.new(str2.size+1)} | |
| for i in (0..str1.size) | |
| d[i][0] = i | |
| end | |
| for j in (0..str2.size) | |
| d[0][j] = j | |
| end | |
| str1.each_char_with_index do |i,c1| | |
| str2.each_char_with_index do |j,c2| | |
| c = (c1 == c2 ? 0 : 1) | |
| d[i+1][j+1] = [ | |
| d[i][j+1] + 1, #deletion | |
| d[i+1][j] + 1, #insertion | |
| d[i][j] + c].min #substitution | |
| if (i>0) and (j>0) and (str1[i]==str2[j-1]) and (str1[i-1]==str2[j]) | |
| d[i+1][j+1] = [ | |
| d[i+1][j+1], | |
| d[i-1][j-1] + c].min #transposition | |
| end | |
| end | |
| end | |
| d[str1.size][str2.size] | |
| end | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # extension for string class | |
| class String | |
| # get ngrams of string | |
| def ngrams(len = 1) | |
| ngrams = [] | |
| len = size if len > size | |
| (0..size - len).each do |n| | |
| ng = self[n...(n + len)] | |
| ngrams.push(ng) | |
| end | |
| ngrams | |
| end | |
| end | |
| def dice_coefficient(str1, str2) | |
| str1_2grams = str1.ngrams(2) | |
| str2_2grams = str2.ngrams(2) | |
| intersection = (str1_2grams & str2_2grams).length | |
| total = str1_2grams.length + str2_2grams.length | |
| dice = 2.0 * intersection / total | |
| end | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | def hamming_distance(str1, str2) | |
| str1.split(//).zip(str2.split(//)).inject(0) { |h, e| e[0]==e[1] ? h+0 : h+1 } | |
| end | 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | def levenshtein(str1, str2) | |
| return str1.length if 0 == str2.length | |
| return str2.length if 0 == str1.length | |
| c = Array.new | |
| c << (str1[0] == str2[0] ? 0 : 1) + (levenshtein str1[1..-1], str2[1..-1]) | |
| c << 1 + levenshtein(str1[1..-1], str2) | |
| c << 1 + levenshtein(str1, str2[1..-1]) | |
| return c.min | |
| end | 
@Mori agreed. For those who don't want to invade String class here is my adaptation + usage https://gist.github.com/hakunin/5568313
@dingsdax Thanks for sharing the two implementations, helped me a lot!
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment
  
            
Nice. But odd that these are instance methods that take two args. Shouldn't they be class methods that take two args, or instance methods that take one arg?