Created
March 5, 2012 05:03
-
-
Save cfcosta/1976697 to your computer and use it in GitHub Desktop.
Trigram similarity index for words.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'minitest/autorun' | |
class String | |
def match_index_to(other) | |
this = self.to_trigrams.first | |
other = other.to_trigrams.first | |
2.0 * ((this & other).size).to_f / (this.size + other.size).to_f | |
end | |
def to_trigrams | |
target = self.downcase.clean_up | |
target.chars.each_slice(3).inject([]) { |accum, el| accum << el } | |
end | |
def clean_up | |
self.delete("()[]{}") | |
end | |
end | |
class TrigramTest < MiniTest::Unit::TestCase | |
def setup | |
@car_trigrams = [["c", "a", "r"], ["r", "o"]] | |
@planet_trigrams = [["p", "l", "a"], ["n", "e", "t"], ["a"]] | |
end | |
def test_word_to_trigrams | |
assert_equal @car_trigrams, "carro".to_trigrams | |
assert_equal @planet_trigrams, "planeta".to_trigrams | |
end | |
def test_remove_garbage | |
assert_equal @car_trigrams, "(carro)".to_trigrams | |
assert_equal @car_trigrams, "[carro]".to_trigrams | |
assert_equal @car_trigrams, "{carro}".to_trigrams | |
end | |
def test_downcase_string | |
assert_equal @car_trigrams, "CARRO".to_trigrams | |
end | |
def test_match_index | |
assert_in_delta 0.66, "carro".match_index_to("barro"), 0.01 | |
assert_in_delta 1.0, "alexandre".match_index_to("aleksander"), 0.01 | |
assert_in_delta 0.66, "arthur".match_index_to("artur"), 0.01 | |
assert_in_delta 0.16, "otorrinolaringologista".match_index_to("barro"), 0.01 | |
assert_in_delta 0.75, "porra".match_index_to("borra"), 0.01 | |
assert_in_delta 0.75, "caralho".match_index_to("kralho"), 0.01 | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment