Forked from dhruvasagar/damerau_levenshtein_distance.rb
Created
March 3, 2019 21:34
-
-
Save mAlishera/c4e6aa5194aa366714f2dbd48d2513be to your computer and use it in GitHub Desktop.
Damerau-Levenshtein distance for ruby in C
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby1.9 | |
# encoding: UTF-8 | |
require 'rubygems' | |
require 'inline' | |
require 'time' | |
class DamerauLevenshtein | |
def distance(str1, str2, block_size=2, max_distance=10) | |
res = distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance) | |
(res > max_distance) ? nil : res | |
end | |
inline do |builder| | |
builder.c " | |
static VALUE distance_utf(VALUE _s, VALUE _t, int block_size, int max_distance){ | |
int min, i,j, sl, tl, cost, *d, distance, del, ins, subs, transp, block, current_distance; | |
int stop_execution = 0; | |
VALUE *sv = RARRAY_PTR(_s); | |
VALUE *tv = RARRAY_PTR(_t); | |
sl = RARRAY_LEN(_s); | |
tl = RARRAY_LEN(_t); | |
int s[sl]; | |
int t[tl]; | |
for (i=0; i < sl; i++) s[i] = NUM2INT(sv[i]); | |
for (i=0; i < tl; i++) t[i] = NUM2INT(tv[i]); | |
sl++; | |
tl++; | |
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1 | |
d = malloc((sizeof(int))*(sl)*(tl)); | |
//populate 'horisonal' row | |
for(i = 0; i < sl; i++){ | |
d[i] = i; | |
} | |
//populate 'vertical' row starting from the 2nd position (first one is filled already) | |
for(i = 1; i < tl; i++){ | |
d[i*sl] = i; | |
} | |
//fill up array with scores | |
for(i = 1; i<sl; i++){ | |
if (stop_execution == 1) break; | |
current_distance = 10000; | |
for(j = 1; j<tl; j++){ | |
block = block_size < i ? block_size : i; | |
if (j < block) block = j; | |
cost = 1; | |
if(s[i-1] == t[j-1]) cost = 0; | |
del = d[j*sl + i - 1] + 1; | |
ins = d[(j-1)*sl + i] + 1; | |
subs = d[(j-1)*sl + i - 1] + cost; | |
min = del; | |
if (ins < min) min = ins; | |
if (subs < min) min = subs; | |
if(block > 1 && i > 1 && j > 1 && s[i-1] == t[j-2] && s[i-2] == t[j-1]){ | |
transp = d[(j-2)*sl + i - 2] + cost; | |
if(transp < min) min = transp; | |
} | |
if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i]; | |
d[j*sl+i]=min; | |
} | |
if (current_distance > max_distance) { | |
stop_execution = 1; | |
} | |
} | |
distance=d[sl * tl - 1]; | |
if (stop_execution == 1) distance = current_distance; | |
free(d); | |
return INT2NUM(distance); | |
} | |
" | |
end | |
end | |
if __FILE__ == $0 | |
a=DamerauLevenshtein.new | |
s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*') | |
t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*') | |
start = Time.now | |
(1..100000).each do | |
a.distance('Cedarinia scabra Sjöstedt 1921', 'Cedarinia scabra Söjstedt 1921') | |
end | |
puts "with unpack time: " + (Time.now - start).to_s + ' sec' | |
start = Time.now | |
(1..100000).each do | |
a.distance_utf(s, t, 2, 10) | |
end | |
puts 'utf time: ' + (Time.now - start).to_s + ' sec' | |
puts a.distance('Cedarinia scabra Sjöstedt 1921','Cedarinia scabra Söjstedt 1921') | |
puts a.distance_utf(s, t, 2, 10) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment