Last active
August 29, 2015 14:24
-
-
Save SegFaultAX/5344166cd5e937544270 to your computer and use it in GitHub Desktop.
Comparing and benchmarking Levenshtein distance implementations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import copy | |
import random | |
import functools | |
def memoize(fn): | |
memo = {} | |
@functools.wraps(fn) | |
def _wrapper(*args): | |
if args not in memo: | |
memo[args] = fn(*args) | |
return memo[args] | |
return _wrapper | |
def permutations(l): | |
if not l: | |
return [()] | |
return ( | |
(l[i],) + p | |
for i in range(len(l)) | |
for p in permutations(l[:i] + l[i+1:])) | |
def commutative_memoize(fn): | |
memo = {} | |
@functools.wraps(fn) | |
def _wrapper(*args): | |
for p in permutations(args): | |
if p in memo: | |
return memo[p] | |
memo[args] = fn(*args) | |
return memo[args] | |
return _wrapper | |
def levenshtein(s1, s2): | |
def _lev(s1, l1, s2, l2): | |
if l1 == 0: return l2 | |
if l2 == 0: return l1 | |
cost = 1 if s1[l1-1] == s2[l2-1] else 0 | |
return min( | |
_lev(s1, l1-1, s2, l2) + 1, | |
_lev(s1, l1, s2, l2-1) + 1, | |
_lev(s1, l1-1, s2, l2-1) + cost) | |
return _lev(s1, len(s1), s2, len(s2)) | |
def iter_levenshtein(s1, s2): | |
tab = [[0] * (len(s2)+1) for _ in range(len(s1)+1)] | |
for i in range(1, len(s1)+1): | |
tab[i][0] = i | |
for j in range(1, len(s2)+1): | |
tab[0][j] = j | |
for i in range(1, len(s1)+1): | |
for j in range(1, len(s2)+1): | |
if s1[i-1] == s2[j-1]: | |
tab[i][j] = tab[i-1][j-1] | |
else: | |
tab[i][j] = min( | |
tab[i-1][j] + 1, | |
tab[i][j-1] + 1, | |
tab[i-1][j-1] + 1) | |
return tab[-1][-1] | |
def shuffle(l): | |
x = copy.copy(l) | |
random.shuffle(x) | |
return x | |
mlevenshtein = memoize(levenshtein) | |
cmlevenshtein = commutative_memoize(levenshtein) | |
i_mlevenshtein = memoize(iter_levenshtein) | |
i_cmlevenshtein = commutative_memoize(iter_levenshtein) | |
def benchmark(): | |
import timeit | |
setup = """ | |
import random | |
from __main__ import * | |
raw_words = open("/usr/share/dict/words").readlines() | |
words = shuffle([e.strip().lower() for e in raw_words])[:100] | |
""" | |
test_iter = """ | |
w1 = random.choice(words) | |
w2 = random.choice(words) | |
iter_levenshtein(w1, w2) | |
""" | |
test_iter_memoize = """ | |
w1 = random.choice(words) | |
w2 = random.choice(words) | |
i_mlevenshtein(w1, w2) | |
""" | |
test_iter_commutative = """ | |
w1 = random.choice(words) | |
w2 = random.choice(words) | |
i_cmlevenshtein(w1, w2) | |
""" | |
test_rec_normal = """ | |
w1 = random.choice(words) | |
w2 = random.choice(words) | |
print "Trying", w1, w2 | |
levenshtein(w1, w2) | |
""" | |
test_rec_memoize = """ | |
w1 = random.choice(words) | |
w2 = random.choice(words) | |
print "Trying", w1, w2 | |
mlevenshtein(w1, w2) | |
""" | |
test_rec_commutative = """ | |
w1 = random.choice(words) | |
w2 = random.choice(words) | |
print "Trying", w1, w2 | |
cmlevenshtein(w1, w2) | |
""" | |
print timeit.timeit(test_iter, setup, number=100000) | |
print timeit.timeit(test_iter_memoize, setup, number=100000) | |
print timeit.timeit(test_iter_commutative, setup, number=100000) | |
# print timeit.timeit(test_rec_normal, setup, number=10) | |
# print timeit.timeit(test_rec_memoize, setup, number=10) | |
# print timeit.timeit(test_rec_commutative, setup, number=10) | |
if __name__ == "__main__": | |
benchmark() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment