Skip to content

Instantly share code, notes, and snippets.

@SegFaultAX
Last active August 29, 2015 14:24
Show Gist options
  • Save SegFaultAX/5344166cd5e937544270 to your computer and use it in GitHub Desktop.
Save SegFaultAX/5344166cd5e937544270 to your computer and use it in GitHub Desktop.
Comparing and benchmarking Levenshtein distance implementations
#!/usr/bin/env python
import copy
import random
import functools
def memoize(fn):
memo = {}
@functools.wraps(fn)
def _wrapper(*args):
if args not in memo:
memo[args] = fn(*args)
return memo[args]
return _wrapper
def permutations(l):
if not l:
return [()]
return (
(l[i],) + p
for i in range(len(l))
for p in permutations(l[:i] + l[i+1:]))
def commutative_memoize(fn):
memo = {}
@functools.wraps(fn)
def _wrapper(*args):
for p in permutations(args):
if p in memo:
return memo[p]
memo[args] = fn(*args)
return memo[args]
return _wrapper
def levenshtein(s1, s2):
def _lev(s1, l1, s2, l2):
if l1 == 0: return l2
if l2 == 0: return l1
cost = 1 if s1[l1-1] == s2[l2-1] else 0
return min(
_lev(s1, l1-1, s2, l2) + 1,
_lev(s1, l1, s2, l2-1) + 1,
_lev(s1, l1-1, s2, l2-1) + cost)
return _lev(s1, len(s1), s2, len(s2))
def iter_levenshtein(s1, s2):
tab = [[0] * (len(s2)+1) for _ in range(len(s1)+1)]
for i in range(1, len(s1)+1):
tab[i][0] = i
for j in range(1, len(s2)+1):
tab[0][j] = j
for i in range(1, len(s1)+1):
for j in range(1, len(s2)+1):
if s1[i-1] == s2[j-1]:
tab[i][j] = tab[i-1][j-1]
else:
tab[i][j] = min(
tab[i-1][j] + 1,
tab[i][j-1] + 1,
tab[i-1][j-1] + 1)
return tab[-1][-1]
def shuffle(l):
x = copy.copy(l)
random.shuffle(x)
return x
mlevenshtein = memoize(levenshtein)
cmlevenshtein = commutative_memoize(levenshtein)
i_mlevenshtein = memoize(iter_levenshtein)
i_cmlevenshtein = commutative_memoize(iter_levenshtein)
def benchmark():
import timeit
setup = """
import random
from __main__ import *
raw_words = open("/usr/share/dict/words").readlines()
words = shuffle([e.strip().lower() for e in raw_words])[:100]
"""
test_iter = """
w1 = random.choice(words)
w2 = random.choice(words)
iter_levenshtein(w1, w2)
"""
test_iter_memoize = """
w1 = random.choice(words)
w2 = random.choice(words)
i_mlevenshtein(w1, w2)
"""
test_iter_commutative = """
w1 = random.choice(words)
w2 = random.choice(words)
i_cmlevenshtein(w1, w2)
"""
test_rec_normal = """
w1 = random.choice(words)
w2 = random.choice(words)
print "Trying", w1, w2
levenshtein(w1, w2)
"""
test_rec_memoize = """
w1 = random.choice(words)
w2 = random.choice(words)
print "Trying", w1, w2
mlevenshtein(w1, w2)
"""
test_rec_commutative = """
w1 = random.choice(words)
w2 = random.choice(words)
print "Trying", w1, w2
cmlevenshtein(w1, w2)
"""
print timeit.timeit(test_iter, setup, number=100000)
print timeit.timeit(test_iter_memoize, setup, number=100000)
print timeit.timeit(test_iter_commutative, setup, number=100000)
# print timeit.timeit(test_rec_normal, setup, number=10)
# print timeit.timeit(test_rec_memoize, setup, number=10)
# print timeit.timeit(test_rec_commutative, setup, number=10)
if __name__ == "__main__":
benchmark()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment