Skip to content

Instantly share code, notes, and snippets.

@tuxedocat
Last active May 31, 2024 19:44
Show Gist options
  • Save tuxedocat/fb024dfa36648797084d to your computer and use it in GitHub Desktop.
Save tuxedocat/fb024dfa36648797084d to your computer and use it in GitHub Desktop.
compute levenshtein distance
import numpy as np
from numba import jit
def levenshtein(x, y):
""" levenshtein distance for iterable sequences
"""
# check type
if (np.all(map(type, x)) is str) and (np.all(map(type, y)) is str):
_x = np.array(x, dtype=np.str)
_y = np.array(y, dtype=np.str)
elif (np.all(map(type, x)) is int) and (np.all(map(type, y)) is int):
_x = np.array(x, dtype=np.int)
_y = np.array(y, dtype=np.int)
elif type(x) is str and type(y) is str:
_x = np.array(list(x), dtype=np.str)
_y = np.array(list(y), dtype=np.str)
else:
raise TypeError
d, D = _levenshtein(_x, _y)
return d, D
@jit
def _levenshtein(x, y):
""" Levenshtein distance
using Dynamic-Programming strategy
Parameters
----------
x, y : np.array of string
Returns
-------
int : distance
np.array : distance matrix
"""
# Initiallize DP-matrix
D = np.zeros((len(x) + 1, len(y) + 1), dtype=int)
D[0, 1:] = range(1, len(y) + 1)
D[1:, 0] = range(1, len(x) + 1)
for i in range(1, len(x) + 1):
for j in range(1, len(y) + 1):
delta = 2 if x[i - 1] != y[j - 1] else 0
D[i, j] = min(D[i - 1, j - 1] + delta, D[i - 1, j] + 1, D[i, j - 1] + 1)
return D[-1, -1], D
def backtrace(x, y, D):
""" Get alignment for given sequences and Distance-Matrix
Parameters
----------
x, y : np.array or array-like
D : np.array
Distance matrix computed by `levenshtein`
Returns
-------
tuple : np.array or array-like * 3
t[0], t[1] are annotated sequence corresponding to x, y
t[2] is possible edit
"""
edit = []
_x = []
_y = []
i = len(x)
j = len(y)
mincost = np.inf
prevcost = D[i, j]
while not (i == 0 and j == 0):
direction = np.argmin([D[i - 1, j - 1], D[i, j - 1], D[i - 1, j]])
mincost = np.min([D[i - 1, j - 1], D[i, j - 1], D[i - 1, j]])
if direction == 0:
if mincost == prevcost:
edit.append(" ") # match (x[i-1] == y[j-1])
else:
edit.append("*") # substitution
_x.append(x[i - 1])
_y.append(y[j - 1])
i -= 1
j -= 1
elif direction == 1:
edit.append("+") # insertion (to x)
_x.append(" ")
_y.append(y[j - 1])
j -= 1
elif direction == 2:
edit.append("-") # deletion (from x)
_x.append(x[i - 1])
_y.append(" ")
i -= 1
prevcost = mincost
edit.reverse()
_x.reverse()
_y.reverse()
return _x, _y, edit
def _test(s1, s2, expected_ed: int = None):
d, D = levenshtein(s1, s2)
print("S1 = {}".format(s1))
print("S2 = {}".format(s2))
print("Distance = {}\n".format(d))
_s1, _s2, edit = backtrace(s1, s2, D)
print("Alignment")
for a, b, op in zip(_s1, _s2, edit):
print(" {} {} {}".format(a, op, b))
print("\n")
if expected_ed:
assert d == expected_ed and edit is not None
else:
assert edit is not None
def test_basic():
x = "kitten"
y = "sitting"
_test(x, y, 5)
def test_CJK1():
x = "上海商業銀行有限公司"
y = "__上海商業銀行洧限公司"
_test(x, y)
def test_empty_s1():
x = ""
y = "sitting"
_test(x, y, 7)
def test_empty_s2():
x = "kitten"
y = ""
_test(x, y, 6)
def test_short_vs_long():
x = "kitten"
y = "the kitten is sitting on a mat"
_test(x, y)
def test_long_vs_short():
x = "the kitten is sitting on a mat"
y = "kitten"
_test(x, y)
if __name__ == '__main__':
tests = [test_basic, test_CJK1, test_empty_s1, test_empty_s2, test_short_vs_long, test_long_vs_short]
for testcase in tests:
try:
print("{}".format(testcase.__name__))
print("-" * len(testcase.__name__))
testcase()
except AssertionError:
print("FAILED: testcase= {}, cause=AssertionError".format(testcase.__name__))
except Exception as e:
print("FAILED: testcase= {}, cause={}".format(testcase.__name__, str(e)))
finally:
print("")
@VincentYZ
Copy link

Thanks for sharing this! Just one suggestion for the backtrace function: the current implementation seems to work fine until any of the two input strings is an empty string, causing index to go out of range. This happens because
direction = np.argmin([D[i - 1, j - 1], D[i, j - 1], D[i - 1, j]])
will always prioritzes substitution over insertion and deletion, even if they all have the same value. Unfortunately, when one input string is empty, insertion/deletion should take precedence.
My workaround for now is to do this instead:
`

  1.     if i == 0:
    
  2.         direction = 1 # source string is empty, only insertion is sensible
    
  3.     elif j == 0:
    
  4.         direction = 2 # target string is empty; only deletion is sensible
    
  5.     else:
    
  6.         direction = np.argmin([D[i - 1, j - 1], D[i, j - 1], D[i - 1, j]])
    

`
Just my two cents. You may have more elegent solutions than this.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment