Last active
May 31, 2024 19:44
-
-
Save tuxedocat/fb024dfa36648797084d to your computer and use it in GitHub Desktop.
compute levenshtein distance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from numba import jit | |
def levenshtein(x, y): | |
""" levenshtein distance for iterable sequences | |
""" | |
# check type | |
if (np.all(map(type, x)) is str) and (np.all(map(type, y)) is str): | |
_x = np.array(x, dtype=np.str) | |
_y = np.array(y, dtype=np.str) | |
elif (np.all(map(type, x)) is int) and (np.all(map(type, y)) is int): | |
_x = np.array(x, dtype=np.int) | |
_y = np.array(y, dtype=np.int) | |
elif type(x) is str and type(y) is str: | |
_x = np.array(list(x), dtype=np.str) | |
_y = np.array(list(y), dtype=np.str) | |
else: | |
raise TypeError | |
d, D = _levenshtein(_x, _y) | |
return d, D | |
@jit | |
def _levenshtein(x, y): | |
""" Levenshtein distance | |
using Dynamic-Programming strategy | |
Parameters | |
---------- | |
x, y : np.array of string | |
Returns | |
------- | |
int : distance | |
np.array : distance matrix | |
""" | |
# Initiallize DP-matrix | |
D = np.zeros((len(x) + 1, len(y) + 1), dtype=int) | |
D[0, 1:] = range(1, len(y) + 1) | |
D[1:, 0] = range(1, len(x) + 1) | |
for i in range(1, len(x) + 1): | |
for j in range(1, len(y) + 1): | |
delta = 2 if x[i - 1] != y[j - 1] else 0 | |
D[i, j] = min(D[i - 1, j - 1] + delta, D[i - 1, j] + 1, D[i, j - 1] + 1) | |
return D[-1, -1], D | |
def backtrace(x, y, D): | |
""" Get alignment for given sequences and Distance-Matrix | |
Parameters | |
---------- | |
x, y : np.array or array-like | |
D : np.array | |
Distance matrix computed by `levenshtein` | |
Returns | |
------- | |
tuple : np.array or array-like * 3 | |
t[0], t[1] are annotated sequence corresponding to x, y | |
t[2] is possible edit | |
""" | |
edit = [] | |
_x = [] | |
_y = [] | |
i = len(x) | |
j = len(y) | |
mincost = np.inf | |
prevcost = D[i, j] | |
while not (i == 0 and j == 0): | |
direction = np.argmin([D[i - 1, j - 1], D[i, j - 1], D[i - 1, j]]) | |
mincost = np.min([D[i - 1, j - 1], D[i, j - 1], D[i - 1, j]]) | |
if direction == 0: | |
if mincost == prevcost: | |
edit.append(" ") # match (x[i-1] == y[j-1]) | |
else: | |
edit.append("*") # substitution | |
_x.append(x[i - 1]) | |
_y.append(y[j - 1]) | |
i -= 1 | |
j -= 1 | |
elif direction == 1: | |
edit.append("+") # insertion (to x) | |
_x.append(" ") | |
_y.append(y[j - 1]) | |
j -= 1 | |
elif direction == 2: | |
edit.append("-") # deletion (from x) | |
_x.append(x[i - 1]) | |
_y.append(" ") | |
i -= 1 | |
prevcost = mincost | |
edit.reverse() | |
_x.reverse() | |
_y.reverse() | |
return _x, _y, edit | |
def _test(s1, s2, expected_ed: int = None): | |
d, D = levenshtein(s1, s2) | |
print("S1 = {}".format(s1)) | |
print("S2 = {}".format(s2)) | |
print("Distance = {}\n".format(d)) | |
_s1, _s2, edit = backtrace(s1, s2, D) | |
print("Alignment") | |
for a, b, op in zip(_s1, _s2, edit): | |
print(" {} {} {}".format(a, op, b)) | |
print("\n") | |
if expected_ed: | |
assert d == expected_ed and edit is not None | |
else: | |
assert edit is not None | |
def test_basic(): | |
x = "kitten" | |
y = "sitting" | |
_test(x, y, 5) | |
def test_CJK1(): | |
x = "上海商業銀行有限公司" | |
y = "__上海商業銀行洧限公司" | |
_test(x, y) | |
def test_empty_s1(): | |
x = "" | |
y = "sitting" | |
_test(x, y, 7) | |
def test_empty_s2(): | |
x = "kitten" | |
y = "" | |
_test(x, y, 6) | |
def test_short_vs_long(): | |
x = "kitten" | |
y = "the kitten is sitting on a mat" | |
_test(x, y) | |
def test_long_vs_short(): | |
x = "the kitten is sitting on a mat" | |
y = "kitten" | |
_test(x, y) | |
if __name__ == '__main__': | |
tests = [test_basic, test_CJK1, test_empty_s1, test_empty_s2, test_short_vs_long, test_long_vs_short] | |
for testcase in tests: | |
try: | |
print("{}".format(testcase.__name__)) | |
print("-" * len(testcase.__name__)) | |
testcase() | |
except AssertionError: | |
print("FAILED: testcase= {}, cause=AssertionError".format(testcase.__name__)) | |
except Exception as e: | |
print("FAILED: testcase= {}, cause={}".format(testcase.__name__, str(e))) | |
finally: | |
print("") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for sharing this! Just one suggestion for the backtrace function: the current implementation seems to work fine until any of the two input strings is an empty string, causing index to go out of range. This happens because
direction = np.argmin([D[i - 1, j - 1], D[i, j - 1], D[i - 1, j]])
will always prioritzes substitution over insertion and deletion, even if they all have the same value. Unfortunately, when one input string is empty, insertion/deletion should take precedence.
My workaround for now is to do this instead:
`
`
Just my two cents. You may have more elegent solutions than this.