Created
January 18, 2016 12:26
-
-
Save teh/dac63b8260b106f7f18e to your computer and use it in GitHub Desktop.
wagner-fisher in numba
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy | |
import timeit | |
import numba | |
@numba.autojit(locals=dict( | |
best_k=numba.int64, | |
pos=numba.int64, | |
ins_del_count=numba.int64, | |
i=numba.int64, | |
j=numba.int64, | |
)) | |
def wagner_fisher(a, b, d): | |
""" | |
:param a: needle | |
:param b: string to search for the needle | |
:param d: a distance matrix of size at least (len(a), len(b)). | |
initialization not required. | |
:returns: start-offset, end-offset, best_k within b | |
""" | |
# a == needle | |
# b == data | |
len_a = len(a) | |
len_b = len(b) | |
for i in range(len_a + 1): | |
d[i, 0] = i | |
for j in range(len_b + 1): | |
d[0, j] = j | |
d[0, j] = 0 | |
# remember the best match we found | |
best_k = 2**20 | |
pos = -1 | |
for j in range(1, len_b + 1): | |
for i in range(1, len_a + 1): | |
x0 = d[i - 1, j -1] + (0 if a[i - 1] == b[j - 1] else 1) | |
x1 = d[i -1, j] + 1 | |
x2 = d[i, j - 1] + 1 | |
# optimized version of d[i, j] = min(x0, x1, x2) | |
if x0 < x1 and x0 < x2: | |
d[i, j] = x0 | |
elif x1 < x2: | |
d[i, j] = x1 | |
else: | |
d[i, j] = x2 | |
# Use <= to find the longest match | |
if d[len_a, j] < best_k: | |
best_k = d[len_a, j] | |
pos = j | |
# Discover number of insertions/deletions/substitutions by | |
# backtracking. Need those to determine the correct offset of the | |
# found needle (e.g. insertions push the end back). | |
i, j = len_a, pos | |
ins_del_count = 0 | |
while i > 0: | |
if d[i - 1, j] < d[i, j]: | |
i -= 1 | |
ins_del_count -= 1 | |
elif d[i, j - 1] < d[i, j]: | |
j -= 1 | |
ins_del_count += 1 | |
else: | |
i -= 1 | |
j -= 1 | |
return pos - len_a - ins_del_count, pos, best_k |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment