Created
March 19, 2016 18:34
-
-
Save n-bar/0a179f782ea99b05e612 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://ru.wikibooks.org/wiki/Реализации_алгоритмов/Расстояние_Левенштейна#Python | |
def distance(a, b): | |
"Calculates the Levenshtein distance between a and b." | |
n, m = len(a), len(b) | |
if n > m: | |
# Make sure n <= m, to use O(min(n,m)) space | |
a, b = b, a | |
n, m = m, n | |
current_row = range(n+1) # Keep current and previous row, not entire matrix | |
for i in range(1, m+1): | |
previous_row, current_row = current_row, [i]+[0]*n | |
for j in range(1,n+1): | |
add, delete, change = previous_row[j]+1, current_row[j-1]+1, previous_row[j-1] | |
if a[j-1] != b[i-1]: | |
change += 1 | |
current_row[j] = min(add, delete, change) | |
return current_row[n] | |
# https://habrahabr.ru/post/279585/ | |
def distance_2(text, pattern): | |
"Calculates the Levenshtein distance between text and pattern." | |
text_len, pattern_len = len(text), len(pattern) | |
current_column = range(pattern_len+1) | |
min_value = pattern_len | |
end_pos = 0 | |
for i in range(1, text_len+1): | |
previous_column, current_column = current_column, [0]*(pattern_len+1) # !!! | |
for j in range(1,pattern_len+1): | |
add, delete, change = previous_column[j]+1, current_column[j-1]+1, previous_column[j-1] | |
if pattern[j-1] != text[i-1]: | |
change += 1 | |
current_column[j] = min(add, delete, change) | |
if min_value > current_column[pattern_len]: # !!! | |
min_value = current_column[pattern_len] | |
end_pos = i | |
return min_value, end_pos | |
print distance_2(u'аргумент', u'рудимент') #3, 8 | |
print distance_2(u'весомый аргумент в мою пользу', u'рудимент') #3, 16 | |
def distance_3(text, pattern): | |
min_value, end_pos = distance_2(text, pattern) | |
min_value, start_pos = distance_2(text[end_pos-1::-1], pattern[::-1]) | |
start_pos = end_pos - start_pos | |
return min_value, start_pos, end_pos, text[start_pos:end_pos], pattern | |
print distance_3(u'аргумент', u'рудимент') # 3 3 8 умент рудимент | |
print distance_3(u'весомый аргумент в мою пользу', u'рудимент') # 3 11 16 умент рудимент |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment