Created
November 28, 2012 03:53
-
-
Save rhybroy/4158925 to your computer and use it in GitHub Desktop.
计算字符串相似度(编辑距离算法)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/user/bin/env python | |
# -*- coding: utf-8 -*- | |
class arithmetic(): | |
def __init__(self): | |
pass | |
''' 【编辑距离算法】 【levenshtein distance】 【字符串相似度算法】 ''' | |
def levenshtein(self,first,second): | |
if len(first) > len(second): | |
first,second = second,first | |
if len(first) == 0: | |
return len(second) | |
if len(second) == 0: | |
return len(first) | |
first_length = len(first) + 1 | |
second_length = len(second) + 1 | |
distance_matrix = [range(second_length) for x in range(first_length)] | |
#print distance_matrix | |
for i in range(1,first_length): | |
for j in range(1,second_length): | |
deletion = distance_matrix[i-1][j] + 1 | |
insertion = distance_matrix[i][j-1] + 1 | |
substitution = distance_matrix[i-1][j-1] | |
if first[i-1] != second[j-1]: | |
substitution += 1 | |
distance_matrix[i][j] = min(insertion,deletion,substitution) | |
print distance_matrix | |
return distance_matrix[first_length-1][second_length-1] | |
if __name__ == "__main__": | |
arith = arithmetic() | |
print arith.levenshtein('GUMBOsdafsadfdsafsafsadfasfadsfasdfasdfs','GAMBOL00000000000dfasfasfdafsafasfasdfdsa' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, 你的distance_mat初始化成了[[0, 1, 2, ..n], ..., [0, 1, 2, n]],应该初始化成[[0, 1, 2, ..n], [1, 0, ..., 0] ..., [m, 0, ..., 0]], 如果非要写成list comprehension,
distance_matrix = [range(second_length) if x==0 else [ x if y == 0 else 0for y in range(second_length)] for x in range(first_length)]