Created
August 24, 2021 15:34
-
-
Save sysopfb/80ba7accb7d26d9a38aee8c4df785984 to your computer and use it in GitHub Desktop.
Sift4 in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Based on: https://gist.github.com/lbenedix/8275d01c2289a7a20d2c6c27ee8ae68e | |
Moved an if block and added a double empty string check for input validation | |
""" | |
def sift4_simple(s1, s2, max_offset=5): | |
""" | |
:param s1: string to compare | |
:param s2: string to compare | |
:param max_offset: number of characters to search for matching letters | |
:return: edit distance | |
""" | |
if len(s1) == 0: | |
if len(s2) == 0: | |
return 0 | |
return len(s2) | |
elif len(s2) == 0: | |
return len(s1) | |
l1 = len(s1) | |
l2 = len(s2) | |
c1 = 0 # cursor for s1 | |
c2 = 0 # cursor for s1 | |
lcss = 0 # longest common subsequence | |
local_cs = 0 # local common subsequence | |
while c1 < l1 and c2 < l2: | |
if s1[c1] == s2[c2]: | |
local_cs += 1 | |
else: | |
lcss += local_cs | |
local_cs = 0 | |
if c1 != c2: | |
c1 = c2 = max(c1, c2) | |
for i in range(max_offset): | |
if c1 + i >= l1 or c2 + i >= l2: | |
break | |
if len(s1) > c1 + i and s1[c1 + i] == s2[c2]: | |
c1 += i | |
local_cs += 1 | |
break | |
if len(s2) > c1 + i and s1[c1] == s2[c2 + i]: | |
c2 += i | |
local_cs += 1 | |
break | |
c1 += 1 | |
c2 += 1 | |
lcss += local_cs | |
return round(max(l1, l2) - lcss) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment