Last active
March 12, 2020 10:03
-
-
Save alvations/989dacca76c820c48ef18c4637515b39 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from functools import lru_cache | |
from itertools import product | |
@lru_cache(maxsize=4095) | |
def ld(s, t): | |
""" | |
Levenshtein distance memoized implementation from Rosetta code: | |
https://rosettacode.org/wiki/Levenshtein_distance#Python | |
""" | |
if not s: return len(t) | |
if not t: return len(s) | |
if s[0] == t[0]: return ld(s[1:], t[1:]) | |
l1 = ld(s, t[1:]) # Deletion. | |
l2 = ld(s[1:], t) # Insertion. | |
l3 = ld(s[1:], t[1:]) # Substitution. | |
return 1 + min(l1, l2, l3) | |
def find_shifts(hyp, ref): | |
"""Find possible shifts in hypothesis.""" | |
hyp_len, ref_len = len(hyp), len(ref) | |
for i, j in product(range(hyp_len), range(ref_len)): | |
if i == j: # Skip words in the same position. | |
continue | |
# When word matches. | |
if hyp[i] == ref[j]: | |
# Find the longest matching phrase from this position | |
l = 0 | |
for l, (h, r) in enumerate(zip(hyp[i:], ref[j:])): | |
if h != r: | |
break | |
l += 1 | |
# Compute the shifted hypothesis. | |
shifted_hyp = hyp[:i] + hyp[i+l:] | |
shifted_hyp[j:j] = hyp[i:i+l] | |
yield shifted_hyp | |
def shift(hyp, ref): | |
original = ld(tuple(hyp), tuple(ref)) | |
# Find the lowest possible shift and it distance. | |
scores = [] | |
for shifted_hyp in find_shifts(hyp, ref): | |
shifted_dist = ld(tuple(shifted_hyp), tuple(ref)) | |
scores.append((original - shifted_dist, shifted_hyp)) | |
# Return original hypothesis if shift is not better. | |
return sorted(scores)[-1] if scores else (0, hyp) | |
def ter(hyp, ref): | |
# Initialize no. of edits, e. | |
e = 0 | |
while True: | |
# Find shift, s, that most reduces min-edit-distance(h', r) | |
delta, s = shift(hyp, ref) | |
# until no shifts that reduce edit distance remain | |
if delta <= 0: | |
break | |
# if shift reduces edit distance, then | |
# h' <- apply s to h' | |
hyp = s | |
# e <- e + 1 | |
e += 1 | |
# e <- e + min-edit-distance(h', r) | |
e += ld(tuple(hyp), tuple(ref)) | |
return e / len(ref) | |
""" | |
>>> hyp = u'THIS WEEK THE SAUDIS denied information published in the new york times'.split() | |
>>> ref = u'SAUDI ARABIA denied THIS WEEK information published in the AMERICAN new york times'.split() | |
>>> ter(hyp, ref) | |
0.3076923076923077 | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment