Skip to content

Instantly share code, notes, and snippets.

@kzinmr
Created July 21, 2021 06:05
Show Gist options
  • Save kzinmr/1b31b0b84c9a5569f317bde0d8c927fb to your computer and use it in GitHub Desktop.
Save kzinmr/1b31b0b84c9a5569f317bde0d8c927fb to your computer and use it in GitHub Desktop.
from textdistance import damerau_levenshtein
import mojimoji
import regex as re
def partial_ratio(s1, s2, levenshtein_ratio=damerau_levenshtein.normalized_similarity):
def _preprocess(s):
s = mojimoji.zen_to_han(s, kana=False, ascii=True, digit=True)
s = s.lower()
s = re.sub('\s+', '', s)
return s
def _match_substring(shorter, longer):
window = len(shorter)
rng = len(longer) - window
if rng>0:
scores = [levenshtein_ratio(shorter, longer[i:i+window]) for i in range(rng)]
# max_ix = np.argmax(scores)
return max(scores)
else:
return 0.
s1 = _preprocess(s1)
s2 = _preprocess(s2)
shorter, longer = s1, s2
if len(s1) == len(s2):
return levenshtein_ratio(s1, s2)
elif len(s1) > len(s2):
shorter, longer = s2, s1
max_score = _match_substring(shorter, longer)
return max_score
def fuzzy_difference(gs, ps, sim=lambda x,y:int(x==y), threshold=1.):
# gs - ps
diffs = set()
for g in gs:
if max([sim(g, p) for p in ps]) < threshold: # g not in p
diffs.add(g)
return diffs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment