Skip to content

Instantly share code, notes, and snippets.

@astoeckel
Created February 27, 2021 02:16
Show Gist options
  • Save astoeckel/88c8a45733dc02670d7a1f037292a372 to your computer and use it in GitHub Desktop.
Save astoeckel/88c8a45733dc02670d7a1f037292a372 to your computer and use it in GitHub Desktop.
Fuzzy Jaccard similarity based search
def fuzzy_find(term, lst, threshold=0.75, shingle_len=3):
# Create a set of n-grams of length shingle_len
def shingle(s):
shingles = []
for t in re.split(' ', s):
for i in range(max(1, len(t) - shingle_len)):
shingles.append(t[i:i+shingle_len])
return set(shingles)
# Computes the Jaccard similarity between two sets
def jaccard_sim(A, B):
return len(A & B) / len(A | B)
# Shingle the search term
term_shingles = shingle(term)
# Search for the best-matching element in lst
best_idx, best_score = None, None
for i, elem in enumerate(lst):
score = jaccard_sim(term_shingles, shingle(elem))
if (i == 0) or (score > best_score):
best_idx = i
best_score = score
# Return the best-matching element if the score is above the threshold
if (not best_idx is None) and (best_score > threshold):
print(term, lst[best_idx], best_score)
return best_idx
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment