Skip to content

Instantly share code, notes, and snippets.

@skannan-maf
Created February 20, 2025 13:26
Show Gist options
  • Save skannan-maf/ecc0d124d1b6b98b071150ce08483dab to your computer and use it in GitHub Desktop.
Save skannan-maf/ecc0d124d1b6b98b071150ce08483dab to your computer and use it in GitHub Desktop.
Fuzzy match 2 list of strings
import ngram
def fuzzy_match_list_of_strings(
l1,
l2,
common_words=[],
verbose=True
):
'''
Accepts 2 lists of strings as arguments and optionally "common_words" as 3rd argument.
Report case-insensitive fuzzy matches in a dictionary!
The dictionary will provide 2-way mappings (i.e. K1 -> K2 and K2 -> K1 both exist in the dict)
For those that did not match, report them in 2 separate lists
First list that contains unmatched strings l1
Second for l2
Words in "common_words" argument are ignored while matching
NOTE:
All matches are case-insensitive
'''
ngrams_N = 3
string_to_be_debugged = 'ENTER THE NAME TO DEBUG HERE'
def canonical_form(l1):
new_l1 = []
for l in l1:
l = ' ' + str.lower(l) + ' '
for w in common_words:
w = str.lower(w)
if w in l:
idx = l.index(w)
l = l[0:idx] + l[(idx+len(w)):]
new_l1.append(l.strip())
return new_l1
## This code block should be together
new_l1 = canonical_form(l1)
new_l2 = canonical_form(l2)
def L1(l):
return l1[new_l1.index(l)]
def L2(l):
return l2[new_l2.index(l)]
# Compare items in l1 with l2 and produce matching dict
def compare(l1, l2):
d = {}
for l in l1:
best_l2_match = None
best_l2_match_score = 0
for c in l2:
this_score = ngram.NGram.compare(l, c, N=ngrams_N)
if (l == canonical_form([string_to_be_debugged])[0]) and (this_score >= 0.30):
print('Matching {} With {} score = {}'.format(l, c, this_score))
if this_score > best_l2_match_score:
best_l2_match = c
best_l2_match_score = this_score
if best_l2_match is not None:
d[l] = best_l2_match
if l == canonical_form([string_to_be_debugged])[0]:
print('Best match for {} = {}'.format(string_to_be_debugged, best_l2_match))
return d
l1_dict = compare(new_l1, new_l2) # Best matching string in L2 for L1
l2_dict = compare(new_l2, new_l1) # Best matching string in L1 for L2
# Compile results
match_dict = {}
unmatched_l1 = []
unmatched_l2 = []
for l in new_l1:
found_match = False
if l in l1_dict:
# then l1_dict[1] has to be in l2_dict because of nGram similarity is commutative
if l2_dict[l1_dict[l]] == l:
# Best match from both sides; Made for each other
score = ngram.NGram.compare(l, l1_dict[l], N=ngrams_N)
match_dict[L1(l)] = {'match': L2(l1_dict[l]), 'score': score}
found_match = True
else:
if verbose:
print(f'{L1(l)} in L1 matched {L2(l1_dict[l])} in L2 but it matched {L1(l2_dict[l1_dict[l]])}')
if found_match == False:
unmatched_l1.append(L1(l))
for l in new_l2:
found_match = False
if l in l2_dict:
# then l2_dict[1] has to be in l1_dict because of nGram similarity is commutative
if l1_dict[l2_dict[l]] == l:
# Best match from both sides; Made for each other
score = ngram.NGram.compare(l, l2_dict[l], N=ngrams_N)
match_dict[L2(l)] = {'match': L1(l2_dict[l]), 'score': score}
found_match = True
else:
if verbose:
print(f'{L2(l)} in L2 matched {L1(l2_dict[l])} in L1 but it matched {L2(l1_dict[l2_dict[l]])}')
if found_match == False:
unmatched_l2.append(L2(l))
return match_dict, unmatched_l1, unmatched_l2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment