Created
February 20, 2025 13:26
-
-
Save skannan-maf/ecc0d124d1b6b98b071150ce08483dab to your computer and use it in GitHub Desktop.
Fuzzy match 2 list of strings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ngram | |
def fuzzy_match_list_of_strings( | |
l1, | |
l2, | |
common_words=[], | |
verbose=True | |
): | |
''' | |
Accepts 2 lists of strings as arguments and optionally "common_words" as 3rd argument. | |
Report case-insensitive fuzzy matches in a dictionary! | |
The dictionary will provide 2-way mappings (i.e. K1 -> K2 and K2 -> K1 both exist in the dict) | |
For those that did not match, report them in 2 separate lists | |
First list that contains unmatched strings l1 | |
Second for l2 | |
Words in "common_words" argument are ignored while matching | |
NOTE: | |
All matches are case-insensitive | |
''' | |
ngrams_N = 3 | |
string_to_be_debugged = 'ENTER THE NAME TO DEBUG HERE' | |
def canonical_form(l1): | |
new_l1 = [] | |
for l in l1: | |
l = ' ' + str.lower(l) + ' ' | |
for w in common_words: | |
w = str.lower(w) | |
if w in l: | |
idx = l.index(w) | |
l = l[0:idx] + l[(idx+len(w)):] | |
new_l1.append(l.strip()) | |
return new_l1 | |
## This code block should be together | |
new_l1 = canonical_form(l1) | |
new_l2 = canonical_form(l2) | |
def L1(l): | |
return l1[new_l1.index(l)] | |
def L2(l): | |
return l2[new_l2.index(l)] | |
# Compare items in l1 with l2 and produce matching dict | |
def compare(l1, l2): | |
d = {} | |
for l in l1: | |
best_l2_match = None | |
best_l2_match_score = 0 | |
for c in l2: | |
this_score = ngram.NGram.compare(l, c, N=ngrams_N) | |
if (l == canonical_form([string_to_be_debugged])[0]) and (this_score >= 0.30): | |
print('Matching {} With {} score = {}'.format(l, c, this_score)) | |
if this_score > best_l2_match_score: | |
best_l2_match = c | |
best_l2_match_score = this_score | |
if best_l2_match is not None: | |
d[l] = best_l2_match | |
if l == canonical_form([string_to_be_debugged])[0]: | |
print('Best match for {} = {}'.format(string_to_be_debugged, best_l2_match)) | |
return d | |
l1_dict = compare(new_l1, new_l2) # Best matching string in L2 for L1 | |
l2_dict = compare(new_l2, new_l1) # Best matching string in L1 for L2 | |
# Compile results | |
match_dict = {} | |
unmatched_l1 = [] | |
unmatched_l2 = [] | |
for l in new_l1: | |
found_match = False | |
if l in l1_dict: | |
# then l1_dict[1] has to be in l2_dict because of nGram similarity is commutative | |
if l2_dict[l1_dict[l]] == l: | |
# Best match from both sides; Made for each other | |
score = ngram.NGram.compare(l, l1_dict[l], N=ngrams_N) | |
match_dict[L1(l)] = {'match': L2(l1_dict[l]), 'score': score} | |
found_match = True | |
else: | |
if verbose: | |
print(f'{L1(l)} in L1 matched {L2(l1_dict[l])} in L2 but it matched {L1(l2_dict[l1_dict[l]])}') | |
if found_match == False: | |
unmatched_l1.append(L1(l)) | |
for l in new_l2: | |
found_match = False | |
if l in l2_dict: | |
# then l2_dict[1] has to be in l1_dict because of nGram similarity is commutative | |
if l1_dict[l2_dict[l]] == l: | |
# Best match from both sides; Made for each other | |
score = ngram.NGram.compare(l, l2_dict[l], N=ngrams_N) | |
match_dict[L2(l)] = {'match': L1(l2_dict[l]), 'score': score} | |
found_match = True | |
else: | |
if verbose: | |
print(f'{L2(l)} in L2 matched {L1(l2_dict[l])} in L1 but it matched {L2(l1_dict[l2_dict[l]])}') | |
if found_match == False: | |
unmatched_l2.append(L2(l)) | |
return match_dict, unmatched_l1, unmatched_l2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment