skannan-maf · February 20, 2025 13:26
diff --git a/fuzzy_match_list_of_strings.py b/fuzzy_match_list_of_strings.py
 import ngram

 def fuzzy_match_list_of_strings(
        l1, 
        l2, 
        common_words=[],
        verbose=True
    ):
    '''
    Accepts 2 lists of strings as arguments and optionally "common_words" as 3rd argument.
    Report case-insensitive fuzzy matches in a dictionary!
    The dictionary will provide 2-way mappings (i.e. K1 -> K2 and K2 -> K1 both exist in the dict)
    For those that did not match, report them in 2 separate lists
      First list that contains unmatched strings l1
      Second for l2
    Words in "common_words" argument are ignored while matching
    NOTE:
    All matches are case-insensitive
    '''
    ngrams_N = 3
    
    string_to_be_debugged = 'ENTER THE NAME TO DEBUG HERE'
    
    def canonical_form(l1):
        new_l1 = []
        for l in l1:
            l = ' ' + str.lower(l) + ' '
            for w in common_words:
                w = str.lower(w)
                if w in l:
                    idx = l.index(w)
                    l = l[0:idx] + l[(idx+len(w)):]
            new_l1.append(l.strip())
        return new_l1

    ## This code block should be together
    new_l1 = canonical_form(l1)
    new_l2 = canonical_form(l2)
    def L1(l):
        return l1[new_l1.index(l)]
    def L2(l):
        return l2[new_l2.index(l)]
    
   # Compare items in l1 with l2 and produce matching dict 
    def compare(l1, l2):
        d = {}
        for l in l1:
            best_l2_match = None
            best_l2_match_score = 0
            for c in l2:
                this_score = ngram.NGram.compare(l, c, N=ngrams_N)
                if (l == canonical_form([string_to_be_debugged])[0]) and (this_score >= 0.30):
                    print('Matching {} With {} score = {}'.format(l, c, this_score))
                if  this_score > best_l2_match_score:
                    best_l2_match = c
                    best_l2_match_score = this_score
            if best_l2_match is not None:
                d[l] = best_l2_match
                if l == canonical_form([string_to_be_debugged])[0]:
                    print('Best match for {} = {}'.format(string_to_be_debugged, best_l2_match))
        return d
    
    l1_dict = compare(new_l1, new_l2) # Best matching string in L2 for L1
    l2_dict = compare(new_l2, new_l1) # Best matching string in L1 for L2

    # Compile results
    
    match_dict = {}
    unmatched_l1 = []
    unmatched_l2 = []
    for l in new_l1:
        found_match = False
        if l in l1_dict:
            # then l1_dict[1] has to be in l2_dict because of nGram similarity is commutative
            if l2_dict[l1_dict[l]] == l:
                # Best match from both sides; Made for each other
                score = ngram.NGram.compare(l, l1_dict[l], N=ngrams_N)
                match_dict[L1(l)] = {'match': L2(l1_dict[l]), 'score': score}
                found_match = True
            else:
                if verbose:
                    print(f'{L1(l)} in L1 matched {L2(l1_dict[l])} in L2 but it matched {L1(l2_dict[l1_dict[l]])}')
        if found_match == False:
            unmatched_l1.append(L1(l))
    
    for l in new_l2:
        found_match = False
        if l in l2_dict:
            # then l2_dict[1] has to be in l1_dict because of nGram similarity is commutative
            if l1_dict[l2_dict[l]] == l:
                # Best match from both sides; Made for each other
                score = ngram.NGram.compare(l, l2_dict[l], N=ngrams_N)
                match_dict[L2(l)] = {'match': L1(l2_dict[l]), 'score': score}
                found_match = True
            else:
                if verbose:
                    print(f'{L2(l)} in L2 matched {L1(l2_dict[l])} in L1 but it matched {L2(l1_dict[l2_dict[l]])}')
        if found_match == False:
            unmatched_l2.append(L2(l))

    return match_dict, unmatched_l1, unmatched_l2
	import ngram

	def fuzzy_match_list_of_strings(
	l1,
	l2,
	common_words=[],
	verbose=True
	):
	'''
	Accepts 2 lists of strings as arguments and optionally "common_words" as 3rd argument.
	Report case-insensitive fuzzy matches in a dictionary!
	The dictionary will provide 2-way mappings (i.e. K1 -> K2 and K2 -> K1 both exist in the dict)
	For those that did not match, report them in 2 separate lists
	First list that contains unmatched strings l1
	Second for l2
	Words in "common_words" argument are ignored while matching
	NOTE:
	All matches are case-insensitive
	'''
	ngrams_N = 3

	string_to_be_debugged = 'ENTER THE NAME TO DEBUG HERE'

	def canonical_form(l1):
	new_l1 = []
	for l in l1:
	l = ' ' + str.lower(l) + ' '
	for w in common_words:
	w = str.lower(w)
	if w in l:
	idx = l.index(w)
	l = l[0:idx] + l[(idx+len(w)):]
	new_l1.append(l.strip())
	return new_l1

	## This code block should be together
	new_l1 = canonical_form(l1)
	new_l2 = canonical_form(l2)
	def L1(l):
	return l1[new_l1.index(l)]
	def L2(l):
	return l2[new_l2.index(l)]

	# Compare items in l1 with l2 and produce matching dict
	def compare(l1, l2):
	d = {}
	for l in l1:
	best_l2_match = None
	best_l2_match_score = 0
	for c in l2:
	this_score = ngram.NGram.compare(l, c, N=ngrams_N)
	if (l == canonical_form([string_to_be_debugged])[0]) and (this_score >= 0.30):
	print('Matching {} With {} score = {}'.format(l, c, this_score))
	if this_score > best_l2_match_score:
	best_l2_match = c
	best_l2_match_score = this_score
	if best_l2_match is not None:
	d[l] = best_l2_match
	if l == canonical_form([string_to_be_debugged])[0]:
	print('Best match for {} = {}'.format(string_to_be_debugged, best_l2_match))
	return d

	l1_dict = compare(new_l1, new_l2) # Best matching string in L2 for L1
	l2_dict = compare(new_l2, new_l1) # Best matching string in L1 for L2

	# Compile results

	match_dict = {}
	unmatched_l1 = []
	unmatched_l2 = []
	for l in new_l1:
	found_match = False
	if l in l1_dict:
	# then l1_dict[1] has to be in l2_dict because of nGram similarity is commutative
	if l2_dict[l1_dict[l]] == l:
	# Best match from both sides; Made for each other
	score = ngram.NGram.compare(l, l1_dict[l], N=ngrams_N)
	match_dict[L1(l)] = {'match': L2(l1_dict[l]), 'score': score}
	found_match = True
	else:
	if verbose:
	print(f'{L1(l)} in L1 matched {L2(l1_dict[l])} in L2 but it matched {L1(l2_dict[l1_dict[l]])}')
	if found_match == False:
	unmatched_l1.append(L1(l))

	for l in new_l2:
	found_match = False
	if l in l2_dict:
	# then l2_dict[1] has to be in l1_dict because of nGram similarity is commutative
	if l1_dict[l2_dict[l]] == l:
	# Best match from both sides; Made for each other
	score = ngram.NGram.compare(l, l2_dict[l], N=ngrams_N)
	match_dict[L2(l)] = {'match': L1(l2_dict[l]), 'score': score}
	found_match = True
	else:
	if verbose:
	print(f'{L2(l)} in L2 matched {L1(l2_dict[l])} in L1 but it matched {L2(l1_dict[l2_dict[l]])}')
	if found_match == False:
	unmatched_l2.append(L2(l))

	return match_dict, unmatched_l1, unmatched_l2