Skip to content

Instantly share code, notes, and snippets.

@virgulvirgul
Forked from claczny/fuzzymatch_titles.py
Created December 15, 2020 15:27
Show Gist options
  • Save virgulvirgul/bd3c5dce12aeab11b7d3d0231282d95b to your computer and use it in GitHub Desktop.
Save virgulvirgul/bd3c5dce12aeab11b7d3d0231282d95b to your computer and use it in GitHub Desktop.
Python code to fuzzy match two files (A and B) of titles to find missing titles in B, i.e., multiplications in A. Not very efficient, but does the job.
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from collections import Counter
A_title_file = "/tmp/A_titles.txt"
B_title_file = "/tmp/B_titles.txt"
# Open the files and get the titles
A_titles = []
with open(A_title_file) as f:
A_titles = f.readlines()
B_titles = []
with open(B_title_file) as f:
B_titles = f.readlines()
# Will be used to check whether a match occurs multiple times, thereby indicating a missing title in B, i.e., a multiplication in A
best_matches_t = list()
# For each title in A, find the closest match in B
for A_t in A_titles:
A_t = A_t.strip()
max_simil = 0
max_match_t = ""
for B_t in B_titles:
B_t = B_t.strip()
# Fuzzy matching
simil = fuzz.token_sort_ratio(A_t, B_t)
# Store only title in B with max. similarity
if simil > max_simil:
max_simil = simil
max_match_t = B_t
print("%i: %s | %s" % (max_simil, A_t, max_match_t))
# Append
best_matches_t.append(max_match_t)
top_n_most_common = 3
c = Counter(best_matches_t)
# Get the most common hits; easily identify multiplications
most_common = c.most_common(top_n_most_common)
print("\nTop %i most common matches in B: " % top_n_most_common)
print(most_common)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment