-
-
Save virgulvirgul/bd3c5dce12aeab11b7d3d0231282d95b to your computer and use it in GitHub Desktop.
Python code to fuzzy match two files (A and B) of titles to find missing titles in B, i.e., multiplications in A. Not very efficient, but does the job.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from fuzzywuzzy import fuzz | |
from fuzzywuzzy import process | |
from collections import Counter | |
A_title_file = "/tmp/A_titles.txt" | |
B_title_file = "/tmp/B_titles.txt" | |
# Open the files and get the titles | |
A_titles = [] | |
with open(A_title_file) as f: | |
A_titles = f.readlines() | |
B_titles = [] | |
with open(B_title_file) as f: | |
B_titles = f.readlines() | |
# Will be used to check whether a match occurs multiple times, thereby indicating a missing title in B, i.e., a multiplication in A | |
best_matches_t = list() | |
# For each title in A, find the closest match in B | |
for A_t in A_titles: | |
A_t = A_t.strip() | |
max_simil = 0 | |
max_match_t = "" | |
for B_t in B_titles: | |
B_t = B_t.strip() | |
# Fuzzy matching | |
simil = fuzz.token_sort_ratio(A_t, B_t) | |
# Store only title in B with max. similarity | |
if simil > max_simil: | |
max_simil = simil | |
max_match_t = B_t | |
print("%i: %s | %s" % (max_simil, A_t, max_match_t)) | |
# Append | |
best_matches_t.append(max_match_t) | |
top_n_most_common = 3 | |
c = Counter(best_matches_t) | |
# Get the most common hits; easily identify multiplications | |
most_common = c.most_common(top_n_most_common) | |
print("\nTop %i most common matches in B: " % top_n_most_common) | |
print(most_common) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment