virgulvirgul · December 15, 2020 15:27
diff --git a/fuzzymatch_titles.py b/fuzzymatch_titles.py
 from fuzzywuzzy import fuzz
 from fuzzywuzzy import process
 from collections import Counter

 A_title_file = "/tmp/A_titles.txt"
 B_title_file = "/tmp/B_titles.txt"

 # Open the files and get the titles
 A_titles = []
 with open(A_title_file) as f:
    A_titles = f.readlines()

 B_titles = []
 with open(B_title_file) as f:
    B_titles = f.readlines()

 # Will be used to check whether a match occurs multiple times, thereby indicating a missing title in B, i.e., a multiplication in A
 best_matches_t = list()

 # For each title in A, find the closest match in B
 for A_t in A_titles:
    A_t = A_t.strip()
    max_simil = 0
    max_match_t = ""
    for B_t in B_titles:
        B_t = B_t.strip()
        # Fuzzy matching
        simil = fuzz.token_sort_ratio(A_t, B_t)
        # Store only title in B with max. similarity
        if simil > max_simil:
            max_simil = simil
            max_match_t = B_t
    print("%i: %s | %s" % (max_simil, A_t, max_match_t))
    # Append
    best_matches_t.append(max_match_t)

 top_n_most_common = 3
 c = Counter(best_matches_t)
 # Get the most common hits; easily identify multiplications
 most_common = c.most_common(top_n_most_common)
 print("\nTop %i most common matches in B: " % top_n_most_common)
 print(most_common)
	from fuzzywuzzy import fuzz
	from fuzzywuzzy import process
	from collections import Counter

	A_title_file = "/tmp/A_titles.txt"
	B_title_file = "/tmp/B_titles.txt"

	# Open the files and get the titles
	A_titles = []
	with open(A_title_file) as f:
	A_titles = f.readlines()

	B_titles = []
	with open(B_title_file) as f:
	B_titles = f.readlines()

	# Will be used to check whether a match occurs multiple times, thereby indicating a missing title in B, i.e., a multiplication in A
	best_matches_t = list()

	# For each title in A, find the closest match in B
	for A_t in A_titles:
	A_t = A_t.strip()
	max_simil = 0
	max_match_t = ""
	for B_t in B_titles:
	B_t = B_t.strip()
	# Fuzzy matching
	simil = fuzz.token_sort_ratio(A_t, B_t)
	# Store only title in B with max. similarity
	if simil > max_simil:
	max_simil = simil
	max_match_t = B_t
	print("%i: %s \| %s" % (max_simil, A_t, max_match_t))
	# Append
	best_matches_t.append(max_match_t)

	top_n_most_common = 3
	c = Counter(best_matches_t)
	# Get the most common hits; easily identify multiplications
	most_common = c.most_common(top_n_most_common)
	print("\nTop %i most common matches in B: " % top_n_most_common)
	print(most_common)