Skip to content

Instantly share code, notes, and snippets.

@kusold
Created March 18, 2011 17:17
Show Gist options
  • Select an option

  • Save kusold/876460 to your computer and use it in GitHub Desktop.

Select an option

Save kusold/876460 to your computer and use it in GitHub Desktop.
Takes two text files with one String per line. Attempts to match them.
###############################################################################
## Description: FuzzyMatch.py takes two files with one keyword per line as ##
## command line arguments. It then creates a file with 100% ##
## matches called Match100.txt. Next it tries to match the ##
## remaining files that are at least 75% similar. If more than ##
## one match occurs, it picks the best match and outputs it ##
## to Match75.csv. ##
## ##
## Note: Currently the two files are hardcoded in. Arguments ##
## will be implemented later. ##
## ##
## Author: Mike Kusold ##
## Created: Feb 16th, 2011 ##
## Version: 1.0 ##
## ##
## Modification Notes ##
##---------------------------------------------------------------------------##
## ##
## ##
## ##
## ##
## ##
## ##
## ##
###############################################################################
#import sys
import difflib
import csv
import time
import re
def ignoredWords (word):
filtered = word
if "university" in word:
filtered = word.replace("university", "")
if "univ" in word:
filtered = word.replace("univ", "")
if " of " in word:
filtered = word.replace("of", "")
if "pharmaceuticals" in word:
filtered = word.replace("pharmaceuticals", "")
if "pharmaceutical" in word:
filtered = word.replace("pharmaceutical", "")
if "academy of sciences" in word:
filtered = word.replace("academy of sciences", "")
return filtered
#Measure execution time
t0 = time.time()
# Imports and parses the files
#fileA = open(sys.argv[0], 'r')
############INPUT FILENAME############
fileA = open("comm.names.txt", 'r')
try:
setA = fileA.readlines()
finally:
fileA.close()
#fileB = open(sys.argv[1], 'r')
############INPUT FILENAME############
fileB = open("acad.names.txt", 'r')
try:
setB = fileB.readlines()
finally:
fileB.close()
# 100% Match
#Set A and Set B lower case.
setA = set([x.lower() for x in setA])
setB = set([x.lower() for x in setB])
setMatch100 = setA.intersection(setB)
#setMatch100 = set(setA).intersection(setB)
listMatch100 = list(setMatch100)
listMatch100.sort()
############ OUTPUT FILENAME############
Match100 = open("Match100.txt", 'w')
try:
for item in listMatch100:
Match100.write(item)
finally:
Match100.close()
#CSV 100% Match
fMatch100 = open("Match100.csv", 'w')
Match100 = csv.writer(fMatch100, dialect='excel')
try:
Match100.writerow(['File A', 'File B', 'Ratio'])
for item in setA:
match = difflib.get_close_matches(item, setB, 1, 0.98)
if len(match) > 0:
ratio = difflib.SequenceMatcher(None, item, match[0]).ratio()
#print (match[0])
#print (item)
#print (ratio)
row = [item.rstrip(), match[0].rstrip(), ratio]
Match100.writerow(row)
finally:
fMatch100.close()
# Remove 100% matches from the two lists
setA_LeftOver = set(setA).difference(setMatch100)
setB_LeftOver = set(setB).difference(setMatch100)
#Convert from a Set to a List so I can sort.
setA_LeftOver = list(setA_LeftOver)
setB_LeftOver = list(setB_LeftOver)
setA_LeftOver.sort()
setB_LeftOver.sort()
#Filter the remaining list to remove common words.
setA_LeftOver_Filtered = []
for item in setA_LeftOver:
item_filtered = ignoredWords(item)
setA_LeftOver_Filtered.append(item_filtered)
setB_LeftOver_Filtered = []
for item in setB_LeftOver:
item_filtered = ignoredWords(item)
setB_LeftOver_Filtered.append(item_filtered)
#Return the best match for setA_LeftOver[i] in setB_LeftOver that is at least 75% matching.
############OUTPUT FILENAME############
fMatch75 = open("Match75.csv", 'w')
Match75 = csv.writer(fMatch75, dialect='excel')
try:
Match75.writerow(['File A', 'File B', 'Ratio'])
index = 0
for item in setA_LeftOver_Filtered:
match = difflib.get_close_matches(item, setB_LeftOver_Filtered, 1, 0.75)
if len(match) > 0:
setBMatch = [re.match(*.match.*, i) for i in setB_LeftOver]
ratio = difflib.SequenceMatcher(None, setA_LeftOver[i], setBMatch).ratio()
#print (match[0])
#print (item)
#print (ratio)
row = [item.rstrip(), match[0].rstrip(), ratio]
Match75.writerow(row)
index = index+1
finally:
fMatch75.close()
print (time.time() - t0, "seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment