kusold · March 18, 2011 17:17
diff --git a/Matching.py b/Matching.py
 ###############################################################################
 ##	Description: FuzzyMatch.py takes two files with one keyword per line as	 ##
 ##				 command line arguments. It then creates a file with 100%	 ##
 ##				 matches called Match100.txt. Next it tries to match the	 ##
 ##				 remaining files that are at least 75% similar. If more than ##
 ##				 one match occurs, it picks the best match and outputs it    ##
 ##				 to Match75.csv.											 ##
 ##																			 ##
 ##				 Note: Currently the two files are hardcoded in. Arguments	 ##
 ##					  will be implemented later.							 ##
 ##																			 ##
 ##	Author:		Mike Kusold													 ##
 ##	Created:	Feb 16th, 2011												 ##
 ##	Version:	1.0															 ##
 ##																			 ##
 ##							Modification Notes								 ##
 ##---------------------------------------------------------------------------##
 ##																			 ##
 ##																			 ##
 ##																			 ##
 ##																			 ##
 ##																			 ##
 ##																			 ##
 ##																			 ##
 ###############################################################################

 #import sys
 import difflib
 import csv
 import time
 import re

 def ignoredWords (word):
 	filtered = word
 	if "university" in word:
 		filtered = word.replace("university", "")
 	if "univ" in word:
 		filtered = word.replace("univ", "")
 	if " of " in word:
 		filtered = word.replace("of", "")
 	if "pharmaceuticals" in word:
 		filtered = word.replace("pharmaceuticals", "")
 	if "pharmaceutical" in word:
 		filtered = word.replace("pharmaceutical", "")
 	if "academy of sciences" in word:
 		filtered = word.replace("academy of sciences", "")	
 	return filtered



 #Measure execution time
 t0 = time.time()

 # Imports and parses the files
 #fileA = open(sys.argv[0], 'r')

 ############INPUT FILENAME############
 fileA = open("comm.names.txt", 'r')

 try:
 	setA = fileA.readlines()
 finally:
 	fileA.close()

 		
 #fileB = open(sys.argv[1], 'r')

 ############INPUT FILENAME############
 fileB = open("acad.names.txt", 'r')
 try:
 		setB = fileB.readlines()
 finally:	
 		fileB.close()
 		
 # 100% Match
 #Set A and Set B lower case.
 setA = set([x.lower() for x in setA])
 setB = set([x.lower() for x in setB])
 setMatch100 = setA.intersection(setB)
 #setMatch100 = set(setA).intersection(setB)
 listMatch100 = list(setMatch100)
 listMatch100.sort()

 ############ OUTPUT FILENAME############
 Match100 = open("Match100.txt", 'w')
 try:
 	for item in listMatch100:
 		Match100.write(item)
 finally:
 	Match100.close()
 	
 #CSV 100% Match	
 fMatch100 = open("Match100.csv", 'w')
 Match100 = csv.writer(fMatch100, dialect='excel')
 try:
 	Match100.writerow(['File A', 'File B', 'Ratio'])
 	for item in setA:
 		match = difflib.get_close_matches(item, setB, 1, 0.98)		
 		if len(match) > 0:
 			ratio = difflib.SequenceMatcher(None, item, match[0]).ratio()
 			#print (match[0])
 			#print (item)
 			#print (ratio)
 			row = [item.rstrip(), match[0].rstrip(), ratio]
 			Match100.writerow(row)
 finally:
 	fMatch100.close()	

 # Remove 100% matches from the two lists
 setA_LeftOver = set(setA).difference(setMatch100)
 setB_LeftOver = set(setB).difference(setMatch100)

 #Convert from a Set to a List so I can sort.
 setA_LeftOver = list(setA_LeftOver)
 setB_LeftOver = list(setB_LeftOver)
 setA_LeftOver.sort()
 setB_LeftOver.sort()

 #Filter the remaining list to remove common words.
 setA_LeftOver_Filtered = []
 for item in setA_LeftOver:
 	item_filtered = ignoredWords(item)
 	setA_LeftOver_Filtered.append(item_filtered)

 setB_LeftOver_Filtered = []
 for item in setB_LeftOver:
 	item_filtered = ignoredWords(item)
 	setB_LeftOver_Filtered.append(item_filtered)

 #Return the best match for setA_LeftOver[i] in setB_LeftOver that is at least 75% matching.
 ############OUTPUT FILENAME############
 fMatch75 = open("Match75.csv", 'w')
 Match75 = csv.writer(fMatch75, dialect='excel')
 try:
 	Match75.writerow(['File A', 'File B', 'Ratio'])
 	index = 0
 	for item in setA_LeftOver_Filtered:
 		match = difflib.get_close_matches(item, setB_LeftOver_Filtered, 1, 0.75)		
 		if len(match) > 0:
 			setBMatch = [re.match(*.match.*, i) for i in setB_LeftOver]
 			ratio = difflib.SequenceMatcher(None, setA_LeftOver[i], setBMatch).ratio()
 			#print (match[0])
 			#print (item)
 			#print (ratio)
 			row = [item.rstrip(), match[0].rstrip(), ratio]
 			Match75.writerow(row)
 		index = index+1	
 finally:
 	fMatch75.close()

 	print (time.time() - t0, "seconds")
	###############################################################################
	## Description: FuzzyMatch.py takes two files with one keyword per line as ##
	## command line arguments. It then creates a file with 100% ##
	## matches called Match100.txt. Next it tries to match the ##
	## remaining files that are at least 75% similar. If more than ##
	## one match occurs, it picks the best match and outputs it ##
	## to Match75.csv. ##
	## ##
	## Note: Currently the two files are hardcoded in. Arguments ##
	## will be implemented later. ##
	## ##
	## Author: Mike Kusold ##
	## Created: Feb 16th, 2011 ##
	## Version: 1.0 ##
	## ##
	## Modification Notes ##
	##---------------------------------------------------------------------------##
	## ##
	## ##
	## ##
	## ##
	## ##
	## ##
	## ##
	###############################################################################

	#import sys
	import difflib
	import csv
	import time
	import re

	def ignoredWords (word):
	filtered = word
	if "university" in word:
	filtered = word.replace("university", "")
	if "univ" in word:
	filtered = word.replace("univ", "")
	if " of " in word:
	filtered = word.replace("of", "")
	if "pharmaceuticals" in word:
	filtered = word.replace("pharmaceuticals", "")
	if "pharmaceutical" in word:
	filtered = word.replace("pharmaceutical", "")
	if "academy of sciences" in word:
	filtered = word.replace("academy of sciences", "")
	return filtered



	#Measure execution time
	t0 = time.time()

	# Imports and parses the files
	#fileA = open(sys.argv[0], 'r')

	############INPUT FILENAME############
	fileA = open("comm.names.txt", 'r')

	try:
	setA = fileA.readlines()
	finally:
	fileA.close()


	#fileB = open(sys.argv[1], 'r')

	############INPUT FILENAME############
	fileB = open("acad.names.txt", 'r')
	try:
	setB = fileB.readlines()
	finally:
	fileB.close()

	# 100% Match
	#Set A and Set B lower case.
	setA = set([x.lower() for x in setA])
	setB = set([x.lower() for x in setB])
	setMatch100 = setA.intersection(setB)
	#setMatch100 = set(setA).intersection(setB)
	listMatch100 = list(setMatch100)
	listMatch100.sort()

	############ OUTPUT FILENAME############
	Match100 = open("Match100.txt", 'w')
	try:
	for item in listMatch100:
	Match100.write(item)
	finally:
	Match100.close()

	#CSV 100% Match
	fMatch100 = open("Match100.csv", 'w')
	Match100 = csv.writer(fMatch100, dialect='excel')
	try:
	Match100.writerow(['File A', 'File B', 'Ratio'])
	for item in setA:
	match = difflib.get_close_matches(item, setB, 1, 0.98)
	if len(match) > 0:
	ratio = difflib.SequenceMatcher(None, item, match[0]).ratio()
	#print (match[0])
	#print (item)
	#print (ratio)
	row = [item.rstrip(), match[0].rstrip(), ratio]
	Match100.writerow(row)
	finally:
	fMatch100.close()

	# Remove 100% matches from the two lists
	setA_LeftOver = set(setA).difference(setMatch100)
	setB_LeftOver = set(setB).difference(setMatch100)

	#Convert from a Set to a List so I can sort.
	setA_LeftOver = list(setA_LeftOver)
	setB_LeftOver = list(setB_LeftOver)
	setA_LeftOver.sort()
	setB_LeftOver.sort()

	#Filter the remaining list to remove common words.
	setA_LeftOver_Filtered = []
	for item in setA_LeftOver:
	item_filtered = ignoredWords(item)
	setA_LeftOver_Filtered.append(item_filtered)

	setB_LeftOver_Filtered = []
	for item in setB_LeftOver:
	item_filtered = ignoredWords(item)
	setB_LeftOver_Filtered.append(item_filtered)

	#Return the best match for setA_LeftOver[i] in setB_LeftOver that is at least 75% matching.
	############OUTPUT FILENAME############
	fMatch75 = open("Match75.csv", 'w')
	Match75 = csv.writer(fMatch75, dialect='excel')
	try:
	Match75.writerow(['File A', 'File B', 'Ratio'])
	index = 0
	for item in setA_LeftOver_Filtered:
	match = difflib.get_close_matches(item, setB_LeftOver_Filtered, 1, 0.75)
	if len(match) > 0:
	setBMatch = [re.match(.match., i) for i in setB_LeftOver]
	ratio = difflib.SequenceMatcher(None, setA_LeftOver[i], setBMatch).ratio()
	#print (match[0])
	#print (item)
	#print (ratio)
	row = [item.rstrip(), match[0].rstrip(), ratio]
	Match75.writerow(row)
	index = index+1
	finally:
	fMatch75.close()

	print (time.time() - t0, "seconds")
No results found