Created
March 18, 2011 17:17
-
-
Save kusold/876460 to your computer and use it in GitHub Desktop.
Takes two text files with one String per line. Attempts to match them.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ############################################################################### | |
| ## Description: FuzzyMatch.py takes two files with one keyword per line as ## | |
| ## command line arguments. It then creates a file with 100% ## | |
| ## matches called Match100.txt. Next it tries to match the ## | |
| ## remaining files that are at least 75% similar. If more than ## | |
| ## one match occurs, it picks the best match and outputs it ## | |
| ## to Match75.csv. ## | |
| ## ## | |
| ## Note: Currently the two files are hardcoded in. Arguments ## | |
| ## will be implemented later. ## | |
| ## ## | |
| ## Author: Mike Kusold ## | |
| ## Created: Feb 16th, 2011 ## | |
| ## Version: 1.0 ## | |
| ## ## | |
| ## Modification Notes ## | |
| ##---------------------------------------------------------------------------## | |
| ## ## | |
| ## ## | |
| ## ## | |
| ## ## | |
| ## ## | |
| ## ## | |
| ## ## | |
| ############################################################################### | |
| #import sys | |
| import difflib | |
| import csv | |
| import time | |
| import re | |
| def ignoredWords (word): | |
| filtered = word | |
| if "university" in word: | |
| filtered = word.replace("university", "") | |
| if "univ" in word: | |
| filtered = word.replace("univ", "") | |
| if " of " in word: | |
| filtered = word.replace("of", "") | |
| if "pharmaceuticals" in word: | |
| filtered = word.replace("pharmaceuticals", "") | |
| if "pharmaceutical" in word: | |
| filtered = word.replace("pharmaceutical", "") | |
| if "academy of sciences" in word: | |
| filtered = word.replace("academy of sciences", "") | |
| return filtered | |
| #Measure execution time | |
| t0 = time.time() | |
| # Imports and parses the files | |
| #fileA = open(sys.argv[0], 'r') | |
| ############INPUT FILENAME############ | |
| fileA = open("comm.names.txt", 'r') | |
| try: | |
| setA = fileA.readlines() | |
| finally: | |
| fileA.close() | |
| #fileB = open(sys.argv[1], 'r') | |
| ############INPUT FILENAME############ | |
| fileB = open("acad.names.txt", 'r') | |
| try: | |
| setB = fileB.readlines() | |
| finally: | |
| fileB.close() | |
| # 100% Match | |
| #Set A and Set B lower case. | |
| setA = set([x.lower() for x in setA]) | |
| setB = set([x.lower() for x in setB]) | |
| setMatch100 = setA.intersection(setB) | |
| #setMatch100 = set(setA).intersection(setB) | |
| listMatch100 = list(setMatch100) | |
| listMatch100.sort() | |
| ############ OUTPUT FILENAME############ | |
| Match100 = open("Match100.txt", 'w') | |
| try: | |
| for item in listMatch100: | |
| Match100.write(item) | |
| finally: | |
| Match100.close() | |
| #CSV 100% Match | |
| fMatch100 = open("Match100.csv", 'w') | |
| Match100 = csv.writer(fMatch100, dialect='excel') | |
| try: | |
| Match100.writerow(['File A', 'File B', 'Ratio']) | |
| for item in setA: | |
| match = difflib.get_close_matches(item, setB, 1, 0.98) | |
| if len(match) > 0: | |
| ratio = difflib.SequenceMatcher(None, item, match[0]).ratio() | |
| #print (match[0]) | |
| #print (item) | |
| #print (ratio) | |
| row = [item.rstrip(), match[0].rstrip(), ratio] | |
| Match100.writerow(row) | |
| finally: | |
| fMatch100.close() | |
| # Remove 100% matches from the two lists | |
| setA_LeftOver = set(setA).difference(setMatch100) | |
| setB_LeftOver = set(setB).difference(setMatch100) | |
| #Convert from a Set to a List so I can sort. | |
| setA_LeftOver = list(setA_LeftOver) | |
| setB_LeftOver = list(setB_LeftOver) | |
| setA_LeftOver.sort() | |
| setB_LeftOver.sort() | |
| #Filter the remaining list to remove common words. | |
| setA_LeftOver_Filtered = [] | |
| for item in setA_LeftOver: | |
| item_filtered = ignoredWords(item) | |
| setA_LeftOver_Filtered.append(item_filtered) | |
| setB_LeftOver_Filtered = [] | |
| for item in setB_LeftOver: | |
| item_filtered = ignoredWords(item) | |
| setB_LeftOver_Filtered.append(item_filtered) | |
| #Return the best match for setA_LeftOver[i] in setB_LeftOver that is at least 75% matching. | |
| ############OUTPUT FILENAME############ | |
| fMatch75 = open("Match75.csv", 'w') | |
| Match75 = csv.writer(fMatch75, dialect='excel') | |
| try: | |
| Match75.writerow(['File A', 'File B', 'Ratio']) | |
| index = 0 | |
| for item in setA_LeftOver_Filtered: | |
| match = difflib.get_close_matches(item, setB_LeftOver_Filtered, 1, 0.75) | |
| if len(match) > 0: | |
| setBMatch = [re.match(*.match.*, i) for i in setB_LeftOver] | |
| ratio = difflib.SequenceMatcher(None, setA_LeftOver[i], setBMatch).ratio() | |
| #print (match[0]) | |
| #print (item) | |
| #print (ratio) | |
| row = [item.rstrip(), match[0].rstrip(), ratio] | |
| Match75.writerow(row) | |
| index = index+1 | |
| finally: | |
| fMatch75.close() | |
| print (time.time() - t0, "seconds") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment