Created
July 9, 2018 17:59
-
-
Save a8dx/20caedaf942f1810e16994bfdb57e8dc to your computer and use it in GitHub Desktop.
Fuzzy Wuzzy String Matching Example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -- DistrictNameMatching.py | |
# Author: Anthony Louis D'Agostino (ald2187 [at] columbia.edu) | |
# Purpose: Given CSV lists of district-state name pairs, identifies the best match given the fuzzywuzzy library | |
# Notes: Default number of matches currently set to 3, though can be modified as input argument. | |
import os | |
import numpy as np | |
from fuzzywuzzy import fuzz | |
from fuzzywuzzy import process | |
import pandas as pd | |
def districtMatch(master, using, master_dist, master_state, using_dist, using_state, outFile, num_match = 3): | |
""" | |
This function takes two sets of district-state names, and produces a DTA with a set number (default=3) | |
of matches with a flag for whether the district name has been completely matched. | |
Manual work is then required for districts where a perfect match has not been made. | |
master: file containing the master list of districts | |
using: file containing using list of districts, eg., each of these districts is compared against these | |
universe of master districts from the master file | |
master_dist: variable name pertaining to districts in master file | |
master_state: variable name pertaining to states in master file | |
using_dist: variable name pertaining to districts in using file | |
using_state: variable name pertaining to states in using file | |
num_match: number of matches generated, default is 3 | |
outFile: includes path and filename for an outputted DTA file - should be "*.dta" | |
""" | |
master_dists = pd.read_csv(master, quotechar = '"', skipinitialspace = True, sep = None) | |
print " *** Now printing column values for master file *** " | |
print list(master_dists.columns.values) | |
using_dists = pd.read_csv(using, quotechar = '"', skipinitialspace = True, sep = None) | |
print " *** Now printing column values for using file *** " | |
print list(using_dists.columns.values) | |
# -- concatenate district and state names | |
master_names = master_dists[master_dist].map(str) + ", " + master_dists[master_state] | |
using_names = using_dists[using_dist].map(str) + ", " + using_dists[using_state] | |
fhp_new = [process.extract(x, master_names, limit=num_match) for x in using_names] | |
# -- generate column names | |
lab = "" | |
i = 1 | |
while i <= num_match: | |
lab = lab + " " + "Match" + str(i) | |
i += 1 | |
fhp_matches = pd.DataFrame(fhp_new, columns = lab.split()) | |
d={} | |
for x in range(1,num_match+1): | |
d["Match{0}".format(x)]=[y[0] for y in fhp_matches['Match'+str(x)]] | |
d['using_original'] = using_names | |
#match1 = [x[0] for x in fhp_matches['Match1']] | |
d['perfect_match'] = d['Match1'] == d['using_original'] | |
#fhp_matches['perfect_match'] = pd.Series(perfect_match, index = fhp_matches.index) | |
out = pd.DataFrame(d) | |
#out.to_stata(str(outFile + ".dta")) | |
out.to_csv(str(outFile + ".csv")) | |
print "******************************************" | |
print "*** Your analysis has been completed! *** " | |
print "******************************************" | |
return out | |
""" | |
BASIC FILES/PATHS WHOSE USE IS REPEATED | |
""" | |
baseDir = os.path.join("<insert path>") | |
outDir = os.path.join(baseDir, "Matched_Results") | |
if not os.path.exists(outDir): | |
os.makedirs(outDir) | |
""" | |
ICRISAT and 1971 Polygon borders | |
""" | |
master_file = os.path.join(baseDir, "ICRISAT_Meso_Names.csv") | |
input_1971 = os.path.join(baseDir,"District_Records_1971_Clean.csv") | |
outFile = os.path.join(outDir, "ICRI_1971_DistrictMatches") | |
icri_1971_match = districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile) | |
# -- alternatively, don't save as a workspace object | |
districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment