Created
June 11, 2014 13:09
-
-
Save Wavewash/8597030884329dc3c448 to your computer and use it in GitHub Desktop.
This was used to cleanse data input by various users for years in a record system. The years being input by different people with different methods all had different formats. Some were just plain incorrect. To reduce the effort by the data cleansing team to unify the year format. This high level parser was written to catch most obvious date form…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'mkakwan' | |
import sys | |
def clense(row, headers): | |
i=0 | |
clensedkey = "year_cleansed" | |
newAnnualReportsClensedValue = None; | |
for key in row: | |
value = row[key] | |
if(key == "year_dirty") and (value != "" ): | |
#Value only has a single 4 digit year ex:1999 | |
if(len(value) == 4) and str(value).isdigit(): | |
newAnnualReportsClensedValue = [(int(value))] | |
else: | |
newAnnualReportsClensedValue = dateCases(value) | |
i=i+1 | |
#Comment out data clense output | |
if False: | |
fo = open("clensedata.txt", "a") | |
if newAnnualReportsClensedValue == None: | |
fo.write("UNRECOGNIZED \t " + row["annual_reports_s"] + '\n') | |
else: | |
fo.write(str(newAnnualReportsClensedValue) + " \t " + row["annual_reports_s"] + '\n') | |
fo.close() | |
row[clensedkey] = newAnnualReportsClensedValue | |
return row; | |
def dateCases(value): | |
originalValue = value; | |
#clense the string of spaces | |
value = value.replace(" ",""); | |
#clense the string of periods | |
value = value.replace(".",","); | |
#clense the data of dates ex: 1979/80,1982/83 | |
value = value.replace("/",","); | |
#clense the data of dates ex: 1989;1992 | |
value = value.replace(";",","); | |
#clense the data of parts ex: 2003:2 parts | |
value = value.split(":")[0] | |
#clense the date of (BOUND) ex: 1976/1977-1985/1986; 1986/1987, 1988/1989-1994/1995 (BOUND) | |
value = value.replace("(BOUND)",""); | |
while(str(value).endswith(".") or str(value).endswith(",") or str(value).endswith(";")): | |
value = value[:-1] | |
commasplit = value.split(",") | |
valuelist = [] | |
#assume the current century is in 19th century | |
currentCentury = "19"; | |
for splitvalue in commasplit: | |
#Value only has a single 4 digit year ex:1999 | |
if(len(splitvalue) == 4) and str(splitvalue).isdigit(): | |
#get the current century to append onto dates | |
currentCentury = splitvalue[:2] | |
valuelist.append(int(splitvalue)) | |
elif(len(splitvalue) == 2) and str(splitvalue).isdigit(): | |
valuelist.append(int(currentCentury+splitvalue)) | |
#Value is a dash to denote a range | |
elif "-" in splitvalue: | |
dashsplitvalues = splitvalue.split("-") | |
#make sure we only have two dates with the dash (ex: 1999-2000, 98-99, 1988-98) | |
rangedate = [] | |
if len(dashsplitvalues) == 2 and dashsplitvalues[0].isdigit() and dashsplitvalues[1].isdigit(): | |
for dashsplitvalue in dashsplitvalues: | |
if len(dashsplitvalue) == 2: | |
rangedate.append(int(currentCentury + dashsplitvalue)) | |
else: | |
rangedate.append(int(dashsplitvalue)) | |
rangedate = range(rangedate[0],rangedate[1] + 1) | |
valuelist = valuelist+ (list(rangedate)) | |
#Value format is unrecognized | |
else: | |
print("UNRECOGNIZED ,\"" + originalValue + "\"") | |
return None | |
print(str(valuelist) + " ,\"" + originalValue + "\"") | |
return valuelist |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment