Wavewash · June 11, 2014 13:09
diff --git a/Data Clenser for Year Strings to Lists b/Data Clenser for Year Strings to Lists
 __author__ = 'mkakwan'

 import sys

 def clense(row, headers):
    i=0

    clensedkey = "year_cleansed"
    newAnnualReportsClensedValue = None;
    for key in row:
        value = row[key]
        if(key == "year_dirty") and (value != "" ):

            #Value only has a single 4 digit year ex:1999
            if(len(value) == 4) and str(value).isdigit():
                newAnnualReportsClensedValue = [(int(value))]
            else:
                 newAnnualReportsClensedValue = dateCases(value)
        i=i+1

    #Comment out data clense output
    if False:
        fo = open("clensedata.txt", "a")
        if newAnnualReportsClensedValue == None:
            fo.write("UNRECOGNIZED \t " + row["annual_reports_s"] + '\n')
        else:
            fo.write(str(newAnnualReportsClensedValue) + " \t " + row["annual_reports_s"] + '\n')
        fo.close()

    row[clensedkey] = newAnnualReportsClensedValue
    return row;


 def dateCases(value):
    originalValue = value;

    #clense the string of spaces
    value = value.replace(" ","");

    #clense the string of periods
    value = value.replace(".",",");

    #clense the data of dates ex: 1979/80,1982/83
    value = value.replace("/",",");

    #clense the data of dates ex: 1989;1992
    value = value.replace(";",",");

    #clense the data of parts ex: 2003:2 parts
    value = value.split(":")[0]

    #clense the date of (BOUND) ex: 1976/1977-1985/1986; 1986/1987, 1988/1989-1994/1995 (BOUND)
    value = value.replace("(BOUND)","");

    while(str(value).endswith(".") or str(value).endswith(",") or str(value).endswith(";")):
        value = value[:-1]

    commasplit = value.split(",")
    valuelist = []
    #assume the current century is in 19th century
    currentCentury = "19";
    for splitvalue in commasplit:
        #Value only has a single 4 digit year ex:1999
        if(len(splitvalue) == 4) and str(splitvalue).isdigit():
                #get the current century to append onto dates
                currentCentury = splitvalue[:2]
                valuelist.append(int(splitvalue))
        elif(len(splitvalue) == 2) and str(splitvalue).isdigit():
                valuelist.append(int(currentCentury+splitvalue))
        #Value is a dash to denote a range
        elif "-" in splitvalue:
            dashsplitvalues = splitvalue.split("-")
            #make sure we only have two dates with the dash (ex: 1999-2000, 98-99, 1988-98)
            rangedate = []
            if len(dashsplitvalues) == 2 and dashsplitvalues[0].isdigit() and dashsplitvalues[1].isdigit():
                for dashsplitvalue in dashsplitvalues:
                    if len(dashsplitvalue) == 2:
                        rangedate.append(int(currentCentury + dashsplitvalue))
                    else:
                        rangedate.append(int(dashsplitvalue))
                rangedate = range(rangedate[0],rangedate[1] + 1)
                valuelist = valuelist+ (list(rangedate))

        #Value format is unrecognized
        else:
            print("UNRECOGNIZED ,\"" + originalValue + "\"")
            return None

    print(str(valuelist) + " ,\"" + originalValue + "\"")
    return valuelist
	__author__ = 'mkakwan'

	import sys

	def clense(row, headers):
	i=0

	clensedkey = "year_cleansed"
	newAnnualReportsClensedValue = None;
	for key in row:
	value = row[key]
	if(key == "year_dirty") and (value != "" ):

	#Value only has a single 4 digit year ex:1999
	if(len(value) == 4) and str(value).isdigit():
	newAnnualReportsClensedValue = [(int(value))]
	else:
	newAnnualReportsClensedValue = dateCases(value)
	i=i+1

	#Comment out data clense output
	if False:
	fo = open("clensedata.txt", "a")
	if newAnnualReportsClensedValue == None:
	fo.write("UNRECOGNIZED \t " + row["annual_reports_s"] + '\n')
	else:
	fo.write(str(newAnnualReportsClensedValue) + " \t " + row["annual_reports_s"] + '\n')
	fo.close()

	row[clensedkey] = newAnnualReportsClensedValue
	return row;


	def dateCases(value):
	originalValue = value;

	#clense the string of spaces
	value = value.replace(" ","");

	#clense the string of periods
	value = value.replace(".",",");

	#clense the data of dates ex: 1979/80,1982/83
	value = value.replace("/",",");

	#clense the data of dates ex: 1989;1992
	value = value.replace(";",",");

	#clense the data of parts ex: 2003:2 parts
	value = value.split(":")[0]

	#clense the date of (BOUND) ex: 1976/1977-1985/1986; 1986/1987, 1988/1989-1994/1995 (BOUND)
	value = value.replace("(BOUND)","");

	while(str(value).endswith(".") or str(value).endswith(",") or str(value).endswith(";")):
	value = value[:-1]

	commasplit = value.split(",")
	valuelist = []
	#assume the current century is in 19th century
	currentCentury = "19";
	for splitvalue in commasplit:
	#Value only has a single 4 digit year ex:1999
	if(len(splitvalue) == 4) and str(splitvalue).isdigit():
	#get the current century to append onto dates
	currentCentury = splitvalue[:2]
	valuelist.append(int(splitvalue))
	elif(len(splitvalue) == 2) and str(splitvalue).isdigit():
	valuelist.append(int(currentCentury+splitvalue))
	#Value is a dash to denote a range
	elif "-" in splitvalue:
	dashsplitvalues = splitvalue.split("-")
	#make sure we only have two dates with the dash (ex: 1999-2000, 98-99, 1988-98)
	rangedate = []
	if len(dashsplitvalues) == 2 and dashsplitvalues[0].isdigit() and dashsplitvalues[1].isdigit():
	for dashsplitvalue in dashsplitvalues:
	if len(dashsplitvalue) == 2:
	rangedate.append(int(currentCentury + dashsplitvalue))
	else:
	rangedate.append(int(dashsplitvalue))
	rangedate = range(rangedate[0],rangedate[1] + 1)
	valuelist = valuelist+ (list(rangedate))

	#Value format is unrecognized
	else:
	print("UNRECOGNIZED ,\"" + originalValue + "\"")
	return None

	print(str(valuelist) + " ,\"" + originalValue + "\"")
	return valuelist