ejcer · October 24, 2015 19:13
diff --git a/gistfile1.txt b/gistfile1.txt
 #class survey data surveyV01.csv
 #Use Pandas to clean all the problems you can find in the data and write a
 #single clean surveyCleaned_LastName.csv file.

 #Submit a list of all the types of problems you found, and for each problem
 #show the Python code you used to fix it, in the textbox.


 ### Imports ###
 import numpy as np
 import pandas as pd
 from pandas import Series, DataFrame
 import os
 os.chdir('/home/edward/workspace/school/datavis')
 os.getcwd()
 from scipy import stats
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 import seaborn as sns
 import statsmodels.api as sm
 from numpy.random import randn


 ### Data Reading ###
 survey_df = pd.read_csv('./datafiles/surveyV01.csv')


 ### Problems ###
 #VQuestion 4, height in inches?, has a NaN as well as some input containing ' " '
    #and ' ' '  and in. and 'FT' and 'IN'
 #VQuestion 5, number of siblings?, row 24 has text input that should be numerical EG '2'
 #VQuestion 6, how far born BB? replace '~55' in row 23 with 55
 #VQuestion 7, what time bed? All items should be %d.%d not AM or PM or '%d:%d'
 #VQuestion 9, introvery scale? some clown put '50i' in row 26
 #VQuestion 12, cellphone use (min)? row 24 has a non numeric entry
 #VQuestion 13, # apps on phone? row 25 has '17.38' a nondiscrete number
 #VQuestion 17, # books read? row 21 has 'roughly 30' a non numeric entry
 #VQuestion 18, # countries visited? row 25 has '17.38' a nondiscrete number
 #VQuestion 21, preferred outdoor temp? row 37 has '60s' a non numeric entry
 #VQuestion 22, nights slept outside? entries: 'alot' and 'too many' and '17.38'
    #are unnacceptable
 #VQuestion 23, attended VT football games? there are NaN's and an entry 'lots'
 #VQuestion 26, like math (1-100)? rows 25 and 14 has value > 100
 #VQuestion 27, gigs on hard drive? rows 21 and 37 have non numeric entries
 #VQuestion 35: replace the index to be Q27, and then move all indices beyond that ahead
    #also replace none with np.nan

 ### Solutions ###

 #Question 2:
 survey_df['Q2'].replace(' Applied and Computational Mathematics', 1, inplace=True)

 #Question 4:
 survey_df['Q4'].replace({'5\'11\"':5*12+11, '5\'9\"': 5*12+9, '\"5\' 9\"\"\"': 5*12+9,
 '66 in.': 66, '5\' 3\"':5*12+3,'68\"': 68, '5 FT 10IN': 5*12+10,
 '5\'11':5*12+11,'5\'9\'\'':5*12+9, '5\' 9\"':5*12+9}, inplace=True)


 survey_df['Q5'].replace('I have two siblings.', 2, inplace=True)

 #Question 7: replace all incorrectly formmatted times to military hours
 survey_df['Q7'].replace({'2:00am':2, '12:00 AM': 0, '11:00': np.nan, '18-Aug':np.nan, '12:30 AM': 0.5, '1:00AM':1}, inplace=True)

 #Question 6: replace '~55' in row 23 with 55
 survey_df['Q6'].replace({'~55': 55, '149 miles': 149, '2,903 miles': 2903}, inplace=True)

 #Question 9: replace '50i' in row 26 with 50
 survey_df['Q9'].replace('50i', 50, inplace=True)

 #Question 12: replace '720 minutes' in row 24 with 720
 survey_df['Q12'].replace('720 minutes', 720, inplace=True)

 #Question 13: replace '17.38' in row 25 with 17
 survey_df['Q13'].replace('17.38', 17, inplace=True)

 #Question 17: replace 'roughly 30' in row 21 with 30
 survey_df['Q17'].replace('roughly 30', 30, inplace=True)

 #Question 18: replace '17.38' in row 25 with 17
 survey_df['Q18'].replace('17.38', 17, inplace=True)

 #Question 21: replace '60s' in row 37 with 60
 survey_df['Q21'].replace('60s', 60, inplace=True)

 #Question 22: replace 'alot' and 'too many' with NaN and 17.38 with 17
 survey_df['Q22'].replace({'17.38':17, 'alot': np.nan, 'too many to count': np.nan}, inplace=True)

 #Question 23: replace 'lots' with NaN
 survey_df['Q23'].replace('lots', np.nan, inplace=True)

 #Question 26: replace all values above 100 with 100
 survey_df['Q26'][survey_df['Q26'].convert_objects(convert_numeric = True) > 100] = 100

 #Question 27:
 survey_df['Q27'].replace({'no idea how to check that': np.nan, '2Tb': 2000}, inplace=True)

 #Question 35: replace the index to be Q27, and then move all indices beyond that ahead
    #also replace none with np.nan
 survey_df['Q35'].replace('none', np.nan, inplace=True)

 survey_df.columns.values[30:35] = ['Q27', 'Q28', 'Q29', 'Q30', 'Q31']

 ### Write to output ###

 survey_df.to_csv('./output/cleaned_survey.csv', sep='\t', encoding='utf-8')
	#class survey data surveyV01.csv
	#Use Pandas to clean all the problems you can find in the data and write a
	#single clean surveyCleaned_LastName.csv file.

	#Submit a list of all the types of problems you found, and for each problem
	#show the Python code you used to fix it, in the textbox.


	### Imports ###
	import numpy as np
	import pandas as pd
	from pandas import Series, DataFrame
	import os
	os.chdir('/home/edward/workspace/school/datavis')
	os.getcwd()
	from scipy import stats
	import matplotlib as mpl
	import matplotlib.pyplot as plt
	import seaborn as sns
	import statsmodels.api as sm
	from numpy.random import randn


	### Data Reading ###
	survey_df = pd.read_csv('./datafiles/surveyV01.csv')


	### Problems ###
	#VQuestion 4, height in inches?, has a NaN as well as some input containing ' " '
	#and ' ' ' and in. and 'FT' and 'IN'
	#VQuestion 5, number of siblings?, row 24 has text input that should be numerical EG '2'
	#VQuestion 6, how far born BB? replace '~55' in row 23 with 55
	#VQuestion 7, what time bed? All items should be %d.%d not AM or PM or '%d:%d'
	#VQuestion 9, introvery scale? some clown put '50i' in row 26
	#VQuestion 12, cellphone use (min)? row 24 has a non numeric entry
	#VQuestion 13, # apps on phone? row 25 has '17.38' a nondiscrete number
	#VQuestion 17, # books read? row 21 has 'roughly 30' a non numeric entry
	#VQuestion 18, # countries visited? row 25 has '17.38' a nondiscrete number
	#VQuestion 21, preferred outdoor temp? row 37 has '60s' a non numeric entry
	#VQuestion 22, nights slept outside? entries: 'alot' and 'too many' and '17.38'
	#are unnacceptable
	#VQuestion 23, attended VT football games? there are NaN's and an entry 'lots'
	#VQuestion 26, like math (1-100)? rows 25 and 14 has value > 100
	#VQuestion 27, gigs on hard drive? rows 21 and 37 have non numeric entries
	#VQuestion 35: replace the index to be Q27, and then move all indices beyond that ahead
	#also replace none with np.nan

	### Solutions ###

	#Question 2:
	survey_df['Q2'].replace(' Applied and Computational Mathematics', 1, inplace=True)

	#Question 4:
	survey_df['Q4'].replace({'5\'11\"':512+11, '5\'9\"': 512+9, '\"5\' 9\"\"\"': 5*12+9,
	'66 in.': 66, '5\' 3\"':512+3,'68\"': 68, '5 FT 10IN': 512+10,
	'5\'11':512+11,'5\'9\'\'':512+9, '5\' 9\"':5*12+9}, inplace=True)


	survey_df['Q5'].replace('I have two siblings.', 2, inplace=True)

	#Question 7: replace all incorrectly formmatted times to military hours
	survey_df['Q7'].replace({'2:00am':2, '12:00 AM': 0, '11:00': np.nan, '18-Aug':np.nan, '12:30 AM': 0.5, '1:00AM':1}, inplace=True)

	#Question 6: replace '~55' in row 23 with 55
	survey_df['Q6'].replace({'~55': 55, '149 miles': 149, '2,903 miles': 2903}, inplace=True)

	#Question 9: replace '50i' in row 26 with 50
	survey_df['Q9'].replace('50i', 50, inplace=True)

	#Question 12: replace '720 minutes' in row 24 with 720
	survey_df['Q12'].replace('720 minutes', 720, inplace=True)

	#Question 13: replace '17.38' in row 25 with 17
	survey_df['Q13'].replace('17.38', 17, inplace=True)

	#Question 17: replace 'roughly 30' in row 21 with 30
	survey_df['Q17'].replace('roughly 30', 30, inplace=True)

	#Question 18: replace '17.38' in row 25 with 17
	survey_df['Q18'].replace('17.38', 17, inplace=True)

	#Question 21: replace '60s' in row 37 with 60
	survey_df['Q21'].replace('60s', 60, inplace=True)

	#Question 22: replace 'alot' and 'too many' with NaN and 17.38 with 17
	survey_df['Q22'].replace({'17.38':17, 'alot': np.nan, 'too many to count': np.nan}, inplace=True)

	#Question 23: replace 'lots' with NaN
	survey_df['Q23'].replace('lots', np.nan, inplace=True)

	#Question 26: replace all values above 100 with 100
	survey_df['Q26'][survey_df['Q26'].convert_objects(convert_numeric = True) > 100] = 100

	#Question 27:
	survey_df['Q27'].replace({'no idea how to check that': np.nan, '2Tb': 2000}, inplace=True)

	#Question 35: replace the index to be Q27, and then move all indices beyond that ahead
	#also replace none with np.nan
	survey_df['Q35'].replace('none', np.nan, inplace=True)

	survey_df.columns.values[30:35] = ['Q27', 'Q28', 'Q29', 'Q30', 'Q31']

	### Write to output ###

	survey_df.to_csv('./output/cleaned_survey.csv', sep='\t', encoding='utf-8')