Created
October 24, 2015 19:13
-
-
Save ejcer/d115a990250d3f88566e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#class survey data surveyV01.csv | |
#Use Pandas to clean all the problems you can find in the data and write a | |
#single clean surveyCleaned_LastName.csv file. | |
#Submit a list of all the types of problems you found, and for each problem | |
#show the Python code you used to fix it, in the textbox. | |
### Imports ### | |
import numpy as np | |
import pandas as pd | |
from pandas import Series, DataFrame | |
import os | |
os.chdir('/home/edward/workspace/school/datavis') | |
os.getcwd() | |
from scipy import stats | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import statsmodels.api as sm | |
from numpy.random import randn | |
### Data Reading ### | |
survey_df = pd.read_csv('./datafiles/surveyV01.csv') | |
### Problems ### | |
#VQuestion 4, height in inches?, has a NaN as well as some input containing ' " ' | |
#and ' ' ' and in. and 'FT' and 'IN' | |
#VQuestion 5, number of siblings?, row 24 has text input that should be numerical EG '2' | |
#VQuestion 6, how far born BB? replace '~55' in row 23 with 55 | |
#VQuestion 7, what time bed? All items should be %d.%d not AM or PM or '%d:%d' | |
#VQuestion 9, introvery scale? some clown put '50i' in row 26 | |
#VQuestion 12, cellphone use (min)? row 24 has a non numeric entry | |
#VQuestion 13, # apps on phone? row 25 has '17.38' a nondiscrete number | |
#VQuestion 17, # books read? row 21 has 'roughly 30' a non numeric entry | |
#VQuestion 18, # countries visited? row 25 has '17.38' a nondiscrete number | |
#VQuestion 21, preferred outdoor temp? row 37 has '60s' a non numeric entry | |
#VQuestion 22, nights slept outside? entries: 'alot' and 'too many' and '17.38' | |
#are unnacceptable | |
#VQuestion 23, attended VT football games? there are NaN's and an entry 'lots' | |
#VQuestion 26, like math (1-100)? rows 25 and 14 has value > 100 | |
#VQuestion 27, gigs on hard drive? rows 21 and 37 have non numeric entries | |
#VQuestion 35: replace the index to be Q27, and then move all indices beyond that ahead | |
#also replace none with np.nan | |
### Solutions ### | |
#Question 2: | |
survey_df['Q2'].replace(' Applied and Computational Mathematics', 1, inplace=True) | |
#Question 4: | |
survey_df['Q4'].replace({'5\'11\"':5*12+11, '5\'9\"': 5*12+9, '\"5\' 9\"\"\"': 5*12+9, | |
'66 in.': 66, '5\' 3\"':5*12+3,'68\"': 68, '5 FT 10IN': 5*12+10, | |
'5\'11':5*12+11,'5\'9\'\'':5*12+9, '5\' 9\"':5*12+9}, inplace=True) | |
survey_df['Q5'].replace('I have two siblings.', 2, inplace=True) | |
#Question 7: replace all incorrectly formmatted times to military hours | |
survey_df['Q7'].replace({'2:00am':2, '12:00 AM': 0, '11:00': np.nan, '18-Aug':np.nan, '12:30 AM': 0.5, '1:00AM':1}, inplace=True) | |
#Question 6: replace '~55' in row 23 with 55 | |
survey_df['Q6'].replace({'~55': 55, '149 miles': 149, '2,903 miles': 2903}, inplace=True) | |
#Question 9: replace '50i' in row 26 with 50 | |
survey_df['Q9'].replace('50i', 50, inplace=True) | |
#Question 12: replace '720 minutes' in row 24 with 720 | |
survey_df['Q12'].replace('720 minutes', 720, inplace=True) | |
#Question 13: replace '17.38' in row 25 with 17 | |
survey_df['Q13'].replace('17.38', 17, inplace=True) | |
#Question 17: replace 'roughly 30' in row 21 with 30 | |
survey_df['Q17'].replace('roughly 30', 30, inplace=True) | |
#Question 18: replace '17.38' in row 25 with 17 | |
survey_df['Q18'].replace('17.38', 17, inplace=True) | |
#Question 21: replace '60s' in row 37 with 60 | |
survey_df['Q21'].replace('60s', 60, inplace=True) | |
#Question 22: replace 'alot' and 'too many' with NaN and 17.38 with 17 | |
survey_df['Q22'].replace({'17.38':17, 'alot': np.nan, 'too many to count': np.nan}, inplace=True) | |
#Question 23: replace 'lots' with NaN | |
survey_df['Q23'].replace('lots', np.nan, inplace=True) | |
#Question 26: replace all values above 100 with 100 | |
survey_df['Q26'][survey_df['Q26'].convert_objects(convert_numeric = True) > 100] = 100 | |
#Question 27: | |
survey_df['Q27'].replace({'no idea how to check that': np.nan, '2Tb': 2000}, inplace=True) | |
#Question 35: replace the index to be Q27, and then move all indices beyond that ahead | |
#also replace none with np.nan | |
survey_df['Q35'].replace('none', np.nan, inplace=True) | |
survey_df.columns.values[30:35] = ['Q27', 'Q28', 'Q29', 'Q30', 'Q31'] | |
### Write to output ### | |
survey_df.to_csv('./output/cleaned_survey.csv', sep='\t', encoding='utf-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment