Skip to content

Instantly share code, notes, and snippets.

@ejcer
Created October 24, 2015 19:13
Show Gist options
  • Save ejcer/d115a990250d3f88566e to your computer and use it in GitHub Desktop.
Save ejcer/d115a990250d3f88566e to your computer and use it in GitHub Desktop.
#class survey data surveyV01.csv
#Use Pandas to clean all the problems you can find in the data and write a
#single clean surveyCleaned_LastName.csv file.
#Submit a list of all the types of problems you found, and for each problem
#show the Python code you used to fix it, in the textbox.
### Imports ###
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import os
os.chdir('/home/edward/workspace/school/datavis')
os.getcwd()
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from numpy.random import randn
### Data Reading ###
survey_df = pd.read_csv('./datafiles/surveyV01.csv')
### Problems ###
#VQuestion 4, height in inches?, has a NaN as well as some input containing ' " '
#and ' ' ' and in. and 'FT' and 'IN'
#VQuestion 5, number of siblings?, row 24 has text input that should be numerical EG '2'
#VQuestion 6, how far born BB? replace '~55' in row 23 with 55
#VQuestion 7, what time bed? All items should be %d.%d not AM or PM or '%d:%d'
#VQuestion 9, introvery scale? some clown put '50i' in row 26
#VQuestion 12, cellphone use (min)? row 24 has a non numeric entry
#VQuestion 13, # apps on phone? row 25 has '17.38' a nondiscrete number
#VQuestion 17, # books read? row 21 has 'roughly 30' a non numeric entry
#VQuestion 18, # countries visited? row 25 has '17.38' a nondiscrete number
#VQuestion 21, preferred outdoor temp? row 37 has '60s' a non numeric entry
#VQuestion 22, nights slept outside? entries: 'alot' and 'too many' and '17.38'
#are unnacceptable
#VQuestion 23, attended VT football games? there are NaN's and an entry 'lots'
#VQuestion 26, like math (1-100)? rows 25 and 14 has value > 100
#VQuestion 27, gigs on hard drive? rows 21 and 37 have non numeric entries
#VQuestion 35: replace the index to be Q27, and then move all indices beyond that ahead
#also replace none with np.nan
### Solutions ###
#Question 2:
survey_df['Q2'].replace(' Applied and Computational Mathematics', 1, inplace=True)
#Question 4:
survey_df['Q4'].replace({'5\'11\"':5*12+11, '5\'9\"': 5*12+9, '\"5\' 9\"\"\"': 5*12+9,
'66 in.': 66, '5\' 3\"':5*12+3,'68\"': 68, '5 FT 10IN': 5*12+10,
'5\'11':5*12+11,'5\'9\'\'':5*12+9, '5\' 9\"':5*12+9}, inplace=True)
survey_df['Q5'].replace('I have two siblings.', 2, inplace=True)
#Question 7: replace all incorrectly formmatted times to military hours
survey_df['Q7'].replace({'2:00am':2, '12:00 AM': 0, '11:00': np.nan, '18-Aug':np.nan, '12:30 AM': 0.5, '1:00AM':1}, inplace=True)
#Question 6: replace '~55' in row 23 with 55
survey_df['Q6'].replace({'~55': 55, '149 miles': 149, '2,903 miles': 2903}, inplace=True)
#Question 9: replace '50i' in row 26 with 50
survey_df['Q9'].replace('50i', 50, inplace=True)
#Question 12: replace '720 minutes' in row 24 with 720
survey_df['Q12'].replace('720 minutes', 720, inplace=True)
#Question 13: replace '17.38' in row 25 with 17
survey_df['Q13'].replace('17.38', 17, inplace=True)
#Question 17: replace 'roughly 30' in row 21 with 30
survey_df['Q17'].replace('roughly 30', 30, inplace=True)
#Question 18: replace '17.38' in row 25 with 17
survey_df['Q18'].replace('17.38', 17, inplace=True)
#Question 21: replace '60s' in row 37 with 60
survey_df['Q21'].replace('60s', 60, inplace=True)
#Question 22: replace 'alot' and 'too many' with NaN and 17.38 with 17
survey_df['Q22'].replace({'17.38':17, 'alot': np.nan, 'too many to count': np.nan}, inplace=True)
#Question 23: replace 'lots' with NaN
survey_df['Q23'].replace('lots', np.nan, inplace=True)
#Question 26: replace all values above 100 with 100
survey_df['Q26'][survey_df['Q26'].convert_objects(convert_numeric = True) > 100] = 100
#Question 27:
survey_df['Q27'].replace({'no idea how to check that': np.nan, '2Tb': 2000}, inplace=True)
#Question 35: replace the index to be Q27, and then move all indices beyond that ahead
#also replace none with np.nan
survey_df['Q35'].replace('none', np.nan, inplace=True)
survey_df.columns.values[30:35] = ['Q27', 'Q28', 'Q29', 'Q30', 'Q31']
### Write to output ###
survey_df.to_csv('./output/cleaned_survey.csv', sep='\t', encoding='utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment