Last active
September 2, 2018 21:25
-
-
Save bhishanpdl/e453575fd66c1caeb93a5bf0f9c30baf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def clean_data(data): | |
""" | |
Clean the pandas dataframe. | |
data: pandas dataframe | |
return: cleaned dataframe | |
1. Make column names lowercase and underscored. | |
2. Rename repeated column names. | |
3. Delete a column is there are a lot of missing values. | |
4. Fill all '' or ' ' by numpy nans in the dataframe. | |
5. Give nice datatype to each columns for > 97% data. | |
""" | |
import pandas as pd | |
import numpy as np | |
import gc | |
# 1. replace ' ' by '_' | |
data.columns = [str(x.lower().strip().replace(' ', '_')) | |
for x in data.columns] | |
# 2. If colnames are repeated make them columnname_position | |
seen = {} | |
columns = [] | |
i = 0 | |
for i, x in enumerate(data.columns): | |
if x in seen: | |
columns.append(x+'_{}'.format(i)) | |
else: | |
columns.append(x) | |
seen[x] = None | |
# 3. Delete a column is there are a lot of missing values | |
for x in data.columns[data.count()/len(data) < 0.0001]: | |
del data[x] | |
gc.collect() | |
# 4. Fill all '' or ' ' by numpy nans in the dataframe | |
try: | |
data = data.replace({'': np.nan, ' ': np.nan}) | |
except: | |
pass | |
# 5. Give nice datatype to each columns | |
# e.g. if >97% integers, make them integer. | |
size = len(data) | |
for x in data.columns: | |
ints = pd.to_numeric(data[x], downcast='integer', errors='coerce') | |
# if more than 97% values in a column are integers, set type int. | |
if ints.count()/size > 0.97: | |
minimum = ints.min() | |
if minimum > 0: | |
data[x] = pd.to_numeric( | |
data[x], downcast='unsigned', errors='coerce') | |
else: | |
data[x] = pd.to_numeric( | |
data[x], downcast='integer', errors='coerce') | |
else: | |
floats = pd.to_numeric( | |
data[x], downcast='float', errors='coerce') | |
if floats.count()/size > 0.97: | |
data[x] = pd.to_numeric( | |
data[x], downcast='float', errors='coerce') | |
else: | |
dates = pd.to_datetime(data[x], errors='coerce') | |
if dates.count()/size > 0.97: | |
data[x] = pd.to_datetime(data[x], errors='coerce') | |
return data.reset_index(drop=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment