Skip to content

Instantly share code, notes, and snippets.

@bhishanpdl
Last active September 2, 2018 21:25
Show Gist options
  • Save bhishanpdl/e453575fd66c1caeb93a5bf0f9c30baf to your computer and use it in GitHub Desktop.
Save bhishanpdl/e453575fd66c1caeb93a5bf0f9c30baf to your computer and use it in GitHub Desktop.
def clean_data(data):
"""
Clean the pandas dataframe.
data: pandas dataframe
return: cleaned dataframe
1. Make column names lowercase and underscored.
2. Rename repeated column names.
3. Delete a column is there are a lot of missing values.
4. Fill all '' or ' ' by numpy nans in the dataframe.
5. Give nice datatype to each columns for > 97% data.
"""
import pandas as pd
import numpy as np
import gc
# 1. replace ' ' by '_'
data.columns = [str(x.lower().strip().replace(' ', '_'))
for x in data.columns]
# 2. If colnames are repeated make them columnname_position
seen = {}
columns = []
i = 0
for i, x in enumerate(data.columns):
if x in seen:
columns.append(x+'_{}'.format(i))
else:
columns.append(x)
seen[x] = None
# 3. Delete a column is there are a lot of missing values
for x in data.columns[data.count()/len(data) < 0.0001]:
del data[x]
gc.collect()
# 4. Fill all '' or ' ' by numpy nans in the dataframe
try:
data = data.replace({'': np.nan, ' ': np.nan})
except:
pass
# 5. Give nice datatype to each columns
# e.g. if >97% integers, make them integer.
size = len(data)
for x in data.columns:
ints = pd.to_numeric(data[x], downcast='integer', errors='coerce')
# if more than 97% values in a column are integers, set type int.
if ints.count()/size > 0.97:
minimum = ints.min()
if minimum > 0:
data[x] = pd.to_numeric(
data[x], downcast='unsigned', errors='coerce')
else:
data[x] = pd.to_numeric(
data[x], downcast='integer', errors='coerce')
else:
floats = pd.to_numeric(
data[x], downcast='float', errors='coerce')
if floats.count()/size > 0.97:
data[x] = pd.to_numeric(
data[x], downcast='float', errors='coerce')
else:
dates = pd.to_datetime(data[x], errors='coerce')
if dates.count()/size > 0.97:
data[x] = pd.to_datetime(data[x], errors='coerce')
return data.reset_index(drop=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment