bhishanpdl · September 2, 2018 21:25
diff --git a/data_clean.py b/data_clean.py
 def clean_data(data):    
    """
    Clean the pandas dataframe.

    data: pandas dataframe 

    return: cleaned dataframe


    1. Make column names lowercase and underscored.
    2. Rename repeated column names.
    3. Delete a column is there are a lot of missing values.
    4. Fill all '' or ' ' by numpy nans in the dataframe.
    5. Give nice datatype to each columns for > 97% data.

    """
    import pandas as pd
    import numpy as np
    import gc
    
    
    # 1. replace ' ' by '_'
    data.columns = [str(x.lower().strip().replace(' ', '_'))
                    for x in data.columns]
    
    # 2. If colnames are repeated make them columnname_position
    seen = {}
    columns = []
    i = 0
    for i, x in enumerate(data.columns):
        if x in seen:
            columns.append(x+'_{}'.format(i))
        else:
            columns.append(x)
        seen[x] = None

    # 3. Delete a column is there are a lot of missing values
    for x in data.columns[data.count()/len(data) < 0.0001]:
        del data[x]
    gc.collect()
    
    # 4. Fill all '' or ' ' by numpy nans in the dataframe
    try:
        data = data.replace({'': np.nan, ' ': np.nan})
    except:
        pass

    
    # 5. Give nice datatype to each columns
    # e.g. if >97% integers, make them integer.
    size = len(data)
    for x in data.columns:
        ints = pd.to_numeric(data[x], downcast='integer', errors='coerce')
        
        # if more than 97% values in a column are integers, set type int.
        if ints.count()/size > 0.97:
            minimum = ints.min()
            if minimum > 0:
                data[x] = pd.to_numeric(
                    data[x], downcast='unsigned', errors='coerce')
            else:
                data[x] = pd.to_numeric(
                    data[x], downcast='integer', errors='coerce')
        else:
            floats = pd.to_numeric(
                data[x], downcast='float', errors='coerce')
            if floats.count()/size > 0.97:
                data[x] = pd.to_numeric(
                    data[x], downcast='float', errors='coerce')
            else:
                dates = pd.to_datetime(data[x], errors='coerce')
                if dates.count()/size > 0.97:
                    data[x] = pd.to_datetime(data[x], errors='coerce')
    return data.reset_index(drop=True)
	def clean_data(data):
	"""
	Clean the pandas dataframe.

	data: pandas dataframe

	return: cleaned dataframe


	1. Make column names lowercase and underscored.
	2. Rename repeated column names.
	3. Delete a column is there are a lot of missing values.
	4. Fill all '' or ' ' by numpy nans in the dataframe.
	5. Give nice datatype to each columns for > 97% data.

	"""
	import pandas as pd
	import numpy as np
	import gc


	# 1. replace ' ' by '_'
	data.columns = [str(x.lower().strip().replace(' ', '_'))
	for x in data.columns]

	# 2. If colnames are repeated make them columnname_position
	seen = {}
	columns = []
	i = 0
	for i, x in enumerate(data.columns):
	if x in seen:
	columns.append(x+'_{}'.format(i))
	else:
	columns.append(x)
	seen[x] = None

	# 3. Delete a column is there are a lot of missing values
	for x in data.columns[data.count()/len(data) < 0.0001]:
	del data[x]
	gc.collect()

	# 4. Fill all '' or ' ' by numpy nans in the dataframe
	try:
	data = data.replace({'': np.nan, ' ': np.nan})
	except:
	pass


	# 5. Give nice datatype to each columns
	# e.g. if >97% integers, make them integer.
	size = len(data)
	for x in data.columns:
	ints = pd.to_numeric(data[x], downcast='integer', errors='coerce')

	# if more than 97% values in a column are integers, set type int.
	if ints.count()/size > 0.97:
	minimum = ints.min()
	if minimum > 0:
	data[x] = pd.to_numeric(
	data[x], downcast='unsigned', errors='coerce')
	else:
	data[x] = pd.to_numeric(
	data[x], downcast='integer', errors='coerce')
	else:
	floats = pd.to_numeric(
	data[x], downcast='float', errors='coerce')
	if floats.count()/size > 0.97:
	data[x] = pd.to_numeric(
	data[x], downcast='float', errors='coerce')
	else:
	dates = pd.to_datetime(data[x], errors='coerce')
	if dates.count()/size > 0.97:
	data[x] = pd.to_datetime(data[x], errors='coerce')
	return data.reset_index(drop=True)