Last active
October 20, 2021 07:43
-
-
Save ltfschoen/4c5d2cf26b8be5355043273493a6b8b9 to your computer and use it in GitHub Desktop.
Calculate percentage of NaN values in a Pandas Dataframe for each column. Exclude columns that do not contain any NaN values
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Luke Schoen 2017 | |
import pandas as pd | |
import numpy as np | |
import functools | |
# Create DataFrame | |
# df = pd.DataFrame(np.random.randn(10,2)) | |
# Populate with NaN values | |
df = pd.DataFrame({'col1': ['1.111', '2.111', '3.111', '4.111'], 'col2': ['4.111', '5.111', np.NaN, '7.111'], 'col3': ['8', '9', np.NaN, np.NaN], 'col4': ['12', '13', '14', '15']}) | |
# Round all values to 2 decimal places | |
df.apply(functools.partial(np.round, decimals=2)) | |
# Populate DataFrame column 0 and indexed rows 2 to 6 with NaN values | |
df.iloc[3:6,0] = np.nan | |
def get_percentage_missing(series): | |
""" Calculates percentage of NaN values in DataFrame | |
:param series: Pandas DataFrame object | |
:return: float | |
""" | |
num = series.isnull().sum() | |
den = len(series) | |
return round(num/den, 2) | |
# Only include columns that contain any NaN values | |
df_with_any_null_values = df[df.columns[df.isnull().any()].tolist()] | |
get_percentage_missing(df_with_any_null_values) | |
# Show qty of each value in a Column | |
# df.astype(str).groupby(['col1']).sum() | |
# Show DataFrame | |
df | |
# df.head() | |
# Show DataFrame info | |
print(df.describe()) | |
print(df.info()) | |
# Iterate over columns in DataFrame and delete those with where >30% of the values are null/NaN | |
for name, values in df_with_any_null_values.iteritems(): | |
print("%r: %r" % (name, values) ) | |
if get_percentage_missing(df_with_any_null_values[name]) > 0.3: | |
print("Deleting Column %r: " % (name) ) | |
df_with_any_null_values.drop(name, axis=1, inplace=True) | |
# Iterate over columns in DataFrame and delete rows of columns where any values are null/NaN | |
for name, values in df_with_any_null_values.iteritems(): | |
if name != "id": | |
if get_percentage_missing(df_with_any_null_values[name]) < 0.01: | |
print("Retained Column: %r, but removed its null and NaN valued rows" % (name) ) | |
print("BEFORE %r: %r" % (name, values) ) | |
df_with_any_null_values.dropna(axis=0, how="any", subset=[name], inplace=True) | |
print("AFTER %r: %r" % (name, values) ) | |
# Select only Columns of certain types | |
# http://pandas.pydata.org/pandas-docs/version/0.19.2/generated/pandas.DataFrame.select_dtypes.html | |
df.select_dtypes(include=['int', 'float64', 'floating', 'number'], exclude=['O']) | |
# Iterate over Columns and perform modifications depending on the type | |
# IMPORTANT NOTE: ENSURE ONLY USE AFTER REMOVE NAN VALUES | |
for col in df.columns: | |
for name, values in df[col].iteritems(): | |
# print("%r, %r" % (name, values)) | |
if(values.dtype == np.float64 or values.dtype == np.int64): | |
print("float or int type %r" % (values.dtype)) | |
# treat_numeric(df[name]) | |
elif(df[name].dtype == np.str): | |
print("string type %r" % (df[name].dtype)) | |
#treat_str(df[y]) | |
elif(df[name].dtype == np.object): | |
print("object type %r" % (df[name].dtype)) | |
#treat_object(df[name]) | |
else: | |
print("other type %r" % (values.dtype)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment