Last active
March 16, 2022 10:50
-
-
Save aurelienpierre/ab427b5bb89fdc3e091a6c48cbea7a2b to your computer and use it in GitHub Desktop.
Perform usual types and values checks on columns of a pandas.DataFrame
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create Dataframe with fake data | |
df = pd.util.testing.makeMissingDataframe() | |
df['index1'] = df.index # create a text column by replicating index | |
df['A'] = 0 # create a zero column | |
# Helper function | |
def check_df_sanity(df, verbose=False): | |
"""Perform usual types and values checks on columns of a pandas.DataFrame""" | |
for col in df: | |
# Types checks | |
is_numeric = pd.api.types.is_numeric_dtype(df[col]) | |
is_string = pd.api.types.is_string_dtype(df[col]) | |
is_date = pd.api.types.is_datetime64_any_dtype(df[col]) | |
is_re = pd.api.types.is_re_compilable(col) | |
# NaN checks | |
nulls = df[col].isnull() | |
is_null = nulls.values.any() | |
nulls = nulls.sum() | |
# Values checks | |
average = 0 | |
std = 0 | |
has_zeros = False | |
zeros = 0 | |
has_negatives = False | |
negatives = 0 | |
num_elem = df[col].size | |
if(is_numeric): | |
# average ± standard deviation | |
average = np.mean(df[col]) | |
std = np.std(df[col]) | |
# zero values | |
zeros = (df[col] == 0) | |
has_zeros = zeros.any() | |
zeros = zeros.sum() | |
# negative values | |
negatives = (df[col] < 0) | |
has_negatives = negatives.any() | |
negatives = negatives.sum() | |
# Report | |
print("column %s :" % col) | |
if(is_numeric): | |
print("\tis numeric") | |
print("\thas average = (%f ± %f)" % (average, std)) | |
if(has_zeros): | |
print("\thas %i zero values (%.2f %%)" % (zeros, 100. * zeros / num_elem)) | |
if(has_negatives): | |
print("\thas %i negative values (%.2f %%)" % (negatives, 100. * negatives / num_elem)) | |
if(is_string): | |
print("\tis string") | |
if(is_date): | |
print("\tis date") | |
if(is_null): | |
print("\thas %i missing entries (%.2f %%)" % (nulls, 100. * nulls / num_elem)) | |
if(verbose): | |
# less used/useful checks go here | |
if(is_re): | |
print("\thas a title that can be used in regex") | |
# test | |
check_df_sanity(df) | |
# example output | |
""" | |
column A : | |
is numeric | |
has average = (0.000000 ± 0.000000) | |
has 30 zero values (100.00 %) | |
column B : | |
is numeric | |
has average = (0.030225 ± 1.107637) | |
has 13 negative values (43.33 %) | |
has 6 missing entries (20.00 %) | |
column C : | |
is numeric | |
has average = (-0.169237 ± 0.999756) | |
has 14 negative values (46.67 %) | |
has 3 missing entries (10.00 %) | |
column D : | |
is numeric | |
has average = (-0.141427 ± 1.015646) | |
has 18 negative values (60.00 %) | |
column index1 : | |
is string | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment