Last active
November 21, 2019 07:14
-
-
Save ivankeller/1841120149f0aa4311ee24ea2e8fe665 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
def describe_df(df, cols=None, max_distinct=10): | |
"""Describe columns of given dataframe. | |
Return a dataframe with column name, type, nb of distinct values and nb of missing values | |
Missing values are counted as one distinct value. | |
Issue: if a column contains integer and missing values, pandas cast it in float and add .0 to the value. | |
This misleads type column and distinct values. | |
Parameters | |
---------- | |
df : pandas dataframe | |
cols : iterable, {str}, optional | |
list of column names, if None (default) consider all columns | |
Returns | |
------- | |
df : dataframe | |
dataframe with descriptive figures for each columns. | |
""" | |
if cols is None: | |
cols = df.columns | |
else: | |
unknown_cols = set(cols) - set(df.columns) | |
# presence of unknown columns | |
if len(unknown_cols) != 0: | |
print("Unkown columns:", unknown_cols) | |
return | |
typ = [] | |
nb_missing = [] | |
nb_distinct = [] | |
distinct_values = [] | |
for col in cols: | |
# type | |
typ.append(df[col].dtype) | |
# distinct values | |
distincts = df[col].unique() | |
nb_dist = len(distincts) | |
nb_distinct.append(nb_dist) | |
if nb_dist <= max_distinct: | |
distinct_values.append(distincts) | |
else: | |
distinct_values.append(None) | |
# missing values | |
if df[col].dtype is 'object': | |
nb_missing.append((df[col] == '').sum() + (df[col] is None).sum()) | |
else: | |
nb_missing.append(df[col].isnull().sum()) | |
result = pd.DataFrame({'column': cols, 'type': typ, 'nb_missing': nb_missing, 'nb_distinct': nb_distinct, 'distinct_values': distinct_values}) | |
return result[['column', 'type', 'nb_missing', 'nb_distinct', 'distinct_values']] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment