Skip to content

Instantly share code, notes, and snippets.

@atulkumar2
Last active March 5, 2023 14:03
Show Gist options
  • Save atulkumar2/2bdcce410e856113f6908f52f217425e to your computer and use it in GitHub Desktop.
Save atulkumar2/2bdcce410e856113f6908f52f217425e to your computer and use it in GitHub Desktop.
##############################################################
# Some functions for basic EDA
# With active help from chatgpt
##############################################################
def header_comment(msg):
print(msg + '\n' + '-' * 50 + '\n')
def basic_df_info(df):
''' df.info() provides information about the dimensions, column names, data types,
and non-null values of the dataframe in a concise format.
df.isnull().sum() provides the count of missing values in each column of the dataframe.
df.head() displays the first 5 rows.
'''
header_comment('BASIC INFORMATION FOR DATAFRAME')
print(df.columns)
print(df.info())
display(df.head())
def display_nonnull_info(df):
header_comment('NONNULL DATA INFORMATION - BASIC')
if df.isnull().any().any():
null_counts = df.isnull().sum()
nonzero_null_counts = null_counts[null_counts > 0]
nonzero_null_cols = list(nonzero_null_counts.index)
df[nonzero_null_cols].info(show_counts=True)
def display_null_info(df):
header_comment('NULL DATA INFORMATION - BASIC')
if df.isnull().any().any():
null_counts = df.isnull().sum()
nonzero_null_counts = null_counts[null_counts > 0]
print(nonzero_null_counts)
def display_null_info_detailed(df):
''' Shows Null count for columns which have nonzero null values
Shows percentage of null values as also the column datatype
'''
null_counts = df.isnull().sum()
nonzero_null_counts = null_counts[null_counts > 0]
nonzero_null_cols = list(nonzero_null_counts.index)
dtypes = df.dtypes[nonzero_null_cols]
df_null_info = pd.concat([nonzero_null_counts, dtypes], axis=1)
df_null_info.columns = ['Null Count', 'Data Type']
df_null_info['Percentage'] = round(df_null_info['Null Count'] / len(df) * 100, 2)
print(df_null_info)
def display_columns_info(category, ls, additional=None):
if additional is None:
print(category, len(ls), ls)
else:
print(category, len(ls), ls, additional)
def display_allcolumns_info(df):
header_comment('ALL COLUMN DATA')
display_columns_info('All Columns', df.columns.tolist(), '\n')
display_columns_info('Numeric Columns', df.select_dtypes(include=np.number).columns.tolist(), '\n')
display_columns_info('Non Numeric Columns', df.select_dtypes(exclude=np.number).columns.tolist(), '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment