Last active
March 5, 2023 14:03
-
-
Save atulkumar2/2bdcce410e856113f6908f52f217425e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################## | |
# Some functions for basic EDA | |
# With active help from chatgpt | |
############################################################## | |
def header_comment(msg): | |
print(msg + '\n' + '-' * 50 + '\n') | |
def basic_df_info(df): | |
''' df.info() provides information about the dimensions, column names, data types, | |
and non-null values of the dataframe in a concise format. | |
df.isnull().sum() provides the count of missing values in each column of the dataframe. | |
df.head() displays the first 5 rows. | |
''' | |
header_comment('BASIC INFORMATION FOR DATAFRAME') | |
print(df.columns) | |
print(df.info()) | |
display(df.head()) | |
def display_nonnull_info(df): | |
header_comment('NONNULL DATA INFORMATION - BASIC') | |
if df.isnull().any().any(): | |
null_counts = df.isnull().sum() | |
nonzero_null_counts = null_counts[null_counts > 0] | |
nonzero_null_cols = list(nonzero_null_counts.index) | |
df[nonzero_null_cols].info(show_counts=True) | |
def display_null_info(df): | |
header_comment('NULL DATA INFORMATION - BASIC') | |
if df.isnull().any().any(): | |
null_counts = df.isnull().sum() | |
nonzero_null_counts = null_counts[null_counts > 0] | |
print(nonzero_null_counts) | |
def display_null_info_detailed(df): | |
''' Shows Null count for columns which have nonzero null values | |
Shows percentage of null values as also the column datatype | |
''' | |
null_counts = df.isnull().sum() | |
nonzero_null_counts = null_counts[null_counts > 0] | |
nonzero_null_cols = list(nonzero_null_counts.index) | |
dtypes = df.dtypes[nonzero_null_cols] | |
df_null_info = pd.concat([nonzero_null_counts, dtypes], axis=1) | |
df_null_info.columns = ['Null Count', 'Data Type'] | |
df_null_info['Percentage'] = round(df_null_info['Null Count'] / len(df) * 100, 2) | |
print(df_null_info) | |
def display_columns_info(category, ls, additional=None): | |
if additional is None: | |
print(category, len(ls), ls) | |
else: | |
print(category, len(ls), ls, additional) | |
def display_allcolumns_info(df): | |
header_comment('ALL COLUMN DATA') | |
display_columns_info('All Columns', df.columns.tolist(), '\n') | |
display_columns_info('Numeric Columns', df.select_dtypes(include=np.number).columns.tolist(), '\n') | |
display_columns_info('Non Numeric Columns', df.select_dtypes(exclude=np.number).columns.tolist(), '\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment