Skip to content

Instantly share code, notes, and snippets.

@jiahao87
Last active May 26, 2023 08:31
Show Gist options
  • Save jiahao87/c97214065f996b76ab8fe4ca1964b2b5 to your computer and use it in GitHub Desktop.
Save jiahao87/c97214065f996b76ab8fe4ca1964b2b5 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import missingno
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
def time_series_plot(df):
"""Given dataframe, generate times series plot of numeric data by daily, monthly and yearly frequency"""
print("\nTo check time series of numeric data by daily, monthly and yearly frequency")
if len(df.select_dtypes(include='datetime64').columns)>0:
for col in df.select_dtypes(include='datetime64').columns:
for p in ['D', 'M', 'Y']:
if p=='D':
print("Plotting daily data")
elif p=='M':
print("Plotting monthly data")
else:
print("Plotting yearly data")
for col_num in df.select_dtypes(include=np.number).columns:
__ = df.copy()
__ = __.set_index(col)
__T = __.resample(p).sum()
ax = __T[[col_num]].plot()
ax.set_ylim(bottom=0)
ax.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.show()
def numeric_eda(df, hue=None):
"""Given dataframe, generate EDA of numeric data"""
print("\nTo check: \nDistribution of numeric data")
display(df.describe().T)
columns = df.select_dtypes(include=np.number).columns
figure = plt.figure(figsize=(20, 10))
figure.add_subplot(1, len(columns), 1)
for index, col in enumerate(columns):
if index > 0:
figure.add_subplot(1, len(columns), index + 1)
sns.boxplot(y=col, data=df, boxprops={'facecolor': 'None'})
figure.tight_layout()
plt.show()
if len(df.select_dtypes(include='category').columns) > 0:
for col_num in df.select_dtypes(include=np.number).columns:
for col in df.select_dtypes(include='category').columns:
fig = sns.catplot(x=col, y=col_num, kind='violin', data=df, height=5, aspect=2)
fig.set_xticklabels(rotation=90)
plt.show()
# Plot the pairwise joint distributions
print("\nTo check pairwise joint distribution of numeric data")
if hue==None:
sns.pairplot(df.select_dtypes(include=np.number))
else:
sns.pairplot(df.select_dtypes(include=np.number).join(df[[hue]]), hue=hue)
plt.show()
def top5(df):
"""Given dataframe, generate top 5 unique values for non-numeric data"""
columns = df.select_dtypes(include=['object', 'category']).columns
for col in columns:
print("Top 5 unique values of " + col)
print(df[col].value_counts().reset_index().rename(columns={"index": col, col: "Count"})[
:min(5, len(df[col].value_counts()))])
print(" ")
def categorical_eda(df, hue=None):
"""Given dataframe, generate EDA of categorical data"""
print("\nTo check: \nUnique count of non-numeric data\n")
print(df.select_dtypes(include=['object', 'category']).nunique())
top5(df)
# Plot count distribution of categorical data
for col in df.select_dtypes(include='category').columns:
fig = sns.catplot(x=col, kind="count", data=df, hue=hue)
fig.set_xticklabels(rotation=90)
plt.show()
def eda(df):
"""Given dataframe, generate exploratory data analysis"""
# check that input is pandas dataframe
if type(df) != pd.core.frame.DataFrame:
raise TypeError("Only pandas dataframe is allowed as input")
# replace field that's entirely space (or empty) with NaN
df = df.replace(r'^\s*$', np.nan, regex=True)
print("Preview of data:")
display(df.head(3))
print("\nTo check: \n (1) Total number of entries \n (2) Column types \n (3) Any null values\n")
print(df.info())
# generate preview of entries with null values
if df.isnull().any(axis=None):
print("\nPreview of data with null values:")
display(df[df.isnull().any(axis=1)].head(3))
missingno.matrix(df)
plt.show()
# generate count statistics of duplicate entries
if len(df[df.duplicated()]) > 0:
print("\n***Number of duplicated entries: ", len(df[df.duplicated()]))
display(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head())
else:
print("\nNo duplicated entries found")
# EDA of categorical data
categorical_eda(df)
# EDA of numeric data
numeric_eda(df)
# Plot time series plot of numeric data
time_series_plot(df)
@Darveesh
Copy link

Thank you for this posting. I am having difficulty running it as-is however. This line throws a syntax error:

%matplotlib inline ^ SyntaxError: invalid syntax

and also the display method used through out is undefined and throws error as well:

display(df.head(3)) NameError: name 'display' is not defined

I am using python 3.8 and all the imported modules are pip installed.

@jiahao87
Copy link
Author

Thank you for this posting. I am having difficulty running it as-is however. This line throws a syntax error:

%matplotlib inline ^ SyntaxError: invalid syntax

and also the display method used through out is undefined and throws error as well:

display(df.head(3)) NameError: name 'display' is not defined

I am using python 3.8 and all the imported modules are pip installed.

Hi @Darveesh, I guess you are probably running the code as script in terminal. Please run the code in a notebook. Thank you.

@Darveesh
Copy link

Thank you for this posting. I am having difficulty running it as-is however. This line throws a syntax error:
%matplotlib inline ^ SyntaxError: invalid syntax
and also the display method used through out is undefined and throws error as well:
display(df.head(3)) NameError: name 'display' is not defined
I am using python 3.8 and all the imported modules are pip installed.

Hi @Darveesh, I guess you are probably running the code as script in terminal. Please run the code in a notebook. Thank you.

You are right. Thank you for the clarification.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment