Last active
May 26, 2023 08:31
-
-
Save jiahao87/c97214065f996b76ab8fe4ca1964b2b5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import matplotlib | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import missingno | |
import warnings | |
warnings.filterwarnings("ignore") | |
%matplotlib inline | |
def time_series_plot(df): | |
"""Given dataframe, generate times series plot of numeric data by daily, monthly and yearly frequency""" | |
print("\nTo check time series of numeric data by daily, monthly and yearly frequency") | |
if len(df.select_dtypes(include='datetime64').columns)>0: | |
for col in df.select_dtypes(include='datetime64').columns: | |
for p in ['D', 'M', 'Y']: | |
if p=='D': | |
print("Plotting daily data") | |
elif p=='M': | |
print("Plotting monthly data") | |
else: | |
print("Plotting yearly data") | |
for col_num in df.select_dtypes(include=np.number).columns: | |
__ = df.copy() | |
__ = __.set_index(col) | |
__T = __.resample(p).sum() | |
ax = __T[[col_num]].plot() | |
ax.set_ylim(bottom=0) | |
ax.get_yaxis().set_major_formatter( | |
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) | |
plt.show() | |
def numeric_eda(df, hue=None): | |
"""Given dataframe, generate EDA of numeric data""" | |
print("\nTo check: \nDistribution of numeric data") | |
display(df.describe().T) | |
columns = df.select_dtypes(include=np.number).columns | |
figure = plt.figure(figsize=(20, 10)) | |
figure.add_subplot(1, len(columns), 1) | |
for index, col in enumerate(columns): | |
if index > 0: | |
figure.add_subplot(1, len(columns), index + 1) | |
sns.boxplot(y=col, data=df, boxprops={'facecolor': 'None'}) | |
figure.tight_layout() | |
plt.show() | |
if len(df.select_dtypes(include='category').columns) > 0: | |
for col_num in df.select_dtypes(include=np.number).columns: | |
for col in df.select_dtypes(include='category').columns: | |
fig = sns.catplot(x=col, y=col_num, kind='violin', data=df, height=5, aspect=2) | |
fig.set_xticklabels(rotation=90) | |
plt.show() | |
# Plot the pairwise joint distributions | |
print("\nTo check pairwise joint distribution of numeric data") | |
if hue==None: | |
sns.pairplot(df.select_dtypes(include=np.number)) | |
else: | |
sns.pairplot(df.select_dtypes(include=np.number).join(df[[hue]]), hue=hue) | |
plt.show() | |
def top5(df): | |
"""Given dataframe, generate top 5 unique values for non-numeric data""" | |
columns = df.select_dtypes(include=['object', 'category']).columns | |
for col in columns: | |
print("Top 5 unique values of " + col) | |
print(df[col].value_counts().reset_index().rename(columns={"index": col, col: "Count"})[ | |
:min(5, len(df[col].value_counts()))]) | |
print(" ") | |
def categorical_eda(df, hue=None): | |
"""Given dataframe, generate EDA of categorical data""" | |
print("\nTo check: \nUnique count of non-numeric data\n") | |
print(df.select_dtypes(include=['object', 'category']).nunique()) | |
top5(df) | |
# Plot count distribution of categorical data | |
for col in df.select_dtypes(include='category').columns: | |
fig = sns.catplot(x=col, kind="count", data=df, hue=hue) | |
fig.set_xticklabels(rotation=90) | |
plt.show() | |
def eda(df): | |
"""Given dataframe, generate exploratory data analysis""" | |
# check that input is pandas dataframe | |
if type(df) != pd.core.frame.DataFrame: | |
raise TypeError("Only pandas dataframe is allowed as input") | |
# replace field that's entirely space (or empty) with NaN | |
df = df.replace(r'^\s*$', np.nan, regex=True) | |
print("Preview of data:") | |
display(df.head(3)) | |
print("\nTo check: \n (1) Total number of entries \n (2) Column types \n (3) Any null values\n") | |
print(df.info()) | |
# generate preview of entries with null values | |
if df.isnull().any(axis=None): | |
print("\nPreview of data with null values:") | |
display(df[df.isnull().any(axis=1)].head(3)) | |
missingno.matrix(df) | |
plt.show() | |
# generate count statistics of duplicate entries | |
if len(df[df.duplicated()]) > 0: | |
print("\n***Number of duplicated entries: ", len(df[df.duplicated()])) | |
display(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head()) | |
else: | |
print("\nNo duplicated entries found") | |
# EDA of categorical data | |
categorical_eda(df) | |
# EDA of numeric data | |
numeric_eda(df) | |
# Plot time series plot of numeric data | |
time_series_plot(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You are right. Thank you for the clarification.