-
-
Save jiahao87/c97214065f996b76ab8fe4ca1964b2b5 to your computer and use it in GitHub Desktop.
import pandas as pd | |
import numpy as np | |
import matplotlib | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import missingno | |
import warnings | |
warnings.filterwarnings("ignore") | |
%matplotlib inline | |
def time_series_plot(df): | |
"""Given dataframe, generate times series plot of numeric data by daily, monthly and yearly frequency""" | |
print("\nTo check time series of numeric data by daily, monthly and yearly frequency") | |
if len(df.select_dtypes(include='datetime64').columns)>0: | |
for col in df.select_dtypes(include='datetime64').columns: | |
for p in ['D', 'M', 'Y']: | |
if p=='D': | |
print("Plotting daily data") | |
elif p=='M': | |
print("Plotting monthly data") | |
else: | |
print("Plotting yearly data") | |
for col_num in df.select_dtypes(include=np.number).columns: | |
__ = df.copy() | |
__ = __.set_index(col) | |
__T = __.resample(p).sum() | |
ax = __T[[col_num]].plot() | |
ax.set_ylim(bottom=0) | |
ax.get_yaxis().set_major_formatter( | |
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) | |
plt.show() | |
def numeric_eda(df, hue=None): | |
"""Given dataframe, generate EDA of numeric data""" | |
print("\nTo check: \nDistribution of numeric data") | |
display(df.describe().T) | |
columns = df.select_dtypes(include=np.number).columns | |
figure = plt.figure(figsize=(20, 10)) | |
figure.add_subplot(1, len(columns), 1) | |
for index, col in enumerate(columns): | |
if index > 0: | |
figure.add_subplot(1, len(columns), index + 1) | |
sns.boxplot(y=col, data=df, boxprops={'facecolor': 'None'}) | |
figure.tight_layout() | |
plt.show() | |
if len(df.select_dtypes(include='category').columns) > 0: | |
for col_num in df.select_dtypes(include=np.number).columns: | |
for col in df.select_dtypes(include='category').columns: | |
fig = sns.catplot(x=col, y=col_num, kind='violin', data=df, height=5, aspect=2) | |
fig.set_xticklabels(rotation=90) | |
plt.show() | |
# Plot the pairwise joint distributions | |
print("\nTo check pairwise joint distribution of numeric data") | |
if hue==None: | |
sns.pairplot(df.select_dtypes(include=np.number)) | |
else: | |
sns.pairplot(df.select_dtypes(include=np.number).join(df[[hue]]), hue=hue) | |
plt.show() | |
def top5(df): | |
"""Given dataframe, generate top 5 unique values for non-numeric data""" | |
columns = df.select_dtypes(include=['object', 'category']).columns | |
for col in columns: | |
print("Top 5 unique values of " + col) | |
print(df[col].value_counts().reset_index().rename(columns={"index": col, col: "Count"})[ | |
:min(5, len(df[col].value_counts()))]) | |
print(" ") | |
def categorical_eda(df, hue=None): | |
"""Given dataframe, generate EDA of categorical data""" | |
print("\nTo check: \nUnique count of non-numeric data\n") | |
print(df.select_dtypes(include=['object', 'category']).nunique()) | |
top5(df) | |
# Plot count distribution of categorical data | |
for col in df.select_dtypes(include='category').columns: | |
fig = sns.catplot(x=col, kind="count", data=df, hue=hue) | |
fig.set_xticklabels(rotation=90) | |
plt.show() | |
def eda(df): | |
"""Given dataframe, generate exploratory data analysis""" | |
# check that input is pandas dataframe | |
if type(df) != pd.core.frame.DataFrame: | |
raise TypeError("Only pandas dataframe is allowed as input") | |
# replace field that's entirely space (or empty) with NaN | |
df = df.replace(r'^\s*$', np.nan, regex=True) | |
print("Preview of data:") | |
display(df.head(3)) | |
print("\nTo check: \n (1) Total number of entries \n (2) Column types \n (3) Any null values\n") | |
print(df.info()) | |
# generate preview of entries with null values | |
if df.isnull().any(axis=None): | |
print("\nPreview of data with null values:") | |
display(df[df.isnull().any(axis=1)].head(3)) | |
missingno.matrix(df) | |
plt.show() | |
# generate count statistics of duplicate entries | |
if len(df[df.duplicated()]) > 0: | |
print("\n***Number of duplicated entries: ", len(df[df.duplicated()])) | |
display(df[df.duplicated(keep=False)].sort_values(by=list(df.columns)).head()) | |
else: | |
print("\nNo duplicated entries found") | |
# EDA of categorical data | |
categorical_eda(df) | |
# EDA of numeric data | |
numeric_eda(df) | |
# Plot time series plot of numeric data | |
time_series_plot(df) |
Thank you for this posting. I am having difficulty running it as-is however. This line throws a syntax error:
%matplotlib inline ^ SyntaxError: invalid syntax
and also the display method used through out is undefined and throws error as well:
display(df.head(3)) NameError: name 'display' is not defined
I am using python 3.8 and all the imported modules are pip installed.
Hi @Darveesh, I guess you are probably running the code as script in terminal. Please run the code in a notebook. Thank you.
Thank you for this posting. I am having difficulty running it as-is however. This line throws a syntax error:
%matplotlib inline ^ SyntaxError: invalid syntax
and also the display method used through out is undefined and throws error as well:
display(df.head(3)) NameError: name 'display' is not defined
I am using python 3.8 and all the imported modules are pip installed.Hi @Darveesh, I guess you are probably running the code as script in terminal. Please run the code in a notebook. Thank you.
You are right. Thank you for the clarification.
Thank you for this posting. I am having difficulty running it as-is however. This line throws a syntax error:
%matplotlib inline ^ SyntaxError: invalid syntax
and also the display method used through out is undefined and throws error as well:
display(df.head(3)) NameError: name 'display' is not defined
I am using python 3.8 and all the imported modules are pip installed.