Skip to content

Instantly share code, notes, and snippets.

@adwiteeya3
Last active December 9, 2025 14:29
Show Gist options
  • Select an option

  • Save adwiteeya3/105218afda5cb6c01d469ba3abced3bb to your computer and use it in GitHub Desktop.

Select an option

Save adwiteeya3/105218afda5cb6c01d469ba3abced3bb to your computer and use it in GitHub Desktop.
DataScienceSeries : EDA
#####################################################
# Setting the Stage: Our Dataset
#####################################################
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Optional: Set a style for matplotlib plots
plt.style.use('seaborn-v0_8-darkgrid')
# Load the dataset (assuming it's in the same directory or accessible via URL)
# For simplicity, we'll create a synthetic version that mimics Titanic structure
# In a real scenario, you'd load 'train.csv' or similar.
data = {
'PassengerId': range(1, 892),
'Survived': np.random.randint(0, 2, 891),
'Pclass': np.random.choice([1, 2, 3], 891, p=[0.25, 0.25, 0.5]),
'Name': [f'Passenger {i}' for i in range(1, 892)],
'Sex': np.random.choice(['male', 'female'], 891),
'Age': np.random.normal(30, 15, 891).clip(0, 80).astype(int),
'SibSp': np.random.randint(0, 4, 891),
'Parch': np.random.randint(0, 3, 891),
'Ticket': [f'T{i}' for i in range(1, 892)],
'Fare': np.random.lognormal(mean=3.5, sigma=0.8, size=891).round(2),
'Cabin': [np.random.choice([f'C{i}', f'B{i}', None], p=[0.2, 0.2, 0.6]) for i in range(891)],
'Embarked': np.random.choice(['S', 'C', 'Q', None], 891, p=[0.7, 0.15, 0.1, 0.05])
}
df = pd.DataFrame(data)
# Introduce some missing values for demonstration
df.loc[df.sample(frac=0.05).index, 'Age'] = np.nan
df.loc[df.sample(frac=0.1).index, 'Cabin'] = np.nan
df.loc[df.sample(frac=0.02).index, 'Embarked'] = np.nan
df.loc[df.sample(frac=0.01).index, 'Fare'] = np.nan # Just for variety
print("First 5 rows of the dataset:")
print(df.head())
print("\nDataset Info:")
df.info()
#####################################################
# 1. Robust Missing Value Analysis
#####################################################
# Detailed Missing Values Report
def missing_data_report(df):
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending=False)
missing_df = pd.concat([total, percent], axis=1, keys=['Total Missing', 'Percent Missing (%)'])
return missing_df[missing_df['Total Missing'] > 0]
print("\nMissing Data Report:")
print(missing_data_report(df))
# Visualizing Missing Data Patterns (Seaborn)
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()
# You can also use missingno library for more advanced missing data visualizations
# !pip install missingno
# import missingno as msno
# msno.matrix(df, figsize=(10, 6), color=(0.2, 0.2, 0.2))
# plt.title('Missing Values Matrix (missingno)')
# plt.show()
# msno.bar(df, figsize=(10, 6), color='skyblue')
# plt.title('Missing Values Bar Plot (missingno)')
# plt.show()
#####################################################
# 2. Deeper Univariate Analysis with Distributions
## A. Kernel Density Estimates (KDEs) for Smooth Distributions
#####################################################
plt.figure(figsize=(12, 6))
sns.histplot(df['Age'].dropna(), kde=True, bins=30, color='skyblue', label='Age Distribution')
sns.kdeplot(df['Age'].dropna(), color='red', linestyle='--', linewidth=2, label='Age KDE')
plt.title('Age Distribution with KDE')
plt.xlabel('Age')
plt.ylabel('Density / Count')
plt.legend()
plt.show()
plt.figure(figsize=(12, 6))
sns.histplot(df['Fare'].dropna(), kde=True, bins=50, color='lightgreen', log_scale=True) # Log scale often helps for skewed data like Fare
plt.title('Fare Distribution (Log Scale) with KDE')
plt.xlabel('Fare (Log Scale)')
plt.ylabel('Density / Count')
plt.show()
#####################################################
# 3. Advanced Bivariate and Multivariate Analysis
## A. Categorical vs. Numerical: Box Plots and Violin Plots (Seaborn)
#####################################################
plt.figure(figsize=(10, 6))
sns.boxplot(x='Pclass', y='Age', data=df)
plt.title('Age Distribution by Passenger Class')
plt.xlabel('Passenger Class')
plt.ylabel('Age')
plt.show()
plt.figure(figsize=(10, 6))
sns.violinplot(x='Sex', y='Age', hue='Survived', data=df, split=True, inner='quartile')
plt.title('Age Distribution by Sex and Survival (Violin Plot)')
plt.xlabel('Sex')
plt.ylabel('Age')
plt.show()
#####################################################
# 3. Advanced Bivariate and Multivariate Analysis
## B. Categorical vs. Categorical: Stacked Bar Charts (Pandas & Matplotlib)
#####################################################
# Calculate survival rates by Pclass and Sex
survival_pivot = df.pivot_table(index='Pclass', columns='Sex', values='Survived', aggfunc='mean')
print("\nSurvival Rate by Pclass and Sex:")
print(survival_pivot)
# Plotting the proportions using stacked bars
survival_pivot.plot(kind='bar', figsize=(10, 6), rot=0)
plt.title('Survival Rate by Passenger Class and Sex')
plt.ylabel('Survival Rate')
plt.xlabel('Passenger Class')
plt.legend(title='Sex')
plt.show()
#####################################################
# 3. Advanced Bivariate and Multivariate Analysis
## C. Numerical vs. Numerical: Enhanced Scatter Plots (Seaborn & Plotly)
#####################################################
# Seaborn Scatter Plot with Hue and Size (for 3 variables)
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Age', y='Fare', hue='Survived', size='Pclass', sizes=(20, 400), data=df.dropna(subset=['Age', 'Fare']))
plt.title('Age vs. Fare with Survival and Pclass')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.show()
# Plotly Express Scatter Plot for Interactivity
fig = px.scatter(df.dropna(subset=['Age', 'Fare', 'Survived']),
x='Age',
y='Fare',
color='Survived',
size='Fare', # Use Fare for size as well
hover_name='Name',
facet_col='Sex', # Faceting by Sex
title='Age vs. Fare by Sex and Survival (Interactive)',
labels={'Survived': 'Survived (0=No, 1=Yes)'},
color_continuous_scale=px.colors.sequential.Viridis)
fig.show()
#####################################################
# 4. Correlation Analysis with a Twist
#####################################################
# Correlation Heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = df.select_dtypes(include=np.number).corr() # Only numerical columns
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.show()
# Bar plot of correlations with 'Survived'
plt.figure(figsize=(8, 6))
df.select_dtypes(include=np.number).corr()['Survived'].drop('Survived').sort_values(ascending=False).plot(kind='bar', color='purple')
plt.title('Correlation with Survival')
plt.ylabel('Correlation Coefficient')
plt.show()
#####################################################
# 5. Feature Engineering During EDA: Extracting More Value
## A. Creating Family Size
#####################################################
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1 # +1 for the passenger themselves
plt.figure(figsize=(10, 6))
sns.barplot(x='FamilySize', y='Survived', data=df)
plt.title('Survival Rate by Family Size')
plt.xlabel('Family Size')
plt.ylabel('Survival Rate')
plt.show()
# Segmenting Family Size into Categories
df['FamilyGroup'] = pd.cut(df['FamilySize'],
bins=[0, 1, 4, 11], # Solo, Small, Large
labels=['Solo', 'Small', 'Large'],
right=True)
plt.figure(figsize=(8, 5))
sns.barplot(x='FamilyGroup', y='Survived', data=df)
plt.title('Survival Rate by Family Group')
plt.xlabel('Family Group')
plt.ylabel('Survival Rate')
plt.show()
#####################################################
# 5. Feature Engineering During EDA: Extracting More Value
## B. Extracting Title from Name
#####################################################
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# Group less frequent titles
df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Mlle', 'Ms', 'Mme'],
['Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Miss', 'Miss', 'Mrs'])
plt.figure(figsize=(10, 6))
sns.barplot(x='Title', y='Survived', data=df)
plt.title('Survival Rate by Title')
plt.xlabel('Title')
plt.ylabel('Survival Rate')
plt.show()
#####################################################
# 6. Interactive Dashboards for Comprehensive Exploration (Plotly)
#####################################################
# Create subplots for multiple visualizations in one figure
fig = make_subplots(rows=2, cols=2,
subplot_titles=('Age Distribution by Pclass', 'Fare Distribution by Embarked',
'Survival by Sex and Pclass', 'Family Size Survival'))
# Plot 1: Age Distribution by Pclass (Violin Plot)
for pclass in sorted(df['Pclass'].unique()):
fig.add_trace(go.Violin(x=df[df['Pclass'] == pclass]['Pclass'],
y=df[df['Pclass'] == pclass]['Age'],
name=f'Pclass {pclass}',
box_visible=True,
meanline_visible=True),
row=1, col=1)
# Plot 2: Fare Distribution by Embarked (Box Plot)
for embarked_port in sorted(df['Embarked'].dropna().unique()):
fig.add_trace(go.Box(y=df[df['Embarked'] == embarked_port]['Fare'],
name=f'Embarked {embarked_port}'),
row=1, col=2)
# Plot 3: Survival by Sex and Pclass (Grouped Bar Chart)
survival_by_sex_pclass = df.groupby(['Sex', 'Pclass'])['Survived'].mean().reset_index()
fig.add_trace(go.Bar(x=survival_by_sex_pclass[survival_by_sex_pclass['Sex'] == 'female']['Pclass'],
y=survival_by_sex_pclass[survival_by_sex_pclass['Sex'] == 'female']['Survived'],
name='Female Survival', marker_color='pink'),
row=2, col=1)
fig.add_trace(go.Bar(x=survival_by_sex_pclass[survival_by_sex_pclass['Sex'] == 'male']['Pclass'],
y=survival_by_sex_pclass[survival_by_sex_pclass['Sex'] == 'male']['Survived'],
name='Male Survival', marker_color='blue'),
row=2, col=1)
# Plot 4: Family Size Survival
family_survival = df.groupby('FamilyGroup')['Survived'].mean().reset_index()
fig.add_trace(go.Bar(x=family_survival['FamilyGroup'], y=family_survival['Survived'],
name='Family Group Survival', marker_color='lightgreen'),
row=2, col=2)
fig.update_layout(height=800, showlegend=True, title_text="Advanced EDA Dashboard for Titanic Dataset")
fig.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment