Last active
December 9, 2025 14:29
-
-
Save adwiteeya3/105218afda5cb6c01d469ba3abced3bb to your computer and use it in GitHub Desktop.
DataScienceSeries : EDA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ##################################################### | |
| # Setting the Stage: Our Dataset | |
| ##################################################### | |
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| # Optional: Set a style for matplotlib plots | |
| plt.style.use('seaborn-v0_8-darkgrid') | |
| # Load the dataset (assuming it's in the same directory or accessible via URL) | |
| # For simplicity, we'll create a synthetic version that mimics Titanic structure | |
| # In a real scenario, you'd load 'train.csv' or similar. | |
| data = { | |
| 'PassengerId': range(1, 892), | |
| 'Survived': np.random.randint(0, 2, 891), | |
| 'Pclass': np.random.choice([1, 2, 3], 891, p=[0.25, 0.25, 0.5]), | |
| 'Name': [f'Passenger {i}' for i in range(1, 892)], | |
| 'Sex': np.random.choice(['male', 'female'], 891), | |
| 'Age': np.random.normal(30, 15, 891).clip(0, 80).astype(int), | |
| 'SibSp': np.random.randint(0, 4, 891), | |
| 'Parch': np.random.randint(0, 3, 891), | |
| 'Ticket': [f'T{i}' for i in range(1, 892)], | |
| 'Fare': np.random.lognormal(mean=3.5, sigma=0.8, size=891).round(2), | |
| 'Cabin': [np.random.choice([f'C{i}', f'B{i}', None], p=[0.2, 0.2, 0.6]) for i in range(891)], | |
| 'Embarked': np.random.choice(['S', 'C', 'Q', None], 891, p=[0.7, 0.15, 0.1, 0.05]) | |
| } | |
| df = pd.DataFrame(data) | |
| # Introduce some missing values for demonstration | |
| df.loc[df.sample(frac=0.05).index, 'Age'] = np.nan | |
| df.loc[df.sample(frac=0.1).index, 'Cabin'] = np.nan | |
| df.loc[df.sample(frac=0.02).index, 'Embarked'] = np.nan | |
| df.loc[df.sample(frac=0.01).index, 'Fare'] = np.nan # Just for variety | |
| print("First 5 rows of the dataset:") | |
| print(df.head()) | |
| print("\nDataset Info:") | |
| df.info() | |
| ##################################################### | |
| # 1. Robust Missing Value Analysis | |
| ##################################################### | |
| # Detailed Missing Values Report | |
| def missing_data_report(df): | |
| total = df.isnull().sum().sort_values(ascending=False) | |
| percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending=False) | |
| missing_df = pd.concat([total, percent], axis=1, keys=['Total Missing', 'Percent Missing (%)']) | |
| return missing_df[missing_df['Total Missing'] > 0] | |
| print("\nMissing Data Report:") | |
| print(missing_data_report(df)) | |
| # Visualizing Missing Data Patterns (Seaborn) | |
| plt.figure(figsize=(10, 6)) | |
| sns.heatmap(df.isnull(), cbar=False, cmap='viridis') | |
| plt.title('Missing Values Heatmap') | |
| plt.show() | |
| # You can also use missingno library for more advanced missing data visualizations | |
| # !pip install missingno | |
| # import missingno as msno | |
| # msno.matrix(df, figsize=(10, 6), color=(0.2, 0.2, 0.2)) | |
| # plt.title('Missing Values Matrix (missingno)') | |
| # plt.show() | |
| # msno.bar(df, figsize=(10, 6), color='skyblue') | |
| # plt.title('Missing Values Bar Plot (missingno)') | |
| # plt.show() | |
| ##################################################### | |
| # 2. Deeper Univariate Analysis with Distributions | |
| ## A. Kernel Density Estimates (KDEs) for Smooth Distributions | |
| ##################################################### | |
| plt.figure(figsize=(12, 6)) | |
| sns.histplot(df['Age'].dropna(), kde=True, bins=30, color='skyblue', label='Age Distribution') | |
| sns.kdeplot(df['Age'].dropna(), color='red', linestyle='--', linewidth=2, label='Age KDE') | |
| plt.title('Age Distribution with KDE') | |
| plt.xlabel('Age') | |
| plt.ylabel('Density / Count') | |
| plt.legend() | |
| plt.show() | |
| plt.figure(figsize=(12, 6)) | |
| sns.histplot(df['Fare'].dropna(), kde=True, bins=50, color='lightgreen', log_scale=True) # Log scale often helps for skewed data like Fare | |
| plt.title('Fare Distribution (Log Scale) with KDE') | |
| plt.xlabel('Fare (Log Scale)') | |
| plt.ylabel('Density / Count') | |
| plt.show() | |
| ##################################################### | |
| # 3. Advanced Bivariate and Multivariate Analysis | |
| ## A. Categorical vs. Numerical: Box Plots and Violin Plots (Seaborn) | |
| ##################################################### | |
| plt.figure(figsize=(10, 6)) | |
| sns.boxplot(x='Pclass', y='Age', data=df) | |
| plt.title('Age Distribution by Passenger Class') | |
| plt.xlabel('Passenger Class') | |
| plt.ylabel('Age') | |
| plt.show() | |
| plt.figure(figsize=(10, 6)) | |
| sns.violinplot(x='Sex', y='Age', hue='Survived', data=df, split=True, inner='quartile') | |
| plt.title('Age Distribution by Sex and Survival (Violin Plot)') | |
| plt.xlabel('Sex') | |
| plt.ylabel('Age') | |
| plt.show() | |
| ##################################################### | |
| # 3. Advanced Bivariate and Multivariate Analysis | |
| ## B. Categorical vs. Categorical: Stacked Bar Charts (Pandas & Matplotlib) | |
| ##################################################### | |
| # Calculate survival rates by Pclass and Sex | |
| survival_pivot = df.pivot_table(index='Pclass', columns='Sex', values='Survived', aggfunc='mean') | |
| print("\nSurvival Rate by Pclass and Sex:") | |
| print(survival_pivot) | |
| # Plotting the proportions using stacked bars | |
| survival_pivot.plot(kind='bar', figsize=(10, 6), rot=0) | |
| plt.title('Survival Rate by Passenger Class and Sex') | |
| plt.ylabel('Survival Rate') | |
| plt.xlabel('Passenger Class') | |
| plt.legend(title='Sex') | |
| plt.show() | |
| ##################################################### | |
| # 3. Advanced Bivariate and Multivariate Analysis | |
| ## C. Numerical vs. Numerical: Enhanced Scatter Plots (Seaborn & Plotly) | |
| ##################################################### | |
| # Seaborn Scatter Plot with Hue and Size (for 3 variables) | |
| plt.figure(figsize=(12, 8)) | |
| sns.scatterplot(x='Age', y='Fare', hue='Survived', size='Pclass', sizes=(20, 400), data=df.dropna(subset=['Age', 'Fare'])) | |
| plt.title('Age vs. Fare with Survival and Pclass') | |
| plt.xlabel('Age') | |
| plt.ylabel('Fare') | |
| plt.show() | |
| # Plotly Express Scatter Plot for Interactivity | |
| fig = px.scatter(df.dropna(subset=['Age', 'Fare', 'Survived']), | |
| x='Age', | |
| y='Fare', | |
| color='Survived', | |
| size='Fare', # Use Fare for size as well | |
| hover_name='Name', | |
| facet_col='Sex', # Faceting by Sex | |
| title='Age vs. Fare by Sex and Survival (Interactive)', | |
| labels={'Survived': 'Survived (0=No, 1=Yes)'}, | |
| color_continuous_scale=px.colors.sequential.Viridis) | |
| fig.show() | |
| ##################################################### | |
| # 4. Correlation Analysis with a Twist | |
| ##################################################### | |
| # Correlation Heatmap | |
| plt.figure(figsize=(10, 8)) | |
| correlation_matrix = df.select_dtypes(include=np.number).corr() # Only numerical columns | |
| sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f") | |
| plt.title('Correlation Matrix of Numerical Features') | |
| plt.show() | |
| # Bar plot of correlations with 'Survived' | |
| plt.figure(figsize=(8, 6)) | |
| df.select_dtypes(include=np.number).corr()['Survived'].drop('Survived').sort_values(ascending=False).plot(kind='bar', color='purple') | |
| plt.title('Correlation with Survival') | |
| plt.ylabel('Correlation Coefficient') | |
| plt.show() | |
| ##################################################### | |
| # 5. Feature Engineering During EDA: Extracting More Value | |
| ## A. Creating Family Size | |
| ##################################################### | |
| df['FamilySize'] = df['SibSp'] + df['Parch'] + 1 # +1 for the passenger themselves | |
| plt.figure(figsize=(10, 6)) | |
| sns.barplot(x='FamilySize', y='Survived', data=df) | |
| plt.title('Survival Rate by Family Size') | |
| plt.xlabel('Family Size') | |
| plt.ylabel('Survival Rate') | |
| plt.show() | |
| # Segmenting Family Size into Categories | |
| df['FamilyGroup'] = pd.cut(df['FamilySize'], | |
| bins=[0, 1, 4, 11], # Solo, Small, Large | |
| labels=['Solo', 'Small', 'Large'], | |
| right=True) | |
| plt.figure(figsize=(8, 5)) | |
| sns.barplot(x='FamilyGroup', y='Survived', data=df) | |
| plt.title('Survival Rate by Family Group') | |
| plt.xlabel('Family Group') | |
| plt.ylabel('Survival Rate') | |
| plt.show() | |
| ##################################################### | |
| # 5. Feature Engineering During EDA: Extracting More Value | |
| ## B. Extracting Title from Name | |
| ##################################################### | |
| df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False) | |
| # Group less frequent titles | |
| df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Mlle', 'Ms', 'Mme'], | |
| ['Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Rare', 'Miss', 'Miss', 'Mrs']) | |
| plt.figure(figsize=(10, 6)) | |
| sns.barplot(x='Title', y='Survived', data=df) | |
| plt.title('Survival Rate by Title') | |
| plt.xlabel('Title') | |
| plt.ylabel('Survival Rate') | |
| plt.show() | |
| ##################################################### | |
| # 6. Interactive Dashboards for Comprehensive Exploration (Plotly) | |
| ##################################################### | |
| # Create subplots for multiple visualizations in one figure | |
| fig = make_subplots(rows=2, cols=2, | |
| subplot_titles=('Age Distribution by Pclass', 'Fare Distribution by Embarked', | |
| 'Survival by Sex and Pclass', 'Family Size Survival')) | |
| # Plot 1: Age Distribution by Pclass (Violin Plot) | |
| for pclass in sorted(df['Pclass'].unique()): | |
| fig.add_trace(go.Violin(x=df[df['Pclass'] == pclass]['Pclass'], | |
| y=df[df['Pclass'] == pclass]['Age'], | |
| name=f'Pclass {pclass}', | |
| box_visible=True, | |
| meanline_visible=True), | |
| row=1, col=1) | |
| # Plot 2: Fare Distribution by Embarked (Box Plot) | |
| for embarked_port in sorted(df['Embarked'].dropna().unique()): | |
| fig.add_trace(go.Box(y=df[df['Embarked'] == embarked_port]['Fare'], | |
| name=f'Embarked {embarked_port}'), | |
| row=1, col=2) | |
| # Plot 3: Survival by Sex and Pclass (Grouped Bar Chart) | |
| survival_by_sex_pclass = df.groupby(['Sex', 'Pclass'])['Survived'].mean().reset_index() | |
| fig.add_trace(go.Bar(x=survival_by_sex_pclass[survival_by_sex_pclass['Sex'] == 'female']['Pclass'], | |
| y=survival_by_sex_pclass[survival_by_sex_pclass['Sex'] == 'female']['Survived'], | |
| name='Female Survival', marker_color='pink'), | |
| row=2, col=1) | |
| fig.add_trace(go.Bar(x=survival_by_sex_pclass[survival_by_sex_pclass['Sex'] == 'male']['Pclass'], | |
| y=survival_by_sex_pclass[survival_by_sex_pclass['Sex'] == 'male']['Survived'], | |
| name='Male Survival', marker_color='blue'), | |
| row=2, col=1) | |
| # Plot 4: Family Size Survival | |
| family_survival = df.groupby('FamilyGroup')['Survived'].mean().reset_index() | |
| fig.add_trace(go.Bar(x=family_survival['FamilyGroup'], y=family_survival['Survived'], | |
| name='Family Group Survival', marker_color='lightgreen'), | |
| row=2, col=2) | |
| fig.update_layout(height=800, showlegend=True, title_text="Advanced EDA Dashboard for Titanic Dataset") | |
| fig.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment