Created
November 8, 2021 09:17
-
-
Save Imperial-lord/757b1d870f00253afed056d7324cb1d9 to your computer and use it in GitHub Desktop.
Titanic Survival Predictions.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Titanic Survival Predictions.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"mount_file_id": "1Z6ahIXr78kIpfQKPcPAnYYxCo1FsVC2c", | |
"authorship_tag": "ABX9TyPEajGvsiUrPLJ9aB3C71+M", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/Imperial-lord/757b1d870f00253afed056d7324cb1d9/titanic-survival-predictions.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "X6qIb0mUqGy1" | |
}, | |
"source": [ | |
"# 1. Import Necessary Libraries\n", | |
"First off, we need to import several Python libraries such as numpy, pandas, matplotlib and seaborn." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ck1RRP6ipzyO" | |
}, | |
"source": [ | |
"#data analysis libraries \n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"#visualization libraries\n", | |
"import matplotlib.pyplot as plt\n", | |
"import seaborn as sns\n", | |
"%matplotlib inline\n", | |
"\n", | |
"#ignore warnings\n", | |
"import warnings\n", | |
"warnings.filterwarnings('ignore')" | |
], | |
"execution_count": 223, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "IeaiCBdpqSSf" | |
}, | |
"source": [ | |
"#2. Read in and Explore the Data\n", | |
"It's time to read in our training and testing data using pd.read_csv, and take a first look at the training data using the describe() function." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 383 | |
}, | |
"id": "KdH6bQE9qbB0", | |
"outputId": "767962bf-2dc7-47dc-ca6b-528fbcb42fe3" | |
}, | |
"source": [ | |
"#import train and test CSV files\n", | |
"train = pd.read_csv(\"/content/drive/MyDrive/Titanic Survival Prediction/input/train.csv\")\n", | |
"test = pd.read_csv(\"/content/drive/MyDrive/Titanic Survival Prediction/input/test.csv\")\n", | |
"\n", | |
"#take a look at the training data\n", | |
"train.describe(include=\"all\")" | |
], | |
"execution_count": 224, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Ticket</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Cabin</th>\n", | |
" <th>Embarked</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>891.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891</td>\n", | |
" <td>891</td>\n", | |
" <td>714.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>204</td>\n", | |
" <td>889</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unique</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>891</td>\n", | |
" <td>2</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>681</td>\n", | |
" <td>NaN</td>\n", | |
" <td>147</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>top</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Woolner, Mr. Hugh</td>\n", | |
" <td>male</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>347082</td>\n", | |
" <td>NaN</td>\n", | |
" <td>C23 C25 C27</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>freq</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1</td>\n", | |
" <td>577</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>7</td>\n", | |
" <td>NaN</td>\n", | |
" <td>4</td>\n", | |
" <td>644</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>446.000000</td>\n", | |
" <td>0.383838</td>\n", | |
" <td>2.308642</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>29.699118</td>\n", | |
" <td>0.523008</td>\n", | |
" <td>0.381594</td>\n", | |
" <td>NaN</td>\n", | |
" <td>32.204208</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>257.353842</td>\n", | |
" <td>0.486592</td>\n", | |
" <td>0.836071</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>14.526497</td>\n", | |
" <td>1.102743</td>\n", | |
" <td>0.806057</td>\n", | |
" <td>NaN</td>\n", | |
" <td>49.693429</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.420000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>223.500000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>2.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20.125000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>7.910400</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>446.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>28.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>14.454200</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>668.500000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>38.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>31.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>891.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>80.000000</td>\n", | |
" <td>8.000000</td>\n", | |
" <td>6.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>512.329200</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass ... Fare Cabin Embarked\n", | |
"count 891.000000 891.000000 891.000000 ... 891.000000 204 889\n", | |
"unique NaN NaN NaN ... NaN 147 3\n", | |
"top NaN NaN NaN ... NaN C23 C25 C27 S\n", | |
"freq NaN NaN NaN ... NaN 4 644\n", | |
"mean 446.000000 0.383838 2.308642 ... 32.204208 NaN NaN\n", | |
"std 257.353842 0.486592 0.836071 ... 49.693429 NaN NaN\n", | |
"min 1.000000 0.000000 1.000000 ... 0.000000 NaN NaN\n", | |
"25% 223.500000 0.000000 2.000000 ... 7.910400 NaN NaN\n", | |
"50% 446.000000 0.000000 3.000000 ... 14.454200 NaN NaN\n", | |
"75% 668.500000 1.000000 3.000000 ... 31.000000 NaN NaN\n", | |
"max 891.000000 1.000000 3.000000 ... 512.329200 NaN NaN\n", | |
"\n", | |
"[11 rows x 12 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 224 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "oqnvKmqHru0o" | |
}, | |
"source": [ | |
"#3. Data Analysis\n", | |
"We're going to consider the features in the dataset and how complete they are." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "0TKeJh54ryD3", | |
"outputId": "e4db021f-c781-46f8-a861-c816756608b9" | |
}, | |
"source": [ | |
"#get a list of the features within the dataset\n", | |
"print(train.columns)" | |
], | |
"execution_count": 225, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',\n", | |
" 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],\n", | |
" dtype='object')\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 201 | |
}, | |
"id": "XE0hvrRtr2Tk", | |
"outputId": "d3054289-dda4-4c2f-89a6-f8baa6f58777" | |
}, | |
"source": [ | |
"#see a sample of the dataset to get an idea of the variables\n", | |
"train.sample(5)" | |
], | |
"execution_count": 226, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Ticket</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Cabin</th>\n", | |
" <th>Embarked</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>328</th>\n", | |
" <td>329</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>Goldsmith, Mrs. Frank John (Emily Alice Brown)</td>\n", | |
" <td>female</td>\n", | |
" <td>31.0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>363291</td>\n", | |
" <td>20.525</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>271</th>\n", | |
" <td>272</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>Tornquist, Mr. William Henry</td>\n", | |
" <td>male</td>\n", | |
" <td>25.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>LINE</td>\n", | |
" <td>0.000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>619</th>\n", | |
" <td>620</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>Gavey, Mr. Lawrence</td>\n", | |
" <td>male</td>\n", | |
" <td>26.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>31028</td>\n", | |
" <td>10.500</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>432</th>\n", | |
" <td>433</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>Louch, Mrs. Charles Alexander (Alice Adelaide ...</td>\n", | |
" <td>female</td>\n", | |
" <td>42.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>SC/AH 3085</td>\n", | |
" <td>26.000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>237</th>\n", | |
" <td>238</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>Collyer, Miss. Marjorie \"Lottie\"</td>\n", | |
" <td>female</td>\n", | |
" <td>8.0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>C.A. 31921</td>\n", | |
" <td>26.250</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass ... Fare Cabin Embarked\n", | |
"328 329 1 3 ... 20.525 NaN S\n", | |
"271 272 1 3 ... 0.000 NaN S\n", | |
"619 620 0 2 ... 10.500 NaN S\n", | |
"432 433 1 2 ... 26.000 NaN S\n", | |
"237 238 1 2 ... 26.250 NaN S\n", | |
"\n", | |
"[5 rows x 12 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 226 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "MIDW8wZLsCEU" | |
}, | |
"source": [ | |
"* **Numerical Features**: Age (Continuous), Fare (Continuous), SibSp (Discrete), Parch (Discrete)\n", | |
"* **Categorical Features**: Survived, Sex, Embarked, Pclass\n", | |
"* **Alphanumeric Features**: Ticket, Cabin\n", | |
"\n", | |
"What are the data types for each feature?\n", | |
"* Survived: int\n", | |
"* Pclass: int\n", | |
"* Name: string\n", | |
"* Sex: string\n", | |
"* Age: float\n", | |
"* SibSp: int\n", | |
"* Parch: int\n", | |
"* Ticket: string\n", | |
"* Fare: float\n", | |
"* Cabin: string\n", | |
"* Embarked: string\n", | |
"\n", | |
"Now that we have an idea of what kinds of features we're working with, we can see how much information we have about each of them." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 383 | |
}, | |
"id": "s-h6_I9ar64G", | |
"outputId": "fc2e2b66-9ff1-4a7f-da8d-aad395afeb3b" | |
}, | |
"source": [ | |
"#see a summary of the training dataset\n", | |
"train.describe(include = \"all\")" | |
], | |
"execution_count": 227, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Ticket</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Cabin</th>\n", | |
" <th>Embarked</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>891.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891</td>\n", | |
" <td>891</td>\n", | |
" <td>714.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>204</td>\n", | |
" <td>889</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unique</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>891</td>\n", | |
" <td>2</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>681</td>\n", | |
" <td>NaN</td>\n", | |
" <td>147</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>top</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Woolner, Mr. Hugh</td>\n", | |
" <td>male</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>347082</td>\n", | |
" <td>NaN</td>\n", | |
" <td>C23 C25 C27</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>freq</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1</td>\n", | |
" <td>577</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>7</td>\n", | |
" <td>NaN</td>\n", | |
" <td>4</td>\n", | |
" <td>644</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>446.000000</td>\n", | |
" <td>0.383838</td>\n", | |
" <td>2.308642</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>29.699118</td>\n", | |
" <td>0.523008</td>\n", | |
" <td>0.381594</td>\n", | |
" <td>NaN</td>\n", | |
" <td>32.204208</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>257.353842</td>\n", | |
" <td>0.486592</td>\n", | |
" <td>0.836071</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>14.526497</td>\n", | |
" <td>1.102743</td>\n", | |
" <td>0.806057</td>\n", | |
" <td>NaN</td>\n", | |
" <td>49.693429</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.420000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>223.500000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>2.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20.125000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>7.910400</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>446.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>28.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>14.454200</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>668.500000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>38.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>31.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>891.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>80.000000</td>\n", | |
" <td>8.000000</td>\n", | |
" <td>6.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>512.329200</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass ... Fare Cabin Embarked\n", | |
"count 891.000000 891.000000 891.000000 ... 891.000000 204 889\n", | |
"unique NaN NaN NaN ... NaN 147 3\n", | |
"top NaN NaN NaN ... NaN C23 C25 C27 S\n", | |
"freq NaN NaN NaN ... NaN 4 644\n", | |
"mean 446.000000 0.383838 2.308642 ... 32.204208 NaN NaN\n", | |
"std 257.353842 0.486592 0.836071 ... 49.693429 NaN NaN\n", | |
"min 1.000000 0.000000 1.000000 ... 0.000000 NaN NaN\n", | |
"25% 223.500000 0.000000 2.000000 ... 7.910400 NaN NaN\n", | |
"50% 446.000000 0.000000 3.000000 ... 14.454200 NaN NaN\n", | |
"75% 668.500000 1.000000 3.000000 ... 31.000000 NaN NaN\n", | |
"max 891.000000 1.000000 3.000000 ... 512.329200 NaN NaN\n", | |
"\n", | |
"[11 rows x 12 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 227 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "EAJCIZJgsl7t" | |
}, | |
"source": [ | |
"#### Some Observations:\n", | |
"* There are a total of 891 passengers in our training set.\n", | |
"* The Age feature is missing approximately 19.8% of its values. I'm guessing that the Age feature is pretty important to survival, so we should probably attempt to fill these gaps. \n", | |
"* The Cabin feature is missing approximately 77.1% of its values. Since so much of the feature is missing, it would be hard to fill in the missing values. We'll probably drop these values from our dataset.\n", | |
"* The Embarked feature is missing 0.22% of its values, which should be relatively harmless." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "QxQqypx8snV_", | |
"outputId": "c57a1604-5ddf-4ac8-dd17-1c9388b8890f" | |
}, | |
"source": [ | |
"#check for any other unusable values\n", | |
"print(pd.isnull(train).sum())" | |
], | |
"execution_count": 228, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"PassengerId 0\n", | |
"Survived 0\n", | |
"Pclass 0\n", | |
"Name 0\n", | |
"Sex 0\n", | |
"Age 177\n", | |
"SibSp 0\n", | |
"Parch 0\n", | |
"Ticket 0\n", | |
"Fare 0\n", | |
"Cabin 687\n", | |
"Embarked 2\n", | |
"dtype: int64\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "rfPfPRtvsuF-" | |
}, | |
"source": [ | |
"We can see that except for the abovementioned missing values, no NaN values exist." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "6gYiprfpsyOg" | |
}, | |
"source": [ | |
"### Some Predictions:\n", | |
"* Sex: Females are more likely to survive.\n", | |
"* SibSp/Parch: People traveling alone are more likely to survive.\n", | |
"* Age: Young children are more likely to survive.\n", | |
"* Pclass: People of higher socioeconomic class are more likely to survive." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_xSMhDoUs5CN" | |
}, | |
"source": [ | |
"# 4. Data Visualization\n", | |
"It's time to visualize our data so we can see whether our predictions were accurate! " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 314 | |
}, | |
"id": "vdBdTVsbsvOM", | |
"outputId": "32c779b8-d641-4c26-9099-5acf47326b05" | |
}, | |
"source": [ | |
"#draw a bar plot of survival by sex\n", | |
"sns.barplot(x=\"Sex\", y=\"Survived\", data=train)\n", | |
"\n", | |
"#print percentages of females vs. males that survive\n", | |
"print(\"Percentage of females who survived:\", train[\"Survived\"][train[\"Sex\"] == 'female'].value_counts(normalize = True)[1]*100)\n", | |
"\n", | |
"print(\"Percentage of males who survived:\", train[\"Survived\"][train[\"Sex\"] == 'male'].value_counts(normalize = True)[1]*100)" | |
], | |
"execution_count": 229, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Percentage of females who survived: 74.20382165605095\n", | |
"Percentage of males who survived: 18.890814558058924\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAT0klEQVR4nO3df5BdZ33f8ffHMooHY5xSbcceSUYqiDgqcXC8iKT5BcFOZNqR0gCJbGeCpy4apshkQowrClWoHJqJaEiTVCQorQtlAsKBDrO0SlUChibml9axsSs5IlvZIAlU1piAgNZm42//uFfO5epqdWXr3Kvd837N3NE95zx77ndXV/rseZ57nidVhSSpvc4bdwGSpPEyCCSp5QwCSWo5g0CSWs4gkKSWO3/cBZypZcuW1apVq8ZdhiQtKHfffffDVTUx6NiCC4JVq1YxPT097jIkaUFJ8oVTHbNrSJJaziCQpJYzCCSp5RoNgiTrkxxMMpNk64DjlyW5M8k9Se5L8rIm65EknayxIEiyBNgJXAusBa5Lsrav2ZuBO6rqSmAT8I6m6pEkDdbkFcE6YKaqDlXVY8BuYGNfmwKe2X1+MfClBuuRJA3QZBAsBw73bB/p7uv1FuAXkxwB9gA3DzpRks1JppNMz87ONlGrJLXWuAeLrwPeVVUrgJcB70lyUk1VtauqJqtqcmJi4P0QkqQnqckbyo4CK3u2V3T39boJWA9QVZ9KcgGwDPhKg3VJOsfdeuutHDt2jEsuuYQdO3aMu5xFr8krgn3AmiSrkyylMxg81dfmi8BLAZJ8P3ABYN+P1HLHjh3j6NGjHDt2bNyltEJjQVBVc8AWYC/wAJ1PB+1Psj3Jhm6zXwVeneRzwPuAG8sl0yRppBqda6iq9tAZBO7dt63n+QHgR5usQZI0v3EPFkuSxswgkKSWMwgkqeUMAklqOYNAklrOIJCkljMIJKnlDAJJajmDQJJartE7iyWdmS9u/4Fxl3BOmHvkWcD5zD3yBX8mwGXb7m/0/F4RSFLLGQSS1HIGgSS1nEEgSS1nEEhSyxkEktRyBoEktVyjQZBkfZKDSWaSbB1w/LeT3Nt9fD7JXzdZjyTpZI3dUJZkCbATuAY4AuxLMtVdnhKAqvqVnvY3A1c2VY8kabAmrwjWATNVdaiqHgN2AxvnaX8dnQXsJUkj1GQQLAcO92wf6e47SZJnA6uBj53i+OYk00mmZ2dnz3qhktRm58pg8SbgA1X1N4MOVtWuqpqsqsmJiYkRlyZJi1uTQXAUWNmzvaK7b5BN2C0kSWPR5Oyj+4A1SVbTCYBNwPX9jZJcDvwd4FMN1iJpAVl2wePAXPdPNa2xIKiquSRbgL3AEuD2qtqfZDswXVVT3aabgN1VVU3VImlhueUKP0k+So2uR1BVe4A9ffu29W2/pckaJEnzO1cGiyVJY2IQSFLLGQSS1HIGgSS1nEEgSS1nEEhSyxkEktRyBoEktZxBIEktZxBIUssZBJLUcgaBJLWcQSBJLWcQSFLLGQSS1HIGgSS1nEEgSS3XaBAkWZ/kYJKZJFtP0ebnkxxIsj/Je5usR5J0ssaWqkyyBNgJXAMcAfYlmaqqAz1t1gBvBH60qr6W5O81VY8kabAmrwjWATNVdaiqHgN2Axv72rwa2FlVXwOoqq80WI8kaYAmg2A5cLhn+0h3X6/nAc9LcleSTydZP+hESTYnmU4yPTs721C5ktRO4x4sPh9YA7wYuA74wyTf29+oqnZV1WRVTU5MTIy4REla3JoMgqPAyp7tFd19vY4AU1X1nap6EPg8nWCQJI1Ik0GwD1iTZHWSpcAmYKqvzYfoXA2QZBmdrqJDDdYkSerTWBBU1RywBdgLPADcUVX7k2xPsqHbbC/w1SQHgDuBN1TVV5uqSZJ0ssY+PgpQVXuAPX37tvU8L+D13YckaQzGPVgsSRozg0CSWs4gkKSWMwgkqeUMAklqOYNAklrOIJCkljMIJKnlDAJJajmDQJJaziCQpJYzCCSp5QwCSWo5g0CSWs4gkKSWMwgkqeUMAklqOYNAklqu0SBIsj7JwSQzSbYOOH5jktkk93Yf/6zJeiRJJ2tszeIkS4CdwDXAEWBfkqmqOtDX9P1VtaWpOiRJ82vyimAdMFNVh6rqMWA3sLHB15MkPQlNBsFy4HDP9pHuvn4vT3Jfkg8kWTnoREk2J5lOMj07O9tErZLUWuMeLP4wsKqqrgA+Arx7UKOq2lVVk1U1OTExMdICJWmxazIIjgK9v+Gv6O57QlV9taoe7W7+B+CqBuuRJA3QZBDsA9YkWZ1kKbAJmOptkOTSns0NwAMN1iNJGmDeTw0lOQ7UqY5X1TPnOTaXZAuwF1gC3F5V+5NsB6aragp4XZINwBzwCHDjmX8LkqSnYt4gqKqLAJLcBnwZeA8Q4Abg0nm+9MTX7wH29O3b1vP8jcAbz7hqSdJZM2zX0IaqekdVHa+qb1TV7+NHQSVpURg2CL6V5IYkS5Kcl+QG4FtNFiZJGo1hg+B64OeB/9N9vLK7T5K0wA01xURVPYRdQZK0KA11RZDkeUk+muR/dbevSPLmZkuTJI3CsF1Df0jn0z3fAaiq++jcFyBJWuCGDYKnV9Vn+/bNne1iJEmjN2wQPJzkOXRvLkvyCjr3FUiSFrhh1yN4LbALuDzJUeBBOjeVSZIWuGGD4AtVdXWSC4Hzqup4k0VJkkZn2K6hB5PsAn4Y+GaD9UiSRmzYILgc+FM6XUQPJvn3SX6subIkSaMyVBBU1ber6o6q+jngSuCZwCcarUySNBJDr0eQ5CeTvAO4G7iAzpQTkqQFbqjB4iQPAfcAdwBvqConnJOkRWLYTw1dUVXfaLQSSdJYnG6Fsluragfw1iQnrVRWVa9rrDJJ0kicbozgxBrC03TGBvof80qyPsnBJDNJts7T7uVJKsnkkHVLks6S0y1V+eHu0/ur6i/O5MRJlgA7gWuAI8C+JFNVdaCv3UXALwOfOZPzS5LOjmE/NfRbSR5IcluS5w/5NeuAmao6VFWPAbsZvKbBbcBvAv9vyPNKks6iYe8jeAnwEmAWeGeS+4dYj2A5cLhn+0h33xOS/BCwsqr+23wnSrI5yXSS6dnZ2WFKliQNaej7CKrqWFX9LvAa4F5g21N54STnAW8HfnWI195VVZNVNTkxMfFUXlaS1GfYFcq+P8lbktwP/B7wSWDFab7sKLCyZ3tFd98JFwHPBz7evU/hh4EpB4wlabSGvY/gdjp9/D9TVV8a8mv2AWuSrKYTAJvoWfC+qr4OLDuxneTjwC1VNT3k+SVJZ8Fprwi6n/55sKp+5wxCgKqaA7YAe+l8DPWOqtqfZHuSDU+6YknSWXXaK4Kq+pskK5Ms7X76Z2hVtQfY07dv4NhCVb34TM4tSTo7hu0aehC4K8kU8MQ8Q1X19kaqkiSNzLBB8L+7j/PoDPJKkhaJoYKgqv5104VIksZj2Gmo7wQGTTr3U2e9IknSSA3bNXRLz/MLgJcDc2e/HEnSqA3bNdQ/0+hdST7bQD2SpBEbtmvoWT2b5wGTwMWNVCRJGqlhu4bu5m/HCOaAh4CbmihIkjRap1uh7IXA4apa3d1+FZ3xgYeAA/N8qSRpgTjdFBPvBB4DSPITwG8A7wa+DuxqtjRJ0iicrmtoSVU90n3+C8Cuqvog8MEk9zZbmiRpFE53RbAkyYmweCnwsZ5jw44vSJLOYaf7z/x9wCeSPAz8X+DPAJI8l073kCRpgTvd4vVvTfJR4FLgf1TViU8OnQfc3HRxkqTmDTMN9acH7Pt8M+VIkkZt6DWLJUmLk0EgSS3XaBAkWZ/kYJKZJFsHHH9NkvuT3Jvkz5OsbbIeSdLJGguC7lrHO4FrgbXAdQP+o39vVf1AVb0A2AG44pkkjViTVwTrgJmqOtRd63g3sLG3QVV9o2fzQgaseSBJalaTN4UtBw73bB8BXtTfKMlrgdcDS4GBC90k2QxsBrjsssvOeqGS1GZjHyyuqp1V9RzgXwBvPkWbXVU1WVWTExMToy1Qkha5JoPgKLCyZ3tFd9+p7AZ+tsF6JEkDNBkE+4A1SVYnWQpsAqZ6GyRZ07P5j4C/arAeSdIAjY0RVNVcki3AXmAJcHtV7U+yHZiuqilgS5Krge8AXwNe1VQ9kqTBGp1BtKr2AHv69m3ref7LTb6+JOn0xj5YLEkaL4NAklrOIJCkljMIJKnlDAJJajmDQJJaziCQpJYzCCSp5QwCSWo5g0CSWs4gkKSWMwgkqeUMAklquUZnH9W57dZbb+XYsWNccskl7NixY9zlSBoTg6DFjh07xtGj8y0aJ6kN7BqSpJYzCCSp5RoNgiTrkxxMMpNk64Djr09yIMl9ST6a5NlN1iNJOlljQZBkCbATuBZYC1yXZG1fs3uAyaq6AvgA4IilJI1Yk1cE64CZqjpUVY8Bu4GNvQ2q6s6q+nZ389PAigbrkSQN0GQQLAcO92wf6e47lZuAPxl0IMnmJNNJpmdnZ89iiZKkc2KwOMkvApPA2wYdr6pdVTVZVZMTExOjLU6SFrkm7yM4Cqzs2V7R3fddklwNvAn4yap6tMF6JEkDNBkE+4A1SVbTCYBNwPW9DZJcCbwTWF9VX2mwlu9y1Rv+86he6px20cPHWQJ88eHj/kyAu9/2S+MuQRqLxrqGqmoO2ALsBR4A7qiq/Um2J9nQbfY24BnAHye5N8lUU/VIkgZrdIqJqtoD7Onbt63n+dVNvr4k6fTOicFiSdL4GASS1HIGgSS1nEEgSS1nEEhSyxkEktRyrlDWYo8vvfC7/pTUTgZBi31rzU+PuwRJ5wC7hiSp5QwCSWo5g0CSWs4gkKSWMwgkqeUMAklqOYNAklrOIJCkljMIJKnlGg2CJOuTHEwyk2TrgOM/keQvkswleUWTtUiSBmssCJIsAXYC1wJrgeuSrO1r9kXgRuC9TdUhSZpfk3MNrQNmquoQQJLdwEbgwIkGVfVQ99jjDdYhSZpHk11Dy4HDPdtHuvskSeeQBTFYnGRzkukk07Ozs+MuR5IWlSaD4Ciwsmd7RXffGauqXVU1WVWTExMTZ6U4SVJHk0GwD1iTZHWSpcAmYKrB15MkPQmNBUFVzQFbgL3AA8AdVbU/yfYkGwCSvDDJEeCVwDuT7G+qHknSYI2uUFZVe4A9ffu29TzfR6fLSJI0JgtisFiS1ByDQJJaziCQpJYzCCSp5QwCSWo5g0CSWs4gkKSWMwgkqeUMAklqOYNAklrOIJCkljMIJKnlDAJJajmDQJJaziCQpJYzCCSp5QwCSWo5g0CSWq7RIEiyPsnBJDNJtg44/j1J3t89/pkkq5qsR5J0ssaCIMkSYCdwLbAWuC7J2r5mNwFfq6rnAr8N/GZT9UiSBmvyimAdMFNVh6rqMWA3sLGvzUbg3d3nHwBemiQN1iRJ6nN+g+deDhzu2T4CvOhUbapqLsnXgb8LPNzbKMlmYHN385tJDjZScTsto+/n3Vb5t68adwn6br43T/i1s/L78bNPdaDJIDhrqmoXsGvcdSxGSaaranLcdUj9fG+OTpNdQ0eBlT3bK7r7BrZJcj5wMfDVBmuSJPVpMgj2AWuSrE6yFNgETPW1mQJOXI+/AvhYVVWDNUmS+jTWNdTt898C7AWWALdX1f4k24HpqpoC/iPwniQzwCN0wkKjZZebzlW+N0ck/gIuSe3mncWS1HIGgSS1nEGgJyR5cZL/Ou46tDgkeV2SB5L8UUPnf0uSW5o4d9ssiPsIJC1I/xy4uqqOjLsQzc8rgkUmyaokf5nkXUk+n+SPklyd5K4kf5VkXffxqST3JPlkku8bcJ4Lk9ye5LPddv3Tg0inlOQPgL8P/EmSNw16LyW5McmHknwkyUNJtiR5fbfNp5M8q9vu1Un2Jflckg8mefqA13tOkv+e5O4kf5bk8tF+xwubQbA4PRf4LeDy7uN64MeAW4B/Cfwl8ONVdSWwDfg3A87xJjr3dawDXgK8LcmFI6hdi0BVvQb4Ep33zoWc+r30fODngBcCbwW+3X1ffgr4pW6b/1JVL6yqHwQeoDNZZb9dwM1VdRWd9/k7mvnOFie7hhanB6vqfoAk+4GPVlUluR9YRecO7ncnWQMU8LQB5/hpYENPH+wFwGV0/iFKZ+JU7yWAO6vqOHC8O9fYh7v77weu6D5/fpJfB74XeAade5OekOQZwD8E/rhnzsrvaeIbWawMgsXp0Z7nj/dsP07n7/w2Ov8A/0l3DYiPDzhHgJdXlRP86aka+F5K8iJO/14FeBfws1X1uSQ3Ai/uO/95wF9X1QvObtntYddQO13M3877dOMp2uwFbj4xLXiSK0dQlxanp/peugj4cpKnATf0H6yqbwAPJnll9/xJ8oNPseZWMQjaaQfwG0nu4dRXhbfR6TK6r9u9dNuoitOi81TfS/8K+AxwF53xrUFuAG5K8jlgPyevfaJ5OMWEJLWcVwSS1HIGgSS1nEEgSS1nEEhSyxkEktRyBoF0Brrz5uxPcl+Se7s3RUkLmncWS0NK8iPAPwZ+qKoeTbIMWDrmsqSnzCsCaXiXAg9X1aMAVfVwVX0pyVVJPtGd+XJvkkuTXJzk4ImZXZO8L8mrx1q9dAreUCYNqTu52Z8DTwf+FHg/8EngE8DGqppN8gvAz1TVP01yDbAd+B3gxqpaP6bSpXnZNSQNqaq+meQq4MfpTKf8fuDX6Uyl/JHuVDpLgC9323+kO//NTsC5b3TO8opAepKSvAJ4LXBBVf3IgOPn0blaWAW87MTU4NK5xjECaUhJvq+7hsMJL6CzPsNEdyCZJE9L8g+6x3+le/x64D91Z8+UzjleEUhD6nYL/R6dBVLmgBlgM7AC+F0603ufD/w74H8CHwLWVdXxJG8HjlfVr42jdmk+BoEktZxdQ5LUcgaBJLWcQSBJLWcQSFLLGQSS1HIGgSS1nEEgSS33/wGAikte6oaCQAAAAABJRU5ErkJggg==\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ZFBncjYztA8H" | |
}, | |
"source": [ | |
"As predicted, females have a much higher chance of survival than males. The Sex feature is essential in our predictions." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 331 | |
}, | |
"id": "QJenNUNDtBk_", | |
"outputId": "7756fb01-8157-4603-8c75-3db0d4198bde" | |
}, | |
"source": [ | |
"#draw a bar plot of survival by Pclass\n", | |
"sns.barplot(x=\"Pclass\", y=\"Survived\", data=train)\n", | |
"\n", | |
"#print percentage of people by Pclass that survived\n", | |
"print(\"Percentage of Pclass = 1 who survived:\", train[\"Survived\"][train[\"Pclass\"] == 1].value_counts(normalize = True)[1]*100)\n", | |
"\n", | |
"print(\"Percentage of Pclass = 2 who survived:\", train[\"Survived\"][train[\"Pclass\"] == 2].value_counts(normalize = True)[1]*100)\n", | |
"\n", | |
"print(\"Percentage of Pclass = 3 who survived:\", train[\"Survived\"][train[\"Pclass\"] == 3].value_counts(normalize = True)[1]*100)" | |
], | |
"execution_count": 230, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Percentage of Pclass = 1 who survived: 62.96296296296296\n", | |
"Percentage of Pclass = 2 who survived: 47.28260869565217\n", | |
"Percentage of Pclass = 3 who survived: 24.236252545824847\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAASx0lEQVR4nO3dcZBdZ33e8e9jOarBOKGgbeWxVKyAKHWoJ5SN0qk7hBDcimZGyhRI5bpJPENRmUFAm4IwbeOCKe1EJGQaqiQojSeECQgDbbNp1agUO0BcbLQCYyM5pooMSCob1jYGm9DIsn/9Y4/pZXW1e2Xv2avV+/3M3NE973nvub/rO+Nnz3vued9UFZKkdl0w7gIkSeNlEEhS4wwCSWqcQSBJjTMIJKlxF467gLO1Zs2auvzyy8ddhiStKAcPHry/qiaG7VtxQXD55ZczPT097jIkaUVJ8pUz7XNoSJIaZxBIUuMMAklqnEEgSY3rNQiSbE5yb5IjSa4fsv9XktzZPb6U5KE+65Ekna63Xw0lWQXsBq4GjgMHkkxV1eEn+lTVPxvo/wbgRX3VI0kars8zgk3Akao6WlUngb3A1gX6XwN8qMd6JElD9BkElwHHBraPd22nSfIcYANwyxn2b08ynWR6dnZ2yQuVpJadKzeUbQM+WlWPDdtZVXuAPQCTk5Pn7QIKO3fuZGZmhrVr17Jr165xlyOpEX0GwQlg/cD2uq5tmG3A63usZUWYmZnhxIkz/SeSpH70OTR0ANiYZEOS1cz9z35qfqckLwD+IvCZHmuRJJ1Bb0FQVaeAHcB+4B7g5qo6lOTGJFsGum4D9pZrZkrSWPR6jaCq9gH75rXdMG/77X3WIElamHcWS1LjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1LhzZc3iXrz4Lb8z7hLOyiX3P8wq4Kv3P7yiaj/47p8ddwmSngLPCCSpcQaBJDXOIJCkxhkEktQ4g0CSGtdrECTZnOTeJEeSXH+GPj+d5HCSQ0k+2Gc9kqTT9fbz0SSrgN3A1cBx4ECSqao6PNBnI/A24Kqq+kaSv9RXPZKk4fo8I9gEHKmqo1V1EtgLbJ3X57XA7qr6BkBVfb3HeiRJQ/QZBJcBxwa2j3dtg54PPD/JbUluT7J52IGSbE8ynWR6dna2p3IlqU3jvlh8IbAReClwDfCbSZ45v1NV7amqyaqanJiYWOYSJen81mcQnADWD2yv69oGHQemqurRqroP+BJzwSBJWiZ9BsEBYGOSDUlWA9uAqXl9/gtzZwMkWcPcUNHRHmuSJM3TWxBU1SlgB7AfuAe4uaoOJbkxyZau237ggSSHgVuBt1TVA33VJEk6Xa+zj1bVPmDfvLYbBp4X8PPdQ5I0BuO+WCxJGjODQJIaZxBIUuMMAklqnEEgSY07r9csXmkeX33x9/wrScvBIDiHfHvj3xl3CZIa5NCQJDXOMwJpCezcuZOZmRnWrl3Lrl27xl2OdFYMAmkJzMzMcOLE/DkVpZXBoSFJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxvQZBks1J7k1yJMn1Q/Zfl2Q2yZ3d4x/3WY8k6XS9TTqXZBWwG7gaOA4cSDJVVYfndf1wVe3oqw5J0sL6PCPYBBypqqNVdRLYC2zt8f0kSU9Cn0FwGXBsYPt41zbfK5PcleSjSdYPO1CS7Ummk0zPzs72UaskNWvcF4t/H7i8qq4EPg68f1inqtpTVZNVNTkxMbGsBUrS+a7PIDgBDP6Fv65r+66qeqCq/rzb/I/Ai3usR5I0RJ9BcADYmGRDktXANmBqsEOSSwc2twD39FiPJGmI3n41VFWnkuwA9gOrgJuq6lCSG4HpqpoC3phkC3AKeBC4rq96JEnD9bpmcVXtA/bNa7th4PnbgLf1WYMkaWHjvlgsSRozg0CSGtfr0JD0VHz1xr8+7hJGdurBZwEXcurBr6youv/KDXePuwSdAzwjkKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJatyCs48meRioM+2vqu9f8ookSctqwSCoqksAkrwT+BrwASDAtcClC7xUkrRCjDo0tKWqfq2qHq6qb1XVrwNb+yxMkrQ8Rg2Cbye5NsmqJBckuRb4dp+FSZKWx6hB8A+Bnwb+tHu8umtbUJLNSe5NciTJ9Qv0e2WSSjI5Yj2SpCUy0lKVVfVlznIoKMkqYDdwNXAcOJBkqqoOz+t3CfAm4I6zOb4kaWmMdEaQ5PlJPpHki932lUn+1SIv2wQcqaqjVXUS2MvwMHkn8IvA/z2LuiVJS2TUoaHfBN4GPApQVXcB2xZ5zWXAsYHt413bdyX5G8D6qvpvI9YhSVpiowbB06vqs/PaTj2VN05yAfAe4J+P0Hd7kukk07Ozs0/lbSVJ84waBPcneS7dzWVJXsXcfQULOQGsH9he17U94RLghcAfJvky8DeBqWEXjKtqT1VNVtXkxMTEiCVLy2fNRY/zl592ijUXPT7uUqSzNtLFYuD1wB7gBUlOAPcxd1PZQg4AG5NsYC4AtjHwS6Oq+iaw5ontJH8IvLmqpkeuXjpHvPnKh8ZdgvSkjRoEX6mqlye5GLigqh5e7AVVdSrJDmA/sAq4qaoOJbkRmK6qqSdftiRpqYwaBPcl+QPgw8Atox68qvYB++a13XCGvi8d9biSpKUz6jWCFwD/k7khovuS/Ickf7u/siRJy2WkIKiqP6uqm6vq7wMvAr4f+GSvlUmSlsXI6xEk+bEkvwYcBC5ibsoJSdIKN9I1gu7nnZ8HbgbeUlVOOCdJ54lRLxZfWVXf6rUSSdJYLLZC2c6q2gW8K8lpK5VV1Rt7q0yStCwWOyO4p/vXm7wk6Ty12FKVv989vbuqPrcM9UiSltmovxr65ST3JHlnkhf2WpEkaVmNeh/BjwM/DswC70ty9wjrEUiSVoCR7yOoqpmq+lXgdcCdwNCpIiRJK8uoK5T9tSRvT3I38F7gfzE3rbQkaYUb9T6Cm5hbavLvVtX/6bEeSdIyWzQIukXo76uqf78M9UiSltmiQ0NV9RiwPsnqZahHkrTMRl6PALgtyRTw3XmGquo9vVQlSVo2owbBn3SPC5hba1iSdJ4YKQiq6h19FyJJGo9Rp6G+FRg26dzLlrwiSdKyGnVo6M0Dzy8CXgmcWvpyJEnLbdShoYPzmm5L8tke6pEkLbNR7yx+1sBjTZLNwA+M8LrNSe5NciTJ9UP2v66bt+jOJH+U5Ion8RkkSU/BqENDB/n/1whOAV8GXrPQC7ob0XYDVwPHgQNJpqrq8EC3D1bVb3T9twDvATaPXL0k6Slb8IwgyY8kWVtVG6rqB4F3AH/cPQ4v9FpgE3Ckqo5W1UnmpqjYOthh3vKXFzPkgrQkqV+LDQ29DzgJkOQlwL8D3g98E9izyGsvA44NbB/v2r5Hktcn+RNgFzB06csk25NMJ5menZ1d5G0lSWdjsSBYVVUPds//AbCnqj5WVb8APG8pCqiq3VX1XOCtwNA1DqpqT1VNVtXkxMTEUrytJKmzaBAkeeI6wk8AtwzsW+z6wglg/cD2uq7tTPYCP7XIMSVJS2yxIPgQ8Mkkvwd8B/g0QJLnMTc8tJADwMYkG7oJ67YBU4Mdkmwc2PxJ4H+fRe2SpCWw2OL170ryCeBS4H9U1RMXcy8A3rDIa08l2QHsB1YBN1XVoSQ3AtNVNQXsSPJy4FHgG8DPPbWPI0k6W4v+fLSqbh/S9qVRDl5V+4B989puGHj+plGOI0nqz6j3EUjSeWvnzp3MzMywdu1adu3aNe5ylp1BIKl5MzMznDix0G9Zzm8jTTEhSTp/GQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4p5iQtOSueu9V4y7hrKx+aDUXcAHHHjq2omq/7Q23LclxPCOQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNa7XIEiyOcm9SY4kuX7I/p9PcjjJXUk+keQ5fdYjSTpdb0GQZBWwG3gFcAVwTZIr5nX7PDBZVVcCHwV29VWPJGm4Ps8INgFHqupoVZ0E9gJbBztU1a1V9Wfd5u3Auh7rkSQN0WcQXAYcG9g+3rWdyWuA/z5sR5LtSaaTTM/Ozi5hiZKkc+JicZJ/BEwC7x62v6r2VNVkVU1OTEwsb3GSznv19OLxix+nnl7jLmUs+px99ASwfmB7Xdf2PZK8HPiXwI9V1Z/3WI8kDfXoVY+Ou4Sx6vOM4ACwMcmGJKuBbcDUYIckLwLeB2ypqq/3WIsk6Qx6C4KqOgXsAPYD9wA3V9WhJDcm2dJ1ezfwDOAjSe5MMnWGw0mSetLrwjRVtQ/YN6/thoHnL+/z/SVJizsnLhZLksbHIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuN6DYIkm5Pcm+RIkuuH7H9Jks8lOZXkVX3WIkkarrcgSLIK2A28ArgCuCbJFfO6fRW4DvhgX3VIkhZ2YY/H3gQcqaqjAEn2AluBw090qKovd/se77EOSdIC+hwaugw4NrB9vGs7a0m2J5lOMj07O7skxUmS5qyIi8VVtaeqJqtqcmJiYtzlSNJ5pc8gOAGsH9he17VJks4hfQbBAWBjkg1JVgPbgKke30+S9CT0FgRVdQrYAewH7gFurqpDSW5MsgUgyY8kOQ68GnhfkkN91SNJGq7PXw1RVfuAffPabhh4foC5ISNJ0pisiIvFkqT+GASS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWpcr0GQZHOSe5McSXL9kP1/IcmHu/13JLm8z3okSafrLQiSrAJ2A68ArgCuSXLFvG6vAb5RVc8DfgX4xb7qkSQN1+cZwSbgSFUdraqTwF5g67w+W4H3d88/CvxEkvRYkyRpngt7PPZlwLGB7ePAj56pT1WdSvJN4NnA/YOdkmwHtnebjyS5t5eKzw1rmPf5z3X5pZ8bdwnnihX33fGv/btrwIr7/vLGs/r+nnOmHX0GwZKpqj3AnnHXsRySTFfV5Ljr0Nnzu1vZWv7++hwaOgGsH9he17UN7ZPkQuAHgAd6rEmSNE+fQXAA2JhkQ5LVwDZgal6fKeCJcYVXAbdUVfVYkyRpnt6Ghrox/x3AfmAVcFNVHUpyIzBdVVPAbwEfSHIEeJC5sGhdE0Ng5ym/u5Wt2e8v/gEuSW3zzmJJapxBIEmNMwjOEUluSvL1JF8cdy06O0nWJ7k1yeEkh5K8adw1aXRJLkry2SRf6L6/d4y7puXmNYJzRJKXAI8Av1NVLxx3PRpdkkuBS6vqc0kuAQ4CP1VVh8dcmkbQzWZwcVU9kuT7gD8C3lRVt4+5tGXjGcE5oqo+xdwvp7TCVNXXqupz3fOHgXuYu2teK0DNeaTb/L7u0dRfyAaBtIS6GXRfBNwx3kp0NpKsSnIn8HXg41XV1PdnEEhLJMkzgI8B/7SqvjXuejS6qnqsqn6YuRkQNiVpanjWIJCWQDe2/DHgd6vqP427Hj05VfUQcCuwedy1LCeDQHqKuouNvwXcU1XvGXc9OjtJJpI8s3v+NOBq4I/HW9XyMgjOEUk+BHwG+KtJjid5zbhr0siuAn4GeFmSO7vH3xt3URrZpcCtSe5ibo60j1fVfx1zTcvKn49KUuM8I5CkxhkEktQ4g0CSGmcQSFLjDAJJapxBIM2T5LHuJ6BfTPKRJE9foO/bk7x5OeuTlppBIJ3uO1X1w90ssCeB1427IKlPBoG0sE8DzwNI8rNJ7urmrf/A/I5JXpvkQLf/Y0+cSSR5dXd28YUkn+rafqibA//O7pgbl/VTSQO8oUyaJ8kjVfWMJBcyN3/QHwCfAv4z8Leq6v4kz6qqB5O8HXikqn4pybOr6oHuGP8G+NOqem+Su4HNVXUiyTOr6qEk7wVur6rfTbIaWFVV3xnLB1bzPCOQTve0bkriaeCrzM0j9DLgI1V1P0BVDVs74oVJPt39j/9a4Ie69tuA307yWmBV1/YZ4F8keSvwHENA43ThuAuQzkHf6aYk/q65eeUW9dvMrUz2hSTXAS8FqKrXJflR4CeBg0leXFUfTHJH17YvyT+pqluW8DNII/OMQBrNLcCrkzwbIMmzhvS5BPhaNyX1tU80JnluVd1RVTcAs8D6JD8IHK2qXwV+D7iy908gnYFnBNIIqupQkncBn0zyGPB54Lp53X6BuZXJZrt/L+na391dDA7wCeALwFuBn0nyKDAD/NveP4R0Bl4slqTGOTQkSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLj/h+SBBZzqACz9AAAAABJRU5ErkJggg==\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "AY4w_LVmtG7C" | |
}, | |
"source": [ | |
"As predicted, people with higher socioeconomic class had a higher rate of survival. (62.9% vs. 47.3% vs. 24.2%)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 331 | |
}, | |
"id": "NKvART6otHvf", | |
"outputId": "0100aae8-6f1e-4db5-bf22-4b236f90534d" | |
}, | |
"source": [ | |
"#draw a bar plot for SibSp vs. survival\n", | |
"sns.barplot(x=\"SibSp\", y=\"Survived\", data=train)\n", | |
"\n", | |
"#I won't be printing individual percent values for all of these.\n", | |
"print(\"Percentage of SibSp = 0 who survived:\", train[\"Survived\"][train[\"SibSp\"] == 0].value_counts(normalize = True)[1]*100)\n", | |
"\n", | |
"print(\"Percentage of SibSp = 1 who survived:\", train[\"Survived\"][train[\"SibSp\"] == 1].value_counts(normalize = True)[1]*100)\n", | |
"\n", | |
"print(\"Percentage of SibSp = 2 who survived:\", train[\"Survived\"][train[\"SibSp\"] == 2].value_counts(normalize = True)[1]*100)" | |
], | |
"execution_count": 231, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Percentage of SibSp = 0 who survived: 34.53947368421053\n", | |
"Percentage of SibSp = 1 who survived: 53.588516746411486\n", | |
"Percentage of SibSp = 2 who survived: 46.42857142857143\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAATeklEQVR4nO3df7BndX3f8eeLJRsCkqQJm0LYJTDJxnRriD9ukBTHn5CuNV1mKjGAWp3RbDPjRlt/MDBxqMHJdIKJpjUb68YwtaaKFJN2bbZBq4CVJroXRXDZoisgu6tbdkUMogUW3v3je6BfL9/d+93lnu93v/t5Pmbu3PPjc873fZnlvu75nHM+n1QVkqR2HTPtAiRJ02UQSFLjDAJJapxBIEmNMwgkqXHHTruAQ3XSSSfV6aefPu0yJGmm3HzzzfuqasWofTMXBKeffjrz8/PTLkOSZkqSrx9on11DktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMbN3AtlOnpdcskl7Nmzh5NPPpkrr7xy2uVIzTAIdMTYs2cPu3fvnnYZUnPsGpKkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMb1GgRJ1ia5I8mOJJceoM0rktyeZFuSD/dZjyTpyXobYiLJMmAjcB6wC9iaZHNV3T7UZjVwGXBOVX07yU/1VY8kabQ+rwjOAnZU1Z1V9TBwNXD+gja/CWysqm8DVNW9PdYjSRqhz0HnTgV2Dq3vAp67oM3PAyS5CVgGvKOq/rrHmo5qjt4p6XBMe/TRY4HVwAuBlcBnkvxiVd0/3CjJemA9wGmnnTbpGmeGo3dKOhx9dg3tBlYNra/stg3bBWyuqkeq6i7gKwyC4QdU1aaqmququRUrVvRWsCS1qM8g2AqsTnJGkuXAhcDmBW3+C4OrAZKcxKCr6M4ea5IkLdBbEFTVfmADcB2wHbimqrYluSLJuq7ZdcC3ktwOXA+8raq+1VdNkqQn6/UeQVVtAbYs2Hb50HIBb+6+JElT4JvFktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNW7acxYLuOeKX1yS8+y/7yeAY9l/39eX5JynXX7bUy9K0hHPKwJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS43oNgiRrk9yRZEeSS0fsf22SvUlu6b5e32c9kqQn6+3N4iTLgI3AecAuYGuSzVV1+4KmH62qDX3VIUk6uD6vCM4CdlTVnVX1MHA1cH6PnydJOgx9BsGpwM6h9V3dtoVenuTWJNcmWTXqREnWJ5lPMr93794+apWkZk37ZvHHgdOr6kzgk8AHRzWqqk1VNVdVcytWrJhogZJ0tOszCHYDw3/hr+y2PaGqvlVVD3WrHwCe02M9kqQR+gyCrcDqJGckWQ5cCGwebpDklKHVdcD2HuuRJI3Q21NDVbU/yQbgOmAZcFVVbUtyBTBfVZuBNyZZB+wH7gNe21c9kqTRep2Ypqq2AFsWbLt8aPky4LI+a5AkHdy0bxZLkqbMIJCkxhkEktQ4g0CSGmcQSFLjen1qSJN10nGPAfu775NzznvPWZLzLL9/OcdwDDvv37kk57zpt29agqqko59BcBR565n3T7sESTPIriFJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOB8flZbAJZdcwp49ezj55JO58sorp12OdEgMAmkJ7Nmzh927dy/eUDoC2TUkSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJalyvQZBkbZI7kuxIculB2r08SSWZ67MeSdKT9RYESZYBG4GXAmuAi5KsGdHuROBNwOf6qkWSdGB9XhGcBeyoqjur6mHgauD8Ee3eCfw+8H97rEWSdAB9BsGpwM6h9V3dtickeTawqqr+6mAnSrI+yXyS+b179y59pZLUsKndLE5yDPBu4C2Lta2qTVU1V1VzK1as6L84SWrIQUcfTfIAUAfaX1U/epDDdwOrhtZXdtsedyLwDOCGJAAnA5uTrKuq+UXqliQtkYMGQVWdCJDkncA3gQ8BAV4JnLLIubcCq5OcwSAALgQuHjr3d4CTHl9PcgPwVkNAkiZr3PkI1lXVLw2tvy/Jl4DLD3RAVe1PsgG4DlgGXFVV25JcAcxX1ebDrronTi4iqUXjBsGDSV7J4MmfAi4CHlzsoKraAmxZsG1keFTVC8espTdOLiKpRePeLL4YeAXwf7qvX2eom0eSNLvGuiKoqrsZ/Q6AJGnGjXVFkOTnk3wqyZe79TOTvL3f0iRJkzBu19CfApcBjwBU1a0MngKSJM24cYPg+Kr6/IJt+5e6GEnS5I0bBPuS/Czdy2VJLmDwXoEkacaN+/joG4BNwC8k2Q3cxeClMknSjBs3CL5eVecmOQE4pqoe6LMoSdLkjNs1dFeSTcDZwHd7rEeSNGHjBsEvAP+DQRfRXUn+OMnz+itLkjQpYwVBVX2vqq6pqn8GPAv4UeDGXiuTJE3E2PMRJHlBkj8BbgaOYzDkhCRpxo11szjJ3cAXgWuAt1XVogPOTdJz3vYfl+Q8J+57gGXAPfseWJJz3vyuf/7Ui5Kkno371NCZVfV3vVYiSZqKxWYou6SqrgR+L8mTZiqrqjf2VpmkiXEujrYtdkWwvfvurGHSUcy5ONq22FSVH+8Wb6uqL0ygHknShI371NAfJtme5J1JntFrRZKkiRr3PYIXAS8C9gLvT3Kb8xFI0tFh7PcIqmpPVf074LeAWzjIxPWSpNkx7gxl/yDJO5LcBrwX+F/Ayl4rkyRNxLjvEVwFXA3846r6Ro/1SJImbNEgSLIMuKuq/u0E6pEkTdiiXUNV9SiwKsnyQz15krVJ7kiyI8mlI/b/Vnfj+ZYkn02y5lA/Q5L01IzbNXQXcFOSzcAT4wxV1bsPdEB3JbEROA/YBWxNsrmqbh9q9uGq+vdd+3XAu4G1h/YjLJ3Hlp/wA98lqQXjBsHXuq9jgBPHPOYsYEdV3QmQ5GrgfOCJIFgwftEJdHMiT8uDq391mh8vSVMxVhBU1e8exrlPBXYOre8CnruwUZI3AG8GlgMvHnWiJOuB9QCnnXbaYZQiSTqQcYehvp4Rf61X1chf3IeiqjYCG5NcDLwdeM2INpuATQBzc3NTvWqQpKPNuF1Dbx1aPg54ObB/kWN2A6uG1ld22w7kauB9Y9YjSVoi43YN3bxg001JPr/IYVuB1UnOYBAAFwIXDzdIsrqqvtqtvgz4KpKkiRq3a+gnhlaPAeaAHzvYMVW1P8kG4DpgGXBVVW1LcgUwX1WbgQ1JzgUeAb7NiG4hSVK/xu0aupn/f49gP3A38LrFDqqqLcCWBdsuH1p+05ifL0nqyWIzlP0ysLOqzujWX8Pg/sDdDD0GKi2FOr54jMeo430eQJqkxd4sfj/wMECS5wP/Bvgg8B26p3ikpfLIOY/w8HkP88g5j0y7FKkpi3UNLauq+7rl3wA2VdXHgI8luaXf0iRJk7DYFcGyJI+HxUuATw/tG/f+giTpCLbYL/OPADcm2Qd8H/ifAEl+jkH3kCRpxi02ef3vJfkUcArwiap6/C7eMcBv912cJKl/i3bvVNXfjtj2lX7KkSRN2thzFkuSjk4GgSQ1ziCQpMb5CKiaduPzX7Ak5/n+scsg4fu7di3ZOV/wmRuX5DzSYrwikKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjeg2CJGuT3JFkR5JLR+x/c5Lbk9ya5FNJfqbPeiRJT9ZbECRZBmwEXgqsAS5KsmZBsy8Cc1V1JnAtcGVf9UiSRuvziuAsYEdV3VlVDwNXA+cPN6iq66vqe93q3wIre6xHkjRCn0FwKrBzaH1Xt+1AXgf891E7kqxPMp9kfu/evUtYoiTpiLhZnORVwBzwrlH7q2pTVc1V1dyKFSsmW5wkHeX6nKFsN7BqaH1lt+0HJDkX+B3gBVX1UI/1SJJG6POKYCuwOskZSZYDFwKbhxskeRbwfmBdVd3bYy2SpAPoLQiqaj+wAbgO2A5cU1XbklyRZF3X7F3A04D/nOSWJJsPcDpJUk96nby+qrYAWxZsu3xo+dw+P1+StLgj4maxJGl6DAJJapxBIEmNMwgkqXEGgSQ1rtenhiT164/f8vElOc/9+x584vtSnHPDH/7Tp3wOTY5XBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktS4XoMgydokdyTZkeTSEfufn+QLSfYnuaDPWiRJo/UWBEmWARuBlwJrgIuSrFnQ7B7gtcCH+6pDknRwfc5ZfBawo6ruBEhyNXA+cPvjDarq7m7fYz3WIUk6iD67hk4Fdg6t7+q2SZKOIDNxszjJ+iTzSeb37t077XIk6ajSZxDsBlYNra/sth2yqtpUVXNVNbdixYolKU6SNNBnEGwFVic5I8ly4EJgc4+fJ0k6DL0FQVXtBzYA1wHbgWuqaluSK5KsA0jyy0l2Ab8OvD/Jtr7qkSSN1udTQ1TVFmDLgm2XDy1vZdBlJEmakpm4WSxJ6o9BIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS43p9oUxqxY9X/cB3aZYYBNISeNWjTqmh2WXXkCQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuMMAklqXK9BkGRtkjuS7Ehy6Yj9P5zko93+zyU5vc96JElP1lsQJFkGbAReCqwBLkqyZkGz1wHfrqqfA94D/H5f9UiSRuvziuAsYEdV3VlVDwNXA+cvaHM+8MFu+VrgJUnSY02SpAVSPU2tl+QCYG1Vvb5bfzXw3KraMNTmy12bXd3617o2+xacaz2wvlt9OnBHL0UPnATsW7TVkcv6p2eWawfrn7a+6/+ZqloxasdMTFVZVZuATZP4rCTzVTU3ic/qg/VPzyzXDtY/bdOsv8+uod3AqqH1ld22kW2SHAv8GPCtHmuSJC3QZxBsBVYnOSPJcuBCYPOCNpuB13TLFwCfrr76qiRJI/XWNVRV+5NsAK4DlgFXVdW2JFcA81W1Gfgz4ENJdgD3MQiLaZtIF1SPrH96Zrl2sP5pm1r9vd0sliTNBt8slqTGGQSS1DiDoLPYcBhHuiRXJbm3ezdjpiRZleT6JLcn2ZbkTdOu6VAkOS7J55N8qav/d6dd0+FIsizJF5P8t2nXcqiS3J3ktiS3JJmfdj2HIsm/6v7dfDnJR5IcN+kaDALGHg7jSPcfgLXTLuIw7QfeUlVrgLOBN8zYf/+HgBdX1S8BzwTWJjl7yjUdjjcB26ddxFPwoqp65iy9S5DkVOCNwFxVPYPBgzUTf2jGIBgYZziMI1pVfYbBk1czp6q+WVVf6JYfYPDL6NTpVjW+Gvhut/pD3ddMPYWRZCXwMuAD066lQccCP9K9S3U88I1JF2AQDJwK7Bxa38UM/SI6mnQj0D4L+Nx0Kzk0XbfKLcC9wCeraqbqB/4IuAR4bNqFHKYCPpHk5m5ImplQVbuBPwDuAb4JfKeqPjHpOgwCHTGSPA34GPAvq+rvpl3PoaiqR6vqmQzeoD8ryTOmXdO4kvwacG9V3TztWp6C51XVsxl0774hyfOnXdA4kvw9Br0PZwA/DZyQ5FWTrsMgGBhnOAz1KMkPMQiB/1RVfzHteg5XVd0PXM9s3a85B1iX5G4G3aIvTvLn0y3p0HR/WVNV9wJ/yaC7dxacC9xVVXur6hHgL4B/NOkiDIKBcYbDUE+6ocf/DNheVe+edj2HKsmKJD/eLf8IcB7wv6db1fiq6rKqWllVpzP4t//pqpr4X6WHK8kJSU58fBn4VWBWnp67Bzg7yfHd/wcvYQo37A0CBsNhAI8Ph7EduKaqtk23qkOT5CPA3wBPT7IryeumXdMhOAd4NYO/RG/pvv7JtIs6BKcA1ye5lcEfFZ+sqpl7BHOG/X3gs0m+BHwe+Kuq+usp1zSW7l7StcAXgNsY/E6e+FATDjEhSY3zikCSGmcQSFLjDAJJapxBIEmNMwgkqXEGgXQASX6nGxXy1u6R1ucm+cDjA+Il+e4Bjjs7yee6Y7YnecdEC5cOUW9TVUqzLMmvAL8GPLuqHkpyErC8ql4/xuEfBF5RVV/qRrZ9ep+1Sk+VVwTSaKcA+6rqIYCq2ldV30hyQ5InhjlO8p7uquFTSVZ0m3+KwQBij49BdHvX9h1JPpTkb5J8NclvTvhnkkYyCKTRPgGsSvKVJH+S5AUj2pwAzFfVPwRuBP51t/09wB1J/jLJv1gw0ciZwIuBXwEuT/LTPf4M0lgMAmmEbn6B5wDrgb3AR5O8dkGzx4CPdst/DjyvO/YKYI5BmFwMDA938F+r6vtVtY/B4HSzMjiajmLeI5AOoKoeBW4AbkhyG/CaxQ4ZOvZrwPuS/CmwN8lPLmxzgHVp4rwikEZI8vQkq4c2PRP4+oJmxwAXdMsXA5/tjn1ZN5IkwGrgUeD+bv38bo7jnwReyGCQOmmqvCKQRnsa8N5ueOn9wA4G3UTXDrV5kMEkNG9nMDPZb3TbXw28J8n3umNfWVWPdtlwK4MuoZOAd1bVxKcllBZy9FFpQrr3Cb5bVX8w7VqkYXYNSVLjvCKQpMZ5RSBJjTMIJKlxBoEkNc4gkKTGGQSS1Lj/By9iaFLZT7AmAAAAAElFTkSuQmCC\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "EnXtHdI7tKaQ" | |
}, | |
"source": [ | |
"In general, it's clear that people with more siblings or spouses aboard were less likely to survive. However, contrary to expectations, people with no siblings or spouses were less to likely to survive than those with one or two. (34.5% vs 53.4% vs. 46.4%)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 279 | |
}, | |
"id": "IYasBzrbtNru", | |
"outputId": "cb3e37e9-d53d-42af-8000-3e0ac44e2839" | |
}, | |
"source": [ | |
"#draw a bar plot for Parch vs. survival\n", | |
"sns.barplot(x=\"Parch\", y=\"Survived\", data=train)\n", | |
"plt.show()" | |
], | |
"execution_count": 232, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAASb0lEQVR4nO3dfZBdd33f8fdHEo5jx9hlpCKPJUWeopC4hMRUNbTOgHlKbUrtTqGpnTimGRpNO5jSIbDjNBnHMWU6EVPawjgEBRwekqAYUzpqqsZpwQHiFrDEk7EcM4pt0CpsbWPsGOPGyP72j3vkXFar3bvSnnt19Xu/Znb2nnN/e+5nPR599vzOU6oKSVK7Vk06gCRpsiwCSWqcRSBJjbMIJKlxFoEkNW7NpAMs19q1a2vz5s2TjiFJU2Xv3r0PVtW6hd6buiLYvHkze/bsmXQMSZoqSb5+tPecGpKkxlkEktQ4i0CSGmcRSFLjLAJJapxFIEmN660IktyY5P4kXz3K+0nyriT7k3wlyQv6yiJJOro+9wg+AFy8yPuXAFu6r23Ae3rMIkk6it4uKKuqTyfZvMiQy4AP1eCBCJ9NclaSs6vqm31lkvoyMzPD3Nwc69evZ/v27ZOOIy3LJK8sPgc4MLQ82607ogiSbGOw18CmTZvGEk5ajrm5OQ4ePDjpGNIxmYqDxVW1o6q2VtXWdesWvFWGJOkYTbIIDgIbh5Y3dOskSWM0ySLYBVzVnT30IuARjw9I0vj1dowgyUeAi4C1SWaBXwOeAVBVvwXsBl4F7Ae+C/xCX1kkSUfX51lDVyzxfgFv6OvzJUmjmYqDxZKk/lgEktQ4i0CSGmcRSFLjLAJJapxFIEmNswgkqXEWgSQ1ziKQpMZZBJLUOItAkhpnEUhS4ywCSWqcRSBJjbMIJKlxFoEkNc4ikKTGWQSS1DiLQJIaZxFIUuMsAklqnEUgSY2zCCSpcRaBJDXOIpCkxlkEktQ4i0CSGmcRSFLjLAJJapxFIEmNswgkqXEWgSQ1rtciSHJxkruT7E9yzQLvb0pya5IvJvlKklf1mUeSdKTeiiDJauAG4BLgPOCKJOfNG/arwE1VdT5wOfCbfeWRJC2szz2CC4D9VXVPVT0B7AQumzemgGd2r88E/qLHPJKkBfRZBOcAB4aWZ7t1w64DrkwyC+wG3rjQhpJsS7InyZ4HHnigj6yS1KxJHyy+AvhAVW0AXgV8OMkRmapqR1Vtraqt69atG3tISTqZ9VkEB4GNQ8sbunXDXg/cBFBV/wc4FVjbYyZJ0jx9FsHtwJYk5yY5hcHB4F3zxnwDeDlAkh9jUATO/UjSGPVWBFV1CLgauAW4i8HZQXcmuT7Jpd2wXwJ+McmXgY8A/7yqqq9MkqQjrelz41W1m8FB4OF11w693gdc2GcGSdLiJn2wWJI0YRaBJDXOIpCkxlkEktQ4i0CSGmcRSFLjLAJJapxFIEmNswgkqXEWgSQ1ziKQpMZZBJLUOItAkhpnEUhS4ywCSWqcRSBJjbMIJKlxFoEkNc4ikKTGWQSS1DiLQJIaZxFIUuMsAklqnEUgSY2zCCSpcWsmHUArZ2Zmhrm5OdavX8/27dsnHUfSlLAITiJzc3McPHhw0jEkTRmnhiSpcRaBJDXOIpCkxlkEktQ4i0CSGtdrESS5OMndSfYnueYoY34myb4kdyb5/T7z6MQ2MzPDVVddxczMzKSjSE1Z9PTRJI8CdbT3q+qZi/zsauAG4JXALHB7kl1VtW9ozBbgl4ELq+rbSf7mMvPrJOLpr9JkLFoEVXUGQJK3Ad8EPgwE+Dng7CW2fQGwv6ru6baxE7gM2Dc05heBG6rq293n3X8Mv4Mk6TiMekHZpVX1E0PL70nyZeDaRX7mHODA0PIs8MJ5Y34EIMltwGrguqr6oxEzSRLgVfXHa9QieCzJzwE7GUwVXQE8tkKfvwW4CNgAfDrJj1fVw8ODkmwDtgFs2rRpBT5WGvjUi1+yItt5fM1qSHh8dnbFtvmST39qRbbTAqcVj8+oB4t/FvgZ4P92X/+0W7eYg8DGoeUN3bphs8CuqvpeVd0LfI1BMXyfqtpRVVurauu6detGjCxJGsVIewRVdR+D+f3luB3YkuRcBgVwOUeWx39lsHfxO0nWMpgqumeZnyNJOg4j7REk+ZEkn0jy1W75+Ul+dbGfqapDwNXALcBdwE1VdWeS65Nc2g27BfhWkn3ArcBbq+pbx/rLSJKWb9RjBL8NvBV4L0BVfaU75//fLfZDVbUb2D1v3bVDrwt4c/clSZqAUY8RnFZVn5+37tBKh5Ekjd+oRfBgkr9Fd3FZktcyuK5AkjTlRp0aegOwA/jRJAeBexlcVCZJmnKjFsHXq+oVSU4HVlXVo32GkiSNz6hTQ/cm2QG8CPhOj3kkSWM26h7BjwKvZjBF9P4kfwjsrKo/7S1ZQ75x/Y+vyHYOPfQsYA2HHvr6imxz07V3HH8oSSe8kfYIquq7VXVTVf0T4HzgmYDXv0vSSWDk5xEkeUmS3wT2AqcyuOWEJGnKjTQ1lOQ+4IvATQyu/l2JG85Jkk4Aox4jeH5V/WWvSSRJE7HUE8pmqmo78PYkRzyprKr+dW/JJEljsdQewV3d9z19B5EkTcZSj6r8b93LO6rqC2PII0kas1HPGvoPSe5K8rYkz+s1kSRprEa9juClwEuBB4D3JrljqecRSJKmw6hnDVFVc8C7ktwKzDB4cP2izyNQGy5894Ursp1THj6FVaziwMMHVmSbt73xthVIJZ38Rn1C2Y8luS7JHcC7gf/N4BnEkqQpN+oewY3ATuAfVNVf9JhHkjRmSxZBktXAvVX1n8eQR5I0ZktODVXVk8DGJKeMIY8kacxGnRq6F7gtyS7g6fsMVdU7e0klSRqbUYvgz7uvVcAZ/cWRJI3bSEVQVb/edxBJ0mSMehvqW4GFbjr3shVPJEkaq1Gnht4y9PpU4DXAoZWPI0kat1GnhvbOW3Vbks/3kEfHYe2pTwGHuu+SNJpRp4aeNbS4CtgKnNlLIh2ztzz/4UlHkDSFRp0a2stfHyM4BNwHvL6PQJKk8VrqCWV/FzhQVed2y69jcHzgPmBf7+kkSb1b6sri9wJPACR5MfDvgQ8CjwA7+o0mSRqHpaaGVlfVQ93rfwbsqKqPAR9L8qV+o0mSxmGpPYLVSQ6XxcuBTw69N/KzDCRJJ66l/jH/CPCpJA8CjwOfAUjyHAbTQ5KkKbfoHkFVvR34JeADwE9V1eEzh1YBb1xq40kuTnJ3kv1Jrllk3GuSVJKto0eXJK2EJad3quqzC6z72lI/1z3H4AbglcAscHuSXVW1b964M4A3AZ8bNbQkaeWM9KjKY3QBsL+q7qmqJxg84eyyBca9DfgN4P/1mEWSdBR9FsE5wIGh5dlu3dOSvADYWFX/fbENJdmWZE+SPQ888MDKJ5WkhvVZBItKsgp4J4NjEIuqqh1VtbWqtq5bt67/cJLUkD6L4CCwcWh5Q7fusDOA5wF/kuQ+4EXALg8YS9J49XktwO3AliTnMiiAy4GfPfxmVT0CrD28nORPgLdU1Z4eMy1qZmaGubk51q9fz/bt2ycVQ5LGqrciqKpDSa4GbgFWAzdW1Z1Jrgf2VNWuvj77WM3NzXHw4MGlB6oXdVrxFE9Rpx3xDCRJPer16uCq2g3snrfu2qOMvajPLDrxfe/C7006gtSkiR0sliSdGCwCSWqcRSBJjbMIJKlxFoEkNc4ikKTGnRQPl/k7b/3QimznjAcfZTXwjQcfXZFt7n3HVccfSpJ65h6BJDXOIpCkxlkEktQ4i0CSGmcRSFLjLAJJapxFIEmNOymuI1gpT51y+vd9l6QWWARDHtvy05OOIElj59SQJDXOIpCkxlkEktQ4i0CSGmcRSFLjLAJJapxFIEmNswgkqXEWgSQ1ziKQpMZZBJLUOItAkhpnEUhS4ywCSWqcRSBJjbMIJKlxvRZBkouT3J1kf5JrFnj/zUn2JflKkk8k+eE+80iSjtRbESRZDdwAXAKcB1yR5Lx5w74IbK2q5wM3A9v7yiNJWlifewQXAPur6p6qegLYCVw2PKCqbq2q73aLnwU29JhHkrSAPp9ZfA5wYGh5FnjhIuNfD/yPhd5Isg3YBrBp06aVyidpwt5+5WtXZDsP3f/I4PvcN1dkm7/yuzcf9zamyQlxsDjJlcBW4B0LvV9VO6pqa1VtXbdu3XjDSSM4q4pnVXFW1aSjSMvW5x7BQWDj0PKGbt33SfIK4FeAl1TVX/WYR+rNlU8+NekI0jHrc4/gdmBLknOTnAJcDuwaHpDkfOC9wKVVdX+PWSRJR9FbEVTVIeBq4BbgLuCmqrozyfVJLu2GvQP4IeCjSb6UZNdRNidJ6kmfU0NU1W5g97x11w69fkWfny9JWtoJcbBYkjQ5FoEkNc4ikKTGWQSS1DiLQJIaZxFIUuMsAklqnEUgSY2zCCSpcRaBJDXOIpCkxlkEktQ4i0CSGmcRSFLjLAJJapxFIEmNswgkqXEWgSQ1ziKQpMZZBJLUOItAkhpnEUhS4ywCSWqcRSBJjbMIJKlxFoEkNc4ikKTGWQSS1DiLQJIaZxFIUuMsAklqnEUgSY2zCCSpcb0WQZKLk9ydZH+SaxZ4/weS/EH3/ueSbO4zjyTpSL0VQZLVwA3AJcB5wBVJzps37PXAt6vqOcB/BH6jrzySpIX1uUdwAbC/qu6pqieAncBl88ZcBnywe30z8PIk6TGTJGmeVFU/G05eC1xcVf+iW/554IVVdfXQmK92Y2a75T/vxjw4b1vbgG3d4nOBu3sJPbAWeHDJUScu80/ONGcH809a3/l/uKrWLfTGmh4/dMVU1Q5gxzg+K8meqto6js/qg/knZ5qzg/knbZL5+5waOghsHFre0K1bcEySNcCZwLd6zCRJmqfPIrgd2JLk3CSnAJcDu+aN2QW8rnv9WuCT1ddclSRpQb1NDVXVoSRXA7cAq4Ebq+rOJNcDe6pqF/B+4MNJ9gMPMSiLSRvLFFSPzD8505wdzD9pE8vf28FiSdJ08MpiSWqcRSBJjbMIOkvdDuNEl+TGJPd312ZMlSQbk9yaZF+SO5O8adKZliPJqUk+n+TLXf5fn3SmY5FkdZIvJvnDSWdZriT3JbkjyZeS7Jl0nuVKclaSm5P8WZK7kvy9sX6+xwievh3G14BXArMMzni6oqr2TTTYMiR5MfAd4ENV9bxJ51mOJGcDZ1fVF5KcAewF/vG0/PfvroY/vaq+k+QZwJ8Cb6qqz0442rIkeTOwFXhmVb160nmWI8l9wNb5F6NOiyQfBD5TVe/rzrI8raoeHtfnu0cwMMrtME5oVfVpBmdeTZ2q+mZVfaF7/ShwF3DOZFONrga+0y0+o/uaqr+wkmwA/iHwvklnaU2SM4EXMziLkqp6YpwlABbBYecAB4aWZ5mif4hOJt0daM8HPjfZJMvTTat8Cbgf+J9VNVX5gf8EzABPTTrIMSrgj5Ps7W5JM03OBR4AfqebmntfktPHGcAi0AkjyQ8BHwP+TVX95aTzLEdVPVlVP8ngCvoLkkzN9FySVwP3V9XeSWc5Dj9VVS9gcLfjN3RTpdNiDfAC4D1VdT7wGDDW45QWwcAot8NQj7q59Y8Bv1dV/2XSeY5Vt0t/K3DxpLMsw4XApd08+07gZUl+d7KRlqeqDnbf7wc+zmC6d1rMArNDe5E3MyiGsbEIBka5HYZ60h1sfT9wV1W9c9J5livJuiRnda9/kMFJB3822VSjq6pfrqoNVbWZwf/7n6yqKycca2RJTu9OMqCbUvlpYGrOnquqOeBAkud2q14OjPVEiam4+2jfjnY7jAnHWpYkHwEuAtYmmQV+rareP9lUI7sQ+Hngjm6eHeDfVtXuCWZajrOBD3Znn60CbqqqqTsFc4o9G/h49yiTNcDvV9UfTTbSsr0R+L3uD9F7gF8Y54d7+qgkNc6pIUlqnEUgSY2zCCSpcRaBJDXOIpCkxlkE0lEkebK7m+VXk3w0yWnHub3N03h3WJ38LALp6B6vqp/s7ub6BPAvR/mhJF6fo6liEUij+QzwnCT/KMnnupuD/a8kzwZIcl2SDye5jcFzuJ+d5OPdMwq+nOTvd9tZneS3u+cW/HF3JbI0URaBtITuL/xLgDsYPGvgRd3NwXYyuGPnYecBr6iqK4B3AZ+qqp9gcN+Yw1eqbwFuqKq/DTwMvGY8v4V0dO7CSkf3g0O3vPgMg/shPRf4g+5hOqcA9w6N31VVj3evXwZcBYM7kwKPJPkbwL1VdXibe4HN/f4K0tIsAunoHu9uLf20JO8G3llVu5JcBFw39PZjI2zzr4ZePwk4NaSJc2pIWp4z+etblL9ukXGfAP4VPP3QmjP7DiYdK4tAWp7rgI8m2Qss9nzcNwEvTXIHgymg88aQTTom3n1UkhrnHoEkNc4ikKTGWQSS1DiLQJIaZxFIUuMsAklqnEUgSY37/wIC7A9FChxzAAAAAElFTkSuQmCC\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "KAzwQUtxtSNE" | |
}, | |
"source": [ | |
"People with less than four parents or children aboard are more likely to survive than those with four or more. Again, people traveling alone are less likely to survive than those with 1-3 parents or children." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "-VSRphEPtWHL" | |
}, | |
"source": [ | |
"### Age Feature" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 279 | |
}, | |
"id": "57imtxRFtTKI", | |
"outputId": "58dca94e-538b-49e5-e21c-ad1525893634" | |
}, | |
"source": [ | |
"#sort the ages into logical categories\n", | |
"train[\"Age\"] = train[\"Age\"].fillna(-0.5)\n", | |
"test[\"Age\"] = test[\"Age\"].fillna(-0.5)\n", | |
"bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf]\n", | |
"labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']\n", | |
"train['AgeGroup'] = pd.cut(train[\"Age\"], bins, labels = labels)\n", | |
"test['AgeGroup'] = pd.cut(test[\"Age\"], bins, labels = labels)\n", | |
"\n", | |
"#draw a bar plot of Age vs. survival\n", | |
"sns.barplot(x=\"AgeGroup\", y=\"Survived\", data=train)\n", | |
"plt.show()" | |
], | |
"execution_count": 233, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "qRTwsi23tck_" | |
}, | |
"source": [ | |
"Babies are more likely to survive than any other age group. " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "BG8EV4yato4Y" | |
}, | |
"source": [ | |
"### Cabin Feature\n", | |
"The idea here is that people with recorded cabin numbers are of higher socioeconomic class, and thus more likely to survive." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 314 | |
}, | |
"id": "on8FHk2stdys", | |
"outputId": "866d3fe3-2015-4d79-ffb9-8904dac3be09" | |
}, | |
"source": [ | |
"train[\"CabinBool\"] = (train[\"Cabin\"].notnull().astype('int'))\n", | |
"test[\"CabinBool\"] = (test[\"Cabin\"].notnull().astype('int'))\n", | |
"\n", | |
"#calculate percentages of CabinBool vs. survived\n", | |
"print(\"Percentage of CabinBool = 1 who survived:\", train[\"Survived\"][train[\"CabinBool\"] == 1].value_counts(normalize = True)[1]*100)\n", | |
"\n", | |
"print(\"Percentage of CabinBool = 0 who survived:\", train[\"Survived\"][train[\"CabinBool\"] == 0].value_counts(normalize = True)[1]*100)\n", | |
"#draw a bar plot of CabinBool vs. survival\n", | |
"sns.barplot(x=\"CabinBool\", y=\"Survived\", data=train)\n", | |
"plt.show()" | |
], | |
"execution_count": 234, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Percentage of CabinBool = 1 who survived: 66.66666666666666\n", | |
"Percentage of CabinBool = 0 who survived: 29.985443959243085\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAR7klEQVR4nO3df5BdZ13H8fen6YRKBQS7UCZJmwyEHx2oQJeiA/JDioZxphX5YQoKzAAZRgIIQikjUzAMMgSB8UfARuzwQyHUMuii0Si/BSlkA6ElCcElLSSRwJYWKCC0C1//2Bu4bO9mb9qcvUme92tmJ+d5znPP+XZnu589z7n3OakqJEntOmXUBUiSRssgkKTGGQSS1DiDQJIaZxBIUuNOHXUBR+uMM86olStXjroMSTqh7Nix44aqGhu074QLgpUrVzI5OTnqMiTphJLkq/Ptc2pIkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1LgT7gNlkk5+l1xyCYcOHeLMM89k48aNoy7npGcQSDruHDp0iIMHD466jGY4NSRJjTMIJKlxBoEkNc4gkKTGdRoESdYk2ZtkKsmlA/a/JcnO3teXk3y7y3okSbfV2buGkiwBNgFPAA4A25NMVNXuw2Oq6iV9418IPLSreiRJg3V5RXA+MFVV+6rqFmALcNERxl8MvLfDeiRJA3QZBMuA/X3tA72+20hyNrAK+Mg8+9clmUwyOT09fcwLlaSWHS83i9cCV1XVjwftrKrNVTVeVeNjYwMfuSlJup26DIKDwIq+9vJe3yBrcVpIkkaiyyDYDqxOsirJUmZ/2U/MHZTkAcDdgU93WIskaR6dBUFVzQDrgW3AHuDKqtqVZEOSC/uGrgW2VFV1VYskaX6dLjpXVVuBrXP6LpvTfk2XNUiSjux4uVksSRoRg0CSGmcQSFLjDAJJapxPKJOOI1/b8OBRl3BcmLnxHsCpzNz4Vb8nwFmXXdvp8b0ikKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXGdBkGSNUn2JplKcuk8Y56WZHeSXUne02U9kqTb6uzBNEmWAJuAJwAHgO1JJqpqd9+Y1cArgUdW1U1J7tlVPZKkwbq8IjgfmKqqfVV1C7AFuGjOmOcBm6rqJoCq+maH9UiSBugyCJYB+/vaB3p9/e4H3C/Jp5JcnWTNoAMlWZdkMsnk9PR0R+VKOl6ccdpPuNcvzHDGaT8ZdSlNGPUzi08FVgOPBZYDn0jy4Kr6dv+gqtoMbAYYHx+vxS5S0uJ62bnfXniQjpkurwgOAiv62st7ff0OABNVdWtVXQd8mdlgkCQtki6DYDuwOsmqJEuBtcDEnDH/xOzVAEnOYHaqaF+HNUmS5ugsCKpqBlgPbAP2AFdW1a4kG5Jc2Bu2DfhWkt3AR4GXV9W3uqpJknRbnd4jqKqtwNY5fZf1bRfw0t6XJGkE/GSxJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1LhOgyDJmiR7k0wluXTA/mcnmU6ys/f13C7rkSTdVmcPr0+yBNgEPAE4AGxPMlFVu+cMfV9Vre+qDknSkXV5RXA+MFVV+6rqFmALcFGH55Mk3Q5dBsEyYH9f+0Cvb64nJ7kmyVVJVgw6UJJ1SSaTTE5PT3dRqyQ1a9Q3iz8IrKyqc4H/BN45aFBVba6q8aoaHxsbW9QCJelk12UQHAT6/8Jf3uv7qar6VlX9qNd8O3Beh/VIkgboMgi2A6uTrEqyFFgLTPQPSHLvvuaFwJ4O65EkDdDZu4aqaibJemAbsAS4oqp2JdkATFbVBPCiJBcCM8CNwLO7qkeSNFhnQQBQVVuBrXP6LuvbfiXwyi5rkCQd2ahvFkuSRswgkKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXuiJ8sTnIzUPPtr6q7HvOKJEmL6ohBUFV3AUjyWuDrwLuBAM8A7n2El0qSThDDTg1dWFVvraqbq+q7VfU2fNqYJJ0Uhg2C7yd5RpIlSU5J8gzg+10WJklaHMMGwdOBpwHf6H09tdcnSTrBDbUMdVVdj1NBknRSGuqKIMn9knw4yRd77XOTvKrb0iRJi2HYqaG/ZfYBMrcCVNU1zD56UpJ0ghs2CO5cVZ+d0zdzrIuRJC2+YYPghiT3offhsiRPYfZzBZKkE9ywQfAC4HLgAUkOAn8EPH+hFyVZk2Rvkqkklx5h3JOTVJLxIeuRJB0jwz68/qtVdUGS04FTqurmhV6QZAmwCXgCcADYnmSiqnbPGXcX4MXAZ46udEnSsTDsFcF1STYDvwp8b8jXnA9MVdW+qroF2MLgt6C+FngD8MMhjytJOoaGDYIHAB9idorouiR/neRRC7xmGbC/r32g1/dTSR4GrKiqfz3SgZKsSzKZZHJ6enrIkiVJwxgqCKrqB1V1ZVX9LvBQ4K7Ax+/IiZOcArwZ+OMhzr+5qsaranxsbOyOnFaSNMfQzyNI8pgkbwV2AKcxu+TEkRwEVvS1l/f6DrsL8CDgY0muZ3baacIbxpK0uIa6Wdz7Rf154Erg5VU1zIJz24HVSVYxGwBr6VufqKq+A5zRd46PAS+rqslhi5ck3XHDvmvo3Kr67tEcuKpmkqwHtgFLgCuqaleSDcBkVU0cZa2SpA4s9ISyS6pqI/C6JLd5UllVvehIr6+qrcDWOX2XzTP2sQtWK0k65ha6ItjT+9fpGkk6SS30qMoP9javrarPLUI9kqRFNuy7ht6UZE+S1yZ5UKcVSZIW1bCfI3gc8DhgGrg8ybU+j0CSTg5Df46gqg5V1V8yu9jcTmDgTV9J0oll2CeUPTDJa5JcC/wV8N/MfkBMknSCG/ZzBFcwu2jcb1XV/3ZYjyRpkS0YBL3lpK+rqr9YhHokSYtswamhqvoxsCLJ0kWoR5K0yIadGroO+FSSCeCn6wxV1Zs7qUqStGiGDYKv9L5OYXbVUEnSSWKoIKiqP+26EEnSaAy7DPVHgUGLzv3GMa9Ii+aSSy7h0KFDnHnmmWzcuHHU5UgakWGnhl7Wt30a8GRg5tiXo8V06NAhDh48uPBASSe1YaeGdszp+lSSz3ZQjyRpkQ07NXSPvuYpwDhwt04qkiQtqmGnhnbws3sEM8D1wHO6KEiStLgWekLZw4H9VbWq134Ws/cHrgd2d16dJKlzC32y+HLgFoAkjwZeD7wT+A6wudvSJEmLYaGpoSVVdWNv+/eAzVX1fuD9SXZ2W5okaTEsdEWwJMnhsHg88JG+fcMsWLcmyd4kU0kuHbD/+b2H3OxM8skk5wxfuiTpWFgoCN4LfDzJPwP/B/wXQJL7Mjs9NK/eqqWbgCcC5wAXD/hF/56qenBVPQTYCLh2kSQtsoUeXv+6JB8G7g38R1UdfufQKcALFzj2+cBUVe0DSLIFuIi+m8xV9d2+8acz4NPLkqRuLTi9U1VXD+j78hDHXgbs72sfAB4xd1CSFwAvBZYCA5esSLIOWAdw1llnDXHqIzvv5e+6w8c4GdzlhptZAnzthpv9ngA73vjMUZcgjcTQzyzuSlVtqqr7AK8AXjXPmM1VNV5V42NjY4tboCSd5LoMgoPAir728l7ffLYAv9NhPZKkAboMgu3A6iSrek83WwtM9A9Isrqv+dvA/3RYjyRpgGGXmDhqVTWTZD2wDVgCXFFVu5JsACaragJYn+QC4FbgJuBZXdUjSRqssyAAqKqtwNY5fZf1bb+4y/NLkhY28pvFkqTRMggkqXEGgSQ1ziCQpMYZBJLUuE7fNaTj20+Wnv5z/0pqk0HQsO+v/s1RlyDpOODUkCQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIa12kQJFmTZG+SqSSXDtj/0iS7k1yT5MNJzu6yHknSbXUWBEmWAJuAJwLnABcnOWfOsM8D41V1LnAVsLGreiRJg3V5RXA+MFVV+6rqFmALcFH/gKr6aFX9oNe8GljeYT2SpAG6DIJlwP6+9oFe33yeA/zboB1J1iWZTDI5PT19DEuUJB0XN4uT/D4wDrxx0P6q2lxV41U1PjY2trjFSdJJrssnlB0EVvS1l/f6fk6SC4A/AR5TVT/qsB5J0gBdXhFsB1YnWZVkKbAWmOgfkOShwOXAhVX1zQ5rkSTNo7MgqKoZYD2wDdgDXFlVu5JsSHJhb9gbgV8E/jHJziQT8xxOktSRTh9eX1Vbga1z+i7r276gy/NLkhZ2XNwsliSNjkEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxnQZBkjVJ9iaZSnLpgP2PTvK5JDNJntJlLZKkwToLgiRLgE3AE4FzgIuTnDNn2NeAZwPv6aoOSdKRndrhsc8HpqpqH0CSLcBFwO7DA6rq+t6+n3RYhyTpCLqcGloG7O9rH+j1HbUk65JMJpmcnp4+JsVJkmadEDeLq2pzVY1X1fjY2Nioy5Gkk0qXQXAQWNHXXt7rkyQdR7oMgu3A6iSrkiwF1gITHZ5PknQ7dBYEVTUDrAe2AXuAK6tqV5INSS4ESPLwJAeApwKXJ9nVVT2SpMG6fNcQVbUV2Dqn77K+7e3MThlJkkbkhLhZLEnqjkEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNa7TIEiyJsneJFNJLh2w/05J3tfb/5kkK7usR5J0W50FQZIlwCbgicA5wMVJzpkz7DnATVV1X+AtwBu6qkeSNFiXVwTnA1NVta+qbgG2ABfNGXMR8M7e9lXA45Okw5okSXOc2uGxlwH7+9oHgEfMN6aqZpJ8B/hl4Ib+QUnWAet6ze8l2dtJxW06gznf71blz5816hL08/zZPOzVx+Tv47Pn29FlEBwzVbUZ2DzqOk5GSSaranzUdUhz+bO5eLqcGjoIrOhrL+/1DRyT5FTgbsC3OqxJkjRHl0GwHVidZFWSpcBaYGLOmAng8PX4U4CPVFV1WJMkaY7OpoZ6c/7rgW3AEuCKqtqVZAMwWVUTwN8B704yBdzIbFhocTnlpuOVP5uLJP4BLklt85PFktQ4g0CSGmcQNGqh5T+kUUlyRZJvJvniqGtphUHQoCGX/5BG5R3AmlEX0RKDoE3DLP8hjURVfYLZdxFqkRgEbRq0/MeyEdUiacQMAklqnEHQpmGW/5DUCIOgTcMs/yGpEQZBg6pqBji8/Mce4Mqq2jXaqqRZSd4LfBq4f5IDSZ4z6ppOdi4xIUmN84pAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoGakuTMJFuSfCXJjiRbk9xvnrEr51sBM8nbF1qoL8lrkhxMsjPJl5K8Lcnt+n8uyTuSPOX2vFZaiEGgZiQJ8AHgY1V1n6o6D3glcK+jPVZVPbeqdg8x9C1V9RBmV3l9MPCYoz2X1DWDQC15HHBrVf3N4Y6q+gLw+SQfTvK5JNcm6V+J9dQk/5BkT5KrktwZIMnHkoz3tr+X5HVJvpDk6iSDgmUpcBpwU+81D+mNvSbJB5Lc/Uj9UpcMArXkQcCOAf0/BJ5UVQ9jNize1Lt6ALg/8NaqeiDwXeAPB7z+dODqqvoV4BPA8/r2vSTJTuDrwJeramev/13AK6rqXOBa4NUL9EudMQgkCPBnSa4BPsTsktyH/6rfX1Wf6m3/PfCoAa+/BfiX3vYOYGXfvsNTQ/cETk+yNsndgF+qqo/3xrwTePR8/Xf4v05agEGgluwCzhvQ/wxgDDiv90v7G8xO4wDMXYNl0Jost9bP1mr5MXDq3AFVdSvw7/iLXcchg0At+QhwpyTrDnckORc4G/hmVd2a5HG99mFnJfm13vbTgU/enhP3ppoeCXylqr4D3JTk13u7/wD4+Hz9t+d80tEwCNSM3l/tTwIu6L19dBfwemArMJ7kWuCZwJf6XrYXeEGSPcDdgbcd5WkP3yP4IrAEeGuv/1nAG3vTUQ8BNizQL3XG1UclqXFeEUhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1Lj/ByBiuM6Ym8+rAAAAAElFTkSuQmCC\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "j-TF9J2_t4py" | |
}, | |
"source": [ | |
"People with a recorded Cabin number are, in fact, more likely to survive. (66.6% vs 29.9%)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "jYP7Ocq1t8fM" | |
}, | |
"source": [ | |
"# 5. Cleaning Data\n", | |
"Time to clean our data to account for missing values and unnecessary information!" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "mQwCjZNJt-SK" | |
}, | |
"source": [ | |
"### Looking at the Test Data\n", | |
"Let's see how our test data looks!" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 383 | |
}, | |
"id": "a7aR8zMtuBmA", | |
"outputId": "fcf82c0f-992b-429c-9f1b-e97384558f5a" | |
}, | |
"source": [ | |
"test.describe(include=\"all\")" | |
], | |
"execution_count": 235, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Ticket</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Cabin</th>\n", | |
" <th>Embarked</th>\n", | |
" <th>AgeGroup</th>\n", | |
" <th>CabinBool</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>418.000000</td>\n", | |
" <td>418.000000</td>\n", | |
" <td>418</td>\n", | |
" <td>418</td>\n", | |
" <td>418.000000</td>\n", | |
" <td>418.000000</td>\n", | |
" <td>418.000000</td>\n", | |
" <td>418</td>\n", | |
" <td>417.000000</td>\n", | |
" <td>91</td>\n", | |
" <td>418</td>\n", | |
" <td>418</td>\n", | |
" <td>418.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unique</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>418</td>\n", | |
" <td>2</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>363</td>\n", | |
" <td>NaN</td>\n", | |
" <td>76</td>\n", | |
" <td>3</td>\n", | |
" <td>8</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>top</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Dulles, Mr. William Crothers</td>\n", | |
" <td>male</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>PC 17608</td>\n", | |
" <td>NaN</td>\n", | |
" <td>B57 B59 B63 B66</td>\n", | |
" <td>S</td>\n", | |
" <td>Young Adult</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>freq</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1</td>\n", | |
" <td>266</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>5</td>\n", | |
" <td>NaN</td>\n", | |
" <td>3</td>\n", | |
" <td>270</td>\n", | |
" <td>96</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>1100.500000</td>\n", | |
" <td>2.265550</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>23.941388</td>\n", | |
" <td>0.447368</td>\n", | |
" <td>0.392344</td>\n", | |
" <td>NaN</td>\n", | |
" <td>35.627188</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.217703</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>120.810458</td>\n", | |
" <td>0.841838</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>17.741080</td>\n", | |
" <td>0.896760</td>\n", | |
" <td>0.981429</td>\n", | |
" <td>NaN</td>\n", | |
" <td>55.907576</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.413179</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>892.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>-0.500000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>996.250000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>9.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>7.895800</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>1100.500000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>24.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>14.454200</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>1204.750000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>35.750000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>31.500000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>1309.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>76.000000</td>\n", | |
" <td>8.000000</td>\n", | |
" <td>9.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>512.329200</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Pclass ... AgeGroup CabinBool\n", | |
"count 418.000000 418.000000 ... 418 418.000000\n", | |
"unique NaN NaN ... 8 NaN\n", | |
"top NaN NaN ... Young Adult NaN\n", | |
"freq NaN NaN ... 96 NaN\n", | |
"mean 1100.500000 2.265550 ... NaN 0.217703\n", | |
"std 120.810458 0.841838 ... NaN 0.413179\n", | |
"min 892.000000 1.000000 ... NaN 0.000000\n", | |
"25% 996.250000 1.000000 ... NaN 0.000000\n", | |
"50% 1100.500000 3.000000 ... NaN 0.000000\n", | |
"75% 1204.750000 3.000000 ... NaN 0.000000\n", | |
"max 1309.000000 3.000000 ... NaN 1.000000\n", | |
"\n", | |
"[11 rows x 13 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 235 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "XFXtv9iHuFxn" | |
}, | |
"source": [ | |
"* We have a total of 418 passengers.\n", | |
"* 1 value from the Fare feature is missing.\n", | |
"* Around 20.5% of the Age feature is missing, we will need to fill that in." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "t1PB5592uJci" | |
}, | |
"source": [ | |
"### Cabin Feature" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "CJUuS8SjuGnm" | |
}, | |
"source": [ | |
"#we'll start off by dropping the Cabin feature since not a lot more useful information can be extracted from it.\n", | |
"train = train.drop('Cabin', axis = 1)\n", | |
"test = test.drop('Cabin', axis = 1)" | |
], | |
"execution_count": 236, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "mjAfZJJyuNK_" | |
}, | |
"source": [ | |
"### Ticket Feature" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "mEo0tUsruQJp" | |
}, | |
"source": [ | |
"#we can also drop the Ticket feature since it's unlikely to yield any useful information\n", | |
"train = train.drop(['Ticket'], axis = 1)\n", | |
"test = test.drop(['Ticket'], axis = 1)" | |
], | |
"execution_count": 237, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "pEUYnq_uuWAc" | |
}, | |
"source": [ | |
"### Embarked Feature" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "ZM9tNtZBuYbX", | |
"outputId": "d4f71708-d40b-485d-b950-0919e5d41990" | |
}, | |
"source": [ | |
"#now we need to fill in the missing values in the Embarked feature\n", | |
"print(\"Number of people embarking in Southampton (S):\")\n", | |
"southampton = train[train[\"Embarked\"] == \"S\"].shape[0]\n", | |
"print(southampton)\n", | |
"\n", | |
"print(\"Number of people embarking in Cherbourg (C):\")\n", | |
"cherbourg = train[train[\"Embarked\"] == \"C\"].shape[0]\n", | |
"print(cherbourg)\n", | |
"\n", | |
"print(\"Number of people embarking in Queenstown (Q):\")\n", | |
"queenstown = train[train[\"Embarked\"] == \"Q\"].shape[0]\n", | |
"print(queenstown)" | |
], | |
"execution_count": 238, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Number of people embarking in Southampton (S):\n", | |
"644\n", | |
"Number of people embarking in Cherbourg (C):\n", | |
"168\n", | |
"Number of people embarking in Queenstown (Q):\n", | |
"77\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ZEIg2AAducwE" | |
}, | |
"source": [ | |
"It's clear that the majority of people embarked in Southampton (S). Let's go ahead and fill in the missing values with S." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ImxMUmtrudjW" | |
}, | |
"source": [ | |
"#replacing the missing values in the Embarked feature with S\n", | |
"train = train.fillna({\"Embarked\": \"S\"})" | |
], | |
"execution_count": 239, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_NNHRS12uhA3" | |
}, | |
"source": [ | |
"### Age Feature" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Bvns7Temulfq" | |
}, | |
"source": [ | |
"Next we'll fill in the missing values in the Age feature. Since a higher percentage of values are missing, it would be illogical to fill all of them with the same value (as we did with Embarked). Instead, let's try to find a way to predict the missing ages. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 597 | |
}, | |
"id": "s4LCIWwwumEN", | |
"outputId": "8f79c1a0-b836-45ce-c46f-3dccbd8c3557" | |
}, | |
"source": [ | |
"#create a combined group of both datasets\n", | |
"combine = [train, test]\n", | |
"\n", | |
"#extract a title for each Name in the train and test datasets\n", | |
"for dataset in combine:\n", | |
" dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\\.', expand=False)\n", | |
"\n", | |
"pd.crosstab(train['Title'], train['Sex'])" | |
], | |
"execution_count": 240, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>Sex</th>\n", | |
" <th>female</th>\n", | |
" <th>male</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Title</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>Capt</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Col</th>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Countess</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Don</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Dr</th>\n", | |
" <td>1</td>\n", | |
" <td>6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Jonkheer</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Lady</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Major</th>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Master</th>\n", | |
" <td>0</td>\n", | |
" <td>40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Miss</th>\n", | |
" <td>182</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Mlle</th>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Mme</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Mr</th>\n", | |
" <td>0</td>\n", | |
" <td>517</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Mrs</th>\n", | |
" <td>125</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Ms</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Rev</th>\n", | |
" <td>0</td>\n", | |
" <td>6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Sir</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"Sex female male\n", | |
"Title \n", | |
"Capt 0 1\n", | |
"Col 0 2\n", | |
"Countess 1 0\n", | |
"Don 0 1\n", | |
"Dr 1 6\n", | |
"Jonkheer 0 1\n", | |
"Lady 1 0\n", | |
"Major 0 2\n", | |
"Master 0 40\n", | |
"Miss 182 0\n", | |
"Mlle 2 0\n", | |
"Mme 1 0\n", | |
"Mr 0 517\n", | |
"Mrs 125 0\n", | |
"Ms 1 0\n", | |
"Rev 0 6\n", | |
"Sir 0 1" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 240 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 231 | |
}, | |
"id": "p68zx8fCuttk", | |
"outputId": "c7ec7902-0690-44b9-b633-aa8a0489b4f4" | |
}, | |
"source": [ | |
"#replace various titles with more common names\n", | |
"for dataset in combine:\n", | |
" dataset['Title'] = dataset['Title'].replace(['Lady', 'Capt', 'Col',\n", | |
" 'Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Rare')\n", | |
" \n", | |
" dataset['Title'] = dataset['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')\n", | |
" dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')\n", | |
" dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')\n", | |
" dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')\n", | |
"\n", | |
"train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()" | |
], | |
"execution_count": 241, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Title</th>\n", | |
" <th>Survived</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Master</td>\n", | |
" <td>0.575000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Miss</td>\n", | |
" <td>0.702703</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Mr</td>\n", | |
" <td>0.156673</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Mrs</td>\n", | |
" <td>0.793651</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Rare</td>\n", | |
" <td>0.285714</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>Royal</td>\n", | |
" <td>1.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Title Survived\n", | |
"0 Master 0.575000\n", | |
"1 Miss 0.702703\n", | |
"2 Mr 0.156673\n", | |
"3 Mrs 0.793651\n", | |
"4 Rare 0.285714\n", | |
"5 Royal 1.000000" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 241 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 201 | |
}, | |
"id": "cYcvHx0fuxHw", | |
"outputId": "6ad77d05-5b98-4922-b725-42a3845479a5" | |
}, | |
"source": [ | |
"#map each of the title groups to a numerical value\n", | |
"title_mapping = {\"Mr\": 1, \"Miss\": 2, \"Mrs\": 3, \"Master\": 4, \"Royal\": 5, \"Rare\": 6}\n", | |
"for dataset in combine:\n", | |
" dataset['Title'] = dataset['Title'].map(title_mapping)\n", | |
" dataset['Title'] = dataset['Title'].fillna(0)\n", | |
"\n", | |
"train.head()" | |
], | |
"execution_count": 242, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Embarked</th>\n", | |
" <th>AgeGroup</th>\n", | |
" <th>CabinBool</th>\n", | |
" <th>Title</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>Braund, Mr. Owen Harris</td>\n", | |
" <td>male</td>\n", | |
" <td>22.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>7.2500</td>\n", | |
" <td>S</td>\n", | |
" <td>Student</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", | |
" <td>female</td>\n", | |
" <td>38.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>71.2833</td>\n", | |
" <td>C</td>\n", | |
" <td>Adult</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>Heikkinen, Miss. Laina</td>\n", | |
" <td>female</td>\n", | |
" <td>26.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>7.9250</td>\n", | |
" <td>S</td>\n", | |
" <td>Young Adult</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", | |
" <td>female</td>\n", | |
" <td>35.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>53.1000</td>\n", | |
" <td>S</td>\n", | |
" <td>Young Adult</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>Allen, Mr. William Henry</td>\n", | |
" <td>male</td>\n", | |
" <td>35.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>8.0500</td>\n", | |
" <td>S</td>\n", | |
" <td>Young Adult</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass ... AgeGroup CabinBool Title\n", | |
"0 1 0 3 ... Student 0 1\n", | |
"1 2 1 1 ... Adult 1 3\n", | |
"2 3 1 3 ... Young Adult 0 2\n", | |
"3 4 1 1 ... Young Adult 1 3\n", | |
"4 5 0 3 ... Young Adult 0 1\n", | |
"\n", | |
"[5 rows x 13 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 242 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "vLO6pv2Su0rP" | |
}, | |
"source": [ | |
"Next, we'll try to predict the missing Age values from the most common age for their Title." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "r0c4pC2Ru2Mz" | |
}, | |
"source": [ | |
"# fill missing age with mode age group for each title\n", | |
"mr_age = train[train[\"Title\"] == 1][\"AgeGroup\"].mode() #Young Adult\n", | |
"miss_age = train[train[\"Title\"] == 2][\"AgeGroup\"].mode() #Student\n", | |
"mrs_age = train[train[\"Title\"] == 3][\"AgeGroup\"].mode() #Adult\n", | |
"master_age = train[train[\"Title\"] == 4][\"AgeGroup\"].mode() #Baby\n", | |
"royal_age = train[train[\"Title\"] == 5][\"AgeGroup\"].mode() #Adult\n", | |
"rare_age = train[train[\"Title\"] == 6][\"AgeGroup\"].mode() #Adult\n", | |
"\n", | |
"age_title_mapping = {1: \"Young Adult\", 2: \"Student\", 3: \"Adult\", 4: \"Baby\", 5: \"Adult\", 6: \"Adult\"}\n", | |
"\n", | |
"#I tried to get this code to work with using .map(), but couldn't.\n", | |
"#I've put down a less elegant, temporary solution for now.\n", | |
"#train = train.fillna({\"Age\": train[\"Title\"].map(age_title_mapping)})\n", | |
"#test = test.fillna({\"Age\": test[\"Title\"].map(age_title_mapping)})\n", | |
"\n", | |
"for x in range(len(train[\"AgeGroup\"])):\n", | |
" if train[\"AgeGroup\"][x] == \"Unknown\":\n", | |
" train[\"AgeGroup\"][x] = age_title_mapping[train[\"Title\"][x]]\n", | |
" \n", | |
"for x in range(len(test[\"AgeGroup\"])):\n", | |
" if test[\"AgeGroup\"][x] == \"Unknown\":\n", | |
" test[\"AgeGroup\"][x] = age_title_mapping[test[\"Title\"][x]]" | |
], | |
"execution_count": 243, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "5HWgQU0su72S" | |
}, | |
"source": [ | |
"Now that we've filled in the missing values at least *somewhat* accurately, it's time to map each age group to a numerical value." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "tLfLTbA7u-v3" | |
}, | |
"source": [ | |
"#map each Age value to a numerical value\n", | |
"age_mapping = {'Baby': 1, 'Child': 2, 'Teenager': 3, 'Student': 4, 'Young Adult': 5, 'Adult': 6, 'Senior': 7}\n", | |
"train['AgeGroup'] = train['AgeGroup'].map(age_mapping)\n", | |
"test['AgeGroup'] = test['AgeGroup'].map(age_mapping)\n", | |
"\n", | |
"train.head()\n", | |
"\n", | |
"#dropping the Age feature for now, might change\n", | |
"train = train.drop(['Age'], axis = 1)\n", | |
"test = test.drop(['Age'], axis = 1)" | |
], | |
"execution_count": 244, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "9FxV8e_hvEPQ" | |
}, | |
"source": [ | |
"### Name Feature\n", | |
"We can drop the name feature now that we've extracted the titles." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Cff86r6hvFIR" | |
}, | |
"source": [ | |
"#drop the name feature since it contains no more useful information.\n", | |
"train = train.drop(['Name'], axis = 1)\n", | |
"test = test.drop(['Name'], axis = 1)" | |
], | |
"execution_count": 245, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "uu2fON76vG_C" | |
}, | |
"source": [ | |
"### Sex Feature" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 201 | |
}, | |
"id": "LVaGaHHBvMHG", | |
"outputId": "86f04c99-bbf3-4254-969f-ef984e4871b0" | |
}, | |
"source": [ | |
"#map each Sex value to a numerical value\n", | |
"sex_mapping = {\"male\": 0, \"female\": 1}\n", | |
"train['Sex'] = train['Sex'].map(sex_mapping)\n", | |
"test['Sex'] = test['Sex'].map(sex_mapping)\n", | |
"\n", | |
"train.head()" | |
], | |
"execution_count": 246, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Sex</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Embarked</th>\n", | |
" <th>AgeGroup</th>\n", | |
" <th>CabinBool</th>\n", | |
" <th>Title</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>7.2500</td>\n", | |
" <td>S</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>71.2833</td>\n", | |
" <td>C</td>\n", | |
" <td>6.0</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>7.9250</td>\n", | |
" <td>S</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>53.1000</td>\n", | |
" <td>S</td>\n", | |
" <td>5.0</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>8.0500</td>\n", | |
" <td>S</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass Sex ... Embarked AgeGroup CabinBool Title\n", | |
"0 1 0 3 0 ... S 4.0 0 1\n", | |
"1 2 1 1 1 ... C 6.0 1 3\n", | |
"2 3 1 3 1 ... S 5.0 0 2\n", | |
"3 4 1 1 1 ... S 5.0 1 3\n", | |
"4 5 0 3 0 ... S 5.0 0 1\n", | |
"\n", | |
"[5 rows x 11 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 246 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "h0j_2wK6vPT2" | |
}, | |
"source": [ | |
"### Embarked Feature" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 201 | |
}, | |
"id": "rQcyd1dUvRbO", | |
"outputId": "3768bd4f-b41c-4402-f8e3-5b9ed0fcdf2e" | |
}, | |
"source": [ | |
"#map each Embarked value to a numerical value\n", | |
"embarked_mapping = {\"S\": 1, \"C\": 2, \"Q\": 3}\n", | |
"train['Embarked'] = train['Embarked'].map(embarked_mapping)\n", | |
"test['Embarked'] = test['Embarked'].map(embarked_mapping)\n", | |
"\n", | |
"train.head()" | |
], | |
"execution_count": 247, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Sex</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Embarked</th>\n", | |
" <th>AgeGroup</th>\n", | |
" <th>CabinBool</th>\n", | |
" <th>Title</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>7.2500</td>\n", | |
" <td>1</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>71.2833</td>\n", | |
" <td>2</td>\n", | |
" <td>6.0</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>7.9250</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>53.1000</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>8.0500</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass Sex ... Embarked AgeGroup CabinBool Title\n", | |
"0 1 0 3 0 ... 1 4.0 0 1\n", | |
"1 2 1 1 1 ... 2 6.0 1 3\n", | |
"2 3 1 3 1 ... 1 5.0 0 2\n", | |
"3 4 1 1 1 ... 1 5.0 1 3\n", | |
"4 5 0 3 0 ... 1 5.0 0 1\n", | |
"\n", | |
"[5 rows x 11 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 247 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "XC1zmKehvVKW" | |
}, | |
"source": [ | |
"### Fare Feature\n", | |
"It's time separate the fare values into some logical groups as well as filling in the single missing value in the test dataset." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "koA1FxUOvWAA" | |
}, | |
"source": [ | |
"#fill in missing Fare value in test set based on mean fare for that Pclass \n", | |
"for x in range(len(test[\"Fare\"])):\n", | |
" if pd.isnull(test[\"Fare\"][x]):\n", | |
" pclass = test[\"Pclass\"][x] #Pclass = 3\n", | |
" test[\"Fare\"][x] = round(train[train[\"Pclass\"] == pclass][\"Fare\"].mean(), 4)\n", | |
" \n", | |
"#map Fare values into groups of numerical values\n", | |
"train['FareBand'] = pd.qcut(train['Fare'], 4, labels = [1, 2, 3, 4])\n", | |
"test['FareBand'] = pd.qcut(test['Fare'], 4, labels = [1, 2, 3, 4])\n", | |
"\n", | |
"#drop Fare values\n", | |
"train = train.drop(['Fare'], axis = 1)\n", | |
"test = test.drop(['Fare'], axis = 1)" | |
], | |
"execution_count": 248, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 201 | |
}, | |
"id": "luA66cftvcZY", | |
"outputId": "5196158f-51c8-48b3-88a4-1ea20b4db589" | |
}, | |
"source": [ | |
"#check train data\n", | |
"train.head()" | |
], | |
"execution_count": 249, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Sex</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Embarked</th>\n", | |
" <th>AgeGroup</th>\n", | |
" <th>CabinBool</th>\n", | |
" <th>Title</th>\n", | |
" <th>FareBand</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>6.0</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass Sex ... AgeGroup CabinBool Title FareBand\n", | |
"0 1 0 3 0 ... 4.0 0 1 1\n", | |
"1 2 1 1 1 ... 6.0 1 3 4\n", | |
"2 3 1 3 1 ... 5.0 0 2 2\n", | |
"3 4 1 1 1 ... 5.0 1 3 4\n", | |
"4 5 0 3 0 ... 5.0 0 1 2\n", | |
"\n", | |
"[5 rows x 11 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 249 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 201 | |
}, | |
"id": "CNOa7nIgvftl", | |
"outputId": "ca1ff93b-3a99-420e-b3f8-4af3fbc19622" | |
}, | |
"source": [ | |
"#check test data\n", | |
"test.head()" | |
], | |
"execution_count": 250, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Sex</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Embarked</th>\n", | |
" <th>AgeGroup</th>\n", | |
" <th>CabinBool</th>\n", | |
" <th>Title</th>\n", | |
" <th>FareBand</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>892</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>893</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>6.0</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>894</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>7.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>895</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>896</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Pclass Sex SibSp ... AgeGroup CabinBool Title FareBand\n", | |
"0 892 3 0 0 ... 5.0 0 1 1\n", | |
"1 893 3 1 1 ... 6.0 0 3 1\n", | |
"2 894 2 0 0 ... 7.0 0 1 2\n", | |
"3 895 3 0 0 ... 5.0 0 1 2\n", | |
"4 896 3 1 1 ... 4.0 0 3 2\n", | |
"\n", | |
"[5 rows x 10 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 250 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "a25aZaBfvX8b" | |
}, | |
"source": [ | |
"# 6. Choosing the Best Model" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "sEE0wHtZvnbF" | |
}, | |
"source": [ | |
"### Splitting the Training Data\n", | |
"We will use part of our training data (22% in this case) to test the accuracy of our different models." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Acq6b-covlK9" | |
}, | |
"source": [ | |
"from sklearn.model_selection import train_test_split\n", | |
"\n", | |
"predictors = train.drop(['Survived', 'PassengerId'], axis=1)\n", | |
"target = train[\"Survived\"]\n", | |
"x_train, x_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.22, random_state = 0)" | |
], | |
"execution_count": 251, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "GXgAOb8fvs1t" | |
}, | |
"source": [ | |
"### Testing Different Models\n", | |
"I will be testing the following models with my training data :\n", | |
"* Gaussian Naive Bayes\n", | |
"* Logistic Regression\n", | |
"* Support Vector Machines\n", | |
"* Perceptron\n", | |
"* Decision Tree Classifier\n", | |
"* Random Forest Classifier\n", | |
"* KNN or k-Nearest Neighbors\n", | |
"* Stochastic Gradient Descent\n", | |
"* Gradient Boosting Classifier\n", | |
"\n", | |
"For each model, we set the model, fit it with 80% of our training data, predict for 20% of the training data and check the accuracy." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "vBDzceBjvvMh", | |
"outputId": "ccbd2053-adcc-48d0-9aa0-639bf7e2537a" | |
}, | |
"source": [ | |
"# Gaussian Naive Bayes\n", | |
"from sklearn.naive_bayes import GaussianNB\n", | |
"from sklearn.metrics import accuracy_score\n", | |
"\n", | |
"gaussian = GaussianNB()\n", | |
"gaussian.fit(x_train, y_train)\n", | |
"y_pred = gaussian.predict(x_val)\n", | |
"acc_gaussian = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_gaussian)" | |
], | |
"execution_count": 252, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"78.68\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "xiZSNXByvzCR", | |
"outputId": "c5cc1b94-8673-4cc1-b264-f7452e1e4a57" | |
}, | |
"source": [ | |
"# Logistic Regression\n", | |
"from sklearn.linear_model import LogisticRegression\n", | |
"\n", | |
"logreg = LogisticRegression()\n", | |
"logreg.fit(x_train, y_train)\n", | |
"y_pred = logreg.predict(x_val)\n", | |
"acc_logreg = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_logreg)" | |
], | |
"execution_count": 253, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"79.7\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "PaK_xdQTv19k", | |
"outputId": "4fbeebc0-3242-4768-c735-5da527c0df24" | |
}, | |
"source": [ | |
"# Support Vector Machines\n", | |
"from sklearn.svm import SVC\n", | |
"\n", | |
"svc = SVC()\n", | |
"svc.fit(x_train, y_train)\n", | |
"y_pred = svc.predict(x_val)\n", | |
"acc_svc = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_svc)" | |
], | |
"execution_count": 254, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"82.74\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "nE53_qvGv5JP", | |
"outputId": "a6cbf7bd-bf1b-4a65-ecb3-8659a70e596d" | |
}, | |
"source": [ | |
"# Linear SVC\n", | |
"from sklearn.svm import LinearSVC\n", | |
"\n", | |
"linear_svc = LinearSVC()\n", | |
"linear_svc.fit(x_train, y_train)\n", | |
"y_pred = linear_svc.predict(x_val)\n", | |
"acc_linear_svc = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_linear_svc)" | |
], | |
"execution_count": 255, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"78.17\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "4Znyh4arv70I", | |
"outputId": "d22c55b5-d609-4c6d-9a25-bac3e2c007f1" | |
}, | |
"source": [ | |
"# Perceptron\n", | |
"from sklearn.linear_model import Perceptron\n", | |
"\n", | |
"perceptron = Perceptron()\n", | |
"perceptron.fit(x_train, y_train)\n", | |
"y_pred = perceptron.predict(x_val)\n", | |
"acc_perceptron = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_perceptron)" | |
], | |
"execution_count": 256, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"78.68\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "2GNgsEfEv95_", | |
"outputId": "16b55078-e9c2-47a7-886e-711c86f8f953" | |
}, | |
"source": [ | |
"#Decision Tree\n", | |
"from sklearn.tree import DecisionTreeClassifier\n", | |
"\n", | |
"decisiontree = DecisionTreeClassifier()\n", | |
"decisiontree.fit(x_train, y_train)\n", | |
"y_pred = decisiontree.predict(x_val)\n", | |
"acc_decisiontree = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_decisiontree)" | |
], | |
"execution_count": 257, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"80.71\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "-bPUuU4vv_8X", | |
"outputId": "9c3dadf5-5b56-4e17-bfcc-94045de03a0a" | |
}, | |
"source": [ | |
"# Random Forest\n", | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"\n", | |
"randomforest = RandomForestClassifier()\n", | |
"randomforest.fit(x_train, y_train)\n", | |
"y_pred = randomforest.predict(x_val)\n", | |
"acc_randomforest = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_randomforest)" | |
], | |
"execution_count": 258, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"83.76\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "yJoxSZzzwDiL", | |
"outputId": "45a231c9-405b-46e7-a897-20196e0d0385" | |
}, | |
"source": [ | |
"# KNN or k-Nearest Neighbors\n", | |
"from sklearn.neighbors import KNeighborsClassifier\n", | |
"\n", | |
"knn = KNeighborsClassifier()\n", | |
"knn.fit(x_train, y_train)\n", | |
"y_pred = knn.predict(x_val)\n", | |
"acc_knn = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_knn)" | |
], | |
"execution_count": 259, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"77.66\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "vSQjtKo3wFjz", | |
"outputId": "9094e4e3-dd0e-425d-e7f3-54b26138ac45" | |
}, | |
"source": [ | |
"# Stochastic Gradient Descent\n", | |
"from sklearn.linear_model import SGDClassifier\n", | |
"\n", | |
"sgd = SGDClassifier()\n", | |
"sgd.fit(x_train, y_train)\n", | |
"y_pred = sgd.predict(x_val)\n", | |
"acc_sgd = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_sgd)" | |
], | |
"execution_count": 260, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"78.17\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "2bhbILIUwHYB", | |
"outputId": "75f7bbb4-d6af-46b2-ec7c-4a51809ca244" | |
}, | |
"source": [ | |
"# Gradient Boosting Classifier\n", | |
"from sklearn.ensemble import GradientBoostingClassifier\n", | |
"\n", | |
"gbk = GradientBoostingClassifier()\n", | |
"gbk.fit(x_train, y_train)\n", | |
"y_pred = gbk.predict(x_val)\n", | |
"acc_gbk = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_gbk)" | |
], | |
"execution_count": 261, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"84.26\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 353 | |
}, | |
"id": "7c_kEs6EwJeO", | |
"outputId": "7f436b0c-24d1-44e3-f672-4e4f433930b1" | |
}, | |
"source": [ | |
"models = pd.DataFrame({\n", | |
" 'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', \n", | |
" 'Random Forest', 'Naive Bayes', 'Perceptron', 'Linear SVC', \n", | |
" 'Decision Tree', 'Stochastic Gradient Descent', 'Gradient Boosting Classifier'],\n", | |
" 'Score': [acc_svc, acc_knn, acc_logreg, \n", | |
" acc_randomforest, acc_gaussian, acc_perceptron,acc_linear_svc, acc_decisiontree,\n", | |
" acc_sgd, acc_gbk]})\n", | |
"models.sort_values(by='Score', ascending=False)" | |
], | |
"execution_count": 262, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Model</th>\n", | |
" <th>Score</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>Gradient Boosting Classifier</td>\n", | |
" <td>84.26</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Random Forest</td>\n", | |
" <td>83.76</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Support Vector Machines</td>\n", | |
" <td>82.74</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>Decision Tree</td>\n", | |
" <td>80.71</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Logistic Regression</td>\n", | |
" <td>79.70</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Naive Bayes</td>\n", | |
" <td>78.68</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>Perceptron</td>\n", | |
" <td>78.68</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>Linear SVC</td>\n", | |
" <td>78.17</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>Stochastic Gradient Descent</td>\n", | |
" <td>78.17</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>KNN</td>\n", | |
" <td>77.66</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Model Score\n", | |
"9 Gradient Boosting Classifier 84.26\n", | |
"3 Random Forest 83.76\n", | |
"0 Support Vector Machines 82.74\n", | |
"7 Decision Tree 80.71\n", | |
"2 Logistic Regression 79.70\n", | |
"4 Naive Bayes 78.68\n", | |
"5 Perceptron 78.68\n", | |
"6 Linear SVC 78.17\n", | |
"8 Stochastic Gradient Descent 78.17\n", | |
"1 KNN 77.66" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 262 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "gAlM6IzawNFj" | |
}, | |
"source": [ | |
"We will all these models for COBRA Classifier. The first 4 can be combined as basic machines. And all of them will be in the advanced machine list." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "YR8ExkzuzSGV", | |
"outputId": "3d11b027-c396-4c2d-832a-0b77e4fbf89e" | |
}, | |
"source": [ | |
"import sys\n", | |
"sys.path.append('/content/drive/MyDrive/Titanic Survival Prediction/')\n", | |
"import classifiercobra\n", | |
"\n", | |
"cobra = classifiercobra.ClassifierCobra(machine_list='advanced')\n", | |
"cobra.fit(x_train, y_train)\n", | |
"y_pred = cobra.predict(x_val)\n", | |
"acc_cobra = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_cobra)" | |
], | |
"execution_count": 263, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"83.76\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "l4_2JNIrwYoS" | |
}, | |
"source": [ | |
"#set ids as PassengerId and predict survival \n", | |
"ids = test['PassengerId']\n", | |
"predictions = cobra.predict(test.drop('PassengerId', axis=1))\n", | |
"\n", | |
"#set the output as a dataframe and convert to csv file named submission.csv\n", | |
"output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })\n", | |
"output.to_csv('submission.csv', index=False)" | |
], | |
"execution_count": 264, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment