Created
November 8, 2021 09:17
-
-
Save Imperial-lord/757b1d870f00253afed056d7324cb1d9 to your computer and use it in GitHub Desktop.
Titanic Survival Predictions.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Titanic Survival Predictions.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"mount_file_id": "1Z6ahIXr78kIpfQKPcPAnYYxCo1FsVC2c", | |
"authorship_tag": "ABX9TyPEajGvsiUrPLJ9aB3C71+M", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/Imperial-lord/757b1d870f00253afed056d7324cb1d9/titanic-survival-predictions.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "X6qIb0mUqGy1" | |
}, | |
"source": [ | |
"# 1. Import Necessary Libraries\n", | |
"First off, we need to import several Python libraries such as numpy, pandas, matplotlib and seaborn." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ck1RRP6ipzyO" | |
}, | |
"source": [ | |
"#data analysis libraries \n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"#visualization libraries\n", | |
"import matplotlib.pyplot as plt\n", | |
"import seaborn as sns\n", | |
"%matplotlib inline\n", | |
"\n", | |
"#ignore warnings\n", | |
"import warnings\n", | |
"warnings.filterwarnings('ignore')" | |
], | |
"execution_count": 223, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "IeaiCBdpqSSf" | |
}, | |
"source": [ | |
"#2. Read in and Explore the Data\n", | |
"It's time to read in our training and testing data using pd.read_csv, and take a first look at the training data using the describe() function." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 383 | |
}, | |
"id": "KdH6bQE9qbB0", | |
"outputId": "767962bf-2dc7-47dc-ca6b-528fbcb42fe3" | |
}, | |
"source": [ | |
"#import train and test CSV files\n", | |
"train = pd.read_csv(\"/content/drive/MyDrive/Titanic Survival Prediction/input/train.csv\")\n", | |
"test = pd.read_csv(\"/content/drive/MyDrive/Titanic Survival Prediction/input/test.csv\")\n", | |
"\n", | |
"#take a look at the training data\n", | |
"train.describe(include=\"all\")" | |
], | |
"execution_count": 224, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Ticket</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Cabin</th>\n", | |
" <th>Embarked</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>891.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891</td>\n", | |
" <td>891</td>\n", | |
" <td>714.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>204</td>\n", | |
" <td>889</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unique</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>891</td>\n", | |
" <td>2</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>681</td>\n", | |
" <td>NaN</td>\n", | |
" <td>147</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>top</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Woolner, Mr. Hugh</td>\n", | |
" <td>male</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>347082</td>\n", | |
" <td>NaN</td>\n", | |
" <td>C23 C25 C27</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>freq</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1</td>\n", | |
" <td>577</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>7</td>\n", | |
" <td>NaN</td>\n", | |
" <td>4</td>\n", | |
" <td>644</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>446.000000</td>\n", | |
" <td>0.383838</td>\n", | |
" <td>2.308642</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>29.699118</td>\n", | |
" <td>0.523008</td>\n", | |
" <td>0.381594</td>\n", | |
" <td>NaN</td>\n", | |
" <td>32.204208</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>257.353842</td>\n", | |
" <td>0.486592</td>\n", | |
" <td>0.836071</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>14.526497</td>\n", | |
" <td>1.102743</td>\n", | |
" <td>0.806057</td>\n", | |
" <td>NaN</td>\n", | |
" <td>49.693429</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.420000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>223.500000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>2.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20.125000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>7.910400</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>446.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>28.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>14.454200</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>668.500000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>38.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>31.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>891.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>80.000000</td>\n", | |
" <td>8.000000</td>\n", | |
" <td>6.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>512.329200</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass ... Fare Cabin Embarked\n", | |
"count 891.000000 891.000000 891.000000 ... 891.000000 204 889\n", | |
"unique NaN NaN NaN ... NaN 147 3\n", | |
"top NaN NaN NaN ... NaN C23 C25 C27 S\n", | |
"freq NaN NaN NaN ... NaN 4 644\n", | |
"mean 446.000000 0.383838 2.308642 ... 32.204208 NaN NaN\n", | |
"std 257.353842 0.486592 0.836071 ... 49.693429 NaN NaN\n", | |
"min 1.000000 0.000000 1.000000 ... 0.000000 NaN NaN\n", | |
"25% 223.500000 0.000000 2.000000 ... 7.910400 NaN NaN\n", | |
"50% 446.000000 0.000000 3.000000 ... 14.454200 NaN NaN\n", | |
"75% 668.500000 1.000000 3.000000 ... 31.000000 NaN NaN\n", | |
"max 891.000000 1.000000 3.000000 ... 512.329200 NaN NaN\n", | |
"\n", | |
"[11 rows x 12 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 224 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "oqnvKmqHru0o" | |
}, | |
"source": [ | |
"#3. Data Analysis\n", | |
"We're going to consider the features in the dataset and how complete they are." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "0TKeJh54ryD3", | |
"outputId": "e4db021f-c781-46f8-a861-c816756608b9" | |
}, | |
"source": [ | |
"#get a list of the features within the dataset\n", | |
"print(train.columns)" | |
], | |
"execution_count": 225, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',\n", | |
" 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],\n", | |
" dtype='object')\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 201 | |
}, | |
"id": "XE0hvrRtr2Tk", | |
"outputId": "d3054289-dda4-4c2f-89a6-f8baa6f58777" | |
}, | |
"source": [ | |
"#see a sample of the dataset to get an idea of the variables\n", | |
"train.sample(5)" | |
], | |
"execution_count": 226, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Ticket</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Cabin</th>\n", | |
" <th>Embarked</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>328</th>\n", | |
" <td>329</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>Goldsmith, Mrs. Frank John (Emily Alice Brown)</td>\n", | |
" <td>female</td>\n", | |
" <td>31.0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>363291</td>\n", | |
" <td>20.525</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>271</th>\n", | |
" <td>272</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>Tornquist, Mr. William Henry</td>\n", | |
" <td>male</td>\n", | |
" <td>25.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>LINE</td>\n", | |
" <td>0.000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>619</th>\n", | |
" <td>620</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>Gavey, Mr. Lawrence</td>\n", | |
" <td>male</td>\n", | |
" <td>26.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>31028</td>\n", | |
" <td>10.500</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>432</th>\n", | |
" <td>433</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>Louch, Mrs. Charles Alexander (Alice Adelaide ...</td>\n", | |
" <td>female</td>\n", | |
" <td>42.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>SC/AH 3085</td>\n", | |
" <td>26.000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>237</th>\n", | |
" <td>238</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>Collyer, Miss. Marjorie \"Lottie\"</td>\n", | |
" <td>female</td>\n", | |
" <td>8.0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>C.A. 31921</td>\n", | |
" <td>26.250</td>\n", | |
" <td>NaN</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass ... Fare Cabin Embarked\n", | |
"328 329 1 3 ... 20.525 NaN S\n", | |
"271 272 1 3 ... 0.000 NaN S\n", | |
"619 620 0 2 ... 10.500 NaN S\n", | |
"432 433 1 2 ... 26.000 NaN S\n", | |
"237 238 1 2 ... 26.250 NaN S\n", | |
"\n", | |
"[5 rows x 12 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 226 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "MIDW8wZLsCEU" | |
}, | |
"source": [ | |
"* **Numerical Features**: Age (Continuous), Fare (Continuous), SibSp (Discrete), Parch (Discrete)\n", | |
"* **Categorical Features**: Survived, Sex, Embarked, Pclass\n", | |
"* **Alphanumeric Features**: Ticket, Cabin\n", | |
"\n", | |
"What are the data types for each feature?\n", | |
"* Survived: int\n", | |
"* Pclass: int\n", | |
"* Name: string\n", | |
"* Sex: string\n", | |
"* Age: float\n", | |
"* SibSp: int\n", | |
"* Parch: int\n", | |
"* Ticket: string\n", | |
"* Fare: float\n", | |
"* Cabin: string\n", | |
"* Embarked: string\n", | |
"\n", | |
"Now that we have an idea of what kinds of features we're working with, we can see how much information we have about each of them." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 383 | |
}, | |
"id": "s-h6_I9ar64G", | |
"outputId": "fc2e2b66-9ff1-4a7f-da8d-aad395afeb3b" | |
}, | |
"source": [ | |
"#see a summary of the training dataset\n", | |
"train.describe(include = \"all\")" | |
], | |
"execution_count": 227, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Ticket</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Cabin</th>\n", | |
" <th>Embarked</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>891.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891</td>\n", | |
" <td>891</td>\n", | |
" <td>714.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>891</td>\n", | |
" <td>891.000000</td>\n", | |
" <td>204</td>\n", | |
" <td>889</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unique</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>891</td>\n", | |
" <td>2</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>681</td>\n", | |
" <td>NaN</td>\n", | |
" <td>147</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>top</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Woolner, Mr. Hugh</td>\n", | |
" <td>male</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>347082</td>\n", | |
" <td>NaN</td>\n", | |
" <td>C23 C25 C27</td>\n", | |
" <td>S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>freq</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1</td>\n", | |
" <td>577</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>7</td>\n", | |
" <td>NaN</td>\n", | |
" <td>4</td>\n", | |
" <td>644</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>446.000000</td>\n", | |
" <td>0.383838</td>\n", | |
" <td>2.308642</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>29.699118</td>\n", | |
" <td>0.523008</td>\n", | |
" <td>0.381594</td>\n", | |
" <td>NaN</td>\n", | |
" <td>32.204208</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>257.353842</td>\n", | |
" <td>0.486592</td>\n", | |
" <td>0.836071</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>14.526497</td>\n", | |
" <td>1.102743</td>\n", | |
" <td>0.806057</td>\n", | |
" <td>NaN</td>\n", | |
" <td>49.693429</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.420000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>223.500000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>2.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20.125000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>7.910400</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>446.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>28.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>14.454200</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>668.500000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>38.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>31.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>891.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>80.000000</td>\n", | |
" <td>8.000000</td>\n", | |
" <td>6.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>512.329200</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass ... Fare Cabin Embarked\n", | |
"count 891.000000 891.000000 891.000000 ... 891.000000 204 889\n", | |
"unique NaN NaN NaN ... NaN 147 3\n", | |
"top NaN NaN NaN ... NaN C23 C25 C27 S\n", | |
"freq NaN NaN NaN ... NaN 4 644\n", | |
"mean 446.000000 0.383838 2.308642 ... 32.204208 NaN NaN\n", | |
"std 257.353842 0.486592 0.836071 ... 49.693429 NaN NaN\n", | |
"min 1.000000 0.000000 1.000000 ... 0.000000 NaN NaN\n", | |
"25% 223.500000 0.000000 2.000000 ... 7.910400 NaN NaN\n", | |
"50% 446.000000 0.000000 3.000000 ... 14.454200 NaN NaN\n", | |
"75% 668.500000 1.000000 3.000000 ... 31.000000 NaN NaN\n", | |
"max 891.000000 1.000000 3.000000 ... 512.329200 NaN NaN\n", | |
"\n", | |
"[11 rows x 12 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 227 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "EAJCIZJgsl7t" | |
}, | |
"source": [ | |
"#### Some Observations:\n", | |
"* There are a total of 891 passengers in our training set.\n", | |
"* The Age feature is missing approximately 19.8% of its values. I'm guessing that the Age feature is pretty important to survival, so we should probably attempt to fill these gaps. \n", | |
"* The Cabin feature is missing approximately 77.1% of its values. Since so much of the feature is missing, it would be hard to fill in the missing values. We'll probably drop these values from our dataset.\n", | |
"* The Embarked feature is missing 0.22% of its values, which should be relatively harmless." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "QxQqypx8snV_", | |
"outputId": "c57a1604-5ddf-4ac8-dd17-1c9388b8890f" | |
}, | |
"source": [ | |
"#check for any other unusable values\n", | |
"print(pd.isnull(train).sum())" | |
], | |
"execution_count": 228, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"PassengerId 0\n", | |
"Survived 0\n", | |
"Pclass 0\n", | |
"Name 0\n", | |
"Sex 0\n", | |
"Age 177\n", | |
"SibSp 0\n", | |
"Parch 0\n", | |
"Ticket 0\n", | |
"Fare 0\n", | |
"Cabin 687\n", | |
"Embarked 2\n", | |
"dtype: int64\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "rfPfPRtvsuF-" | |
}, | |
"source": [ | |
"We can see that except for the abovementioned missing values, no NaN values exist." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "6gYiprfpsyOg" | |
}, | |
"source": [ | |
"### Some Predictions:\n", | |
"* Sex: Females are more likely to survive.\n", | |
"* SibSp/Parch: People traveling alone are more likely to survive.\n", | |
"* Age: Young children are more likely to survive.\n", | |
"* Pclass: People of higher socioeconomic class are more likely to survive." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_xSMhDoUs5CN" | |
}, | |
"source": [ | |
"# 4. Data Visualization\n", | |
"It's time to visualize our data so we can see whether our predictions were accurate! " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 314 | |
}, | |
"id": "vdBdTVsbsvOM", | |
"outputId": "32c779b8-d641-4c26-9099-5acf47326b05" | |
}, | |
"source": [ | |
"#draw a bar plot of survival by sex\n", | |
"sns.barplot(x=\"Sex\", y=\"Survived\", data=train)\n", | |
"\n", | |
"#print percentages of females vs. males that survive\n", | |
"print(\"Percentage of females who survived:\", train[\"Survived\"][train[\"Sex\"] == 'female'].value_counts(normalize = True)[1]*100)\n", | |
"\n", | |
"print(\"Percentage of males who survived:\", train[\"Survived\"][train[\"Sex\"] == 'male'].value_counts(normalize = True)[1]*100)" | |
], | |
"execution_count": 229, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Percentage of females who survived: 74.20382165605095\n", | |
"Percentage of males who survived: 18.890814558058924\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAT0klEQVR4nO3df5BdZ33f8ffHMooHY5xSbcceSUYqiDgqcXC8iKT5BcFOZNqR0gCJbGeCpy4apshkQowrClWoHJqJaEiTVCQorQtlAsKBDrO0SlUChibml9axsSs5IlvZIAlU1piAgNZm42//uFfO5epqdWXr3Kvd837N3NE95zx77ndXV/rseZ57nidVhSSpvc4bdwGSpPEyCCSp5QwCSWo5g0CSWs4gkKSWO3/cBZypZcuW1apVq8ZdhiQtKHfffffDVTUx6NiCC4JVq1YxPT097jIkaUFJ8oVTHbNrSJJaziCQpJYzCCSp5RoNgiTrkxxMMpNk64DjlyW5M8k9Se5L8rIm65EknayxIEiyBNgJXAusBa5Lsrav2ZuBO6rqSmAT8I6m6pEkDdbkFcE6YKaqDlXVY8BuYGNfmwKe2X1+MfClBuuRJA3QZBAsBw73bB/p7uv1FuAXkxwB9gA3DzpRks1JppNMz87ONlGrJLXWuAeLrwPeVVUrgJcB70lyUk1VtauqJqtqcmJi4P0QkqQnqckbyo4CK3u2V3T39boJWA9QVZ9KcgGwDPhKg3VJOsfdeuutHDt2jEsuuYQdO3aMu5xFr8krgn3AmiSrkyylMxg81dfmi8BLAZJ8P3ABYN+P1HLHjh3j6NGjHDt2bNyltEJjQVBVc8AWYC/wAJ1PB+1Psj3Jhm6zXwVeneRzwPuAG8sl0yRppBqda6iq9tAZBO7dt63n+QHgR5usQZI0v3EPFkuSxswgkKSWMwgkqeUMAklqOYNAklrOIJCkljMIJKnlDAJJajmDQJJartE7iyWdmS9u/4Fxl3BOmHvkWcD5zD3yBX8mwGXb7m/0/F4RSFLLGQSS1HIGgSS1nEEgSS1nEEhSyxkEktRyBoEktVyjQZBkfZKDSWaSbB1w/LeT3Nt9fD7JXzdZjyTpZI3dUJZkCbATuAY4AuxLMtVdnhKAqvqVnvY3A1c2VY8kabAmrwjWATNVdaiqHgN2AxvnaX8dnQXsJUkj1GQQLAcO92wf6e47SZJnA6uBj53i+OYk00mmZ2dnz3qhktRm58pg8SbgA1X1N4MOVtWuqpqsqsmJiYkRlyZJi1uTQXAUWNmzvaK7b5BN2C0kSWPR5Oyj+4A1SVbTCYBNwPX9jZJcDvwd4FMN1iJpAVl2wePAXPdPNa2xIKiquSRbgL3AEuD2qtqfZDswXVVT3aabgN1VVU3VImlhueUKP0k+So2uR1BVe4A9ffu29W2/pckaJEnzO1cGiyVJY2IQSFLLGQSS1HIGgSS1nEEgSS1nEEhSyxkEktRyBoEktZxBIEktZxBIUssZBJLUcgaBJLWcQSBJLWcQSFLLGQSS1HIGgSS1nEEgSS3XaBAkWZ/kYJKZJFtP0ebnkxxIsj/Je5usR5J0ssaWqkyyBNgJXAMcAfYlmaqqAz1t1gBvBH60qr6W5O81VY8kabAmrwjWATNVdaiqHgN2Axv72rwa2FlVXwOoqq80WI8kaYAmg2A5cLhn+0h3X6/nAc9LcleSTydZP+hESTYnmU4yPTs721C5ktRO4x4sPh9YA7wYuA74wyTf29+oqnZV1WRVTU5MTIy4REla3JoMgqPAyp7tFd19vY4AU1X1nap6EPg8nWCQJI1Ik0GwD1iTZHWSpcAmYKqvzYfoXA2QZBmdrqJDDdYkSerTWBBU1RywBdgLPADcUVX7k2xPsqHbbC/w1SQHgDuBN1TVV5uqSZJ0ssY+PgpQVXuAPX37tvU8L+D13YckaQzGPVgsSRozg0CSWs4gkKSWMwgkqeUMAklqOYNAklrOIJCkljMIJKnlDAJJajmDQJJaziCQpJYzCCSp5QwCSWo5g0CSWs4gkKSWMwgkqeUMAklqOYNAklqu0SBIsj7JwSQzSbYOOH5jktkk93Yf/6zJeiRJJ2tszeIkS4CdwDXAEWBfkqmqOtDX9P1VtaWpOiRJ82vyimAdMFNVh6rqMWA3sLHB15MkPQlNBsFy4HDP9pHuvn4vT3Jfkg8kWTnoREk2J5lOMj07O9tErZLUWuMeLP4wsKqqrgA+Arx7UKOq2lVVk1U1OTExMdICJWmxazIIjgK9v+Gv6O57QlV9taoe7W7+B+CqBuuRJA3QZBDsA9YkWZ1kKbAJmOptkOTSns0NwAMN1iNJGmDeTw0lOQ7UqY5X1TPnOTaXZAuwF1gC3F5V+5NsB6aragp4XZINwBzwCHDjmX8LkqSnYt4gqKqLAJLcBnwZeA8Q4Abg0nm+9MTX7wH29O3b1vP8jcAbz7hqSdJZM2zX0IaqekdVHa+qb1TV7+NHQSVpURg2CL6V5IYkS5Kcl+QG4FtNFiZJGo1hg+B64OeB/9N9vLK7T5K0wA01xURVPYRdQZK0KA11RZDkeUk+muR/dbevSPLmZkuTJI3CsF1Df0jn0z3fAaiq++jcFyBJWuCGDYKnV9Vn+/bNne1iJEmjN2wQPJzkOXRvLkvyCjr3FUiSFrhh1yN4LbALuDzJUeBBOjeVSZIWuGGD4AtVdXWSC4Hzqup4k0VJkkZn2K6hB5PsAn4Y+GaD9UiSRmzYILgc+FM6XUQPJvn3SX6subIkSaMyVBBU1ber6o6q+jngSuCZwCcarUySNBJDr0eQ5CeTvAO4G7iAzpQTkqQFbqjB4iQPAfcAdwBvqConnJOkRWLYTw1dUVXfaLQSSdJYnG6Fsluragfw1iQnrVRWVa9rrDJJ0kicbozgxBrC03TGBvof80qyPsnBJDNJts7T7uVJKsnkkHVLks6S0y1V+eHu0/ur6i/O5MRJlgA7gWuAI8C+JFNVdaCv3UXALwOfOZPzS5LOjmE/NfRbSR5IcluS5w/5NeuAmao6VFWPAbsZvKbBbcBvAv9vyPNKks6iYe8jeAnwEmAWeGeS+4dYj2A5cLhn+0h33xOS/BCwsqr+23wnSrI5yXSS6dnZ2WFKliQNaej7CKrqWFX9LvAa4F5g21N54STnAW8HfnWI195VVZNVNTkxMfFUXlaS1GfYFcq+P8lbktwP/B7wSWDFab7sKLCyZ3tFd98JFwHPBz7evU/hh4EpB4wlabSGvY/gdjp9/D9TVV8a8mv2AWuSrKYTAJvoWfC+qr4OLDuxneTjwC1VNT3k+SVJZ8Fprwi6n/55sKp+5wxCgKqaA7YAe+l8DPWOqtqfZHuSDU+6YknSWXXaK4Kq+pskK5Ms7X76Z2hVtQfY07dv4NhCVb34TM4tSTo7hu0aehC4K8kU8MQ8Q1X19kaqkiSNzLBB8L+7j/PoDPJKkhaJoYKgqv5104VIksZj2Gmo7wQGTTr3U2e9IknSSA3bNXRLz/MLgJcDc2e/HEnSqA3bNdQ/0+hdST7bQD2SpBEbtmvoWT2b5wGTwMWNVCRJGqlhu4bu5m/HCOaAh4CbmihIkjRap1uh7IXA4apa3d1+FZ3xgYeAA/N8qSRpgTjdFBPvBB4DSPITwG8A7wa+DuxqtjRJ0iicrmtoSVU90n3+C8Cuqvog8MEk9zZbmiRpFE53RbAkyYmweCnwsZ5jw44vSJLOYaf7z/x9wCeSPAz8X+DPAJI8l073kCRpgTvd4vVvTfJR4FLgf1TViU8OnQfc3HRxkqTmDTMN9acH7Pt8M+VIkkZt6DWLJUmLk0EgSS3XaBAkWZ/kYJKZJFsHHH9NkvuT3Jvkz5OsbbIeSdLJGguC7lrHO4FrgbXAdQP+o39vVf1AVb0A2AG44pkkjViTVwTrgJmqOtRd63g3sLG3QVV9o2fzQgaseSBJalaTN4UtBw73bB8BXtTfKMlrgdcDS4GBC90k2QxsBrjsssvOeqGS1GZjHyyuqp1V9RzgXwBvPkWbXVU1WVWTExMToy1Qkha5JoPgKLCyZ3tFd9+p7AZ+tsF6JEkDNBkE+4A1SVYnWQpsAqZ6GyRZ07P5j4C/arAeSdIAjY0RVNVcki3AXmAJcHtV7U+yHZiuqilgS5Krge8AXwNe1VQ9kqTBGp1BtKr2AHv69m3ref7LTb6+JOn0xj5YLEkaL4NAklrOIJCkljMIJKnlDAJJajmDQJJaziCQpJYzCCSp5QwCSWo5g0CSWs4gkKSWMwgkqeUMAklquUZnH9W57dZbb+XYsWNccskl7NixY9zlSBoTg6DFjh07xtGj8y0aJ6kN7BqSpJYzCCSp5RoNgiTrkxxMMpNk64Djr09yIMl9ST6a5NlN1iNJOlljQZBkCbATuBZYC1yXZG1fs3uAyaq6AvgA4IilJI1Yk1cE64CZqjpUVY8Bu4GNvQ2q6s6q+nZ389PAigbrkSQN0GQQLAcO92wf6e47lZuAPxl0IMnmJNNJpmdnZ89iiZKkc2KwOMkvApPA2wYdr6pdVTVZVZMTExOjLU6SFrkm7yM4Cqzs2V7R3fddklwNvAn4yap6tMF6JEkDNBkE+4A1SVbTCYBNwPW9DZJcCbwTWF9VX2mwlu9y1Rv+86he6px20cPHWQJ88eHj/kyAu9/2S+MuQRqLxrqGqmoO2ALsBR4A7qiq/Um2J9nQbfY24BnAHye5N8lUU/VIkgZrdIqJqtoD7Onbt63n+dVNvr4k6fTOicFiSdL4GASS1HIGgSS1nEEgSS1nEEhSyxkEktRyrlDWYo8vvfC7/pTUTgZBi31rzU+PuwRJ5wC7hiSp5QwCSWo5g0CSWs4gkKSWMwgkqeUMAklqOYNAklrOIJCkljMIJKnlGg2CJOuTHEwyk2TrgOM/keQvkswleUWTtUiSBmssCJIsAXYC1wJrgeuSrO1r9kXgRuC9TdUhSZpfk3MNrQNmquoQQJLdwEbgwIkGVfVQ99jjDdYhSZpHk11Dy4HDPdtHuvskSeeQBTFYnGRzkukk07Ozs+MuR5IWlSaD4Ciwsmd7RXffGauqXVU1WVWTExMTZ6U4SVJHk0GwD1iTZHWSpcAmYKrB15MkPQmNBUFVzQFbgL3AA8AdVbU/yfYkGwCSvDDJEeCVwDuT7G+qHknSYI2uUFZVe4A9ffu29TzfR6fLSJI0JgtisFiS1ByDQJJaziCQpJYzCCSp5QwCSWo5g0CSWs4gkKSWMwgkqeUMAklqOYNAklrOIJCkljMIJKnlDAJJajmDQJJaziCQpJYzCCSp5QwCSWo5g0CSWq7RIEiyPsnBJDNJtg44/j1J3t89/pkkq5qsR5J0ssaCIMkSYCdwLbAWuC7J2r5mNwFfq6rnAr8N/GZT9UiSBmvyimAdMFNVh6rqMWA3sLGvzUbg3d3nHwBemiQN1iRJ6nN+g+deDhzu2T4CvOhUbapqLsnXgb8LPNzbKMlmYHN385tJDjZScTsto+/n3Vb5t68adwn6br43T/i1s/L78bNPdaDJIDhrqmoXsGvcdSxGSaaranLcdUj9fG+OTpNdQ0eBlT3bK7r7BrZJcj5wMfDVBmuSJPVpMgj2AWuSrE6yFNgETPW1mQJOXI+/AvhYVVWDNUmS+jTWNdTt898C7AWWALdX1f4k24HpqpoC/iPwniQzwCN0wkKjZZebzlW+N0ck/gIuSe3mncWS1HIGgSS1nEGgJyR5cZL/Ou46tDgkeV2SB5L8UUPnf0uSW5o4d9ssiPsIJC1I/xy4uqqOjLsQzc8rgkUmyaokf5nkXUk+n+SPklyd5K4kf5VkXffxqST3JPlkku8bcJ4Lk9ye5LPddv3Tg0inlOQPgL8P/EmSNw16LyW5McmHknwkyUNJtiR5fbfNp5M8q9vu1Un2Jflckg8mefqA13tOkv+e5O4kf5bk8tF+xwubQbA4PRf4LeDy7uN64MeAW4B/Cfwl8ONVdSWwDfg3A87xJjr3dawDXgK8LcmFI6hdi0BVvQb4Ep33zoWc+r30fODngBcCbwW+3X1ffgr4pW6b/1JVL6yqHwQeoDNZZb9dwM1VdRWd9/k7mvnOFie7hhanB6vqfoAk+4GPVlUluR9YRecO7ncnWQMU8LQB5/hpYENPH+wFwGV0/iFKZ+JU7yWAO6vqOHC8O9fYh7v77weu6D5/fpJfB74XeAade5OekOQZwD8E/rhnzsrvaeIbWawMgsXp0Z7nj/dsP07n7/w2Ov8A/0l3DYiPDzhHgJdXlRP86aka+F5K8iJO/14FeBfws1X1uSQ3Ai/uO/95wF9X1QvObtntYddQO13M3877dOMp2uwFbj4xLXiSK0dQlxanp/peugj4cpKnATf0H6yqbwAPJnll9/xJ8oNPseZWMQjaaQfwG0nu4dRXhbfR6TK6r9u9dNuoitOi81TfS/8K+AxwF53xrUFuAG5K8jlgPyevfaJ5OMWEJLWcVwSS1HIGgSS1nEEgSS1nEEhSyxkEktRyBoF0Brrz5uxPcl+Se7s3RUkLmncWS0NK8iPAPwZ+qKoeTbIMWDrmsqSnzCsCaXiXAg9X1aMAVfVwVX0pyVVJPtGd+XJvkkuTXJzk4ImZXZO8L8mrx1q9dAreUCYNqTu52Z8DTwf+FHg/8EngE8DGqppN8gvAz1TVP01yDbAd+B3gxqpaP6bSpXnZNSQNqaq+meQq4MfpTKf8fuDX6Uyl/JHuVDpLgC9323+kO//NTsC5b3TO8opAepKSvAJ4LXBBVf3IgOPn0blaWAW87MTU4NK5xjECaUhJvq+7hsMJL6CzPsNEdyCZJE9L8g+6x3+le/x64D91Z8+UzjleEUhD6nYL/R6dBVLmgBlgM7AC+F0603ufD/w74H8CHwLWVdXxJG8HjlfVr42jdmk+BoEktZxdQ5LUcgaBJLWcQSBJLWcQSFLLGQSS1HIGgSS1nEEgSS33/wGAikte6oaCQAAAAABJRU5ErkJggg==\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ZFBncjYztA8H" | |
}, | |
"source": [ | |
"As predicted, females have a much higher chance of survival than males. The Sex feature is essential in our predictions." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 331 | |
}, | |
"id": "QJenNUNDtBk_", | |
"outputId": "7756fb01-8157-4603-8c75-3db0d4198bde" | |
}, | |
"source": [ | |
"#draw a bar plot of survival by Pclass\n", | |
"sns.barplot(x=\"Pclass\", y=\"Survived\", data=train)\n", | |
"\n", | |
"#print percentage of people by Pclass that survived\n", | |
"print(\"Percentage of Pclass = 1 who survived:\", train[\"Survived\"][train[\"Pclass\"] == 1].value_counts(normalize = True)[1]*100)\n", | |
"\n", | |
"print(\"Percentage of Pclass = 2 who survived:\", train[\"Survived\"][train[\"Pclass\"] == 2].value_counts(normalize = True)[1]*100)\n", | |
"\n", | |
"print(\"Percentage of Pclass = 3 who survived:\", train[\"Survived\"][train[\"Pclass\"] == 3].value_counts(normalize = True)[1]*100)" | |
], | |
"execution_count": 230, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Percentage of Pclass = 1 who survived: 62.96296296296296\n", | |
"Percentage of Pclass = 2 who survived: 47.28260869565217\n", | |
"Percentage of Pclass = 3 who survived: 24.236252545824847\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAASx0lEQVR4nO3dcZBdZ33e8e9jOarBOKGgbeWxVKyAKHWoJ5SN0qk7hBDcimZGyhRI5bpJPENRmUFAm4IwbeOCKe1EJGQaqiQojSeECQgDbbNp1agUO0BcbLQCYyM5pooMSCob1jYGm9DIsn/9Y4/pZXW1e2Xv2avV+/3M3NE973nvub/rO+Nnz3vued9UFZKkdl0w7gIkSeNlEEhS4wwCSWqcQSBJjTMIJKlxF467gLO1Zs2auvzyy8ddhiStKAcPHry/qiaG7VtxQXD55ZczPT097jIkaUVJ8pUz7XNoSJIaZxBIUuMMAklqnEEgSY3rNQiSbE5yb5IjSa4fsv9XktzZPb6U5KE+65Ekna63Xw0lWQXsBq4GjgMHkkxV1eEn+lTVPxvo/wbgRX3VI0kars8zgk3Akao6WlUngb3A1gX6XwN8qMd6JElD9BkElwHHBraPd22nSfIcYANwyxn2b08ynWR6dnZ2yQuVpJadKzeUbQM+WlWPDdtZVXuAPQCTk5Pn7QIKO3fuZGZmhrVr17Jr165xlyOpEX0GwQlg/cD2uq5tmG3A63usZUWYmZnhxIkz/SeSpH70OTR0ANiYZEOS1cz9z35qfqckLwD+IvCZHmuRJJ1Bb0FQVaeAHcB+4B7g5qo6lOTGJFsGum4D9pZrZkrSWPR6jaCq9gH75rXdMG/77X3WIElamHcWS1LjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1LhzZc3iXrz4Lb8z7hLOyiX3P8wq4Kv3P7yiaj/47p8ddwmSngLPCCSpcQaBJDXOIJCkxhkEktQ4g0CSGtdrECTZnOTeJEeSXH+GPj+d5HCSQ0k+2Gc9kqTT9fbz0SSrgN3A1cBx4ECSqao6PNBnI/A24Kqq+kaSv9RXPZKk4fo8I9gEHKmqo1V1EtgLbJ3X57XA7qr6BkBVfb3HeiRJQ/QZBJcBxwa2j3dtg54PPD/JbUluT7J52IGSbE8ynWR6dna2p3IlqU3jvlh8IbAReClwDfCbSZ45v1NV7amqyaqanJiYWOYSJen81mcQnADWD2yv69oGHQemqurRqroP+BJzwSBJWiZ9BsEBYGOSDUlWA9uAqXl9/gtzZwMkWcPcUNHRHmuSJM3TWxBU1SlgB7AfuAe4uaoOJbkxyZau237ggSSHgVuBt1TVA33VJEk6Xa+zj1bVPmDfvLYbBp4X8PPdQ5I0BuO+WCxJGjODQJIaZxBIUuMMAklqnEEgSY07r9csXmkeX33x9/wrScvBIDiHfHvj3xl3CZIa5NCQJDXOMwJpCezcuZOZmRnWrl3Lrl27xl2OdFYMAmkJzMzMcOLE/DkVpZXBoSFJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxvQZBks1J7k1yJMn1Q/Zfl2Q2yZ3d4x/3WY8k6XS9TTqXZBWwG7gaOA4cSDJVVYfndf1wVe3oqw5J0sL6PCPYBBypqqNVdRLYC2zt8f0kSU9Cn0FwGXBsYPt41zbfK5PcleSjSdYPO1CS7Ummk0zPzs72UaskNWvcF4t/H7i8qq4EPg68f1inqtpTVZNVNTkxMbGsBUrS+a7PIDgBDP6Fv65r+66qeqCq/rzb/I/Ai3usR5I0RJ9BcADYmGRDktXANmBqsEOSSwc2twD39FiPJGmI3n41VFWnkuwA9gOrgJuq6lCSG4HpqpoC3phkC3AKeBC4rq96JEnD9bpmcVXtA/bNa7th4PnbgLf1WYMkaWHjvlgsSRozg0CSGtfr0JD0VHz1xr8+7hJGdurBZwEXcurBr6youv/KDXePuwSdAzwjkKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJatyCs48meRioM+2vqu9f8ookSctqwSCoqksAkrwT+BrwASDAtcClC7xUkrRCjDo0tKWqfq2qHq6qb1XVrwNb+yxMkrQ8Rg2Cbye5NsmqJBckuRb4dp+FSZKWx6hB8A+Bnwb+tHu8umtbUJLNSe5NciTJ9Qv0e2WSSjI5Yj2SpCUy0lKVVfVlznIoKMkqYDdwNXAcOJBkqqoOz+t3CfAm4I6zOb4kaWmMdEaQ5PlJPpHki932lUn+1SIv2wQcqaqjVXUS2MvwMHkn8IvA/z2LuiVJS2TUoaHfBN4GPApQVXcB2xZ5zWXAsYHt413bdyX5G8D6qvpvI9YhSVpiowbB06vqs/PaTj2VN05yAfAe4J+P0Hd7kukk07Ozs0/lbSVJ84waBPcneS7dzWVJXsXcfQULOQGsH9he17U94RLghcAfJvky8DeBqWEXjKtqT1VNVtXkxMTEiCVLy2fNRY/zl592ijUXPT7uUqSzNtLFYuD1wB7gBUlOAPcxd1PZQg4AG5NsYC4AtjHwS6Oq+iaw5ontJH8IvLmqpkeuXjpHvPnKh8ZdgvSkjRoEX6mqlye5GLigqh5e7AVVdSrJDmA/sAq4qaoOJbkRmK6qqSdftiRpqYwaBPcl+QPgw8Atox68qvYB++a13XCGvi8d9biSpKUz6jWCFwD/k7khovuS/Ickf7u/siRJy2WkIKiqP6uqm6vq7wMvAr4f+GSvlUmSlsXI6xEk+bEkvwYcBC5ibsoJSdIKN9I1gu7nnZ8HbgbeUlVOOCdJ54lRLxZfWVXf6rUSSdJYLLZC2c6q2gW8K8lpK5VV1Rt7q0yStCwWOyO4p/vXm7wk6Ty12FKVv989vbuqPrcM9UiSltmovxr65ST3JHlnkhf2WpEkaVmNeh/BjwM/DswC70ty9wjrEUiSVoCR7yOoqpmq+lXgdcCdwNCpIiRJK8uoK5T9tSRvT3I38F7gfzE3rbQkaYUb9T6Cm5hbavLvVtX/6bEeSdIyWzQIukXo76uqf78M9UiSltmiQ0NV9RiwPsnqZahHkrTMRl6PALgtyRTw3XmGquo9vVQlSVo2owbBn3SPC5hba1iSdJ4YKQiq6h19FyJJGo9Rp6G+FRg26dzLlrwiSdKyGnVo6M0Dzy8CXgmcWvpyJEnLbdShoYPzmm5L8tke6pEkLbNR7yx+1sBjTZLNwA+M8LrNSe5NciTJ9UP2v66bt+jOJH+U5Ion8RkkSU/BqENDB/n/1whOAV8GXrPQC7ob0XYDVwPHgQNJpqrq8EC3D1bVb3T9twDvATaPXL0k6Slb8IwgyY8kWVtVG6rqB4F3AH/cPQ4v9FpgE3Ckqo5W1UnmpqjYOthh3vKXFzPkgrQkqV+LDQ29DzgJkOQlwL8D3g98E9izyGsvA44NbB/v2r5Hktcn+RNgFzB06csk25NMJ5menZ1d5G0lSWdjsSBYVVUPds//AbCnqj5WVb8APG8pCqiq3VX1XOCtwNA1DqpqT1VNVtXkxMTEUrytJKmzaBAkeeI6wk8AtwzsW+z6wglg/cD2uq7tTPYCP7XIMSVJS2yxIPgQ8Mkkvwd8B/g0QJLnMTc8tJADwMYkG7oJ67YBU4Mdkmwc2PxJ4H+fRe2SpCWw2OL170ryCeBS4H9U1RMXcy8A3rDIa08l2QHsB1YBN1XVoSQ3AtNVNQXsSPJy4FHgG8DPPbWPI0k6W4v+fLSqbh/S9qVRDl5V+4B989puGHj+plGOI0nqz6j3EUjSeWvnzp3MzMywdu1adu3aNe5ylp1BIKl5MzMznDix0G9Zzm8jTTEhSTp/GQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4p5iQtOSueu9V4y7hrKx+aDUXcAHHHjq2omq/7Q23LclxPCOQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNa7XIEiyOcm9SY4kuX7I/p9PcjjJXUk+keQ5fdYjSTpdb0GQZBWwG3gFcAVwTZIr5nX7PDBZVVcCHwV29VWPJGm4Ps8INgFHqupoVZ0E9gJbBztU1a1V9Wfd5u3Auh7rkSQN0WcQXAYcG9g+3rWdyWuA/z5sR5LtSaaTTM/Ozi5hiZKkc+JicZJ/BEwC7x62v6r2VNVkVU1OTEwsb3GSznv19OLxix+nnl7jLmUs+px99ASwfmB7Xdf2PZK8HPiXwI9V1Z/3WI8kDfXoVY+Ou4Sx6vOM4ACwMcmGJKuBbcDUYIckLwLeB2ypqq/3WIsk6Qx6C4KqOgXsAPYD9wA3V9WhJDcm2dJ1ezfwDOAjSe5MMnWGw0mSetLrwjRVtQ/YN6/thoHnL+/z/SVJizsnLhZLksbHIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuN6DYIkm5Pcm+RIkuuH7H9Jks8lOZXkVX3WIkkarrcgSLIK2A28ArgCuCbJFfO6fRW4DvhgX3VIkhZ2YY/H3gQcqaqjAEn2AluBw090qKovd/se77EOSdIC+hwaugw4NrB9vGs7a0m2J5lOMj07O7skxUmS5qyIi8VVtaeqJqtqcmJiYtzlSNJ5pc8gOAGsH9he17VJks4hfQbBAWBjkg1JVgPbgKke30+S9CT0FgRVdQrYAewH7gFurqpDSW5MsgUgyY8kOQ68GnhfkkN91SNJGq7PXw1RVfuAffPabhh4foC5ISNJ0pisiIvFkqT+GASS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWpcr0GQZHOSe5McSXL9kP1/IcmHu/13JLm8z3okSafrLQiSrAJ2A68ArgCuSXLFvG6vAb5RVc8DfgX4xb7qkSQN1+cZwSbgSFUdraqTwF5g67w+W4H3d88/CvxEkvRYkyRpngt7PPZlwLGB7ePAj56pT1WdSvJN4NnA/YOdkmwHtnebjyS5t5eKzw1rmPf5z3X5pZ8bdwnnihX33fGv/btrwIr7/vLGs/r+nnOmHX0GwZKpqj3AnnHXsRySTFfV5Ljr0Nnzu1vZWv7++hwaOgGsH9he17UN7ZPkQuAHgAd6rEmSNE+fQXAA2JhkQ5LVwDZgal6fKeCJcYVXAbdUVfVYkyRpnt6Ghrox/x3AfmAVcFNVHUpyIzBdVVPAbwEfSHIEeJC5sGhdE0Ng5ym/u5Wt2e8v/gEuSW3zzmJJapxBIEmNMwjOEUluSvL1JF8cdy06O0nWJ7k1yeEkh5K8adw1aXRJLkry2SRf6L6/d4y7puXmNYJzRJKXAI8Av1NVLxx3PRpdkkuBS6vqc0kuAQ4CP1VVh8dcmkbQzWZwcVU9kuT7gD8C3lRVt4+5tGXjGcE5oqo+xdwvp7TCVNXXqupz3fOHgXuYu2teK0DNeaTb/L7u0dRfyAaBtIS6GXRfBNwx3kp0NpKsSnIn8HXg41XV1PdnEEhLJMkzgI8B/7SqvjXuejS6qnqsqn6YuRkQNiVpanjWIJCWQDe2/DHgd6vqP427Hj05VfUQcCuwedy1LCeDQHqKuouNvwXcU1XvGXc9OjtJJpI8s3v+NOBq4I/HW9XyMgjOEUk+BHwG+KtJjid5zbhr0siuAn4GeFmSO7vH3xt3URrZpcCtSe5ibo60j1fVfx1zTcvKn49KUuM8I5CkxhkEktQ4g0CSGmcQSFLjDAJJapxBIM2T5LHuJ6BfTPKRJE9foO/bk7x5OeuTlppBIJ3uO1X1w90ssCeB1427IKlPBoG0sE8DzwNI8rNJ7urmrf/A/I5JXpvkQLf/Y0+cSSR5dXd28YUkn+rafqibA//O7pgbl/VTSQO8oUyaJ8kjVfWMJBcyN3/QHwCfAv4z8Leq6v4kz6qqB5O8HXikqn4pybOr6oHuGP8G+NOqem+Su4HNVXUiyTOr6qEk7wVur6rfTbIaWFVV3xnLB1bzPCOQTve0bkriaeCrzM0j9DLgI1V1P0BVDVs74oVJPt39j/9a4Ie69tuA307yWmBV1/YZ4F8keSvwHENA43ThuAuQzkHf6aYk/q65eeUW9dvMrUz2hSTXAS8FqKrXJflR4CeBg0leXFUfTHJH17YvyT+pqluW8DNII/OMQBrNLcCrkzwbIMmzhvS5BPhaNyX1tU80JnluVd1RVTcAs8D6JD8IHK2qXwV+D7iy908gnYFnBNIIqupQkncBn0zyGPB54Lp53X6BuZXJZrt/L+na391dDA7wCeALwFuBn0nyKDAD/NveP4R0Bl4slqTGOTQkSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLj/h+SBBZzqACz9AAAAABJRU5ErkJggg==\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "AY4w_LVmtG7C" | |
}, | |
"source": [ | |
"As predicted, people with higher socioeconomic class had a higher rate of survival. (62.9% vs. 47.3% vs. 24.2%)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 331 | |
}, | |
"id": "NKvART6otHvf", | |
"outputId": "0100aae8-6f1e-4db5-bf22-4b236f90534d" | |
}, | |
"source": [ | |
"#draw a bar plot for SibSp vs. survival\n", | |
"sns.barplot(x=\"SibSp\", y=\"Survived\", data=train)\n", | |
"\n", | |
"#I won't be printing individual percent values for all of these.\n", | |
"print(\"Percentage of SibSp = 0 who survived:\", train[\"Survived\"][train[\"SibSp\"] == 0].value_counts(normalize = True)[1]*100)\n", | |
"\n", | |
"print(\"Percentage of SibSp = 1 who survived:\", train[\"Survived\"][train[\"SibSp\"] == 1].value_counts(normalize = True)[1]*100)\n", | |
"\n", | |
"print(\"Percentage of SibSp = 2 who survived:\", train[\"Survived\"][train[\"SibSp\"] == 2].value_counts(normalize = True)[1]*100)" | |
], | |
"execution_count": 231, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Percentage of SibSp = 0 who survived: 34.53947368421053\n", | |
"Percentage of SibSp = 1 who survived: 53.588516746411486\n", | |
"Percentage of SibSp = 2 who survived: 46.42857142857143\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAATeklEQVR4nO3df7BndX3f8eeLJRsCkqQJm0LYJTDJxnRriD9ukBTHn5CuNV1mKjGAWp3RbDPjRlt/MDBxqMHJdIKJpjUb68YwtaaKFJN2bbZBq4CVJroXRXDZoisgu6tbdkUMogUW3v3je6BfL9/d+93lnu93v/t5Pmbu3PPjc873fZnlvu75nHM+n1QVkqR2HTPtAiRJ02UQSFLjDAJJapxBIEmNMwgkqXHHTruAQ3XSSSfV6aefPu0yJGmm3HzzzfuqasWofTMXBKeffjrz8/PTLkOSZkqSrx9on11DktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMbN3AtlOnpdcskl7Nmzh5NPPpkrr7xy2uVIzTAIdMTYs2cPu3fvnnYZUnPsGpKkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMb1GgRJ1ia5I8mOJJceoM0rktyeZFuSD/dZjyTpyXobYiLJMmAjcB6wC9iaZHNV3T7UZjVwGXBOVX07yU/1VY8kabQ+rwjOAnZU1Z1V9TBwNXD+gja/CWysqm8DVNW9PdYjSRqhz0HnTgV2Dq3vAp67oM3PAyS5CVgGvKOq/rrHmo5qjt4p6XBMe/TRY4HVwAuBlcBnkvxiVd0/3CjJemA9wGmnnTbpGmeGo3dKOhx9dg3tBlYNra/stg3bBWyuqkeq6i7gKwyC4QdU1aaqmququRUrVvRWsCS1qM8g2AqsTnJGkuXAhcDmBW3+C4OrAZKcxKCr6M4ea5IkLdBbEFTVfmADcB2wHbimqrYluSLJuq7ZdcC3ktwOXA+8raq+1VdNkqQn6/UeQVVtAbYs2Hb50HIBb+6+JElT4JvFktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNW7acxYLuOeKX1yS8+y/7yeAY9l/39eX5JynXX7bUy9K0hHPKwJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS43oNgiRrk9yRZEeSS0fsf22SvUlu6b5e32c9kqQn6+3N4iTLgI3AecAuYGuSzVV1+4KmH62qDX3VIUk6uD6vCM4CdlTVnVX1MHA1cH6PnydJOgx9BsGpwM6h9V3dtoVenuTWJNcmWTXqREnWJ5lPMr93794+apWkZk37ZvHHgdOr6kzgk8AHRzWqqk1VNVdVcytWrJhogZJ0tOszCHYDw3/hr+y2PaGqvlVVD3WrHwCe02M9kqQR+gyCrcDqJGckWQ5cCGwebpDklKHVdcD2HuuRJI3Q21NDVbU/yQbgOmAZcFVVbUtyBTBfVZuBNyZZB+wH7gNe21c9kqTRep2Ypqq2AFsWbLt8aPky4LI+a5AkHdy0bxZLkqbMIJCkxhkEktQ4g0CSGmcQSFLjen1qSJN10nGPAfu775NzznvPWZLzLL9/OcdwDDvv37kk57zpt29agqqko59BcBR565n3T7sESTPIriFJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOB8flZbAJZdcwp49ezj55JO58sorp12OdEgMAmkJ7Nmzh927dy/eUDoC2TUkSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJalyvQZBkbZI7kuxIculB2r08SSWZ67MeSdKT9RYESZYBG4GXAmuAi5KsGdHuROBNwOf6qkWSdGB9XhGcBeyoqjur6mHgauD8Ee3eCfw+8H97rEWSdAB9BsGpwM6h9V3dtickeTawqqr+6mAnSrI+yXyS+b179y59pZLUsKndLE5yDPBu4C2Lta2qTVU1V1VzK1as6L84SWrIQUcfTfIAUAfaX1U/epDDdwOrhtZXdtsedyLwDOCGJAAnA5uTrKuq+UXqliQtkYMGQVWdCJDkncA3gQ8BAV4JnLLIubcCq5OcwSAALgQuHjr3d4CTHl9PcgPwVkNAkiZr3PkI1lXVLw2tvy/Jl4DLD3RAVe1PsgG4DlgGXFVV25JcAcxX1ebDrronTi4iqUXjBsGDSV7J4MmfAi4CHlzsoKraAmxZsG1keFTVC8espTdOLiKpRePeLL4YeAXwf7qvX2eom0eSNLvGuiKoqrsZ/Q6AJGnGjXVFkOTnk3wqyZe79TOTvL3f0iRJkzBu19CfApcBjwBU1a0MngKSJM24cYPg+Kr6/IJt+5e6GEnS5I0bBPuS/Czdy2VJLmDwXoEkacaN+/joG4BNwC8k2Q3cxeClMknSjBs3CL5eVecmOQE4pqoe6LMoSdLkjNs1dFeSTcDZwHd7rEeSNGHjBsEvAP+DQRfRXUn+OMnz+itLkjQpYwVBVX2vqq6pqn8GPAv4UeDGXiuTJE3E2PMRJHlBkj8BbgaOYzDkhCRpxo11szjJ3cAXgWuAt1XVogPOTdJz3vYfl+Q8J+57gGXAPfseWJJz3vyuf/7Ui5Kkno371NCZVfV3vVYiSZqKxWYou6SqrgR+L8mTZiqrqjf2VpmkiXEujrYtdkWwvfvurGHSUcy5ONq22FSVH+8Wb6uqL0ygHknShI371NAfJtme5J1JntFrRZKkiRr3PYIXAS8C9gLvT3Kb8xFI0tFh7PcIqmpPVf074LeAWzjIxPWSpNkx7gxl/yDJO5LcBrwX+F/Ayl4rkyRNxLjvEVwFXA3846r6Ro/1SJImbNEgSLIMuKuq/u0E6pEkTdiiXUNV9SiwKsnyQz15krVJ7kiyI8mlI/b/Vnfj+ZYkn02y5lA/Q5L01IzbNXQXcFOSzcAT4wxV1bsPdEB3JbEROA/YBWxNsrmqbh9q9uGq+vdd+3XAu4G1h/YjLJ3Hlp/wA98lqQXjBsHXuq9jgBPHPOYsYEdV3QmQ5GrgfOCJIFgwftEJdHMiT8uDq391mh8vSVMxVhBU1e8exrlPBXYOre8CnruwUZI3AG8GlgMvHnWiJOuB9QCnnXbaYZQiSTqQcYehvp4Rf61X1chf3IeiqjYCG5NcDLwdeM2INpuATQBzc3NTvWqQpKPNuF1Dbx1aPg54ObB/kWN2A6uG1ld22w7kauB9Y9YjSVoi43YN3bxg001JPr/IYVuB1UnOYBAAFwIXDzdIsrqqvtqtvgz4KpKkiRq3a+gnhlaPAeaAHzvYMVW1P8kG4DpgGXBVVW1LcgUwX1WbgQ1JzgUeAb7NiG4hSVK/xu0aupn/f49gP3A38LrFDqqqLcCWBdsuH1p+05ifL0nqyWIzlP0ysLOqzujWX8Pg/sDdDD0GKi2FOr54jMeo430eQJqkxd4sfj/wMECS5wP/Bvgg8B26p3ikpfLIOY/w8HkP88g5j0y7FKkpi3UNLauq+7rl3wA2VdXHgI8luaXf0iRJk7DYFcGyJI+HxUuATw/tG/f+giTpCLbYL/OPADcm2Qd8H/ifAEl+jkH3kCRpxi02ef3vJfkUcArwiap6/C7eMcBv912cJKl/i3bvVNXfjtj2lX7KkSRN2thzFkuSjk4GgSQ1ziCQpMb5CKiaduPzX7Ak5/n+scsg4fu7di3ZOV/wmRuX5DzSYrwikKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjeg2CJGuT3JFkR5JLR+x/c5Lbk9ya5FNJfqbPeiRJT9ZbECRZBmwEXgqsAS5KsmZBsy8Cc1V1JnAtcGVf9UiSRuvziuAsYEdV3VlVDwNXA+cPN6iq66vqe93q3wIre6xHkjRCn0FwKrBzaH1Xt+1AXgf891E7kqxPMp9kfu/evUtYoiTpiLhZnORVwBzwrlH7q2pTVc1V1dyKFSsmW5wkHeX6nKFsN7BqaH1lt+0HJDkX+B3gBVX1UI/1SJJG6POKYCuwOskZSZYDFwKbhxskeRbwfmBdVd3bYy2SpAPoLQiqaj+wAbgO2A5cU1XbklyRZF3X7F3A04D/nOSWJJsPcDpJUk96nby+qrYAWxZsu3xo+dw+P1+StLgj4maxJGl6DAJJapxBIEmNMwgkqXEGgSQ1rtenhiT164/f8vElOc/9+x584vtSnHPDH/7Tp3wOTY5XBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktS4XoMgydokdyTZkeTSEfufn+QLSfYnuaDPWiRJo/UWBEmWARuBlwJrgIuSrFnQ7B7gtcCH+6pDknRwfc5ZfBawo6ruBEhyNXA+cPvjDarq7m7fYz3WIUk6iD67hk4Fdg6t7+q2SZKOIDNxszjJ+iTzSeb37t077XIk6ajSZxDsBlYNra/sth2yqtpUVXNVNbdixYolKU6SNNBnEGwFVic5I8ly4EJgc4+fJ0k6DL0FQVXtBzYA1wHbgWuqaluSK5KsA0jyy0l2Ab8OvD/Jtr7qkSSN1udTQ1TVFmDLgm2XDy1vZdBlJEmakpm4WSxJ6o9BIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS43p9oUxqxY9X/cB3aZYYBNISeNWjTqmh2WXXkCQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIaZxBIUuMMAklqXK9BkGRtkjuS7Ehy6Yj9P5zko93+zyU5vc96JElP1lsQJFkGbAReCqwBLkqyZkGz1wHfrqqfA94D/H5f9UiSRuvziuAsYEdV3VlVDwNXA+cvaHM+8MFu+VrgJUnSY02SpAVSPU2tl+QCYG1Vvb5bfzXw3KraMNTmy12bXd3617o2+xacaz2wvlt9OnBHL0UPnATsW7TVkcv6p2eWawfrn7a+6/+ZqloxasdMTFVZVZuATZP4rCTzVTU3ic/qg/VPzyzXDtY/bdOsv8+uod3AqqH1ld22kW2SHAv8GPCtHmuSJC3QZxBsBVYnOSPJcuBCYPOCNpuB13TLFwCfrr76qiRJI/XWNVRV+5NsAK4DlgFXVdW2JFcA81W1Gfgz4ENJdgD3MQiLaZtIF1SPrH96Zrl2sP5pm1r9vd0sliTNBt8slqTGGQSS1DiDoLPYcBhHuiRXJbm3ezdjpiRZleT6JLcn2ZbkTdOu6VAkOS7J55N8qav/d6dd0+FIsizJF5P8t2nXcqiS3J3ktiS3JJmfdj2HIsm/6v7dfDnJR5IcN+kaDALGHg7jSPcfgLXTLuIw7QfeUlVrgLOBN8zYf/+HgBdX1S8BzwTWJjl7yjUdjjcB26ddxFPwoqp65iy9S5DkVOCNwFxVPYPBgzUTf2jGIBgYZziMI1pVfYbBk1czp6q+WVVf6JYfYPDL6NTpVjW+Gvhut/pD3ddMPYWRZCXwMuAD066lQccCP9K9S3U88I1JF2AQDJwK7Bxa38UM/SI6mnQj0D4L+Nx0Kzk0XbfKLcC9wCeraqbqB/4IuAR4bNqFHKYCPpHk5m5ImplQVbuBPwDuAb4JfKeqPjHpOgwCHTGSPA34GPAvq+rvpl3PoaiqR6vqmQzeoD8ryTOmXdO4kvwacG9V3TztWp6C51XVsxl0774hyfOnXdA4kvw9Br0PZwA/DZyQ5FWTrsMgGBhnOAz1KMkPMQiB/1RVfzHteg5XVd0PXM9s3a85B1iX5G4G3aIvTvLn0y3p0HR/WVNV9wJ/yaC7dxacC9xVVXur6hHgL4B/NOkiDIKBcYbDUE+6ocf/DNheVe+edj2HKsmKJD/eLf8IcB7wv6db1fiq6rKqWllVpzP4t//pqpr4X6WHK8kJSU58fBn4VWBWnp67Bzg7yfHd/wcvYQo37A0CBsNhAI8Ph7EduKaqtk23qkOT5CPA3wBPT7IryeumXdMhOAd4NYO/RG/pvv7JtIs6BKcA1ye5lcEfFZ+sqpl7BHOG/X3gs0m+BHwe+Kuq+usp1zSW7l7StcAXgNsY/E6e+FATDjEhSY3zikCSGmcQSFLjDAJJapxBIEmNMwgkqXEGgXQASX6nGxXy1u6R1ucm+cDjA+Il+e4Bjjs7yee6Y7YnecdEC5cOUW9TVUqzLMmvAL8GPLuqHkpyErC8ql4/xuEfBF5RVV/qRrZ9ep+1Sk+VVwTSaKcA+6rqIYCq2ldV30hyQ5InhjlO8p7uquFTSVZ0m3+KwQBij49BdHvX9h1JPpTkb5J8NclvTvhnkkYyCKTRPgGsSvKVJH+S5AUj2pwAzFfVPwRuBP51t/09wB1J/jLJv1gw0ciZwIuBXwEuT/LTPf4M0lgMAmmEbn6B5wDrgb3AR5O8dkGzx4CPdst/DjyvO/YKYI5BmFwMDA938F+r6vtVtY/B4HSzMjiajmLeI5AOoKoeBW4AbkhyG/CaxQ4ZOvZrwPuS/CmwN8lPLmxzgHVp4rwikEZI8vQkq4c2PRP4+oJmxwAXdMsXA5/tjn1ZN5IkwGrgUeD+bv38bo7jnwReyGCQOmmqvCKQRnsa8N5ueOn9wA4G3UTXDrV5kMEkNG9nMDPZb3TbXw28J8n3umNfWVWPdtlwK4MuoZOAd1bVxKcllBZy9FFpQrr3Cb5bVX8w7VqkYXYNSVLjvCKQpMZ5RSBJjTMIJKlxBoEkNc4gkKTGGQSS1Lj/By9iaFLZT7AmAAAAAElFTkSuQmCC\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "EnXtHdI7tKaQ" | |
}, | |
"source": [ | |
"In general, it's clear that people with more siblings or spouses aboard were less likely to survive. However, contrary to expectations, people with no siblings or spouses were less to likely to survive than those with one or two. (34.5% vs 53.4% vs. 46.4%)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 279 | |
}, | |
"id": "IYasBzrbtNru", | |
"outputId": "cb3e37e9-d53d-42af-8000-3e0ac44e2839" | |
}, | |
"source": [ | |
"#draw a bar plot for Parch vs. survival\n", | |
"sns.barplot(x=\"Parch\", y=\"Survived\", data=train)\n", | |
"plt.show()" | |
], | |
"execution_count": 232, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAASb0lEQVR4nO3dfZBdd33f8fdHEo5jx9hlpCKPJUWeopC4hMRUNbTOgHlKbUrtTqGpnTimGRpNO5jSIbDjNBnHMWU6EVPawjgEBRwekqAYUzpqqsZpwQHiFrDEk7EcM4pt0CpsbWPsGOPGyP72j3vkXFar3bvSnnt19Xu/Znb2nnN/e+5nPR599vzOU6oKSVK7Vk06gCRpsiwCSWqcRSBJjbMIJKlxFoEkNW7NpAMs19q1a2vz5s2TjiFJU2Xv3r0PVtW6hd6buiLYvHkze/bsmXQMSZoqSb5+tPecGpKkxlkEktQ4i0CSGmcRSFLjLAJJapxFIEmN660IktyY5P4kXz3K+0nyriT7k3wlyQv6yiJJOro+9wg+AFy8yPuXAFu6r23Ae3rMIkk6it4uKKuqTyfZvMiQy4AP1eCBCJ9NclaSs6vqm31lkvoyMzPD3Nwc69evZ/v27ZOOIy3LJK8sPgc4MLQ82607ogiSbGOw18CmTZvGEk5ajrm5OQ4ePDjpGNIxmYqDxVW1o6q2VtXWdesWvFWGJOkYTbIIDgIbh5Y3dOskSWM0ySLYBVzVnT30IuARjw9I0vj1dowgyUeAi4C1SWaBXwOeAVBVvwXsBl4F7Ae+C/xCX1kkSUfX51lDVyzxfgFv6OvzJUmjmYqDxZKk/lgEktQ4i0CSGmcRSFLjLAJJapxFIEmNswgkqXEWgSQ1ziKQpMZZBJLUOItAkhpnEUhS4ywCSWqcRSBJjbMIJKlxFoEkNc4ikKTGWQSS1DiLQJIaZxFIUuMsAklqnEUgSY2zCCSpcRaBJDXOIpCkxlkEktQ4i0CSGmcRSFLjLAJJapxFIEmNswgkqXEWgSQ1rtciSHJxkruT7E9yzQLvb0pya5IvJvlKklf1mUeSdKTeiiDJauAG4BLgPOCKJOfNG/arwE1VdT5wOfCbfeWRJC2szz2CC4D9VXVPVT0B7AQumzemgGd2r88E/qLHPJKkBfRZBOcAB4aWZ7t1w64DrkwyC+wG3rjQhpJsS7InyZ4HHnigj6yS1KxJHyy+AvhAVW0AXgV8OMkRmapqR1Vtraqt69atG3tISTqZ9VkEB4GNQ8sbunXDXg/cBFBV/wc4FVjbYyZJ0jx9FsHtwJYk5yY5hcHB4F3zxnwDeDlAkh9jUATO/UjSGPVWBFV1CLgauAW4i8HZQXcmuT7Jpd2wXwJ+McmXgY8A/7yqqq9MkqQjrelz41W1m8FB4OF11w693gdc2GcGSdLiJn2wWJI0YRaBJDXOIpCkxlkEktQ4i0CSGmcRSFLjLAJJapxFIEmNswgkqXEWgSQ1ziKQpMZZBJLUOItAkhpnEUhS4ywCSWqcRSBJjbMIJKlxFoEkNc4ikKTGWQSS1DiLQJIaZxFIUuMsAklqnEUgSY2zCCSpcWsmHUArZ2Zmhrm5OdavX8/27dsnHUfSlLAITiJzc3McPHhw0jEkTRmnhiSpcRaBJDXOIpCkxlkEktQ4i0CSGtdrESS5OMndSfYnueYoY34myb4kdyb5/T7z6MQ2MzPDVVddxczMzKSjSE1Z9PTRJI8CdbT3q+qZi/zsauAG4JXALHB7kl1VtW9ozBbgl4ELq+rbSf7mMvPrJOLpr9JkLFoEVXUGQJK3Ad8EPgwE+Dng7CW2fQGwv6ru6baxE7gM2Dc05heBG6rq293n3X8Mv4Mk6TiMekHZpVX1E0PL70nyZeDaRX7mHODA0PIs8MJ5Y34EIMltwGrguqr6oxEzSRLgVfXHa9QieCzJzwE7GUwVXQE8tkKfvwW4CNgAfDrJj1fVw8ODkmwDtgFs2rRpBT5WGvjUi1+yItt5fM1qSHh8dnbFtvmST39qRbbTAqcVj8+oB4t/FvgZ4P92X/+0W7eYg8DGoeUN3bphs8CuqvpeVd0LfI1BMXyfqtpRVVurauu6detGjCxJGsVIewRVdR+D+f3luB3YkuRcBgVwOUeWx39lsHfxO0nWMpgqumeZnyNJOg4j7REk+ZEkn0jy1W75+Ul+dbGfqapDwNXALcBdwE1VdWeS65Nc2g27BfhWkn3ArcBbq+pbx/rLSJKWb9RjBL8NvBV4L0BVfaU75//fLfZDVbUb2D1v3bVDrwt4c/clSZqAUY8RnFZVn5+37tBKh5Ekjd+oRfBgkr9Fd3FZktcyuK5AkjTlRp0aegOwA/jRJAeBexlcVCZJmnKjFsHXq+oVSU4HVlXVo32GkiSNz6hTQ/cm2QG8CPhOj3kkSWM26h7BjwKvZjBF9P4kfwjsrKo/7S1ZQ75x/Y+vyHYOPfQsYA2HHvr6imxz07V3HH8oSSe8kfYIquq7VXVTVf0T4HzgmYDXv0vSSWDk5xEkeUmS3wT2AqcyuOWEJGnKjTQ1lOQ+4IvATQyu/l2JG85Jkk4Aox4jeH5V/WWvSSRJE7HUE8pmqmo78PYkRzyprKr+dW/JJEljsdQewV3d9z19B5EkTcZSj6r8b93LO6rqC2PII0kas1HPGvoPSe5K8rYkz+s1kSRprEa9juClwEuBB4D3JrljqecRSJKmw6hnDVFVc8C7ktwKzDB4cP2izyNQGy5894Ursp1THj6FVaziwMMHVmSbt73xthVIJZ38Rn1C2Y8luS7JHcC7gf/N4BnEkqQpN+oewY3ATuAfVNVf9JhHkjRmSxZBktXAvVX1n8eQR5I0ZktODVXVk8DGJKeMIY8kacxGnRq6F7gtyS7g6fsMVdU7e0klSRqbUYvgz7uvVcAZ/cWRJI3bSEVQVb/edxBJ0mSMehvqW4GFbjr3shVPJEkaq1Gnht4y9PpU4DXAoZWPI0kat1GnhvbOW3Vbks/3kEfHYe2pTwGHuu+SNJpRp4aeNbS4CtgKnNlLIh2ztzz/4UlHkDSFRp0a2stfHyM4BNwHvL6PQJKk8VrqCWV/FzhQVed2y69jcHzgPmBf7+kkSb1b6sri9wJPACR5MfDvgQ8CjwA7+o0mSRqHpaaGVlfVQ93rfwbsqKqPAR9L8qV+o0mSxmGpPYLVSQ6XxcuBTw69N/KzDCRJJ66l/jH/CPCpJA8CjwOfAUjyHAbTQ5KkKbfoHkFVvR34JeADwE9V1eEzh1YBb1xq40kuTnJ3kv1Jrllk3GuSVJKto0eXJK2EJad3quqzC6z72lI/1z3H4AbglcAscHuSXVW1b964M4A3AZ8bNbQkaeWM9KjKY3QBsL+q7qmqJxg84eyyBca9DfgN4P/1mEWSdBR9FsE5wIGh5dlu3dOSvADYWFX/fbENJdmWZE+SPQ888MDKJ5WkhvVZBItKsgp4J4NjEIuqqh1VtbWqtq5bt67/cJLUkD6L4CCwcWh5Q7fusDOA5wF/kuQ+4EXALg8YS9J49XktwO3AliTnMiiAy4GfPfxmVT0CrD28nORPgLdU1Z4eMy1qZmaGubk51q9fz/bt2ycVQ5LGqrciqKpDSa4GbgFWAzdW1Z1Jrgf2VNWuvj77WM3NzXHw4MGlB6oXdVrxFE9Rpx3xDCRJPer16uCq2g3snrfu2qOMvajPLDrxfe/C7006gtSkiR0sliSdGCwCSWqcRSBJjbMIJKlxFoEkNc4ikKTGnRQPl/k7b/3QimznjAcfZTXwjQcfXZFt7n3HVccfSpJ65h6BJDXOIpCkxlkEktQ4i0CSGmcRSFLjLAJJapxFIEmNOymuI1gpT51y+vd9l6QWWARDHtvy05OOIElj59SQJDXOIpCkxlkEktQ4i0CSGmcRSFLjLAJJapxFIEmNswgkqXEWgSQ1ziKQpMZZBJLUOItAkhpnEUhS4ywCSWqcRSBJjbMIJKlxvRZBkouT3J1kf5JrFnj/zUn2JflKkk8k+eE+80iSjtRbESRZDdwAXAKcB1yR5Lx5w74IbK2q5wM3A9v7yiNJWlifewQXAPur6p6qegLYCVw2PKCqbq2q73aLnwU29JhHkrSAPp9ZfA5wYGh5FnjhIuNfD/yPhd5Isg3YBrBp06aVyidpwt5+5WtXZDsP3f/I4PvcN1dkm7/yuzcf9zamyQlxsDjJlcBW4B0LvV9VO6pqa1VtXbdu3XjDSSM4q4pnVXFW1aSjSMvW5x7BQWDj0PKGbt33SfIK4FeAl1TVX/WYR+rNlU8+NekI0jHrc4/gdmBLknOTnAJcDuwaHpDkfOC9wKVVdX+PWSRJR9FbEVTVIeBq4BbgLuCmqrozyfVJLu2GvQP4IeCjSb6UZNdRNidJ6kmfU0NU1W5g97x11w69fkWfny9JWtoJcbBYkjQ5FoEkNc4ikKTGWQSS1DiLQJIaZxFIUuMsAklqnEUgSY2zCCSpcRaBJDXOIpCkxlkEktQ4i0CSGmcRSFLjLAJJapxFIEmNswgkqXEWgSQ1ziKQpMZZBJLUOItAkhpnEUhS4ywCSWqcRSBJjbMIJKlxFoEkNc4ikKTGWQSS1DiLQJIaZxFIUuMsAklqnEUgSY2zCCSpcb0WQZKLk9ydZH+SaxZ4/weS/EH3/ueSbO4zjyTpSL0VQZLVwA3AJcB5wBVJzps37PXAt6vqOcB/BH6jrzySpIX1uUdwAbC/qu6pqieAncBl88ZcBnywe30z8PIk6TGTJGmeVFU/G05eC1xcVf+iW/554IVVdfXQmK92Y2a75T/vxjw4b1vbgG3d4nOBu3sJPbAWeHDJUScu80/ONGcH809a3/l/uKrWLfTGmh4/dMVU1Q5gxzg+K8meqto6js/qg/knZ5qzg/knbZL5+5waOghsHFre0K1bcEySNcCZwLd6zCRJmqfPIrgd2JLk3CSnAJcDu+aN2QW8rnv9WuCT1ddclSRpQb1NDVXVoSRXA7cAq4Ebq+rOJNcDe6pqF/B+4MNJ9gMPMSiLSRvLFFSPzD8505wdzD9pE8vf28FiSdJ08MpiSWqcRSBJjbMIOkvdDuNEl+TGJPd312ZMlSQbk9yaZF+SO5O8adKZliPJqUk+n+TLXf5fn3SmY5FkdZIvJvnDSWdZriT3JbkjyZeS7Jl0nuVKclaSm5P8WZK7kvy9sX6+xwievh3G14BXArMMzni6oqr2TTTYMiR5MfAd4ENV9bxJ51mOJGcDZ1fVF5KcAewF/vG0/PfvroY/vaq+k+QZwJ8Cb6qqz0442rIkeTOwFXhmVb160nmWI8l9wNb5F6NOiyQfBD5TVe/rzrI8raoeHtfnu0cwMMrtME5oVfVpBmdeTZ2q+mZVfaF7/ShwF3DOZFONrga+0y0+o/uaqr+wkmwA/iHwvklnaU2SM4EXMziLkqp6YpwlABbBYecAB4aWZ5mif4hOJt0daM8HPjfZJMvTTat8Cbgf+J9VNVX5gf8EzABPTTrIMSrgj5Ps7W5JM03OBR4AfqebmntfktPHGcAi0AkjyQ8BHwP+TVX95aTzLEdVPVlVP8ngCvoLkkzN9FySVwP3V9XeSWc5Dj9VVS9gcLfjN3RTpdNiDfAC4D1VdT7wGDDW45QWwcAot8NQj7q59Y8Bv1dV/2XSeY5Vt0t/K3DxpLMsw4XApd08+07gZUl+d7KRlqeqDnbf7wc+zmC6d1rMArNDe5E3MyiGsbEIBka5HYZ60h1sfT9wV1W9c9J5livJuiRnda9/kMFJB3822VSjq6pfrqoNVbWZwf/7n6yqKycca2RJTu9OMqCbUvlpYGrOnquqOeBAkud2q14OjPVEiam4+2jfjnY7jAnHWpYkHwEuAtYmmQV+rareP9lUI7sQ+Hngjm6eHeDfVtXuCWZajrOBD3Znn60CbqqqqTsFc4o9G/h49yiTNcDvV9UfTTbSsr0R+L3uD9F7gF8Y54d7+qgkNc6pIUlqnEUgSY2zCCSpcRaBJDXOIpCkxlkE0lEkebK7m+VXk3w0yWnHub3N03h3WJ38LALp6B6vqp/s7ub6BPAvR/mhJF6fo6liEUij+QzwnCT/KMnnupuD/a8kzwZIcl2SDye5jcFzuJ+d5OPdMwq+nOTvd9tZneS3u+cW/HF3JbI0URaBtITuL/xLgDsYPGvgRd3NwXYyuGPnYecBr6iqK4B3AZ+qqp9gcN+Yw1eqbwFuqKq/DTwMvGY8v4V0dO7CSkf3g0O3vPgMg/shPRf4g+5hOqcA9w6N31VVj3evXwZcBYM7kwKPJPkbwL1VdXibe4HN/f4K0tIsAunoHu9uLf20JO8G3llVu5JcBFw39PZjI2zzr4ZePwk4NaSJc2pIWp4z+etblL9ukXGfAP4VPP3QmjP7DiYdK4tAWp7rgI8m2Qss9nzcNwEvTXIHgymg88aQTTom3n1UkhrnHoEkNc4ikKTGWQSS1DiLQJIaZxFIUuMsAklqnEUgSY37/wIC7A9FChxzAAAAAElFTkSuQmCC\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "KAzwQUtxtSNE" | |
}, | |
"source": [ | |
"People with less than four parents or children aboard are more likely to survive than those with four or more. Again, people traveling alone are less likely to survive than those with 1-3 parents or children." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "-VSRphEPtWHL" | |
}, | |
"source": [ | |
"### Age Feature" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 279 | |
}, | |
"id": "57imtxRFtTKI", | |
"outputId": "58dca94e-538b-49e5-e21c-ad1525893634" | |
}, | |
"source": [ | |
"#sort the ages into logical categories\n", | |
"train[\"Age\"] = train[\"Age\"].fillna(-0.5)\n", | |
"test[\"Age\"] = test[\"Age\"].fillna(-0.5)\n", | |
"bins = [-1, 0, 5, 12, 18, 24, 35, 60, np.inf]\n", | |
"labels = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']\n", | |
"train['AgeGroup'] = pd.cut(train[\"Age\"], bins, labels = labels)\n", | |
"test['AgeGroup'] = pd.cut(test[\"Age\"], bins, labels = labels)\n", | |
"\n", | |
"#draw a bar plot of Age vs. survival\n", | |
"sns.barplot(x=\"AgeGroup\", y=\"Survived\", data=train)\n", | |
"plt.show()" | |
], | |
"execution_count": 233, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAeAklEQVR4nO3de5wdZZ3n8c83jQl3FNIumAvJalSioEgTVJCLggYviQpqAgyyOmbdMV5WMRNXFmMcxhVHVDQikcELKhFx0MbJGmcVMCBKOhCISQzbJMEkGkkIKCC3JL/5o56mKyfn1umuPunU9/169eucqvPUU79Tp7p+VU9VPaWIwMzMymtYqwMwM7PWciIwMys5JwIzs5JzIjAzKzknAjOzktun1QH01ciRI2PcuHGtDsPMbEhZunTplohor/bZkEsE48aNo6urq9VhmJkNKZLur/WZm4bMzErOicDMrOScCMzMSs6JwMys5JwIzMxKzonAzKzknAjMzErOicDMrOSG3A1lZTVr1iw2bdrE4YcfzqWXXtrqcMxsL+JEMERs2rSJjRs3tjoMM9sLuWnIzKzknAjMzErOicDMrOScCMzMSs6JwMys5JwIzMxKrtBEIGmypNWSuiXNrvL5WEk3SbpL0j2S3lhkPGZmtqvCEoGkNmAecCYwEZguaWJFsYuA6yLiWGAa8LWi4jEzs+qKPCKYBHRHxJqIeApYAEytKBPAwen9IcAfC4zHzMyqKDIRjALW54Y3pHF5c4DzJG0AFgIfrFaRpBmSuiR1bd68uYhYzcxKq9Uni6cD34qI0cAbgWsk7RJTRMyPiI6I6Ghvbx/0IM3M9mZFJoKNwJjc8Og0Lu+9wHUAEXE7sC8wssCYzMysQpGJYAkwQdJ4ScPJTgZ3VpT5A/A6AElHkSUCt/2YmQ2iwhJBRGwDZgKLgFVkVwetkDRX0pRU7GPA+yTdDVwLXBARUVRMZma2q0K7oY6IhWQngfPjLs69XwmcWGQMZmZWX6tPFpuZWYs5EZiZlZwTgZlZyTkRmJmVnBOBmVnJORGYmZWcE4GZWck5EZiZlZwTgZlZyTkRmJmVnBOBmVnJORGYmZWcE4GZWck5EZiZlZwTgZlZyRWaCCRNlrRaUrek2VU+/6KkZenvXkkPFxmPmZntqrAH00hqA+YBZwAbgCWSOtPDaACIiP+ZK/9B4Nii4jEzs+qKfELZJKA7ItYASFoATAVW1ig/HfhUgfG01B/mHt2v6bdtPRTYh21b7+9XXWMvXt6vOMxs71Nk09AoYH1ueEMatwtJRwLjgV/W+HyGpC5JXZs3+9n2ZmYDaU85WTwNuD4itlf7MCLmR0RHRHS0t7cPcmhmZnu3IhPBRmBMbnh0GlfNNODaAmMxM7MaikwES4AJksZLGk62se+sLCTpxcBzgNsLjMXMzGooLBFExDZgJrAIWAVcFxErJM2VNCVXdBqwICKiqFjMzKy2Iq8aIiIWAgsrxl1cMTynyBjMzKy+PeVksZmZtYgTgZlZyTkRmJmVnBOBmVnJORGYmZWcE4GZWck5EZiZlZwTgZlZyTkRmJmVnBOBmVnJORGYmZWcE4GZWck5EZiZlZwTgZlZyTkRmJmVnBOBmVnJFZoIJE2WtFpSt6TZNcq8U9JKSSskfb/IeMzMbFeFPaFMUhswDzgD2AAskdQZEStzZSYAnwBOjIiHJD23qHjMzKy6Io8IJgHdEbEmIp4CFgBTK8q8D5gXEQ8BRMQDBcZjZmZVFJkIRgHrc8Mb0ri8FwIvlHSbpN9ImlytIkkzJHVJ6tq8eXNB4ZqZlVOrTxbvA0wATgWmA9+Q9OzKQhExPyI6IqKjvb19kEM0M9u7FZkINgJjcsOj07i8DUBnRDwdEWuBe8kSg5mZDZIiE8ESYIKk8ZKGA9OAzooyPyY7GkDSSLKmojUFxmRmZhUKSwQRsQ2YCSwCVgHXRcQKSXMlTUnFFgEPSloJ3AR8PCIeLComMzPbVWGXjwJExEJgYcW4i3PvA/ho+jMzsxZo9cliMzNrMScCM7OScyIwMyu5Qs8R2MAZue8OYFt63XPNmjWLTZs2cfjhh3PppZe2Ohwza4ITwRBx4TEPtzqEpmzatImNGytvFzGzPZmbhszMSs6JwMys5Nw0ZKXkcxlmvZwIrJR8LsOsl5uGzMxKzonAzKzknAjMzErOicDMrOScCMzMSs6JwMys5OpePirpESBqfR4RBw94RGZmNqjqHhFExEFpY/9lYDYwiuzZw/8IfKlR5ZImS1otqVvS7CqfXyBps6Rl6e/vd+9rmJnZ7mr2hrIpEfGy3PAVku4GLq41gaQ2YB5wBtlD6pdI6oyIlRVFfxARM/sStJmZDZxmzxE8JulcSW2Shkk6F3iswTSTgO6IWBMRTwELgKn9CdbMzAZes4ngHOCdwJ/T3zvSuHpGAetzwxvSuEpnSbpH0vWSxlSrSNIMSV2SujZv3txkyGZm1oymEkFErIuIqRExMiLaI+KtEbFuAOZ/IzAuIo4B/gP4do35z4+IjojoaG9vH4DZmplZj6YSgaQXSvqFpN+l4WMkXdRgso1Afg9/dBr3jIh4MCKeTINXAcc1F7aZmQ2UZpuGvgF8AngaICLuAaY1mGYJMEHSeEnDU/nOfAFJR+QGpwCrmozHzMwGSLNXDe0fEXdIyo/bVm+CiNgmaSawCGgDro6IFZLmAl0R0Ql8SNKUVNdW4IK+fgEzM+ufZhPBFknPJ91cJuls4E+NJoqIhcDCinEX595/guxIw8yGKD/kZ+hrNhF8AJgPvFjSRmAtcG5hUZnZkDFUHvLjhFVbs4ng/og4XdIBwLCIeKTIoMzMBtpQSVit0OzJ4rWS5gOvBB4tMB4zMxtkzSaCFwP/j6yJaK2kr0o6qbiwzMxssDR7Q9nfIuK6iHg7cCxwMHBLoZGZmdmgaPp5BJJOkfQ1YCmwL1mXE2ZmNsQ1dbJY0jrgLuA64OMR0ajDOTMzGyKavWromIj4a6GRmJlZSzR6QtmsiLgUuETSLk8qi4gPFRaZmZkNikZHBD19/3QVHYiZmbVG3UQQETemt8sj4s5BiMfMzAZZs+cIviDpcOB6skdL/q7AmMwauuXkU/o1/eP7tIHE4xs27HZdp/yq2Cuo3SWCDZZm7yM4DTgN2AxcKWl5E88jMLN+6OkSYdOmTa0OxfZyTd9HEBGbIuJy4P3AMuo8uN7MzIaOZp9QdpSkOZKWA18Bfk32xDEzMxvimj0iuBp4CHhDRJwaEVdExAONJpI0WdJqSd2SZtcpd5akkNTRZDxmZnulWbNmcf755zNr1qxBm2fDk8WS2oC1EfHlvlScppsHnAFsAJZI6oyIlRXlDgI+DPy2L/Wbme2NWtFddsNEEBHbJY2RNDwinupD3ZOA7ohYAyBpATAVWFlR7jPA54CP96FuMxtAl5x39m5Pu/WBv2Svm/7Ur3o++d3rd3ta659mLx9dC9wmqRN4pp+hiLiszjSjgPW54Q3ACfkCkl4BjImIf5dUMxFImgHMABg7dmyTIZvZ3mTVJb/s1/RPbX38mdf+1HXUJ1/brzj2RM2eI7gP+Gkqf1Dub7dJGgZcBnysUdmImB8RHRHR0d7e3p/ZmplZhaaOCCLi07tR90ZgTG54dBrX4yDgpcDNkgAOBzolTYkId2lhZjZImu2G+iagWqdz9Y6RlgATJI0nSwDTgHNy0/4FGJmbx83AhU4CZmaDq9lzBBfm3u8LnAVsqzdBRGyTNBNYBLQBV0fECklzga6I6NydgM3MbGA12zS0tGLUbZLuaGK6hcDCinFV70iOiFObicXMzAZWs01Dh+YGhwEdwCGFRGRmZoOq2aahpfSeI9gGrAPeW0RAZmY2uBo9oex4YH1EjE/D7yY7P7COXW8MMzOzIajRfQRXAk8BSDoZ+CzwbeAvwPxiQzMzs8HQqGmoLSK2pvfvAuZHxI+AH0laVmxog8MP/9jZiV85sV/TD394OMMYxvqH1+92Xbd98LZ+xWBmfdMwEUjaJyK2Aa8jdfPQ5LRDQis6eDIz25M02phfC9wiaQvwOLAYQNILyJqHzMxsiGv08PpLJP0COAL4eUT0XDk0DPhg0cGZDXVf/diNuz3tw1see+a1P/XM/MJbdntaK4dmuqH+TZVx9xYTjpmZDbamn1lsZmZ7JycCM7OS2yuu/DGz1tm3bdhOrzb0OBGYWb8ce1i/nlE1aA7b95CdXq2XE4GZlcLMY89pXKikfCxnZlZyTgRmZiVXaCKQNFnSakndkmZX+fz9kpZLWibpVkkTi4zHrMezIzg0gmfHLk9gNSudws4RSGoD5gFnABuAJZI6IyLfffX3I+LrqfwU4DJgclExmfU4b/uOVodgtsco8ohgEtAdEWsi4ilgATA1XyAi/pobPIDeh9+YmdkgKfKqoVHA+tzwBuCEykKSPgB8FBgOvLZaRZJmkHo+HTt27IAHamZWZi2/fDQi5gHzJJ0DXAS8u0qZ+aQH4XR0dOx01HDcx7/Tr/kftOUR2oA/bHmkX3Ut/fz5/YrDzKxVimwa2giMyQ2PTuNqWQC8tcB4zMysiiITwRJggqTxkoYD04DOfAFJE3KDbwL+f4HxmJlZFYU1DUXENkkzgUVAG3B1RKyQNBfoiohOYKak04GngYeo0ixkZmbFKvQcQUQsBBZWjLs49/7DRc7fzMwa853FZmYl50RgZlZyTgRmZiXX8vsIzKy6A4YfvNOrWVGcCMz2UCc+/+2tDsFKwk1DZmYl50RgZlZyTgRmZiXnRGBmVnJOBGZmJedEYGZWck4EZmYl5/sIzMwG0Jw5c/o1/datW5957U9dfZm29Ilgx/ADdno1Myub0ieCxya8vtUhmJm1VKHnCCRNlrRaUrek2VU+/6iklZLukfQLSUcWGY+Zme2qsEQgqQ2YB5wJTASmS5pYUewuoCMijgGuBy4tKh4zM6uuyCOCSUB3RKyJiKfIHk4/NV8gIm6KiL+lwd+QPeDezMwGUZGJYBSwPje8IY2r5b3A/y0wHjMzq2KPOFks6TygAzilxuczgBkAY8eOHcTIzMz2fkUeEWwExuSGR6dxO5F0OvBJYEpEPFmtooiYHxEdEdHR3t5eSLBmZmVVZCJYAkyQNF7ScGAa0JkvIOlY4EqyJPBAgbGYmVkNhSWCiNgGzAQWAauA6yJihaS5kqakYp8HDgR+KGmZpM4a1ZmZWUEKPUcQEQuBhRXjLs69P73I+ZuZWWPudM7MrOScCMzMSs6JwMys5JwIzMxKzonAzKzk9og7i23vEfsHO9hB7B+tDsXMmuREYAPq6ROfbnUIZtZHbhoyMys5JwIzs5JzIjAzKzknAjOzknMiMDMrOScCM7OScyIwMys5JwIzs5JzIjAzK7lCE4GkyZJWS+qWNLvK5ydLulPSNklnFxmLmZlVV1gikNQGzAPOBCYC0yVNrCj2B+AC4PtFxWFmZvUV2dfQJKA7ItYASFoATAVW9hSIiHXpsx0FxmFmZnUU2TQ0ClifG96QxvWZpBmSuiR1bd68eUCCMzOzzJA4WRwR8yOiIyI62tvbWx2OmdlepchEsBEYkxsencaZmdkepMhEsASYIGm8pOHANKCzwPmZmdluKCwRRMQ2YCawCFgFXBcRKyTNlTQFQNLxkjYA7wCulLSiqHjMzIaCESNGsN9++zFixIhBm2ehTyiLiIXAwopxF+feLyFrMjIzM+Doo48e9HkOiZPFZmZWHCcCM7OScyIwMys5JwIzs5JzIjAzKzknAjOzknMiMDMrOScCM7OScyIwMys5JwIzs5JzIjAzKzknAjOzknMiMDMrOScCM7OScyIwMys5JwIzs5IrNBFImixptaRuSbOrfD5C0g/S57+VNK7IeMzMbFeFJQJJbcA84ExgIjBd0sSKYu8FHoqIFwBfBD5XVDxmZlZdkUcEk4DuiFgTEU8BC4CpFWWmAt9O768HXidJBcZkZmYVFBHFVCydDUyOiL9Pw38HnBARM3NlfpfKbEjD96UyWyrqmgHMSIMvAlYPcLgjgS0NS7We4xxYQyHOoRAjOM6BVkScR0ZEe7UPCn14/UCJiPnA/KLql9QVER1F1T9QHOfAGgpxDoUYwXEOtMGOs8imoY3AmNzw6DSuahlJ+wCHAA8WGJOZmVUoMhEsASZIGi9pODAN6Kwo0wm8O70/G/hlFNVWZWZmVRXWNBQR2yTNBBYBbcDVEbFC0lygKyI6gX8FrpHUDWwlSxatUFiz0wBznANrKMQ5FGIExznQBjXOwk4Wm5nZ0OA7i83MSs6JwMys5IZUIpA0Lt17kB83R9KFdaa5QNJXi4+u7yRtl7RM0t2S7pT06gbld/n+g0XS4ZIWSLpP0lJJCyXNkPTTGuWv6rmTXNI6SSOrlKn726Uyh6VltEzSJkkbc8PDB+bbDTxJn5S0QtI9KdYTJH1E0v67UdejfSwvSbdKOjOt/8+T9A5JP+vrvPsrfecnJB1Sp0zV9aOizLfSvUlULkdJb5UUkl5cY9qbJdW9FDO/LvYss3rlm1VtPdiNOjokXT4Q8dQyJO4j2Is9HhEvB5D0BuCzwCmtDWlX6W7vG4BvR8S0NO5lwJRa0/TcSNhfEfEg0LOM5gCPRsS/DETdRZF0EvBm4BUR8WTayA0HfgB8F/hbkfOPiJD0fuCHwGbgPuCfgclFzreG6WRXEL4d+OYA1fkRdl6O04Fb0+unBqD+C4DfAX/sTyWSXkX19aBPIqIL6OrDfPeJiG19mceQOiKoJ2X9z0m6Q9K9kl5TpcybJN0uaWTaw7hc0q8lrcntbUjS5yX9TtJySe9K4+dJmpLe3yDp6vT+PZIuSXvrqyR9I+0B/FzSfn34CgcDD6U6D5T0i3SUsFxSvmuOfSR9L83rekn7S3qtpB/nvucZkm7o80Ks7TTg6Yj4es+IiLgbWAwcmOL4fYpLKYaqe2FpD+leSbeS3SXeZ5KOk3SLsiOTRZKOSOOfL+lnafzinj3EOr91zeUs6X8r6zDxVknX5vYW683j65J+C8wCtkTEk2lZbSG7PPp5wE2SbkrTPJqb39mSvpXej0/r6XJJ/1Tx3T8uaUnaw/x0GrfLuke28V8NnAD8BDgQ+GKa7jeSjknT7nRUltb7cfXWZ0nHq3cP9/OqcZQq6flpvheRbaR7xh+W6lsh6SqgZ53Z6YhX0oXKkn++zg/ll6OkA4GTyPot69lJ2U/Z0euq9H+wX276qss8Pw7oAL6Xvl9f/ocrHUHFehARf6yz/lbdhkk6VenIW9Khkn5c43e8RtJtwDV9DXSvSQTJPhExiWyPYac9A0lvA2YDb8x1YXEE2Ur0ZuD/pHFvJ9sDfRlwOvD59EMtBnqSyyiyjvRI436V3k8A5kXES4CHgbMaxLtfWtl+D1wFfCaNfwJ4W0S8gmwj/IWeDSzZxvNrEXEU8FfgH4CbgBdL6rl9/L8BVzeYd1+8FFha47NjyZb3ROC/AifWqkTScWT/rC8H3ggcvxuxCPgKcHZEHEf2PS9Jn80HPpjGXwh8LTddtd+66nKWdDzZb/cysk4T8wmt3jxGA68GzgHGpH/mr0k6JSIuJ9vDPC0iTmvwHb8MXBERRwN/euaLS68nW8cmkS3D4ySdnD6utu6dC2wnuzT7x8DSiDgG+F/AdxrEUKtOyPbs/3s6mt1eZ/ppZH2MLQZeJOm/pPGfAm5N9d4AjG0iFgCqLMepwM8i4l7gwbSO/Q/gb+l/5FPAcX2o/3qyve9zI+LlEfF4s9NW8XMq1gNJz6L2+gt1tmHJp4G7avyOE4HTI2J6lenqGmpNQ7Wude0Z/2/pdSkwLvf5a8n+mV8fEX/Njf9xROwAVuZW0pOAayNiO/BnSbeQbbAWAx9R1u69EnhOShCvAj4EHAasjYhlNWKoJt809CrgO5JeSrax++f0T76DLPH0xLc+Im5L778LfCgi/kXSNcB5kr6ZYjq/wbwHyh25vqKWkX3nW2uUfQ1wQ0T8LZWvvMGwGSPIEtN/pNzYBvwp7Rm+Gvhhb85kRG66ar91reV8IvCTiHgCeELSjSneRvP4YVpvHk0bpNeQJZgfqEo37HWcSO9G9xp6e+V9ffq7Kw0fSLax/gNV1r2IeEzSA8DCFMtZABHxy7RXfnCDOHapU9KzgYMi4vY0/vtkybWa6WSJdoekHwHvAL4KnEy2w0VE/LukhxrEUc90ssQJWdKZDrwAuDzVf4+ke/pR/26LiF3WA+CfqLL+5iartQ3rcRK1f8fO3U1cQy0RPAg8p2LcocDa9P7J9Lqdnb/bfWR7qy9k57a2J3Pv6/Z6GhEb0z/BZLIjgEOBd5K1WT8i6bCK+raTOyRtJCJuV9aG2E62t9wOHBcRT0taB+zbU7Ry0vT6TeBGsr3cH/a1jbCBFWRNG9VUfuei1ykBKyLiVTuNzP4ZHu5JrFVU+63PpfZyrmZYg3k81vMmJYSbgZslLaf3Dvq8/G9ZOd9qOz0CPhsRV+40MnuOR711b0eNeAG2sXPLQD6O3V6fJR1NlqR6NnjDyf5P6124US+WavM4lGwn72hJQbZRDXoTZTX1lvmAq7IefIAq629OrW1YMx5rXKS6IdU0FBGPku39vRaeWREmU3sPtMf9ZFn0O5Je0qDsYuBdktpSU8vJwB3ps9+QHbL9KpW7ML32m7K25jayZHcI8EDaOJ0GHJkrOjYdPUDWBHErQET8keyQ+SIG7qRcj18CI5T1AtsT7zH0NpU161fAW1Mb7kHAW3YjlieB9p5lIOlZkl6SjvTWSnpHGi9lJ7TrqbWcbwPeImnfdBTwZoBm5yHpRZIm5Ea9nGwdfAQ4KDf+z5KOkjQMeFtu/G303mV/bm78IuA9KSYkjZL03Abf8Umyo5bFPXVJOpWs7fqvwDrgFWn8K4Dx9SqLiIeBR9R79Uut3gCmA3MiYlz6ex7wPElHkq0H56R5nknvzt2fgeemvdwR1D7S6FmOZwPXRMSRaR5jyJLN0lz9LwWOyU1ba5lXq79faqwHq6iy/vah2lq/Y78MtSMCyJo85km6LA1/OiLuU4PHGETE7yWdS3ZYX28DdANZ08rdZHsPsyJiU/psMVnzUrek+8mOCvqTCPZLzSmQ7e29OyK2S/oecGPag+gCfp+bZjXwAWUnq1cCV+Q++x7QHhGr+hHTLtJVKG8DviTpH8mOOtaRtTv3pZ47Jf2AbNk+QHY1SV/tINsAXK7sksR9gC+RHbWcC1wh6SLgWWRNBXfXqavqco6IJanZ6h6yjdNy4C9pmmbmcSDwlXQEuQ3oJutGfTrwM0l/TO3bs4Gfkl3Z05WmA/gw8P20rH/SU2lE/FzSUcDtaX1/FDiP+u30y8h2gv5MlsTuIbvapucI5UfA+ZJWAL8F7q1TV4/3At+QtAO4Jbds8qaRHdnm3ZDGfxq4Ns3z12RNW6SEPJdsx2sjO6/3efOBnwHPJXcSOvd9jiX731pFtuHNn9+qtczzvgV8XdLjwKv6cZ6g1nown+rrbzPmAFdX+R37xV1M7EWU3S9xV0T8a6tjGeokHZjaePcn24OdERF3tjquPUHPsknvZwNHRMSHWxyW9cNQPCKwKiQtJWsj/FirY9lLzFd2YcC+ZPdPOAn0epOkT5BtP+4nu+7ehjAfEZiZldyQOllsZmYDz4nAzKzknAjMzErOicBKRw16q+xDPR9V1sfScmU9yF6mrAsBsyHFicDKKN9b5W5R1rvn64FXpj6Bjie7N2KXu28lte3ufMwGg68aslJJd+WuJuv75caIeFG6y/SrZN0VrAeeJnvG9vWpr5jLyG4O2gJcEBF/krQeODki1taYz6PAlWQdF36ArKO496SPr4qIL6WuIX4aES9N01wIHBgRcyTdTHaj2ilkl2m+JyLuwKwAPiKwsqnWW+XbyTr4mgj8Hdmd5ahGT5GpX6MDayWB5ADgtxHxMuBxsh5hTwBeCbxP0rFNxLp/6tfoHxjY3mTNduJEYGUznaxbCOjtrfIkso76dqTuRG5Kn7+I3p4il5H14zS6skJJb1DWnfg69T5lbjtZdwek+m+IiMfSHbn/RnP9NF0LEBG/Ag5OXRWYDTjfWWylUae3yloP8ana02mq61FJ4yNibUQsAhYpe3hIzxOonkg9T9bTqLfNWj3Nmg0oHxFYmdTqrXIrcJakYcqeVXBqKr+a2j1Ffpas87lnp89E7W6NF5P1urq/pAPIer1cTOPeNnuejncS8JeIqNa5m1m/+YjAymQ6vQ956fEj4ChgA1lvruuBO8k2vE8pe3RhtZ4iryCdB5D0JFlPoLdRpS/81Ovqt+jtzvyqiLgLoEFvm09Iuousl9P3YFYQXzVkxk69jR5GtmE+Mdf9eCviuRm4MLIHl5sVykcEZpmfpmae4cBnWpkEzAabjwjMzErOJ4vNzErOicDMrOScCMzMSs6JwMys5JwIzMxK7j8BLxSSyV+qGZ0AAAAASUVORK5CYII=\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "qRTwsi23tck_" | |
}, | |
"source": [ | |
"Babies are more likely to survive than any other age group. " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "BG8EV4yato4Y" | |
}, | |
"source": [ | |
"### Cabin Feature\n", | |
"The idea here is that people with recorded cabin numbers are of higher socioeconomic class, and thus more likely to survive." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 314 | |
}, | |
"id": "on8FHk2stdys", | |
"outputId": "866d3fe3-2015-4d79-ffb9-8904dac3be09" | |
}, | |
"source": [ | |
"train[\"CabinBool\"] = (train[\"Cabin\"].notnull().astype('int'))\n", | |
"test[\"CabinBool\"] = (test[\"Cabin\"].notnull().astype('int'))\n", | |
"\n", | |
"#calculate percentages of CabinBool vs. survived\n", | |
"print(\"Percentage of CabinBool = 1 who survived:\", train[\"Survived\"][train[\"CabinBool\"] == 1].value_counts(normalize = True)[1]*100)\n", | |
"\n", | |
"print(\"Percentage of CabinBool = 0 who survived:\", train[\"Survived\"][train[\"CabinBool\"] == 0].value_counts(normalize = True)[1]*100)\n", | |
"#draw a bar plot of CabinBool vs. survival\n", | |
"sns.barplot(x=\"CabinBool\", y=\"Survived\", data=train)\n", | |
"plt.show()" | |
], | |
"execution_count": 234, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Percentage of CabinBool = 1 who survived: 66.66666666666666\n", | |
"Percentage of CabinBool = 0 who survived: 29.985443959243085\n" | |
] | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEGCAYAAABo25JHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAR7klEQVR4nO3df5BdZ13H8fen6YRKBQS7UCZJmwyEHx2oQJeiA/JDioZxphX5YQoKzAAZRgIIQikjUzAMMgSB8UfARuzwQyHUMuii0Si/BSlkA6ElCcElLSSRwJYWKCC0C1//2Bu4bO9mb9qcvUme92tmJ+d5znPP+XZnu589z7n3OakqJEntOmXUBUiSRssgkKTGGQSS1DiDQJIaZxBIUuNOHXUBR+uMM86olStXjroMSTqh7Nix44aqGhu074QLgpUrVzI5OTnqMiTphJLkq/Ptc2pIkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1LgT7gNlkk5+l1xyCYcOHeLMM89k48aNoy7npGcQSDruHDp0iIMHD466jGY4NSRJjTMIJKlxBoEkNc4gkKTGdRoESdYk2ZtkKsmlA/a/JcnO3teXk3y7y3okSbfV2buGkiwBNgFPAA4A25NMVNXuw2Oq6iV9418IPLSreiRJg3V5RXA+MFVV+6rqFmALcNERxl8MvLfDeiRJA3QZBMuA/X3tA72+20hyNrAK+Mg8+9clmUwyOT09fcwLlaSWHS83i9cCV1XVjwftrKrNVTVeVeNjYwMfuSlJup26DIKDwIq+9vJe3yBrcVpIkkaiyyDYDqxOsirJUmZ/2U/MHZTkAcDdgU93WIskaR6dBUFVzQDrgW3AHuDKqtqVZEOSC/uGrgW2VFV1VYskaX6dLjpXVVuBrXP6LpvTfk2XNUiSjux4uVksSRoRg0CSGmcQSFLjDAJJapxPKJOOI1/b8OBRl3BcmLnxHsCpzNz4Vb8nwFmXXdvp8b0ikKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXGdBkGSNUn2JplKcuk8Y56WZHeSXUne02U9kqTb6uzBNEmWAJuAJwAHgO1JJqpqd9+Y1cArgUdW1U1J7tlVPZKkwbq8IjgfmKqqfVV1C7AFuGjOmOcBm6rqJoCq+maH9UiSBugyCJYB+/vaB3p9/e4H3C/Jp5JcnWTNoAMlWZdkMsnk9PR0R+VKOl6ccdpPuNcvzHDGaT8ZdSlNGPUzi08FVgOPBZYDn0jy4Kr6dv+gqtoMbAYYHx+vxS5S0uJ62bnfXniQjpkurwgOAiv62st7ff0OABNVdWtVXQd8mdlgkCQtki6DYDuwOsmqJEuBtcDEnDH/xOzVAEnOYHaqaF+HNUmS5ugsCKpqBlgPbAP2AFdW1a4kG5Jc2Bu2DfhWkt3AR4GXV9W3uqpJknRbnd4jqKqtwNY5fZf1bRfw0t6XJGkE/GSxJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1LhOgyDJmiR7k0wluXTA/mcnmU6ys/f13C7rkSTdVmcPr0+yBNgEPAE4AGxPMlFVu+cMfV9Vre+qDknSkXV5RXA+MFVV+6rqFmALcFGH55Mk3Q5dBsEyYH9f+0Cvb64nJ7kmyVVJVgw6UJJ1SSaTTE5PT3dRqyQ1a9Q3iz8IrKyqc4H/BN45aFBVba6q8aoaHxsbW9QCJelk12UQHAT6/8Jf3uv7qar6VlX9qNd8O3Beh/VIkgboMgi2A6uTrEqyFFgLTPQPSHLvvuaFwJ4O65EkDdDZu4aqaibJemAbsAS4oqp2JdkATFbVBPCiJBcCM8CNwLO7qkeSNFhnQQBQVVuBrXP6LuvbfiXwyi5rkCQd2ahvFkuSRswgkKTGGQSS1DiDQJIaZxBIUuMMAklqnEEgSY0zCCSpcQaBJDXuiJ8sTnIzUPPtr6q7HvOKJEmL6ohBUFV3AUjyWuDrwLuBAM8A7n2El0qSThDDTg1dWFVvraqbq+q7VfU2fNqYJJ0Uhg2C7yd5RpIlSU5J8gzg+10WJklaHMMGwdOBpwHf6H09tdcnSTrBDbUMdVVdj1NBknRSGuqKIMn9knw4yRd77XOTvKrb0iRJi2HYqaG/ZfYBMrcCVNU1zD56UpJ0ghs2CO5cVZ+d0zdzrIuRJC2+YYPghiT3offhsiRPYfZzBZKkE9ywQfAC4HLgAUkOAn8EPH+hFyVZk2Rvkqkklx5h3JOTVJLxIeuRJB0jwz68/qtVdUGS04FTqurmhV6QZAmwCXgCcADYnmSiqnbPGXcX4MXAZ46udEnSsTDsFcF1STYDvwp8b8jXnA9MVdW+qroF2MLgt6C+FngD8MMhjytJOoaGDYIHAB9idorouiR/neRRC7xmGbC/r32g1/dTSR4GrKiqfz3SgZKsSzKZZHJ6enrIkiVJwxgqCKrqB1V1ZVX9LvBQ4K7Ax+/IiZOcArwZ+OMhzr+5qsaranxsbOyOnFaSNMfQzyNI8pgkbwV2AKcxu+TEkRwEVvS1l/f6DrsL8CDgY0muZ3baacIbxpK0uIa6Wdz7Rf154Erg5VU1zIJz24HVSVYxGwBr6VufqKq+A5zRd46PAS+rqslhi5ck3XHDvmvo3Kr67tEcuKpmkqwHtgFLgCuqaleSDcBkVU0cZa2SpA4s9ISyS6pqI/C6JLd5UllVvehIr6+qrcDWOX2XzTP2sQtWK0k65ha6ItjT+9fpGkk6SS30qMoP9javrarPLUI9kqRFNuy7ht6UZE+S1yZ5UKcVSZIW1bCfI3gc8DhgGrg8ybU+j0CSTg5Df46gqg5V1V8yu9jcTmDgTV9J0oll2CeUPTDJa5JcC/wV8N/MfkBMknSCG/ZzBFcwu2jcb1XV/3ZYjyRpkS0YBL3lpK+rqr9YhHokSYtswamhqvoxsCLJ0kWoR5K0yIadGroO+FSSCeCn6wxV1Zs7qUqStGiGDYKv9L5OYXbVUEnSSWKoIKiqP+26EEnSaAy7DPVHgUGLzv3GMa9Ii+aSSy7h0KFDnHnmmWzcuHHU5UgakWGnhl7Wt30a8GRg5tiXo8V06NAhDh48uPBASSe1YaeGdszp+lSSz3ZQjyRpkQ07NXSPvuYpwDhwt04qkiQtqmGnhnbws3sEM8D1wHO6KEiStLgWekLZw4H9VbWq134Ws/cHrgd2d16dJKlzC32y+HLgFoAkjwZeD7wT+A6wudvSJEmLYaGpoSVVdWNv+/eAzVX1fuD9SXZ2W5okaTEsdEWwJMnhsHg88JG+fcMsWLcmyd4kU0kuHbD/+b2H3OxM8skk5wxfuiTpWFgoCN4LfDzJPwP/B/wXQJL7Mjs9NK/eqqWbgCcC5wAXD/hF/56qenBVPQTYCLh2kSQtsoUeXv+6JB8G7g38R1UdfufQKcALFzj2+cBUVe0DSLIFuIi+m8xV9d2+8acz4NPLkqRuLTi9U1VXD+j78hDHXgbs72sfAB4xd1CSFwAvBZYCA5esSLIOWAdw1llnDXHqIzvv5e+6w8c4GdzlhptZAnzthpv9ngA73vjMUZcgjcTQzyzuSlVtqqr7AK8AXjXPmM1VNV5V42NjY4tboCSd5LoMgoPAir728l7ffLYAv9NhPZKkAboMgu3A6iSrek83WwtM9A9Isrqv+dvA/3RYjyRpgGGXmDhqVTWTZD2wDVgCXFFVu5JsACaragJYn+QC4FbgJuBZXdUjSRqssyAAqKqtwNY5fZf1bb+4y/NLkhY28pvFkqTRMggkqXEGgSQ1ziCQpMYZBJLUuE7fNaTj20+Wnv5z/0pqk0HQsO+v/s1RlyDpOODUkCQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1DiDQJIa12kQJFmTZG+SqSSXDtj/0iS7k1yT5MNJzu6yHknSbXUWBEmWAJuAJwLnABcnOWfOsM8D41V1LnAVsLGreiRJg3V5RXA+MFVV+6rqFmALcFH/gKr6aFX9oNe8GljeYT2SpAG6DIJlwP6+9oFe33yeA/zboB1J1iWZTDI5PT19DEuUJB0XN4uT/D4wDrxx0P6q2lxV41U1PjY2trjFSdJJrssnlB0EVvS1l/f6fk6SC4A/AR5TVT/qsB5J0gBdXhFsB1YnWZVkKbAWmOgfkOShwOXAhVX1zQ5rkSTNo7MgqKoZYD2wDdgDXFlVu5JsSHJhb9gbgV8E/jHJziQT8xxOktSRTh9eX1Vbga1z+i7r276gy/NLkhZ2XNwsliSNjkEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxnQZBkjVJ9iaZSnLpgP2PTvK5JDNJntJlLZKkwToLgiRLgE3AE4FzgIuTnDNn2NeAZwPv6aoOSdKRndrhsc8HpqpqH0CSLcBFwO7DA6rq+t6+n3RYhyTpCLqcGloG7O9rH+j1HbUk65JMJpmcnp4+JsVJkmadEDeLq2pzVY1X1fjY2Nioy5Gkk0qXQXAQWNHXXt7rkyQdR7oMgu3A6iSrkiwF1gITHZ5PknQ7dBYEVTUDrAe2AXuAK6tqV5INSS4ESPLwJAeApwKXJ9nVVT2SpMG6fNcQVbUV2Dqn77K+7e3MThlJkkbkhLhZLEnqjkEgSY0zCCSpcQaBJDXOIJCkxhkEktQ4g0CSGmcQSFLjDAJJapxBIEmNMwgkqXEGgSQ1ziCQpMYZBJLUOINAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoEkNa7TIEiyJsneJFNJLh2w/05J3tfb/5kkK7usR5J0W50FQZIlwCbgicA5wMVJzpkz7DnATVV1X+AtwBu6qkeSNFiXVwTnA1NVta+qbgG2ABfNGXMR8M7e9lXA45Okw5okSXOc2uGxlwH7+9oHgEfMN6aqZpJ8B/hl4Ib+QUnWAet6ze8l2dtJxW06gznf71blz5816hL08/zZPOzVx+Tv47Pn29FlEBwzVbUZ2DzqOk5GSSaranzUdUhz+bO5eLqcGjoIrOhrL+/1DRyT5FTgbsC3OqxJkjRHl0GwHVidZFWSpcBaYGLOmAng8PX4U4CPVFV1WJMkaY7OpoZ6c/7rgW3AEuCKqtqVZAMwWVUTwN8B704yBdzIbFhocTnlpuOVP5uLJP4BLklt85PFktQ4g0CSGmcQNGqh5T+kUUlyRZJvJvniqGtphUHQoCGX/5BG5R3AmlEX0RKDoE3DLP8hjURVfYLZdxFqkRgEbRq0/MeyEdUiacQMAklqnEHQpmGW/5DUCIOgTcMs/yGpEQZBg6pqBji8/Mce4Mqq2jXaqqRZSd4LfBq4f5IDSZ4z6ppOdi4xIUmN84pAkhpnEEhS4wwCSWqcQSBJjTMIJKlxBoGakuTMJFuSfCXJjiRbk9xvnrEr51sBM8nbF1qoL8lrkhxMsjPJl5K8Lcnt+n8uyTuSPOX2vFZaiEGgZiQJ8AHgY1V1n6o6D3glcK+jPVZVPbeqdg8x9C1V9RBmV3l9MPCYoz2X1DWDQC15HHBrVf3N4Y6q+gLw+SQfTvK5JNcm6V+J9dQk/5BkT5KrktwZIMnHkoz3tr+X5HVJvpDk6iSDgmUpcBpwU+81D+mNvSbJB5Lc/Uj9UpcMArXkQcCOAf0/BJ5UVQ9jNize1Lt6ALg/8NaqeiDwXeAPB7z+dODqqvoV4BPA8/r2vSTJTuDrwJeramev/13AK6rqXOBa4NUL9EudMQgkCPBnSa4BPsTsktyH/6rfX1Wf6m3/PfCoAa+/BfiX3vYOYGXfvsNTQ/cETk+yNsndgF+qqo/3xrwTePR8/Xf4v05agEGgluwCzhvQ/wxgDDiv90v7G8xO4wDMXYNl0Jost9bP1mr5MXDq3AFVdSvw7/iLXcchg0At+QhwpyTrDnckORc4G/hmVd2a5HG99mFnJfm13vbTgU/enhP3ppoeCXylqr4D3JTk13u7/wD4+Hz9t+d80tEwCNSM3l/tTwIu6L19dBfwemArMJ7kWuCZwJf6XrYXeEGSPcDdgbcd5WkP3yP4IrAEeGuv/1nAG3vTUQ8BNizQL3XG1UclqXFeEUhS4wwCSWqcQSBJjTMIJKlxBoEkNc4gkKTGGQSS1Lj/ByBiuM6Ym8+rAAAAAElFTkSuQmCC\n", | |
"text/plain": [ | |
"<Figure size 432x288 with 1 Axes>" | |
] | |
}, | |
"metadata": { | |
"needs_background": "light" | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "j-TF9J2_t4py" | |
}, | |
"source": [ | |
"People with a recorded Cabin number are, in fact, more likely to survive. (66.6% vs 29.9%)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "jYP7Ocq1t8fM" | |
}, | |
"source": [ | |
"# 5. Cleaning Data\n", | |
"Time to clean our data to account for missing values and unnecessary information!" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "mQwCjZNJt-SK" | |
}, | |
"source": [ | |
"### Looking at the Test Data\n", | |
"Let's see how our test data looks!" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 383 | |
}, | |
"id": "a7aR8zMtuBmA", | |
"outputId": "fcf82c0f-992b-429c-9f1b-e97384558f5a" | |
}, | |
"source": [ | |
"test.describe(include=\"all\")" | |
], | |
"execution_count": 235, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Ticket</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Cabin</th>\n", | |
" <th>Embarked</th>\n", | |
" <th>AgeGroup</th>\n", | |
" <th>CabinBool</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>418.000000</td>\n", | |
" <td>418.000000</td>\n", | |
" <td>418</td>\n", | |
" <td>418</td>\n", | |
" <td>418.000000</td>\n", | |
" <td>418.000000</td>\n", | |
" <td>418.000000</td>\n", | |
" <td>418</td>\n", | |
" <td>417.000000</td>\n", | |
" <td>91</td>\n", | |
" <td>418</td>\n", | |
" <td>418</td>\n", | |
" <td>418.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>unique</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>418</td>\n", | |
" <td>2</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>363</td>\n", | |
" <td>NaN</td>\n", | |
" <td>76</td>\n", | |
" <td>3</td>\n", | |
" <td>8</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>top</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>Dulles, Mr. William Crothers</td>\n", | |
" <td>male</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>PC 17608</td>\n", | |
" <td>NaN</td>\n", | |
" <td>B57 B59 B63 B66</td>\n", | |
" <td>S</td>\n", | |
" <td>Young Adult</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>freq</th>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1</td>\n", | |
" <td>266</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>5</td>\n", | |
" <td>NaN</td>\n", | |
" <td>3</td>\n", | |
" <td>270</td>\n", | |
" <td>96</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>1100.500000</td>\n", | |
" <td>2.265550</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>23.941388</td>\n", | |
" <td>0.447368</td>\n", | |
" <td>0.392344</td>\n", | |
" <td>NaN</td>\n", | |
" <td>35.627188</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.217703</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>120.810458</td>\n", | |
" <td>0.841838</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>17.741080</td>\n", | |
" <td>0.896760</td>\n", | |
" <td>0.981429</td>\n", | |
" <td>NaN</td>\n", | |
" <td>55.907576</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.413179</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>892.000000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>-0.500000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>996.250000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>9.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>7.895800</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>1100.500000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>24.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>14.454200</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>1204.750000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>35.750000</td>\n", | |
" <td>1.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>31.500000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>0.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>1309.000000</td>\n", | |
" <td>3.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>76.000000</td>\n", | |
" <td>8.000000</td>\n", | |
" <td>9.000000</td>\n", | |
" <td>NaN</td>\n", | |
" <td>512.329200</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>NaN</td>\n", | |
" <td>1.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Pclass ... AgeGroup CabinBool\n", | |
"count 418.000000 418.000000 ... 418 418.000000\n", | |
"unique NaN NaN ... 8 NaN\n", | |
"top NaN NaN ... Young Adult NaN\n", | |
"freq NaN NaN ... 96 NaN\n", | |
"mean 1100.500000 2.265550 ... NaN 0.217703\n", | |
"std 120.810458 0.841838 ... NaN 0.413179\n", | |
"min 892.000000 1.000000 ... NaN 0.000000\n", | |
"25% 996.250000 1.000000 ... NaN 0.000000\n", | |
"50% 1100.500000 3.000000 ... NaN 0.000000\n", | |
"75% 1204.750000 3.000000 ... NaN 0.000000\n", | |
"max 1309.000000 3.000000 ... NaN 1.000000\n", | |
"\n", | |
"[11 rows x 13 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 235 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "XFXtv9iHuFxn" | |
}, | |
"source": [ | |
"* We have a total of 418 passengers.\n", | |
"* 1 value from the Fare feature is missing.\n", | |
"* Around 20.5% of the Age feature is missing, we will need to fill that in." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "t1PB5592uJci" | |
}, | |
"source": [ | |
"### Cabin Feature" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "CJUuS8SjuGnm" | |
}, | |
"source": [ | |
"#we'll start off by dropping the Cabin feature since not a lot more useful information can be extracted from it.\n", | |
"train = train.drop('Cabin', axis = 1)\n", | |
"test = test.drop('Cabin', axis = 1)" | |
], | |
"execution_count": 236, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "mjAfZJJyuNK_" | |
}, | |
"source": [ | |
"### Ticket Feature" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "mEo0tUsruQJp" | |
}, | |
"source": [ | |
"#we can also drop the Ticket feature since it's unlikely to yield any useful information\n", | |
"train = train.drop(['Ticket'], axis = 1)\n", | |
"test = test.drop(['Ticket'], axis = 1)" | |
], | |
"execution_count": 237, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "pEUYnq_uuWAc" | |
}, | |
"source": [ | |
"### Embarked Feature" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "ZM9tNtZBuYbX", | |
"outputId": "d4f71708-d40b-485d-b950-0919e5d41990" | |
}, | |
"source": [ | |
"#now we need to fill in the missing values in the Embarked feature\n", | |
"print(\"Number of people embarking in Southampton (S):\")\n", | |
"southampton = train[train[\"Embarked\"] == \"S\"].shape[0]\n", | |
"print(southampton)\n", | |
"\n", | |
"print(\"Number of people embarking in Cherbourg (C):\")\n", | |
"cherbourg = train[train[\"Embarked\"] == \"C\"].shape[0]\n", | |
"print(cherbourg)\n", | |
"\n", | |
"print(\"Number of people embarking in Queenstown (Q):\")\n", | |
"queenstown = train[train[\"Embarked\"] == \"Q\"].shape[0]\n", | |
"print(queenstown)" | |
], | |
"execution_count": 238, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Number of people embarking in Southampton (S):\n", | |
"644\n", | |
"Number of people embarking in Cherbourg (C):\n", | |
"168\n", | |
"Number of people embarking in Queenstown (Q):\n", | |
"77\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ZEIg2AAducwE" | |
}, | |
"source": [ | |
"It's clear that the majority of people embarked in Southampton (S). Let's go ahead and fill in the missing values with S." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ImxMUmtrudjW" | |
}, | |
"source": [ | |
"#replacing the missing values in the Embarked feature with S\n", | |
"train = train.fillna({\"Embarked\": \"S\"})" | |
], | |
"execution_count": 239, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_NNHRS12uhA3" | |
}, | |
"source": [ | |
"### Age Feature" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Bvns7Temulfq" | |
}, | |
"source": [ | |
"Next we'll fill in the missing values in the Age feature. Since a higher percentage of values are missing, it would be illogical to fill all of them with the same value (as we did with Embarked). Instead, let's try to find a way to predict the missing ages. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 597 | |
}, | |
"id": "s4LCIWwwumEN", | |
"outputId": "8f79c1a0-b836-45ce-c46f-3dccbd8c3557" | |
}, | |
"source": [ | |
"#create a combined group of both datasets\n", | |
"combine = [train, test]\n", | |
"\n", | |
"#extract a title for each Name in the train and test datasets\n", | |
"for dataset in combine:\n", | |
" dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\\.', expand=False)\n", | |
"\n", | |
"pd.crosstab(train['Title'], train['Sex'])" | |
], | |
"execution_count": 240, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th>Sex</th>\n", | |
" <th>female</th>\n", | |
" <th>male</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Title</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>Capt</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Col</th>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Countess</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Don</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Dr</th>\n", | |
" <td>1</td>\n", | |
" <td>6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Jonkheer</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Lady</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Major</th>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Master</th>\n", | |
" <td>0</td>\n", | |
" <td>40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Miss</th>\n", | |
" <td>182</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Mlle</th>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Mme</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Mr</th>\n", | |
" <td>0</td>\n", | |
" <td>517</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Mrs</th>\n", | |
" <td>125</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Ms</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Rev</th>\n", | |
" <td>0</td>\n", | |
" <td>6</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Sir</th>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
"Sex female male\n", | |
"Title \n", | |
"Capt 0 1\n", | |
"Col 0 2\n", | |
"Countess 1 0\n", | |
"Don 0 1\n", | |
"Dr 1 6\n", | |
"Jonkheer 0 1\n", | |
"Lady 1 0\n", | |
"Major 0 2\n", | |
"Master 0 40\n", | |
"Miss 182 0\n", | |
"Mlle 2 0\n", | |
"Mme 1 0\n", | |
"Mr 0 517\n", | |
"Mrs 125 0\n", | |
"Ms 1 0\n", | |
"Rev 0 6\n", | |
"Sir 0 1" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 240 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 231 | |
}, | |
"id": "p68zx8fCuttk", | |
"outputId": "c7ec7902-0690-44b9-b633-aa8a0489b4f4" | |
}, | |
"source": [ | |
"#replace various titles with more common names\n", | |
"for dataset in combine:\n", | |
" dataset['Title'] = dataset['Title'].replace(['Lady', 'Capt', 'Col',\n", | |
" 'Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Rare')\n", | |
" \n", | |
" dataset['Title'] = dataset['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')\n", | |
" dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')\n", | |
" dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')\n", | |
" dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')\n", | |
"\n", | |
"train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()" | |
], | |
"execution_count": 241, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Title</th>\n", | |
" <th>Survived</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Master</td>\n", | |
" <td>0.575000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Miss</td>\n", | |
" <td>0.702703</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Mr</td>\n", | |
" <td>0.156673</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Mrs</td>\n", | |
" <td>0.793651</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Rare</td>\n", | |
" <td>0.285714</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>Royal</td>\n", | |
" <td>1.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Title Survived\n", | |
"0 Master 0.575000\n", | |
"1 Miss 0.702703\n", | |
"2 Mr 0.156673\n", | |
"3 Mrs 0.793651\n", | |
"4 Rare 0.285714\n", | |
"5 Royal 1.000000" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 241 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 201 | |
}, | |
"id": "cYcvHx0fuxHw", | |
"outputId": "6ad77d05-5b98-4922-b725-42a3845479a5" | |
}, | |
"source": [ | |
"#map each of the title groups to a numerical value\n", | |
"title_mapping = {\"Mr\": 1, \"Miss\": 2, \"Mrs\": 3, \"Master\": 4, \"Royal\": 5, \"Rare\": 6}\n", | |
"for dataset in combine:\n", | |
" dataset['Title'] = dataset['Title'].map(title_mapping)\n", | |
" dataset['Title'] = dataset['Title'].fillna(0)\n", | |
"\n", | |
"train.head()" | |
], | |
"execution_count": 242, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Embarked</th>\n", | |
" <th>AgeGroup</th>\n", | |
" <th>CabinBool</th>\n", | |
" <th>Title</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>Braund, Mr. Owen Harris</td>\n", | |
" <td>male</td>\n", | |
" <td>22.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>7.2500</td>\n", | |
" <td>S</td>\n", | |
" <td>Student</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", | |
" <td>female</td>\n", | |
" <td>38.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>71.2833</td>\n", | |
" <td>C</td>\n", | |
" <td>Adult</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>Heikkinen, Miss. Laina</td>\n", | |
" <td>female</td>\n", | |
" <td>26.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>7.9250</td>\n", | |
" <td>S</td>\n", | |
" <td>Young Adult</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n", | |
" <td>female</td>\n", | |
" <td>35.0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>53.1000</td>\n", | |
" <td>S</td>\n", | |
" <td>Young Adult</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>Allen, Mr. William Henry</td>\n", | |
" <td>male</td>\n", | |
" <td>35.0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>8.0500</td>\n", | |
" <td>S</td>\n", | |
" <td>Young Adult</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass ... AgeGroup CabinBool Title\n", | |
"0 1 0 3 ... Student 0 1\n", | |
"1 2 1 1 ... Adult 1 3\n", | |
"2 3 1 3 ... Young Adult 0 2\n", | |
"3 4 1 1 ... Young Adult 1 3\n", | |
"4 5 0 3 ... Young Adult 0 1\n", | |
"\n", | |
"[5 rows x 13 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 242 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "vLO6pv2Su0rP" | |
}, | |
"source": [ | |
"Next, we'll try to predict the missing Age values from the most common age for their Title." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "r0c4pC2Ru2Mz" | |
}, | |
"source": [ | |
"# fill missing age with mode age group for each title\n", | |
"mr_age = train[train[\"Title\"] == 1][\"AgeGroup\"].mode() #Young Adult\n", | |
"miss_age = train[train[\"Title\"] == 2][\"AgeGroup\"].mode() #Student\n", | |
"mrs_age = train[train[\"Title\"] == 3][\"AgeGroup\"].mode() #Adult\n", | |
"master_age = train[train[\"Title\"] == 4][\"AgeGroup\"].mode() #Baby\n", | |
"royal_age = train[train[\"Title\"] == 5][\"AgeGroup\"].mode() #Adult\n", | |
"rare_age = train[train[\"Title\"] == 6][\"AgeGroup\"].mode() #Adult\n", | |
"\n", | |
"age_title_mapping = {1: \"Young Adult\", 2: \"Student\", 3: \"Adult\", 4: \"Baby\", 5: \"Adult\", 6: \"Adult\"}\n", | |
"\n", | |
"#I tried to get this code to work with using .map(), but couldn't.\n", | |
"#I've put down a less elegant, temporary solution for now.\n", | |
"#train = train.fillna({\"Age\": train[\"Title\"].map(age_title_mapping)})\n", | |
"#test = test.fillna({\"Age\": test[\"Title\"].map(age_title_mapping)})\n", | |
"\n", | |
"for x in range(len(train[\"AgeGroup\"])):\n", | |
" if train[\"AgeGroup\"][x] == \"Unknown\":\n", | |
" train[\"AgeGroup\"][x] = age_title_mapping[train[\"Title\"][x]]\n", | |
" \n", | |
"for x in range(len(test[\"AgeGroup\"])):\n", | |
" if test[\"AgeGroup\"][x] == \"Unknown\":\n", | |
" test[\"AgeGroup\"][x] = age_title_mapping[test[\"Title\"][x]]" | |
], | |
"execution_count": 243, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "5HWgQU0su72S" | |
}, | |
"source": [ | |
"Now that we've filled in the missing values at least *somewhat* accurately, it's time to map each age group to a numerical value." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "tLfLTbA7u-v3" | |
}, | |
"source": [ | |
"#map each Age value to a numerical value\n", | |
"age_mapping = {'Baby': 1, 'Child': 2, 'Teenager': 3, 'Student': 4, 'Young Adult': 5, 'Adult': 6, 'Senior': 7}\n", | |
"train['AgeGroup'] = train['AgeGroup'].map(age_mapping)\n", | |
"test['AgeGroup'] = test['AgeGroup'].map(age_mapping)\n", | |
"\n", | |
"train.head()\n", | |
"\n", | |
"#dropping the Age feature for now, might change\n", | |
"train = train.drop(['Age'], axis = 1)\n", | |
"test = test.drop(['Age'], axis = 1)" | |
], | |
"execution_count": 244, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "9FxV8e_hvEPQ" | |
}, | |
"source": [ | |
"### Name Feature\n", | |
"We can drop the name feature now that we've extracted the titles." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Cff86r6hvFIR" | |
}, | |
"source": [ | |
"#drop the name feature since it contains no more useful information.\n", | |
"train = train.drop(['Name'], axis = 1)\n", | |
"test = test.drop(['Name'], axis = 1)" | |
], | |
"execution_count": 245, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "uu2fON76vG_C" | |
}, | |
"source": [ | |
"### Sex Feature" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 201 | |
}, | |
"id": "LVaGaHHBvMHG", | |
"outputId": "86f04c99-bbf3-4254-969f-ef984e4871b0" | |
}, | |
"source": [ | |
"#map each Sex value to a numerical value\n", | |
"sex_mapping = {\"male\": 0, \"female\": 1}\n", | |
"train['Sex'] = train['Sex'].map(sex_mapping)\n", | |
"test['Sex'] = test['Sex'].map(sex_mapping)\n", | |
"\n", | |
"train.head()" | |
], | |
"execution_count": 246, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Sex</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Embarked</th>\n", | |
" <th>AgeGroup</th>\n", | |
" <th>CabinBool</th>\n", | |
" <th>Title</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>7.2500</td>\n", | |
" <td>S</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>71.2833</td>\n", | |
" <td>C</td>\n", | |
" <td>6.0</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>7.9250</td>\n", | |
" <td>S</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>53.1000</td>\n", | |
" <td>S</td>\n", | |
" <td>5.0</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>8.0500</td>\n", | |
" <td>S</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass Sex ... Embarked AgeGroup CabinBool Title\n", | |
"0 1 0 3 0 ... S 4.0 0 1\n", | |
"1 2 1 1 1 ... C 6.0 1 3\n", | |
"2 3 1 3 1 ... S 5.0 0 2\n", | |
"3 4 1 1 1 ... S 5.0 1 3\n", | |
"4 5 0 3 0 ... S 5.0 0 1\n", | |
"\n", | |
"[5 rows x 11 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 246 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "h0j_2wK6vPT2" | |
}, | |
"source": [ | |
"### Embarked Feature" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 201 | |
}, | |
"id": "rQcyd1dUvRbO", | |
"outputId": "3768bd4f-b41c-4402-f8e3-5b9ed0fcdf2e" | |
}, | |
"source": [ | |
"#map each Embarked value to a numerical value\n", | |
"embarked_mapping = {\"S\": 1, \"C\": 2, \"Q\": 3}\n", | |
"train['Embarked'] = train['Embarked'].map(embarked_mapping)\n", | |
"test['Embarked'] = test['Embarked'].map(embarked_mapping)\n", | |
"\n", | |
"train.head()" | |
], | |
"execution_count": 247, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Sex</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Embarked</th>\n", | |
" <th>AgeGroup</th>\n", | |
" <th>CabinBool</th>\n", | |
" <th>Title</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>7.2500</td>\n", | |
" <td>1</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>71.2833</td>\n", | |
" <td>2</td>\n", | |
" <td>6.0</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>7.9250</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>53.1000</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>8.0500</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass Sex ... Embarked AgeGroup CabinBool Title\n", | |
"0 1 0 3 0 ... 1 4.0 0 1\n", | |
"1 2 1 1 1 ... 2 6.0 1 3\n", | |
"2 3 1 3 1 ... 1 5.0 0 2\n", | |
"3 4 1 1 1 ... 1 5.0 1 3\n", | |
"4 5 0 3 0 ... 1 5.0 0 1\n", | |
"\n", | |
"[5 rows x 11 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 247 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "XC1zmKehvVKW" | |
}, | |
"source": [ | |
"### Fare Feature\n", | |
"It's time separate the fare values into some logical groups as well as filling in the single missing value in the test dataset." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "koA1FxUOvWAA" | |
}, | |
"source": [ | |
"#fill in missing Fare value in test set based on mean fare for that Pclass \n", | |
"for x in range(len(test[\"Fare\"])):\n", | |
" if pd.isnull(test[\"Fare\"][x]):\n", | |
" pclass = test[\"Pclass\"][x] #Pclass = 3\n", | |
" test[\"Fare\"][x] = round(train[train[\"Pclass\"] == pclass][\"Fare\"].mean(), 4)\n", | |
" \n", | |
"#map Fare values into groups of numerical values\n", | |
"train['FareBand'] = pd.qcut(train['Fare'], 4, labels = [1, 2, 3, 4])\n", | |
"test['FareBand'] = pd.qcut(test['Fare'], 4, labels = [1, 2, 3, 4])\n", | |
"\n", | |
"#drop Fare values\n", | |
"train = train.drop(['Fare'], axis = 1)\n", | |
"test = test.drop(['Fare'], axis = 1)" | |
], | |
"execution_count": 248, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 201 | |
}, | |
"id": "luA66cftvcZY", | |
"outputId": "5196158f-51c8-48b3-88a4-1ea20b4db589" | |
}, | |
"source": [ | |
"#check train data\n", | |
"train.head()" | |
], | |
"execution_count": 249, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Sex</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Embarked</th>\n", | |
" <th>AgeGroup</th>\n", | |
" <th>CabinBool</th>\n", | |
" <th>Title</th>\n", | |
" <th>FareBand</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>6.0</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>5</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass Sex ... AgeGroup CabinBool Title FareBand\n", | |
"0 1 0 3 0 ... 4.0 0 1 1\n", | |
"1 2 1 1 1 ... 6.0 1 3 4\n", | |
"2 3 1 3 1 ... 5.0 0 2 2\n", | |
"3 4 1 1 1 ... 5.0 1 3 4\n", | |
"4 5 0 3 0 ... 5.0 0 1 2\n", | |
"\n", | |
"[5 rows x 11 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 249 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 201 | |
}, | |
"id": "CNOa7nIgvftl", | |
"outputId": "ca1ff93b-3a99-420e-b3f8-4af3fbc19622" | |
}, | |
"source": [ | |
"#check test data\n", | |
"test.head()" | |
], | |
"execution_count": 250, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>PassengerId</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Sex</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Embarked</th>\n", | |
" <th>AgeGroup</th>\n", | |
" <th>CabinBool</th>\n", | |
" <th>Title</th>\n", | |
" <th>FareBand</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>892</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>893</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>6.0</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>894</td>\n", | |
" <td>2</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>7.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>895</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>5.0</td>\n", | |
" <td>0</td>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>896</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0</td>\n", | |
" <td>3</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" PassengerId Pclass Sex SibSp ... AgeGroup CabinBool Title FareBand\n", | |
"0 892 3 0 0 ... 5.0 0 1 1\n", | |
"1 893 3 1 1 ... 6.0 0 3 1\n", | |
"2 894 2 0 0 ... 7.0 0 1 2\n", | |
"3 895 3 0 0 ... 5.0 0 1 2\n", | |
"4 896 3 1 1 ... 4.0 0 3 2\n", | |
"\n", | |
"[5 rows x 10 columns]" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 250 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "a25aZaBfvX8b" | |
}, | |
"source": [ | |
"# 6. Choosing the Best Model" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "sEE0wHtZvnbF" | |
}, | |
"source": [ | |
"### Splitting the Training Data\n", | |
"We will use part of our training data (22% in this case) to test the accuracy of our different models." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Acq6b-covlK9" | |
}, | |
"source": [ | |
"from sklearn.model_selection import train_test_split\n", | |
"\n", | |
"predictors = train.drop(['Survived', 'PassengerId'], axis=1)\n", | |
"target = train[\"Survived\"]\n", | |
"x_train, x_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.22, random_state = 0)" | |
], | |
"execution_count": 251, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "GXgAOb8fvs1t" | |
}, | |
"source": [ | |
"### Testing Different Models\n", | |
"I will be testing the following models with my training data :\n", | |
"* Gaussian Naive Bayes\n", | |
"* Logistic Regression\n", | |
"* Support Vector Machines\n", | |
"* Perceptron\n", | |
"* Decision Tree Classifier\n", | |
"* Random Forest Classifier\n", | |
"* KNN or k-Nearest Neighbors\n", | |
"* Stochastic Gradient Descent\n", | |
"* Gradient Boosting Classifier\n", | |
"\n", | |
"For each model, we set the model, fit it with 80% of our training data, predict for 20% of the training data and check the accuracy." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "vBDzceBjvvMh", | |
"outputId": "ccbd2053-adcc-48d0-9aa0-639bf7e2537a" | |
}, | |
"source": [ | |
"# Gaussian Naive Bayes\n", | |
"from sklearn.naive_bayes import GaussianNB\n", | |
"from sklearn.metrics import accuracy_score\n", | |
"\n", | |
"gaussian = GaussianNB()\n", | |
"gaussian.fit(x_train, y_train)\n", | |
"y_pred = gaussian.predict(x_val)\n", | |
"acc_gaussian = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_gaussian)" | |
], | |
"execution_count": 252, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"78.68\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "xiZSNXByvzCR", | |
"outputId": "c5cc1b94-8673-4cc1-b264-f7452e1e4a57" | |
}, | |
"source": [ | |
"# Logistic Regression\n", | |
"from sklearn.linear_model import LogisticRegression\n", | |
"\n", | |
"logreg = LogisticRegression()\n", | |
"logreg.fit(x_train, y_train)\n", | |
"y_pred = logreg.predict(x_val)\n", | |
"acc_logreg = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_logreg)" | |
], | |
"execution_count": 253, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"79.7\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "PaK_xdQTv19k", | |
"outputId": "4fbeebc0-3242-4768-c735-5da527c0df24" | |
}, | |
"source": [ | |
"# Support Vector Machines\n", | |
"from sklearn.svm import SVC\n", | |
"\n", | |
"svc = SVC()\n", | |
"svc.fit(x_train, y_train)\n", | |
"y_pred = svc.predict(x_val)\n", | |
"acc_svc = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_svc)" | |
], | |
"execution_count": 254, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"82.74\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "nE53_qvGv5JP", | |
"outputId": "a6cbf7bd-bf1b-4a65-ecb3-8659a70e596d" | |
}, | |
"source": [ | |
"# Linear SVC\n", | |
"from sklearn.svm import LinearSVC\n", | |
"\n", | |
"linear_svc = LinearSVC()\n", | |
"linear_svc.fit(x_train, y_train)\n", | |
"y_pred = linear_svc.predict(x_val)\n", | |
"acc_linear_svc = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_linear_svc)" | |
], | |
"execution_count": 255, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"78.17\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "4Znyh4arv70I", | |
"outputId": "d22c55b5-d609-4c6d-9a25-bac3e2c007f1" | |
}, | |
"source": [ | |
"# Perceptron\n", | |
"from sklearn.linear_model import Perceptron\n", | |
"\n", | |
"perceptron = Perceptron()\n", | |
"perceptron.fit(x_train, y_train)\n", | |
"y_pred = perceptron.predict(x_val)\n", | |
"acc_perceptron = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_perceptron)" | |
], | |
"execution_count": 256, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"78.68\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "2GNgsEfEv95_", | |
"outputId": "16b55078-e9c2-47a7-886e-711c86f8f953" | |
}, | |
"source": [ | |
"#Decision Tree\n", | |
"from sklearn.tree import DecisionTreeClassifier\n", | |
"\n", | |
"decisiontree = DecisionTreeClassifier()\n", | |
"decisiontree.fit(x_train, y_train)\n", | |
"y_pred = decisiontree.predict(x_val)\n", | |
"acc_decisiontree = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_decisiontree)" | |
], | |
"execution_count": 257, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"80.71\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "-bPUuU4vv_8X", | |
"outputId": "9c3dadf5-5b56-4e17-bfcc-94045de03a0a" | |
}, | |
"source": [ | |
"# Random Forest\n", | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"\n", | |
"randomforest = RandomForestClassifier()\n", | |
"randomforest.fit(x_train, y_train)\n", | |
"y_pred = randomforest.predict(x_val)\n", | |
"acc_randomforest = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_randomforest)" | |
], | |
"execution_count": 258, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"83.76\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "yJoxSZzzwDiL", | |
"outputId": "45a231c9-405b-46e7-a897-20196e0d0385" | |
}, | |
"source": [ | |
"# KNN or k-Nearest Neighbors\n", | |
"from sklearn.neighbors import KNeighborsClassifier\n", | |
"\n", | |
"knn = KNeighborsClassifier()\n", | |
"knn.fit(x_train, y_train)\n", | |
"y_pred = knn.predict(x_val)\n", | |
"acc_knn = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_knn)" | |
], | |
"execution_count": 259, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"77.66\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "vSQjtKo3wFjz", | |
"outputId": "9094e4e3-dd0e-425d-e7f3-54b26138ac45" | |
}, | |
"source": [ | |
"# Stochastic Gradient Descent\n", | |
"from sklearn.linear_model import SGDClassifier\n", | |
"\n", | |
"sgd = SGDClassifier()\n", | |
"sgd.fit(x_train, y_train)\n", | |
"y_pred = sgd.predict(x_val)\n", | |
"acc_sgd = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_sgd)" | |
], | |
"execution_count": 260, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"78.17\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "2bhbILIUwHYB", | |
"outputId": "75f7bbb4-d6af-46b2-ec7c-4a51809ca244" | |
}, | |
"source": [ | |
"# Gradient Boosting Classifier\n", | |
"from sklearn.ensemble import GradientBoostingClassifier\n", | |
"\n", | |
"gbk = GradientBoostingClassifier()\n", | |
"gbk.fit(x_train, y_train)\n", | |
"y_pred = gbk.predict(x_val)\n", | |
"acc_gbk = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_gbk)" | |
], | |
"execution_count": 261, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"84.26\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 353 | |
}, | |
"id": "7c_kEs6EwJeO", | |
"outputId": "7f436b0c-24d1-44e3-f672-4e4f433930b1" | |
}, | |
"source": [ | |
"models = pd.DataFrame({\n", | |
" 'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', \n", | |
" 'Random Forest', 'Naive Bayes', 'Perceptron', 'Linear SVC', \n", | |
" 'Decision Tree', 'Stochastic Gradient Descent', 'Gradient Boosting Classifier'],\n", | |
" 'Score': [acc_svc, acc_knn, acc_logreg, \n", | |
" acc_randomforest, acc_gaussian, acc_perceptron,acc_linear_svc, acc_decisiontree,\n", | |
" acc_sgd, acc_gbk]})\n", | |
"models.sort_values(by='Score', ascending=False)" | |
], | |
"execution_count": 262, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Model</th>\n", | |
" <th>Score</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>Gradient Boosting Classifier</td>\n", | |
" <td>84.26</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Random Forest</td>\n", | |
" <td>83.76</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Support Vector Machines</td>\n", | |
" <td>82.74</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>Decision Tree</td>\n", | |
" <td>80.71</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Logistic Regression</td>\n", | |
" <td>79.70</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Naive Bayes</td>\n", | |
" <td>78.68</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>Perceptron</td>\n", | |
" <td>78.68</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>Linear SVC</td>\n", | |
" <td>78.17</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>Stochastic Gradient Descent</td>\n", | |
" <td>78.17</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>KNN</td>\n", | |
" <td>77.66</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Model Score\n", | |
"9 Gradient Boosting Classifier 84.26\n", | |
"3 Random Forest 83.76\n", | |
"0 Support Vector Machines 82.74\n", | |
"7 Decision Tree 80.71\n", | |
"2 Logistic Regression 79.70\n", | |
"4 Naive Bayes 78.68\n", | |
"5 Perceptron 78.68\n", | |
"6 Linear SVC 78.17\n", | |
"8 Stochastic Gradient Descent 78.17\n", | |
"1 KNN 77.66" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 262 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "gAlM6IzawNFj" | |
}, | |
"source": [ | |
"We will all these models for COBRA Classifier. The first 4 can be combined as basic machines. And all of them will be in the advanced machine list." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "YR8ExkzuzSGV", | |
"outputId": "3d11b027-c396-4c2d-832a-0b77e4fbf89e" | |
}, | |
"source": [ | |
"import sys\n", | |
"sys.path.append('/content/drive/MyDrive/Titanic Survival Prediction/')\n", | |
"import classifiercobra\n", | |
"\n", | |
"cobra = classifiercobra.ClassifierCobra(machine_list='advanced')\n", | |
"cobra.fit(x_train, y_train)\n", | |
"y_pred = cobra.predict(x_val)\n", | |
"acc_cobra = round(accuracy_score(y_pred, y_val) * 100, 2)\n", | |
"print(acc_cobra)" | |
], | |
"execution_count": 263, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"83.76\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "l4_2JNIrwYoS" | |
}, | |
"source": [ | |
"#set ids as PassengerId and predict survival \n", | |
"ids = test['PassengerId']\n", | |
"predictions = cobra.predict(test.drop('PassengerId', axis=1))\n", | |
"\n", | |
"#set the output as a dataframe and convert to csv file named submission.csv\n", | |
"output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })\n", | |
"output.to_csv('submission.csv', index=False)" | |
], | |
"execution_count": 264, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment