Created
September 1, 2017 12:56
-
-
Save Swarchal/64a31101b1df40e65c0e543825856ad6 to your computer and use it in GitHub Desktop.
kaggle titanic R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Kaggle Titanic dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Loading required package: lattice\n", | |
"Loading required package: foreach\n", | |
"Loading required package: iterators\n", | |
"Loading required package: parallel\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
"<thead><tr><th scope=col>PassengerId</th><th scope=col>Survived</th><th scope=col>Pclass</th><th scope=col>Name</th><th scope=col>Sex</th><th scope=col>Age</th><th scope=col>SibSp</th><th scope=col>Parch</th><th scope=col>Ticket</th><th scope=col>Fare</th><th scope=col>Cabin</th><th scope=col>Embarked</th><th scope=col>dataset</th></tr></thead>\n", | |
"<tbody>\n", | |
"\t<tr><td>1 </td><td>0 </td><td>3 </td><td>Braund, Mr. Owen Harris </td><td>male </td><td>22 </td><td>1 </td><td>0 </td><td>A/5 21171 </td><td> 7.2500 </td><td>NA </td><td>S </td><td>train </td></tr>\n", | |
"\t<tr><td>2 </td><td>1 </td><td>1 </td><td>Cumings, Mrs. John Bradley (Florence Briggs Thayer)</td><td>female </td><td>38 </td><td>1 </td><td>0 </td><td>PC 17599 </td><td>71.2833 </td><td>C85 </td><td>C </td><td>train </td></tr>\n", | |
"\t<tr><td>3 </td><td>1 </td><td>3 </td><td>Heikkinen, Miss. Laina </td><td>female </td><td>26 </td><td>0 </td><td>0 </td><td>STON/O2. 3101282 </td><td> 7.9250 </td><td>NA </td><td>S </td><td>train </td></tr>\n", | |
"\t<tr><td>4 </td><td>1 </td><td>1 </td><td>Futrelle, Mrs. Jacques Heath (Lily May Peel) </td><td>female </td><td>35 </td><td>1 </td><td>0 </td><td>113803 </td><td>53.1000 </td><td>C123 </td><td>S </td><td>train </td></tr>\n", | |
"\t<tr><td>5 </td><td>0 </td><td>3 </td><td>Allen, Mr. William Henry </td><td>male </td><td>35 </td><td>0 </td><td>0 </td><td>373450 </td><td> 8.0500 </td><td>NA </td><td>S </td><td>train </td></tr>\n", | |
"\t<tr><td>6 </td><td>0 </td><td>3 </td><td>Moran, Mr. James </td><td>male </td><td>NA </td><td>0 </td><td>0 </td><td>330877 </td><td> 8.4583 </td><td>NA </td><td>Q </td><td>train </td></tr>\n", | |
"</tbody>\n", | |
"</table>\n" | |
], | |
"text/latex": [ | |
"\\begin{tabular}{r|lllllllllllll}\n", | |
" PassengerId & Survived & Pclass & Name & Sex & Age & SibSp & Parch & Ticket & Fare & Cabin & Embarked & dataset\\\\\n", | |
"\\hline\n", | |
"\t 1 & 0 & 3 & Braund, Mr. Owen Harris & male & 22 & 1 & 0 & A/5 21171 & 7.2500 & NA & S & train \\\\\n", | |
"\t 2 & 1 & 1 & Cumings, Mrs. John Bradley (Florence Briggs Thayer) & female & 38 & 1 & 0 & PC 17599 & 71.2833 & C85 & C & train \\\\\n", | |
"\t 3 & 1 & 3 & Heikkinen, Miss. Laina & female & 26 & 0 & 0 & STON/O2. 3101282 & 7.9250 & NA & S & train \\\\\n", | |
"\t 4 & 1 & 1 & Futrelle, Mrs. Jacques Heath (Lily May Peel) & female & 35 & 1 & 0 & 113803 & 53.1000 & C123 & S & train \\\\\n", | |
"\t 5 & 0 & 3 & Allen, Mr. William Henry & male & 35 & 0 & 0 & 373450 & 8.0500 & NA & S & train \\\\\n", | |
"\t 6 & 0 & 3 & Moran, Mr. James & male & NA & 0 & 0 & 330877 & 8.4583 & NA & Q & train \\\\\n", | |
"\\end{tabular}\n" | |
], | |
"text/markdown": [ | |
"\n", | |
"PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | dataset | \n", | |
"|---|---|---|---|---|---|\n", | |
"| 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22 | 1 | 0 | A/5 21171 | 7.2500 | NA | S | train | \n", | |
"| 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Thayer) | female | 38 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | train | \n", | |
"| 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NA | S | train | \n", | |
"| 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35 | 1 | 0 | 113803 | 53.1000 | C123 | S | train | \n", | |
"| 5 | 0 | 3 | Allen, Mr. William Henry | male | 35 | 0 | 0 | 373450 | 8.0500 | NA | S | train | \n", | |
"| 6 | 0 | 3 | Moran, Mr. James | male | NA | 0 | 0 | 330877 | 8.4583 | NA | Q | train | \n", | |
"\n", | |
"\n" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass\n", | |
"1 1 0 3 \n", | |
"2 2 1 1 \n", | |
"3 3 1 3 \n", | |
"4 4 1 1 \n", | |
"5 5 0 3 \n", | |
"6 6 0 3 \n", | |
" Name Sex Age SibSp Parch\n", | |
"1 Braund, Mr. Owen Harris male 22 1 0 \n", | |
"2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0 \n", | |
"3 Heikkinen, Miss. Laina female 26 0 0 \n", | |
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 \n", | |
"5 Allen, Mr. William Henry male 35 0 0 \n", | |
"6 Moran, Mr. James male NA 0 0 \n", | |
" Ticket Fare Cabin Embarked dataset\n", | |
"1 A/5 21171 7.2500 NA S train \n", | |
"2 PC 17599 71.2833 C85 C train \n", | |
"3 STON/O2. 3101282 7.9250 NA S train \n", | |
"4 113803 53.1000 C123 S train \n", | |
"5 373450 8.0500 NA S train \n", | |
"6 330877 8.4583 NA Q train " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"library(ggplot2)\n", | |
"library(caret)\n", | |
"library(doParallel)\n", | |
"\n", | |
"set.seed(42)\n", | |
"\n", | |
"orig_train = read.csv(\"../raw_data/train.csv\", na.strings = \"\")\n", | |
"orig_test = read.csv(\"../raw_data/test.csv\", na.strings = \"\")\n", | |
"\n", | |
"orig_train$dataset = \"train\"\n", | |
"orig_test$dataset = \"test\"\n", | |
" \n", | |
"orig_test$Survived = NA\n", | |
"\n", | |
"orig_all_data = rbind(orig_train, orig_test)\n", | |
"\n", | |
"head(orig_all_data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# binarise sex feature\n", | |
"orig_all_data$Sex = as.integer(orig_all_data[, \"Sex\"] == \"male\")\n", | |
"\n", | |
"# convert Survived into a factor\n", | |
"orig_all_data$Survived = as.factor(orig_all_data$Survived)\n", | |
"\n", | |
"# split back into training and test sets after binarising\n", | |
"orig_train = orig_all_data[orig_all_data[\"dataset\"] == \"train\", ]\n", | |
"orig_test = orig_all_data[orig_all_data[\"dataset\"] == \"test\"]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"How imbalanced is the dataset?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWgAAAHgCAMAAAC/2AdSAAAC61BMVEUAAAABAQECAgIDAwMF\nBQUGBgYHBwcIBQAICAgJCQkKCgoLCwsLDg4MDAwNDQ0ODg4PDw8QCgAQEBARERESEhITExMU\nFBQVFRUXFxcYGBgZEAAZGRkaGhoaISMbGxscHBwdHR0eHh4fHx8gICAhISEiIiIiKi0jIyMk\nJCQlJSUmJiYnJycoGgAoKCgqKiorKyssLCwtLS0uLi4vLy8wMDAxIAAxMTEyMjIzIQAzMzM0\nNDQ1NTU2NjY3Nzc4ODg5OTk6Ojo7Ozs9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVFVlxG\nRkZHR0dISEhJSUlLS0tNTU1PT09QUFBRUVFSUlJTU1NUVFRVVVVVanFWVlZXV1dYWFhZWVla\nWlpbW1tcXFxdXV1eXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmQgBmZmZnZ2doaGhpaWlqampr\na2tsbGxtbW1ubm5vb29wcHBxcXFycnJzc3N0dHR1dXV2dnZ4eHh5eXl7e3t8UAB8fHx/f3+A\ngICBgYGCgoKDg4OGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OW\nlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+jo6OkpKSlpaWnp6eoqKipqamqqqqsrKyt\nra2t2Oavr6+wsLCxsbGysrKzs7O0tLS1tbW3t7e4uLi5ubm6urq7u7u8vLy9vb2+vr6/v7/A\nwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnKysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS\n0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk\n5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u7u7v7+/w8PDx8fHy8vLz8/P09PT19fX2\n9vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7/pQD///9xxUsOAAAACXBIWXMAABJ0AAASdAHe\nZh94AAANLElEQVR4nO3dfZwUdQHH8R+gdA/xEA95F3aCcKGQIg9BHIcPPQheoEhAF4c8CJom\nIiCopVSEmFnZg6lBimVqGEdpT2iWKYrlAxqWoilqosiD6D1u82c7v71b5rh7zc7MNd/bYz+f\nP3Zn9ua3u/d+3WuY37E3YxySZDr6DeRKQIsCWhTQooAWBbQooEUBLQpoUUCLAloU0KKAFgW0\nKKBFAS0KaFFAiwJaFNCigBYFtCigRQEtCmhRQIsCWhTQooAWBbQooEUBLQpoUUCLAloU0KKA\nFgW0KKBFAS0KaFFAiwJaFNCigBYFtCigRQEtCmhRQIsCWhTQooAWBbQooEUBLQpoUUCLAloU\n0KKAFgW0KKBFAS0KaFFAiwJaFNCigBYFtCigRQEtCmhRQIsCWhTQooAWBbQooEUBLQpoUUCL\nAloU0KKAFgW0KKBFAS0KaFFAiwJaFNCigBYFtCigRQEtSgdd+3hcvSH7HtqRDvo2E1fnyL6H\ndqSDvuWjv4ynL06WfQ/tCGhRQIsCWhTQooAW1T7od1/elwi6LdDRSmxbMbQweRCbP/Tyvwca\nAHSkamcb06esYmZFWT9jFtQHGAF0pFabUx9K8TY8WmGuCzAC6EgNHvh+erl+1AkBRgAdqaMq\nPStLjw4wAuhIDR5Yk15uGFsaYATQkVpzaB+9tcKsDTAC6EjVVSWPOiZMmTW1vL8x59UFGAF0\ntBLblg/JSx5H5w1Zvi3QpAXo6CX2vcTMMGBMwUUxBRfFFFwUU3BR8UzBG351V7o7f5x6DOhI\n+U/Bdxb1TdfTpI6ygY5U8Cn4X0ytvQc6UsGn4EDb4p+CA22LfwoOtC3+KTjQtvZ+3GD/1h0Z\nZitA26JC3/o19/b5KcmdR/eV+/y2BNoWFfoMd+Dr/cywBReMNCNrfLYE2tYu6EVmTUNyT32D\nWe2zJdC2dkGfMML+U5g4ZYLPlkDb2gVdMDe1Mq/AZ0ugbe2CHnNqauXMQT5bAm2LDn3lzb+/\nrss97vJmM8dnS6BtUaHndLd/p9PTcQ7M6trjOZ8tgbZFnrA0/nvL+lWzT08e45lPbffbEGhb\n+z+IXvOK/9eBtsX/iX+gbUCLAloU0KKAFgW0KKBFAS0KaFFAiwJaFNCigBYFtCigRQEtCmhR\nQIsCWhTQooAWBbQooEUBLQpoUUCLAloU0KKAFhUP9N4Vl6SbDrRbPNC751amm2RSfxwHdMyx\n67ABLQpoUUCLAloU0KKAFgW0KKBFAS0KaFFAiwJaFNCigBYFtCigRQEtCmhRQIsCWhTQooAW\nBbQooEUdkdAPXB9TN+yKzHBEQld85JPx9IFbIjMckdBnff2/8fTxmyMzAN0poINcoRNoW/xX\n6ATaFv8VOoG2xX+FTqBt8Vyh0xvQtniu0OkNaJsXenfzD+mBtzOOy+ordGY7tLm9aWHVhzOO\ny+ordGY19IYNG8zFG2zrR+ZnHJfVV+jMamjjbVbmgdl8hc6shq6urjYrqlP9sTbY4Gy9QmdW\nQyeb+tuwo5mCB44peEdA3z1nclMZxzEFD5kXer0xhf1TZRzHFDxkXuiTCx/IuMdtjil4yLzQ\n3S8NPs5/Cv5K+fh0w/VnN8h26OMuCz7Ofwpe89Nb013NT7SbF3p1ye7A45iCh8wLXb9o+B3P\n73nHLeM4puAh80L37p2eg2ceyBQ8XF7SxYcKNpgpePDa/XGDujcyUANtiwxdt35x1Y9qGlZ2\nN73m/MdvQ6BtXuh5h8o47sB4d1c+89tmwOQhpmSvz5ZA21r8D0tzg4ZmHHeNmbN1x/Umf0aN\nk/iB+arPlkCncD3L9W51uzaOmnww47iTRzQkb8vN08nbxJgxPlsCbWtrH71n4JUZx+Uvcm8v\nTk2v5xf4bAm0rc1/DC8dkHFc6ST3duPS1Dc20GdLoG1tQl+Ql3HcXHNbY/PyY92m+WwJtK0N\n6Ib780ZlHPdqX1My3y5VLzy6y2M+WwJt80IXpjrKmDsyD9x1cenJdmGeKb3fb0OgbV7oaU0t\n3BxsbOq3d0+84D81BNrGJ/47CHrvM5k/dxcuoG0toPd+45jkvLDfKr8ZdeiAtnmh3x1uimdc\nNmuAOSnzzDB4QNu80FeZa9yJXs1XfH93ETagbV7oMaNSxw+NI8ZGfr7WAW3zQhcsaFqYXxj5\n+VoHtM0LPWJC6j5Rdkrk52sd0DYv9BJzo7vvSNxolkR+vtYBbfNC7xloRi5bu2ykGbgn8vO1\nDmhbi+PoXRd1Sx5Hd7so+skS2gho22Ezw9odW3YE/Lh/0IC2tYR+9XfJm/X+HywPG9A2L3Td\nFWZ08q7ELGmI/HytA9rmhb7JlG9M3m0520Q/0UrrgLZ5oU85PvXh8vph4yI/X+uAtrWYGZ7f\ntLCwR+Tnax3QNi/08DObFj57YuTnax3QNi/0l7rcY+/v7XJ+2xtHCmibF/rNElOxdt23pplj\nXov8fK0D2tbiOPrFufajd9Oejfx0bQS07bCZ4e6H79zyauQnazOgbfwvONBAhwtoG9BAAx0u\noG1Ad27o/XvS3Qe0WzzQ/+riPecY0E5sP9HPPp7uNqDd2EcDDXS4gLYBDTTQ4QLaBjTQQIcL\naBvQQAMdLqBtQAMNdLiAtgENNNDhAtoGNNBAhwtoG9BAAx0uoG1AAw10uIC2AQ000IfiUqiB\n41KoWQ7NpVBDFhWaS6GGLCo0l0INWVRo/0uhegPaFv0n2u9SqN6AtkWF5lKoIYsKzaVQQ9aO\n42guhRqmds0MuRRq8JiCZzs0U/BwMQXPcmim4CGLZwrecO9d6dYC7RbPFPylAX3T9UxdLhXo\naDEFDxlT8CyHZgoeMqbg2Q7txhQ8eHzcAGigwwW0Degsh+7dMp8tgbZFhV5XZszg0el8tgTa\nFnnXUT/VbAy0IdC26PvoaqDDFB16V+GmQNsBbeOoA2igwwW0DWiggQ4X0DaggQY6XEDbgAYa\n6HABbQMaaKDDBbQNaKCBDhfQNqCBBjpcQNuABhrocAFtAxpooMMFtI3TSGQ7NKeRCBenkchy\naE4jEbJ4TiOxd/kl6aY3Q3/wc/H0sdbQn/hyPH0o204j8da8ynTTRzfax56qqoypdYe/ufVx\nvVLVkxG5FKeRIFv8p5EgW/ynkSBb/KeRIFv8p5EgW/y/6yAb0KKAFgW0KKBFAS0KaFFAiwJa\nFNCigBYFtCigRQEtCmhRQIsCWhTQooAWBbQooEUBLQpoUUCLAlpUNkLXrS09uvRa1ef51vld\noOD/VxZCJ84zx32hxMzRfNisfkLOQm8zE9933i83Twhe67Vfn21yFnq5eSh5+5BZKXgt9+/K\ncha6tI/7Cff6Pof/YUwcbdq4cXCuQifyyux9WaHm9UbnKvQ+M8XeV5h3Ja+Xs9AvmVn2fqZ5\nWfJ6OQu9z0y19xVmn+T1chY6kVdu78sKNAfSOQvtDOnv/gVoQ/+hmpfLXehlZmvy9lFzuebl\nchd6m5nS4NRPMX/TvFzuQieqzLhlY8w80cvlLrRTu2Zw/hnXq357l8PQR2RAiwJaFNCigBYF\ntCigRQEtCmhRQIsCWhTQooAWBbQooEUBLQpoUUCLAloU0KKAFgW0KKBFAS0KaFFAiwJaFNCi\ngBYFtCigRQEtCmhRRxr0ePMb36+fUaR5H60CWtSRBr1r53u+XwdaFNCZary9vHe/T9+XXJpm\nzy9R7/593OL+jSt73OQssKdSce42VzuLzTueNadu7cTCIVe86a5v//yxJbOfBDpT15re0ysL\nuj54GPQ3Td+fOZvMVe5Ds83TLrRnreZ0c9L8MebE1x3ngQJzWuWAXoOKOuj9dxboRP/jDzjO\ng2ZRS+iuJX9KLtf0GppwnIMFYx0X2rP2XbO0wUmsSY5qHG1+4Tj7PmOKOugb6CzQtV1L65P7\nj79ubwltfmK/usA85e4rbrTQnrWSYvei2o0j8useMTPcDZ8EOlPnmhHff8Ze+LoF9D/sF6vN\ntY5T1e31FHR6bb85e6fbXLN9g7nVbllc1EHvv9NAH7im2JjiFW81Q9eloN+yX6zpNc45WHiO\nk4JOrz1jmnv4O2aT3XJcUQe9/04DndwDPP69cWZsYxP0rhT0O6mvnW9evtv83Gl6pHntbXPW\nxlRv3tm0kzm2qGPefKeBfmH1H5K3iUnmRWfaUQ3JxXtbQFebH1b1POg0PZJe6zfRfvWRzYkn\nzEx36Vn20Rnaacpqk7uI8d0OOAvN/Y6zZ3QL6JrepxZe6C7YR9JrXzPrk7fbup/lJMrdo44D\nk4HOUGKaGXbhuf3c09JUm7wLlwycNMgLndxbmC3ufeqR5rX9I0z5wvJufZJHIX/uYU6bXVIy\ntaiDvoHOAu3sXTUsv9/Ede5eY8PIvOIrDg5tAb3ZDLKHJKlHmtec964ekz940T/dxedmlBRX\nvrC0qAPeu1unge7sAS0KaFFAiwJaFNCigBYFtCigRQEtCmhRQIsCWhTQooAWBbQooEUBLQpo\nUUCLAloU0KKAFgW0KKBFAS0KaFFAiwJaFNCigBb1P/Ix6Qlt01rvAAAAAElFTkSuQmCC", | |
"text/plain": [ | |
"plot without title" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# set plot size for jupyter notebook\n", | |
"options(repr.plot.width=3, repr.plot.height=4)\n", | |
"\n", | |
"barplot(table(orig_train$Survived),\n", | |
" ylab=\"count\", xlab=\"survived\",\n", | |
" col=c(\"lightblue\", \"orange\"))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Set a basic benchmark with the features we can currently use" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<dl class=dl-horizontal>\n", | |
"\t<dt>PassengerId</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Survived</dt>\n", | |
"\t\t<dd>418</dd>\n", | |
"\t<dt>Pclass</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Name</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Sex</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Age</dt>\n", | |
"\t\t<dd>263</dd>\n", | |
"\t<dt>SibSp</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Parch</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Ticket</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Fare</dt>\n", | |
"\t\t<dd>1</dd>\n", | |
"\t<dt>Cabin</dt>\n", | |
"\t\t<dd>1014</dd>\n", | |
"\t<dt>Embarked</dt>\n", | |
"\t\t<dd>2</dd>\n", | |
"\t<dt>dataset</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"</dl>\n" | |
], | |
"text/latex": [ | |
"\\begin{description*}\n", | |
"\\item[PassengerId] 0\n", | |
"\\item[Survived] 418\n", | |
"\\item[Pclass] 0\n", | |
"\\item[Name] 0\n", | |
"\\item[Sex] 0\n", | |
"\\item[Age] 263\n", | |
"\\item[SibSp] 0\n", | |
"\\item[Parch] 0\n", | |
"\\item[Ticket] 0\n", | |
"\\item[Fare] 1\n", | |
"\\item[Cabin] 1014\n", | |
"\\item[Embarked] 2\n", | |
"\\item[dataset] 0\n", | |
"\\end{description*}\n" | |
], | |
"text/markdown": [ | |
"PassengerId\n", | |
": 0Survived\n", | |
": 418Pclass\n", | |
": 0Name\n", | |
": 0Sex\n", | |
": 0Age\n", | |
": 263SibSp\n", | |
": 0Parch\n", | |
": 0Ticket\n", | |
": 0Fare\n", | |
": 1Cabin\n", | |
": 1014Embarked\n", | |
": 2dataset\n", | |
": 0\n", | |
"\n" | |
], | |
"text/plain": [ | |
"PassengerId Survived Pclass Name Sex Age \n", | |
" 0 418 0 0 0 263 \n", | |
" SibSp Parch Ticket Fare Cabin Embarked \n", | |
" 0 0 0 1 1014 2 \n", | |
" dataset \n", | |
" 0 " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"apply(orig_all_data, 2, function(x) sum(is.na(x)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Impute missing age values for all the data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"orig_all_data$Age[is.na(orig_all_data$Age)] = median(orig_all_data$Age, na.rm=TRUE)\n", | |
"# and the one missing Fare\n", | |
"orig_all_data$Fare[is.na(orig_all_data$Fare)] = median(orig_all_data$Fare, na.rm=TRUE)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<dl class=dl-horizontal>\n", | |
"\t<dt>PassengerId</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Survived</dt>\n", | |
"\t\t<dd>418</dd>\n", | |
"\t<dt>Pclass</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Name</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Sex</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Age</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>SibSp</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Parch</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Ticket</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Fare</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"\t<dt>Cabin</dt>\n", | |
"\t\t<dd>1014</dd>\n", | |
"\t<dt>Embarked</dt>\n", | |
"\t\t<dd>2</dd>\n", | |
"\t<dt>dataset</dt>\n", | |
"\t\t<dd>0</dd>\n", | |
"</dl>\n" | |
], | |
"text/latex": [ | |
"\\begin{description*}\n", | |
"\\item[PassengerId] 0\n", | |
"\\item[Survived] 418\n", | |
"\\item[Pclass] 0\n", | |
"\\item[Name] 0\n", | |
"\\item[Sex] 0\n", | |
"\\item[Age] 0\n", | |
"\\item[SibSp] 0\n", | |
"\\item[Parch] 0\n", | |
"\\item[Ticket] 0\n", | |
"\\item[Fare] 0\n", | |
"\\item[Cabin] 1014\n", | |
"\\item[Embarked] 2\n", | |
"\\item[dataset] 0\n", | |
"\\end{description*}\n" | |
], | |
"text/markdown": [ | |
"PassengerId\n", | |
": 0Survived\n", | |
": 418Pclass\n", | |
": 0Name\n", | |
": 0Sex\n", | |
": 0Age\n", | |
": 0SibSp\n", | |
": 0Parch\n", | |
": 0Ticket\n", | |
": 0Fare\n", | |
": 0Cabin\n", | |
": 1014Embarked\n", | |
": 2dataset\n", | |
": 0\n", | |
"\n" | |
], | |
"text/plain": [ | |
"PassengerId Survived Pclass Name Sex Age \n", | |
" 0 418 0 0 0 0 \n", | |
" SibSp Parch Ticket Fare Cabin Embarked \n", | |
" 0 0 0 0 1014 2 \n", | |
" dataset \n", | |
" 0 " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"apply(orig_all_data, 2, function(x) sum(is.na(x)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# split training dataset into train and validation sets\n", | |
"n = nrow(orig_train)\n", | |
"train_idx = sample(n, size=2/3*n)\n", | |
"validation_idx = setdiff(1:n, train_idx)\n", | |
"\n", | |
"# remove all unwanted columns\n", | |
"to_keep = c(\"Survived\", \"Pclass\", \"Sex\", \"Age\", \"Fare\", \"SibSp\", \"Parch\", \"dataset\")\n", | |
"basic_df = orig_all_data[, to_keep]\n", | |
"basic_df = basic_df[basic_df$dataset == \"train\", ]\n", | |
"basic_df$dataset = NULL" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
"<thead><tr><th scope=col>Survived</th><th scope=col>Pclass</th><th scope=col>Sex</th><th scope=col>Age</th><th scope=col>Fare</th><th scope=col>SibSp</th><th scope=col>Parch</th></tr></thead>\n", | |
"<tbody>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>22 </td><td> 7.2500</td><td>1 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>1 </td><td>0 </td><td>38 </td><td> 71.2833</td><td>1 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>3 </td><td>0 </td><td>26 </td><td> 7.9250</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>1 </td><td>0 </td><td>35 </td><td> 53.1000</td><td>1 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>35 </td><td> 8.0500</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>28 </td><td> 8.4583</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>1 </td><td>1 </td><td>54 </td><td> 51.8625</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td> 2 </td><td> 21.0750</td><td>3 </td><td>1 </td></tr>\n", | |
"\t<tr><td>1 </td><td>3 </td><td>0 </td><td>27 </td><td> 11.1333</td><td>0 </td><td>2 </td></tr>\n", | |
"\t<tr><td>1 </td><td>2 </td><td>0 </td><td>14 </td><td> 30.0708</td><td>1 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>3 </td><td>0 </td><td> 4 </td><td> 16.7000</td><td>1 </td><td>1 </td></tr>\n", | |
"\t<tr><td>1 </td><td>1 </td><td>0 </td><td>58 </td><td> 26.5500</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>20 </td><td> 8.0500</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>39 </td><td> 31.2750</td><td>1 </td><td>5 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>0 </td><td>14 </td><td> 7.8542</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>2 </td><td>0 </td><td>55 </td><td> 16.0000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td> 2 </td><td> 29.1250</td><td>4 </td><td>1 </td></tr>\n", | |
"\t<tr><td>1 </td><td>2 </td><td>1 </td><td>28 </td><td> 13.0000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>0 </td><td>31 </td><td> 18.0000</td><td>1 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>3 </td><td>0 </td><td>28 </td><td> 7.2250</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>2 </td><td>1 </td><td>35 </td><td> 26.0000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>2 </td><td>1 </td><td>34 </td><td> 13.0000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>3 </td><td>0 </td><td>15 </td><td> 8.0292</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>1 </td><td>1 </td><td>28 </td><td> 35.5000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>0 </td><td> 8 </td><td> 21.0750</td><td>3 </td><td>1 </td></tr>\n", | |
"\t<tr><td>1 </td><td>3 </td><td>0 </td><td>38 </td><td> 31.3875</td><td>1 </td><td>5 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>28 </td><td> 7.2250</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>1 </td><td>1 </td><td>19 </td><td>263.0000</td><td>3 </td><td>2 </td></tr>\n", | |
"\t<tr><td>1 </td><td>3 </td><td>0 </td><td>28 </td><td> 7.8792</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>28 </td><td> 7.8958</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td><td>⋮</td></tr>\n", | |
"\t<tr><td>0 </td><td>2 </td><td>1 </td><td>21 </td><td>11.5000</td><td>1 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>1 </td><td>0 </td><td>48 </td><td>25.9292</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>0 </td><td>28 </td><td>69.5500</td><td>8 </td><td>2 </td></tr>\n", | |
"\t<tr><td>0 </td><td>2 </td><td>1 </td><td>24 </td><td>13.0000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>2 </td><td>0 </td><td>42 </td><td>13.0000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>2 </td><td>0 </td><td>27 </td><td>13.8583</td><td>1 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>1 </td><td>1 </td><td>31 </td><td>50.4958</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>28 </td><td> 9.5000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>3 </td><td>1 </td><td> 4 </td><td>11.1333</td><td>1 </td><td>1 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>26 </td><td> 7.8958</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>1 </td><td>0 </td><td>47 </td><td>52.5542</td><td>1 </td><td>1 </td></tr>\n", | |
"\t<tr><td>0 </td><td>1 </td><td>1 </td><td>33 </td><td> 5.0000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>47 </td><td> 9.0000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>2 </td><td>0 </td><td>28 </td><td>24.0000</td><td>1 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>3 </td><td>0 </td><td>15 </td><td> 7.2250</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>20 </td><td> 9.8458</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>19 </td><td> 7.8958</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>28 </td><td> 7.8958</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>1 </td><td>0 </td><td>56 </td><td>83.1583</td><td>0 </td><td>1 </td></tr>\n", | |
"\t<tr><td>1 </td><td>2 </td><td>0 </td><td>25 </td><td>26.0000</td><td>0 </td><td>1 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>33 </td><td> 7.8958</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>0 </td><td>22 </td><td>10.5167</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>2 </td><td>1 </td><td>28 </td><td>10.5000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>25 </td><td> 7.0500</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>0 </td><td>39 </td><td>29.1250</td><td>0 </td><td>5 </td></tr>\n", | |
"\t<tr><td>0 </td><td>2 </td><td>1 </td><td>27 </td><td>13.0000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>1 </td><td>0 </td><td>19 </td><td>30.0000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>0 </td><td>28 </td><td>23.4500</td><td>1 </td><td>2 </td></tr>\n", | |
"\t<tr><td>1 </td><td>1 </td><td>1 </td><td>26 </td><td>30.0000</td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>32 </td><td> 7.7500</td><td>0 </td><td>0 </td></tr>\n", | |
"</tbody>\n", | |
"</table>\n" | |
], | |
"text/latex": [ | |
"\\begin{tabular}{r|lllllll}\n", | |
" Survived & Pclass & Sex & Age & Fare & SibSp & Parch\\\\\n", | |
"\\hline\n", | |
"\t 0 & 3 & 1 & 22 & 7.2500 & 1 & 0 \\\\\n", | |
"\t 1 & 1 & 0 & 38 & 71.2833 & 1 & 0 \\\\\n", | |
"\t 1 & 3 & 0 & 26 & 7.9250 & 0 & 0 \\\\\n", | |
"\t 1 & 1 & 0 & 35 & 53.1000 & 1 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 35 & 8.0500 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 28 & 8.4583 & 0 & 0 \\\\\n", | |
"\t 0 & 1 & 1 & 54 & 51.8625 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 2 & 21.0750 & 3 & 1 \\\\\n", | |
"\t 1 & 3 & 0 & 27 & 11.1333 & 0 & 2 \\\\\n", | |
"\t 1 & 2 & 0 & 14 & 30.0708 & 1 & 0 \\\\\n", | |
"\t 1 & 3 & 0 & 4 & 16.7000 & 1 & 1 \\\\\n", | |
"\t 1 & 1 & 0 & 58 & 26.5500 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 20 & 8.0500 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 39 & 31.2750 & 1 & 5 \\\\\n", | |
"\t 0 & 3 & 0 & 14 & 7.8542 & 0 & 0 \\\\\n", | |
"\t 1 & 2 & 0 & 55 & 16.0000 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 2 & 29.1250 & 4 & 1 \\\\\n", | |
"\t 1 & 2 & 1 & 28 & 13.0000 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 0 & 31 & 18.0000 & 1 & 0 \\\\\n", | |
"\t 1 & 3 & 0 & 28 & 7.2250 & 0 & 0 \\\\\n", | |
"\t 0 & 2 & 1 & 35 & 26.0000 & 0 & 0 \\\\\n", | |
"\t 1 & 2 & 1 & 34 & 13.0000 & 0 & 0 \\\\\n", | |
"\t 1 & 3 & 0 & 15 & 8.0292 & 0 & 0 \\\\\n", | |
"\t 1 & 1 & 1 & 28 & 35.5000 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 0 & 8 & 21.0750 & 3 & 1 \\\\\n", | |
"\t 1 & 3 & 0 & 38 & 31.3875 & 1 & 5 \\\\\n", | |
"\t 0 & 3 & 1 & 28 & 7.2250 & 0 & 0 \\\\\n", | |
"\t 0 & 1 & 1 & 19 & 263.0000 & 3 & 2 \\\\\n", | |
"\t 1 & 3 & 0 & 28 & 7.8792 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 28 & 7.8958 & 0 & 0 \\\\\n", | |
"\t ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮ & ⋮\\\\\n", | |
"\t 0 & 2 & 1 & 21 & 11.5000 & 1 & 0 \\\\\n", | |
"\t 1 & 1 & 0 & 48 & 25.9292 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 0 & 28 & 69.5500 & 8 & 2 \\\\\n", | |
"\t 0 & 2 & 1 & 24 & 13.0000 & 0 & 0 \\\\\n", | |
"\t 1 & 2 & 0 & 42 & 13.0000 & 0 & 0 \\\\\n", | |
"\t 1 & 2 & 0 & 27 & 13.8583 & 1 & 0 \\\\\n", | |
"\t 0 & 1 & 1 & 31 & 50.4958 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 28 & 9.5000 & 0 & 0 \\\\\n", | |
"\t 1 & 3 & 1 & 4 & 11.1333 & 1 & 1 \\\\\n", | |
"\t 0 & 3 & 1 & 26 & 7.8958 & 0 & 0 \\\\\n", | |
"\t 1 & 1 & 0 & 47 & 52.5542 & 1 & 1 \\\\\n", | |
"\t 0 & 1 & 1 & 33 & 5.0000 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 47 & 9.0000 & 0 & 0 \\\\\n", | |
"\t 1 & 2 & 0 & 28 & 24.0000 & 1 & 0 \\\\\n", | |
"\t 1 & 3 & 0 & 15 & 7.2250 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 20 & 9.8458 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 19 & 7.8958 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 28 & 7.8958 & 0 & 0 \\\\\n", | |
"\t 1 & 1 & 0 & 56 & 83.1583 & 0 & 1 \\\\\n", | |
"\t 1 & 2 & 0 & 25 & 26.0000 & 0 & 1 \\\\\n", | |
"\t 0 & 3 & 1 & 33 & 7.8958 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 0 & 22 & 10.5167 & 0 & 0 \\\\\n", | |
"\t 0 & 2 & 1 & 28 & 10.5000 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 25 & 7.0500 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 0 & 39 & 29.1250 & 0 & 5 \\\\\n", | |
"\t 0 & 2 & 1 & 27 & 13.0000 & 0 & 0 \\\\\n", | |
"\t 1 & 1 & 0 & 19 & 30.0000 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 0 & 28 & 23.4500 & 1 & 2 \\\\\n", | |
"\t 1 & 1 & 1 & 26 & 30.0000 & 0 & 0 \\\\\n", | |
"\t 0 & 3 & 1 & 32 & 7.7500 & 0 & 0 \\\\\n", | |
"\\end{tabular}\n" | |
], | |
"text/markdown": [ | |
"\n", | |
"Survived | Pclass | Sex | Age | Fare | SibSp | Parch | \n", | |
"|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", | |
"| 0 | 3 | 1 | 22 | 7.2500 | 1 | 0 | \n", | |
"| 1 | 1 | 0 | 38 | 71.2833 | 1 | 0 | \n", | |
"| 1 | 3 | 0 | 26 | 7.9250 | 0 | 0 | \n", | |
"| 1 | 1 | 0 | 35 | 53.1000 | 1 | 0 | \n", | |
"| 0 | 3 | 1 | 35 | 8.0500 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 28 | 8.4583 | 0 | 0 | \n", | |
"| 0 | 1 | 1 | 54 | 51.8625 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 2 | 21.0750 | 3 | 1 | \n", | |
"| 1 | 3 | 0 | 27 | 11.1333 | 0 | 2 | \n", | |
"| 1 | 2 | 0 | 14 | 30.0708 | 1 | 0 | \n", | |
"| 1 | 3 | 0 | 4 | 16.7000 | 1 | 1 | \n", | |
"| 1 | 1 | 0 | 58 | 26.5500 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 20 | 8.0500 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 39 | 31.2750 | 1 | 5 | \n", | |
"| 0 | 3 | 0 | 14 | 7.8542 | 0 | 0 | \n", | |
"| 1 | 2 | 0 | 55 | 16.0000 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 2 | 29.1250 | 4 | 1 | \n", | |
"| 1 | 2 | 1 | 28 | 13.0000 | 0 | 0 | \n", | |
"| 0 | 3 | 0 | 31 | 18.0000 | 1 | 0 | \n", | |
"| 1 | 3 | 0 | 28 | 7.2250 | 0 | 0 | \n", | |
"| 0 | 2 | 1 | 35 | 26.0000 | 0 | 0 | \n", | |
"| 1 | 2 | 1 | 34 | 13.0000 | 0 | 0 | \n", | |
"| 1 | 3 | 0 | 15 | 8.0292 | 0 | 0 | \n", | |
"| 1 | 1 | 1 | 28 | 35.5000 | 0 | 0 | \n", | |
"| 0 | 3 | 0 | 8 | 21.0750 | 3 | 1 | \n", | |
"| 1 | 3 | 0 | 38 | 31.3875 | 1 | 5 | \n", | |
"| 0 | 3 | 1 | 28 | 7.2250 | 0 | 0 | \n", | |
"| 0 | 1 | 1 | 19 | 263.0000 | 3 | 2 | \n", | |
"| 1 | 3 | 0 | 28 | 7.8792 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 28 | 7.8958 | 0 | 0 | \n", | |
"| ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | \n", | |
"| 0 | 2 | 1 | 21 | 11.5000 | 1 | 0 | \n", | |
"| 1 | 1 | 0 | 48 | 25.9292 | 0 | 0 | \n", | |
"| 0 | 3 | 0 | 28 | 69.5500 | 8 | 2 | \n", | |
"| 0 | 2 | 1 | 24 | 13.0000 | 0 | 0 | \n", | |
"| 1 | 2 | 0 | 42 | 13.0000 | 0 | 0 | \n", | |
"| 1 | 2 | 0 | 27 | 13.8583 | 1 | 0 | \n", | |
"| 0 | 1 | 1 | 31 | 50.4958 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 28 | 9.5000 | 0 | 0 | \n", | |
"| 1 | 3 | 1 | 4 | 11.1333 | 1 | 1 | \n", | |
"| 0 | 3 | 1 | 26 | 7.8958 | 0 | 0 | \n", | |
"| 1 | 1 | 0 | 47 | 52.5542 | 1 | 1 | \n", | |
"| 0 | 1 | 1 | 33 | 5.0000 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 47 | 9.0000 | 0 | 0 | \n", | |
"| 1 | 2 | 0 | 28 | 24.0000 | 1 | 0 | \n", | |
"| 1 | 3 | 0 | 15 | 7.2250 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 20 | 9.8458 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 19 | 7.8958 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 28 | 7.8958 | 0 | 0 | \n", | |
"| 1 | 1 | 0 | 56 | 83.1583 | 0 | 1 | \n", | |
"| 1 | 2 | 0 | 25 | 26.0000 | 0 | 1 | \n", | |
"| 0 | 3 | 1 | 33 | 7.8958 | 0 | 0 | \n", | |
"| 0 | 3 | 0 | 22 | 10.5167 | 0 | 0 | \n", | |
"| 0 | 2 | 1 | 28 | 10.5000 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 25 | 7.0500 | 0 | 0 | \n", | |
"| 0 | 3 | 0 | 39 | 29.1250 | 0 | 5 | \n", | |
"| 0 | 2 | 1 | 27 | 13.0000 | 0 | 0 | \n", | |
"| 1 | 1 | 0 | 19 | 30.0000 | 0 | 0 | \n", | |
"| 0 | 3 | 0 | 28 | 23.4500 | 1 | 2 | \n", | |
"| 1 | 1 | 1 | 26 | 30.0000 | 0 | 0 | \n", | |
"| 0 | 3 | 1 | 32 | 7.7500 | 0 | 0 | \n", | |
"\n", | |
"\n" | |
], | |
"text/plain": [ | |
" Survived Pclass Sex Age Fare SibSp Parch\n", | |
"1 0 3 1 22 7.2500 1 0 \n", | |
"2 1 1 0 38 71.2833 1 0 \n", | |
"3 1 3 0 26 7.9250 0 0 \n", | |
"4 1 1 0 35 53.1000 1 0 \n", | |
"5 0 3 1 35 8.0500 0 0 \n", | |
"6 0 3 1 28 8.4583 0 0 \n", | |
"7 0 1 1 54 51.8625 0 0 \n", | |
"8 0 3 1 2 21.0750 3 1 \n", | |
"9 1 3 0 27 11.1333 0 2 \n", | |
"10 1 2 0 14 30.0708 1 0 \n", | |
"11 1 3 0 4 16.7000 1 1 \n", | |
"12 1 1 0 58 26.5500 0 0 \n", | |
"13 0 3 1 20 8.0500 0 0 \n", | |
"14 0 3 1 39 31.2750 1 5 \n", | |
"15 0 3 0 14 7.8542 0 0 \n", | |
"16 1 2 0 55 16.0000 0 0 \n", | |
"17 0 3 1 2 29.1250 4 1 \n", | |
"18 1 2 1 28 13.0000 0 0 \n", | |
"19 0 3 0 31 18.0000 1 0 \n", | |
"20 1 3 0 28 7.2250 0 0 \n", | |
"21 0 2 1 35 26.0000 0 0 \n", | |
"22 1 2 1 34 13.0000 0 0 \n", | |
"23 1 3 0 15 8.0292 0 0 \n", | |
"24 1 1 1 28 35.5000 0 0 \n", | |
"25 0 3 0 8 21.0750 3 1 \n", | |
"26 1 3 0 38 31.3875 1 5 \n", | |
"27 0 3 1 28 7.2250 0 0 \n", | |
"28 0 1 1 19 263.0000 3 2 \n", | |
"29 1 3 0 28 7.8792 0 0 \n", | |
"30 0 3 1 28 7.8958 0 0 \n", | |
"⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ \n", | |
"862 0 2 1 21 11.5000 1 0 \n", | |
"863 1 1 0 48 25.9292 0 0 \n", | |
"864 0 3 0 28 69.5500 8 2 \n", | |
"865 0 2 1 24 13.0000 0 0 \n", | |
"866 1 2 0 42 13.0000 0 0 \n", | |
"867 1 2 0 27 13.8583 1 0 \n", | |
"868 0 1 1 31 50.4958 0 0 \n", | |
"869 0 3 1 28 9.5000 0 0 \n", | |
"870 1 3 1 4 11.1333 1 1 \n", | |
"871 0 3 1 26 7.8958 0 0 \n", | |
"872 1 1 0 47 52.5542 1 1 \n", | |
"873 0 1 1 33 5.0000 0 0 \n", | |
"874 0 3 1 47 9.0000 0 0 \n", | |
"875 1 2 0 28 24.0000 1 0 \n", | |
"876 1 3 0 15 7.2250 0 0 \n", | |
"877 0 3 1 20 9.8458 0 0 \n", | |
"878 0 3 1 19 7.8958 0 0 \n", | |
"879 0 3 1 28 7.8958 0 0 \n", | |
"880 1 1 0 56 83.1583 0 1 \n", | |
"881 1 2 0 25 26.0000 0 1 \n", | |
"882 0 3 1 33 7.8958 0 0 \n", | |
"883 0 3 0 22 10.5167 0 0 \n", | |
"884 0 2 1 28 10.5000 0 0 \n", | |
"885 0 3 1 25 7.0500 0 0 \n", | |
"886 0 3 0 39 29.1250 0 5 \n", | |
"887 0 2 1 27 13.0000 0 0 \n", | |
"888 1 1 0 19 30.0000 0 0 \n", | |
"889 0 3 0 28 23.4500 1 2 \n", | |
"890 1 1 1 26 30.0000 0 0 \n", | |
"891 0 3 1 32 7.7500 0 0 " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"basic_df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Split training data into training and validation sets." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"train_index = createDataPartition(\n", | |
" basic_df$Survived, p=0.7, list=FALSE)\n", | |
"\n", | |
"train_ = basic_df[train_index, ]\n", | |
"test_ = basic_df[- train_index, ]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Loading required package: randomForest\n", | |
"randomForest 4.6-12\n", | |
"Type rfNews() to see new features/changes/bug fixes.\n", | |
"\n", | |
"Attaching package: ‘randomForest’\n", | |
"\n", | |
"The following object is masked from ‘package:ggplot2’:\n", | |
"\n", | |
" margin\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"train_settings = trainControl(\n", | |
" method = \"repeatedcv\", number=10, repeats=20,\n", | |
" summaryFunction=twoClassSummary)\n", | |
"\n", | |
"model = train(Survived ~ ., data=train_,\n", | |
" method=\"rf\", tfControl=train_settings,\n", | |
" allowParallel=TRUE)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Random Forest \n", | |
"\n", | |
"625 samples\n", | |
" 6 predictors\n", | |
" 2 classes: '0', '1' \n", | |
"\n", | |
"No pre-processing\n", | |
"Resampling: Bootstrapped (25 reps) \n", | |
"Summary of sample sizes: 625, 625, 625, 625, 625, 625, ... \n", | |
"Resampling results across tuning parameters:\n", | |
"\n", | |
" mtry Accuracy Kappa \n", | |
" 2 0.8043944 0.5778023\n", | |
" 4 0.7962442 0.5662643\n", | |
" 6 0.7841991 0.5423094\n", | |
"\n", | |
"Accuracy was used to select the optimal model using the largest value.\n", | |
"The final value used for the model was mtry = 2.\n" | |
] | |
} | |
], | |
"source": [ | |
"print(model)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"'validation acc. = 84.6 %'" | |
], | |
"text/latex": [ | |
"'validation acc. = 84.6 \\%'" | |
], | |
"text/markdown": [ | |
"'validation acc. = 84.6 %'" | |
], | |
"text/plain": [ | |
"[1] \"validation acc. = 84.6 %\"" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"# test validation predictions\n", | |
"predictions = predict(object = model, newdata = test_)\n", | |
"\n", | |
"val_acc = sum(predictions == test_$Survived) / length(predictions)\n", | |
"sprintf(\"validation acc. = %.1f %%\", val_acc*100)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWgAAAFoCAMAAABNO5HnAAADAFBMVEUAAAABAQECAgIDAwME\nBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUW\nFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJyco\nKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6\nOjo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tM\nTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1e\nXl5fX19gYGBhYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29w\ncHBxcXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGC\ngoKDg4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OU\nlJSVlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWm\npqanp6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4\nuLi5ubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnK\nysrLy8vMzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc\n3Nzd3d3e3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u\n7u7v7+/w8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7////i\nsF19AAAACXBIWXMAABJ0AAASdAHeZh94AAAU1UlEQVR4nO2daWBURbaAT3azkI0lQJQACWtY\nBgLiiCCLC4YxghBUFhlhXBAejCzDG3ksylMhwQGGJ6KgPIEQJIALm4CKCiq8wLCJWyJRRCJL\nwhpCCN1nqm5vN4RO973d96Rfcr4fdW9X6lQVH911bzrVpwEZEqC6J1BbYNFEsGgiWDQRLJoI\nFk0EiyaCRRPBoolg0USwaCJYNBEsmggWTQSLJoJFE8GiiWDRRLBoIlg0ESyaCBZNBIsmgkUT\nwaKJYNFEsGgiWDQRLJoIFk0Eiyai+kSPBRs7Pe1qEEC5F2ZkKL4sOnfmzDy3utIl2u3evUO1\nin5spkKBkxavA2x1q6tJHTtqF+12796hWkVvrLqFsSpqp+gfhjSP6vmaWZyZc+5qENpi7K+I\nfeWy0g3jIUVUbwRYipgC8ZdH1vlV1VzhfghBbAXJ394T1eUf5pw7w9u8KX4mAn9Mi+k4Q3my\nl878Y2STB7bJU0snrSy9O4aT9cUjm9QbWCAbFT3bMazV2LMV5+YJviH6k3BlsX5Y/GsWWNbt\nlhdxRD2AhoNuFJ0O8KuquYJVdGysrB3oJ8scKbpZc3naz4R4to2l2+fR1klvS++O4UR9bIpy\nXoZY0FipvrWowtw8wQcuhouwvC002bjvcYANWB4BCfPX3APwge3FXVF0IDTo8KujuQWraOiY\n0U/01ynjPoA0GQh1Zs8TxtYjjhFms+dHg98BWyeFSu/q4YTluJkvN1D+/x8DeGrDOIAx6rl5\nhE+I3gawBrE8Hnri8f79xb9oO8C8m4uG6SZ1cwtW0YG/45UICD6NJRHQXhG9GfEIwL1Y4g+t\nxAqyA2C0rRNL7+rhRP1hxBUAr2KxeB0gmlpAs8qD6aRaRQ+ZJvka/ylepImJibdAA1Fv2vf6\nM00AMp08o8XrukJziVV0c3GaCC2VMlkG1pUv9z9AEzwE8II4NcfBXbZObBdDx3ApUF883gsw\nG/cBLBDnvx05Yq40mE58Yo2ebL+lvoTZceIQ41R04xuaK1hFt0KrYptoeYqpEGjepEQjdoZb\nbZ1Ye1cNJ9ZuUZ0rRb8H8M5N5uYJPiE6Q3nRKnwXAK1e/253BdGdRP0628WwYnMLzkQrz+hO\n4vVve0Y3hDtsnVh6Vw+nEv21XD8qzc0jfEK0eAItE4dDu/bgEoC1iPNsorcgtoDYq0pru2hV\ncwvORMs1+iBAqlyjW5cra/ETatFbKgynEn1GXDvNaO4X39lcaTCd+IToknhovGHP/GCh4X8B\ner6bEa78y5cCvFQuXvrwwMIRoBKtam7BqejI/85sCPAR4jMAfdaIuw7//XbRSu/q4VSiMU38\nl7wrbjWerDyYTnxCNK4PUlbB1r/h8Qh5Im4ApiF+ofxKsQWsNXbRquYWnImOC5XNhogF5Gxr\nyzo7A+2dKL2rh1OL/qG+0rxZUeXBdOIbovFfD94W1nZWsTj7/I6wjhnngyD6Cpr/s54Qjdkd\nwlPmf6gWrWqu4Ex0yva7Izu+bJJNSqd3q3Nb6sfy1NqJpXfVcGrReOovyaGtnyu+yWA6qcHv\nR1tuV3wFFk0EiyaCRRNRg0X7FiyaCBZNBIsmgkUTwaKJYNFEsGgiWDQRLJoIFk0EiyaCRRPB\noolg0USwaCJYNBEsmggWTQSLJoJFE8GiiWDRRLBoIlg0ESyaCBZNBIHol2JqMHH/ctMCgegR\nfdbWXEI/cNMChejRxo9RbUSyaBpYNBEsmggWTQSLJoJFE8GiiWDRRLBoIlg0ESyaCBZNBIsm\ngkUTwaKJYNFEsGgiWDQRRojelxYf2nZqkea5sGiJ+6L3BHVdnD0tNuWq1rmwaIn7oh9qWYoy\nW9k6rXOpkaLPWo8GiG4zWDlM/gjRvKxzWPtsxE9hB2JO4MGqA2ug6DVxEJetnBkgenjIXNv6\n/KrfuOzhsArxiRalFxr/l4vAmif6aDAABH8jTw0Qfbo/BPV8MdeMeClqqnj8SJJ4AdWfNaF1\nqYvA9AE/If5+oAYVs5UkmhPlw/Acr4tG/G3FyDjofhL3gvzP/MD/GmJWcOCXrsIeSS9EPJdf\ng4p/KKJfkA8j3M1I77ZoU6nM8Vm2MiJd5nJW+BnxWr3WJleRNW/pOBEjs3vLb7owYOnIg+3K\ncXQ87oZtByQliHPiQpa6iqx5ojH33vh7c5Uz74s2RafJ5+61Tr2wOPRtcbbkCTP+EPLujJhT\nLiJroGgHBlwMs6DD7GUvdwrciTg9bObaKf6z0dTjPvOVZiNcBLJoiYaL4c5+8SGJQ/aLM/OC\ndqGtF5hxcUge4ib4tOo4Fi3hN5U8g0UTwaKJYNFEsGgiWDQRLJoIFk0EiyaCRRPBoolg0USw\naCJYNBEGiJ5g/QPWDq1zYdESDaLDlyto/gJFFi3RILquvqmwaAUdoi880zi42SwTYsLq6bF5\ntl1LzvFZ0Su7JD51xsM+jBAdc0ZSgqNjZmSNgSwhuk/i+CL7riWn+KroDfKKc7eHnRh3MZyJ\nA+QGg/ZThOimJapdS055NP13xPP5Plc8YN2c4lEv3t9AgxPq7JD8JM9PZoVMEqInonrXkjMG\nD8hHLDzgc8VdiuijnvViwJYw+xp9KLVh/f5xUvQcVO9acoavLh0L5LyTrnvWiYEXwwthow6a\nsasUnYnqXUvO8FXRpvFB0Omwh50YKHo7HEc8E24V7di15BRfFY1Y5uk9h6GijwWkrVjYMjrl\noCLavmvJOb4r2gsYeR+dlRR+55ZtjTItom27lpzDoiX8ppJnsGgiWDQRLJoIFk0EiyaCRRPB\noolg0USwaCJYNBEsmggWTQSLJsIY0X1gqo65sGiJFtEn/SOaVfXGsxNYtESL6AWBr8Ne7XNh\n0RItorullYQ9J0/KJjZpMu21ZEdypSqpPtHvDx2x1eAhjBD9E6zFR+JlLonHIue9k5KQrEqu\nVBXVJnqR3E/wtrFjGCH6pcgruB52IR6F9YjnI5Pd2aYkGDr0ImLpKfqisRTdwtgx6rzvfdHt\nhl66dDpsLOKyYLnrJD3ZnW1Kgof7H0b8+Qvy4liQFB1u7EDh73pd9BHLlqS46zgzXj6ekOzO\nNiWsxqVD2fPVz9gxDFg6nr9l565du+bAJ7gkRFmok93ZpoTVKPqbpmLlyDd2DO+LNjd/WB4u\nBj+J+0EsTJdik93ZpoTVeddR+tmuMoOH8L7oPbBaOf6p7jV8KOrVld3atnNnmxLyfbQFt0VP\nCL6gHN+GrVjydFzLhfN6urNNCVm0BT1vKpVfkmv0xMFuNmfREj2i8+AzRFMLFyuGHRYt0SPa\nfFfbLZ89Euzu5+BYtETX+9GFw+pHdd/jbmsWLeE3/j2DRRPBoolg0USwaCJYNBEsmggWTQSL\nJoJFE2GA6EnyD0NhXdV/jIxb5E4gi5ZoEB26fPnyzGT4xFHFog0RHSXLc7EDHFUs2jjR2DcZ\nsXx6y4ieey2i7RmWDt4fHTvwuONoxxDR32fM/8WAbjVjmOjrzXsjjo7IWNMjplARbcuwVBLX\n7Y159e+1Hx0YIfqDIHG92OX9fjVjhOg6hYWF3zwF/8Q8/9WI50PfUkTbMizlwpeIq0aZbUdH\n4PA/lyOarni1SJRX5ju93Kmewqi7DoAnynGVn/wj/oli2xqtZFg6FXb7+xfFA9vRwYDU/Yj5\nn3qzKPOTU4nxbqe6ijBXWzx1iA7bunXrdvmVZ6/YMndI0fYMS7t7+Qc8vMdxtGPEMzpJiu5e\nQ5/RUbaz5QFiCDx2Qop2ZFhCLN7QOyBPdbRixBq9KQQg/Cvv96sZQ0UfhhzE0vpzpWh7hqV1\nSUVyZ+9m29ERaMhdR/78RScM6FYzhorGIXUyc1Ijf5Gi7RmW8oL7LnujV/QZ29ERyPfREl2i\nr05qHnG39T7anmHpw85hsfflov1oh0VL+E0lz2DRRLBoIlg0ESyaCBZNBIsmgkUTwaKJYNFE\nsGgiWDQRLJoIFk0EiybCCNH70uJD204tQrwEIzEPDlSqdwKLlrgvek9Q18XZ02JTruKVvnNV\noh31TmDREvdFP9SyVJS5sE555BBdsf4muCv6+03H3J6Mz2CA6DaWz35P/kj5pps8+HhQbIuZ\n5er6Ajg6pG6rl0w3BLopeixAwAy3Z+MrGCB6eMhc2zqsiG4wMmc8TFDXF0BS+spn4e83BLon\nerOyPUdHtrfqxQDRp/tDUM8Xc+VeL0X0QHE2Oei0qr4A+olyYlhxxcCBMqdSwRcuihmK6IUu\n2/lYYUBOJcTfVoyMg+4nraLXo8yztFNVX6DkTjkMuyuGDR16SebUclEsUUSvc9nOxwrvZwkz\nlcq1t2xlRLpV9NfiYRGsUdUXwBfitBjWVIx0b+k411R47lDq7nR8Be8vHXmwXTmOjlc9ow/C\n16r6AuXO4xv4vGKkmxfD09MGvXje3dn4DN4XbYpOk0/da516WUU/JNbjcVEXVfUFMEicTQ45\nWzGS76Ml7q/RWdBh9rKXOwXutIq+bejKp2COur4AIh5fNRb+dkMgi5ZouBju7BcfkjhkP1pF\nH30wuu1Cs7q+ADYNiEmafeOXi7JoiTffVCqA3JtVs2gJi/YMFk0EiyaC3/gngkUTwaKJYNFE\nsGgiWDQRLJoIFk0EiyaCRRNBk7yqIgmZN69n0RKPkldVhEVXhUfJqyqiX/TV392eha9hbPIq\ne8KqhNXTY/NsqawSMl5IjBxUebOjK9GmCYHQdp/b8/AtDE5eZU1YhQl9EscX2VJZJTQeuHFW\n0LhKga5E/49c/JtU/cVmPouhyavsCaswoWmJI5VVQlvxHB92Z6XAwQPyEQsPOC36KFtnDlTV\nxHeL8Bzvi7Ynr5IoCaswYSI6UlklTJGtulUKfDRdLMHn850WaUq/P1bVxHeLiA3eF21PXuVI\nWJUwR5XKKiHj5qJdLR3K9sYeOr412BcwNAONI2GVvNGwp7JS7jp0iMacP7Z4+rTb8/AtDBVt\nT1iluLWnstIt+v8zhoq2J6yyuLWlsmLRVaEreZU9YZXi1pbKikVXBb+p5BksmggWTQSLJoJF\nE8GiiWDRRLBoIlg0ESyaCBZNBIsmgkUTwaKJYNFEsGgiWDQRLJoIFk2ET4nuMFVhcIs79NG0\ni7649q30Djh+qruE+JDo1fdYuDUwVh8QqS8uNEhfXAyk3OMuqe7m6iMQbWNO5T+Qu4ffp/ri\nnr9PX9wF2K8vsCpYdGVYtEZYtEZYtDZYNBEsmggWTQSLJmJe5Y+3uEfgF/ripj+gL+4yHNIX\nWBWEos/9oDNw340pC93kzE86B/w/Az7nQSi6dsOiiWDRRLBoIlg0ESyaCBZNBIsmgkUTwaKJ\nYNFEsGgi6ETndIvqc9OM6c65/mqbsHaLrusJNvUZpmfQIw/GNnnRpGdAV5CJ3uQ/dGmPqDxN\nMa/4jV89xu8FPcGLYZiOQb+vl7psjPxyGT2zrRoy0b37mPFio6laQswxT4ryP0LLtQcfi4gY\npmPQJ3uKl8+kQbpm6wIq0WfhTVE+laQl5gTItJE5cExzsKn3iO7DtA96tU4WXtc7WxdQiT4E\nX4pyQYiWt9RLv7siyr8Gl2gOfq1hkRStNe4nyOoRHD+rXNdsXUAlejt8K8oVcEFr4PKACZqD\nj4W/j1K01rjdEDZ584zAGfpn6xw60d+J8h0467JlBU4MhqFlWoNNvYVki2htg26C50T517By\nnbOtCirRB+ErUS4M1vZiXBvdfIP24Ldi8s+d65Z+7prWQffCDlF+CD/qm22V0F0Ml4tybHNN\nQTkwqlRHsDUVIrynddDjsFGU6+GkrtlWDdntXa8/iat60ylaQsoajDbrCc7fKWh3z84zWgc1\nd3hMlMNvNeuZrQvofmHxm7hpQFS+lpBPYEym5LKe4O7DdAyaBY+vGgWrdM3WBXS/gr/bJbK3\ntuTFb1qXgEI9wYpozXEru0Z0ztET6BJ+U4kIFk0EiyaCRRPBoolg0USwaCJYNBEsmggWTQSL\nJoJFE8GiiWDRRLBoIlg0ESyaCBZNBIsmgkUTwaKJYNFEsGgiWDQRLJoIFk2Eb4lOgYXK8WwA\nHKmq3Vi5Uywh/aiHw33Zv+4trZ7OQ7wEIx21FR54DV8TfZdyfAtciI5cvnzp5KYRH3s0WkZA\nx78vmdwo4iu80neuo7rCA6/hY6I7+P0mj6kdXIiOk2Vxl9Y601opfA4TrsleOrah+E5yHxM9\nqskicTgf9KIQbV7WOax9tnh44ZnGwc1mmRATVma2iUwvsonGbfAhOpqVT28Z0VN+yfDq6bF5\nlYMP3h8dO/C4qv3tyeVKL9+uvKx8MbG9d+Vbir2Nj4kePfFucVhVb6cQ/arfuOzhcrPy6JgZ\nWWMgSxjonrZ5UfCzdtHXA19GVbOIjDU9YgoxoU/i+KJKwSVx3d6YV/9eR/uLyvXg8hnBVYto\nW++1QvTXfoWIA5/cBUcuRcnPUz6ShDhgqThpP0UYkEvFyNvtorHpX9DeLM9/tXgthL6FCU1L\nsHJwrvxE26pRZvtPcmG7OJkmL6vZFtG23muFaNOti/Fy6EdC9F74RtR84C/XUTyZFTJJGJCf\ndZiaUkG0vdkqvzJxcqIYEyYiVg4+FXb7+xdR9ZNd8Ik4yduxY7u/VbSt91ohGif0xpzoMiF6\nnXW//894KLVh/f5xUnQGVhAtlw57s1fqWjtJmIN4k+DdvfwDHt7j+MkZeN3S/hfbM9rWe+0Q\nvcv/1KMjUYjeDdsOSEouhI06aMauk6wGVKLlxdDebHmAvLYdO6G0ukmwuL/Y0Dsgz/4TbNbF\npPTyhk20rffaIdrUaEHERim6OPRtUbPkCfN2EPcKZ8Iri1Zu7+zNDkMOYmn9uUqrysHrkork\nZ5A323+Ca+F5eXf4Xb3aKRrHRde5KkXj9LCZa6f4z8ZjAWkrFraMTjmoEi1+YVn2t6bh8vOX\ntmY4pE5mTmrkL5ZWlYLzgvsue6NX9BlHe/MYSHl+8TPhL8TXTtGfwWOoiDYvaBfaeoH4ZSIr\nKfzOLdsaZapEy1X2tkHysuZodnVS84i791o9VQ7+sHNY7H25qp8gvte/UVzaJkyrfaJrMCya\nCBZNBIsmgkUTwaKJYNFEsGgiWDQRLJoIFk0EiyaCRRPBoolg0USwaCJYNBEsmggWTQSLJoJF\nE8GiiWDRRLBoIlg0Ef8GLuE1zr6Ek9kAAAAASUVORK5CYII=", | |
"text/plain": [ | |
"Plot with title “Feature importance”" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"options(repr.plot.width=3, repr.plot.height=3)\n", | |
"\n", | |
"varImpPlot(model$finalModel,\n", | |
" pch=20, main=\"Feature importance\", cex=0.8)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"'F1 score = 0.87'" | |
], | |
"text/latex": [ | |
"'F1 score = 0.87'" | |
], | |
"text/markdown": [ | |
"'F1 score = 0.87'" | |
], | |
"text/plain": [ | |
"[1] \"F1 score = 0.87\"" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"y = test_$Survived\n", | |
"\n", | |
"f1_score = function(predictions, true_vals) {\n", | |
" precision = caret::posPredValue(predictions, true_vals)\n", | |
" recall = caret::sensitivity(predictions, true_vals)\n", | |
" f1 = (2 * precision * recall) / (precision + recall)\n", | |
" return(f1)\n", | |
"}\n", | |
"\n", | |
"f1 = f1_score(predictions, y)\n", | |
"sprintf(\"F1 score = %.2f\", f1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## More feature engineering\n", | |
"\n", | |
"Turn embarked into one-hot encoded varibles" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
"<thead><tr><th scope=col>PassengerId</th><th scope=col>Survived</th><th scope=col>Pclass</th><th scope=col>Name</th><th scope=col>Sex</th><th scope=col>Age</th><th scope=col>SibSp</th><th scope=col>Parch</th><th scope=col>Ticket</th><th scope=col>Fare</th><th scope=col>Cabin</th><th scope=col>dataset</th><th scope=col>Embarked.C</th><th scope=col>Embarked.Q</th><th scope=col>Embarked.S</th></tr></thead>\n", | |
"<tbody>\n", | |
"\t<tr><td>1 </td><td>0 </td><td>3 </td><td>Braund, Mr. Owen Harris </td><td>1 </td><td>22 </td><td>1 </td><td>0 </td><td>A/5 21171 </td><td> 7.2500 </td><td>NA </td><td>train </td><td>0 </td><td>0 </td><td>1 </td></tr>\n", | |
"\t<tr><td>2 </td><td>1 </td><td>1 </td><td>Cumings, Mrs. John Bradley (Florence Briggs Thayer)</td><td>0 </td><td>38 </td><td>1 </td><td>0 </td><td>PC 17599 </td><td>71.2833 </td><td>C85 </td><td>train </td><td>1 </td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>3 </td><td>1 </td><td>3 </td><td>Heikkinen, Miss. Laina </td><td>0 </td><td>26 </td><td>0 </td><td>0 </td><td>STON/O2. 3101282 </td><td> 7.9250 </td><td>NA </td><td>train </td><td>0 </td><td>0 </td><td>1 </td></tr>\n", | |
"\t<tr><td>4 </td><td>1 </td><td>1 </td><td>Futrelle, Mrs. Jacques Heath (Lily May Peel) </td><td>0 </td><td>35 </td><td>1 </td><td>0 </td><td>113803 </td><td>53.1000 </td><td>C123 </td><td>train </td><td>0 </td><td>0 </td><td>1 </td></tr>\n", | |
"\t<tr><td>5 </td><td>0 </td><td>3 </td><td>Allen, Mr. William Henry </td><td>1 </td><td>35 </td><td>0 </td><td>0 </td><td>373450 </td><td> 8.0500 </td><td>NA </td><td>train </td><td>0 </td><td>0 </td><td>1 </td></tr>\n", | |
"\t<tr><td>6 </td><td>0 </td><td>3 </td><td>Moran, Mr. James </td><td>1 </td><td>28 </td><td>0 </td><td>0 </td><td>330877 </td><td> 8.4583 </td><td>NA </td><td>train </td><td>0 </td><td>1 </td><td>0 </td></tr>\n", | |
"</tbody>\n", | |
"</table>\n" | |
], | |
"text/latex": [ | |
"\\begin{tabular}{r|lllllllllllllll}\n", | |
" PassengerId & Survived & Pclass & Name & Sex & Age & SibSp & Parch & Ticket & Fare & Cabin & dataset & Embarked.C & Embarked.Q & Embarked.S\\\\\n", | |
"\\hline\n", | |
"\t 1 & 0 & 3 & Braund, Mr. Owen Harris & 1 & 22 & 1 & 0 & A/5 21171 & 7.2500 & NA & train & 0 & 0 & 1 \\\\\n", | |
"\t 2 & 1 & 1 & Cumings, Mrs. John Bradley (Florence Briggs Thayer) & 0 & 38 & 1 & 0 & PC 17599 & 71.2833 & C85 & train & 1 & 0 & 0 \\\\\n", | |
"\t 3 & 1 & 3 & Heikkinen, Miss. Laina & 0 & 26 & 0 & 0 & STON/O2. 3101282 & 7.9250 & NA & train & 0 & 0 & 1 \\\\\n", | |
"\t 4 & 1 & 1 & Futrelle, Mrs. Jacques Heath (Lily May Peel) & 0 & 35 & 1 & 0 & 113803 & 53.1000 & C123 & train & 0 & 0 & 1 \\\\\n", | |
"\t 5 & 0 & 3 & Allen, Mr. William Henry & 1 & 35 & 0 & 0 & 373450 & 8.0500 & NA & train & 0 & 0 & 1 \\\\\n", | |
"\t 6 & 0 & 3 & Moran, Mr. James & 1 & 28 & 0 & 0 & 330877 & 8.4583 & NA & train & 0 & 1 & 0 \\\\\n", | |
"\\end{tabular}\n" | |
], | |
"text/markdown": [ | |
"\n", | |
"PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | dataset | Embarked.C | Embarked.Q | Embarked.S | \n", | |
"|---|---|---|---|---|---|\n", | |
"| 1 | 0 | 3 | Braund, Mr. Owen Harris | 1 | 22 | 1 | 0 | A/5 21171 | 7.2500 | NA | train | 0 | 0 | 1 | \n", | |
"| 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Thayer) | 0 | 38 | 1 | 0 | PC 17599 | 71.2833 | C85 | train | 1 | 0 | 0 | \n", | |
"| 3 | 1 | 3 | Heikkinen, Miss. Laina | 0 | 26 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NA | train | 0 | 0 | 1 | \n", | |
"| 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 0 | 35 | 1 | 0 | 113803 | 53.1000 | C123 | train | 0 | 0 | 1 | \n", | |
"| 5 | 0 | 3 | Allen, Mr. William Henry | 1 | 35 | 0 | 0 | 373450 | 8.0500 | NA | train | 0 | 0 | 1 | \n", | |
"| 6 | 0 | 3 | Moran, Mr. James | 1 | 28 | 0 | 0 | 330877 | 8.4583 | NA | train | 0 | 1 | 0 | \n", | |
"\n", | |
"\n" | |
], | |
"text/plain": [ | |
" PassengerId Survived Pclass\n", | |
"1 1 0 3 \n", | |
"2 2 1 1 \n", | |
"3 3 1 3 \n", | |
"4 4 1 1 \n", | |
"5 5 0 3 \n", | |
"6 6 0 3 \n", | |
" Name Sex Age SibSp Parch\n", | |
"1 Braund, Mr. Owen Harris 1 22 1 0 \n", | |
"2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) 0 38 1 0 \n", | |
"3 Heikkinen, Miss. Laina 0 26 0 0 \n", | |
"4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 35 1 0 \n", | |
"5 Allen, Mr. William Henry 1 35 0 0 \n", | |
"6 Moran, Mr. James 1 28 0 0 \n", | |
" Ticket Fare Cabin dataset Embarked.C Embarked.Q Embarked.S\n", | |
"1 A/5 21171 7.2500 NA train 0 0 1 \n", | |
"2 PC 17599 71.2833 C85 train 1 0 0 \n", | |
"3 STON/O2. 3101282 7.9250 NA train 0 0 1 \n", | |
"4 113803 53.1000 C123 train 0 0 1 \n", | |
"5 373450 8.0500 NA train 0 0 1 \n", | |
"6 330877 8.4583 NA train 0 1 0 " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"dummy_var = dummyVars(\"~ Embarked\", data=orig_all_data, na.action=na.pass)\n", | |
"out = predict(dummy_var, newdata=orig_all_data)\n", | |
"# replace NA values with 0\n", | |
"out[is.na(out)] = 0\n", | |
"\n", | |
"orig_all_data$Embarked = NULL\n", | |
"\n", | |
"\n", | |
"orig_all_data = cbind(orig_all_data, out)\n", | |
"\n", | |
"head(orig_all_data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
"<thead><tr><th scope=col>Survived</th><th scope=col>Pclass</th><th scope=col>Sex</th><th scope=col>Age</th><th scope=col>SibSp</th><th scope=col>Parch</th><th scope=col>Fare</th><th scope=col>Embarked.C</th><th scope=col>Embarked.Q</th><th scope=col>Embarked.S</th></tr></thead>\n", | |
"<tbody>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>22 </td><td>1 </td><td>0 </td><td> 7.2500</td><td>0 </td><td>0 </td><td>1 </td></tr>\n", | |
"\t<tr><td>1 </td><td>1 </td><td>0 </td><td>38 </td><td>1 </td><td>0 </td><td>71.2833</td><td>1 </td><td>0 </td><td>0 </td></tr>\n", | |
"\t<tr><td>1 </td><td>3 </td><td>0 </td><td>26 </td><td>0 </td><td>0 </td><td> 7.9250</td><td>0 </td><td>0 </td><td>1 </td></tr>\n", | |
"\t<tr><td>1 </td><td>1 </td><td>0 </td><td>35 </td><td>1 </td><td>0 </td><td>53.1000</td><td>0 </td><td>0 </td><td>1 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>35 </td><td>0 </td><td>0 </td><td> 8.0500</td><td>0 </td><td>0 </td><td>1 </td></tr>\n", | |
"\t<tr><td>0 </td><td>3 </td><td>1 </td><td>28 </td><td>0 </td><td>0 </td><td> 8.4583</td><td>0 </td><td>1 </td><td>0 </td></tr>\n", | |
"</tbody>\n", | |
"</table>\n" | |
], | |
"text/latex": [ | |
"\\begin{tabular}{r|llllllllll}\n", | |
" Survived & Pclass & Sex & Age & SibSp & Parch & Fare & Embarked.C & Embarked.Q & Embarked.S\\\\\n", | |
"\\hline\n", | |
"\t 0 & 3 & 1 & 22 & 1 & 0 & 7.2500 & 0 & 0 & 1 \\\\\n", | |
"\t 1 & 1 & 0 & 38 & 1 & 0 & 71.2833 & 1 & 0 & 0 \\\\\n", | |
"\t 1 & 3 & 0 & 26 & 0 & 0 & 7.9250 & 0 & 0 & 1 \\\\\n", | |
"\t 1 & 1 & 0 & 35 & 1 & 0 & 53.1000 & 0 & 0 & 1 \\\\\n", | |
"\t 0 & 3 & 1 & 35 & 0 & 0 & 8.0500 & 0 & 0 & 1 \\\\\n", | |
"\t 0 & 3 & 1 & 28 & 0 & 0 & 8.4583 & 0 & 1 & 0 \\\\\n", | |
"\\end{tabular}\n" | |
], | |
"text/markdown": [ | |
"\n", | |
"Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked.C | Embarked.Q | Embarked.S | \n", | |
"|---|---|---|---|---|---|\n", | |
"| 0 | 3 | 1 | 22 | 1 | 0 | 7.2500 | 0 | 0 | 1 | \n", | |
"| 1 | 1 | 0 | 38 | 1 | 0 | 71.2833 | 1 | 0 | 0 | \n", | |
"| 1 | 3 | 0 | 26 | 0 | 0 | 7.9250 | 0 | 0 | 1 | \n", | |
"| 1 | 1 | 0 | 35 | 1 | 0 | 53.1000 | 0 | 0 | 1 | \n", | |
"| 0 | 3 | 1 | 35 | 0 | 0 | 8.0500 | 0 | 0 | 1 | \n", | |
"| 0 | 3 | 1 | 28 | 0 | 0 | 8.4583 | 0 | 1 | 0 | \n", | |
"\n", | |
"\n" | |
], | |
"text/plain": [ | |
" Survived Pclass Sex Age SibSp Parch Fare Embarked.C Embarked.Q Embarked.S\n", | |
"1 0 3 1 22 1 0 7.2500 0 0 1 \n", | |
"2 1 1 0 38 1 0 71.2833 1 0 0 \n", | |
"3 1 3 0 26 0 0 7.9250 0 0 1 \n", | |
"4 1 1 0 35 1 0 53.1000 0 0 1 \n", | |
"5 0 3 1 35 0 0 8.0500 0 0 1 \n", | |
"6 0 3 1 28 0 0 8.4583 0 1 0 " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"cols_to_remove = c(\"PassengerId\", \"Name\", \"Ticket\", \"Cabin\")\n", | |
"cols_to_keep = setdiff(colnames(orig_all_data), cols_to_remove)\n", | |
"\n", | |
"train_ = orig_all_data[, cols_to_keep]\n", | |
"train_ = train_[train_$dataset == \"train\", ]\n", | |
"train_$dataset = NULL\n", | |
"\n", | |
"head(train_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# split intro training and validation sets before training model\n", | |
"X_train = train_[train_idx, ]\n", | |
"X_val = train_[-train_idx, ]\n", | |
"\n", | |
"\n", | |
"train_settings = trainControl(\n", | |
" method = \"repeatedcv\", number=10, repeats=20,\n", | |
" summaryFunction=twoClassSummary)\n", | |
"\n", | |
"model = train(Survived ~ ., data=X_train,\n", | |
" method=\"rf\", tfControl=train_settings,\n", | |
" allowParallel=TRUE)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWgAAAFoCAMAAABNO5HnAAAC+lBMVEUAAAABAQECAgIDAwME\nBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUW\nFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJyco\nKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6\nOjo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tM\nTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1e\nXl5fX19hYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29wcHBx\ncXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGCgoKD\ng4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OUlJSV\nlZWWlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWmpqan\np6eoqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4uLi5\nubm6urq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnLy8vM\nzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc3Nzd3d3e\n3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u7u7v7+/w\n8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7///8EITIBAAAA\nCXBIWXMAABJ0AAASdAHeZh94AAAX90lEQVR4nO2deXwUxbaADwESMtkjGCFC2LcoXA2CCzuK\nXNDIInABkUW9IDzzJChXEVG4IhAUkHdFBM1F0LC7sAmoyEVkU0FAHkbMEwSEQJR9CZnz+72q\n7ullmOnJTE93pSPn+6O7p6uruubLTHVPqqsOICEEKOsKXC+QaEGQaEGQaEGQaEGQaEGQaEGQ\naEGQaEGQaEGQaEGQaEGQaEGQaEGQaEGQaEGQaEGQaEGQaEGQaEGQaEGQaEGQaEGQaEGQaEGQ\naEGQaEGQaEGQaEGQaEGQaEE4S/QIUNgYblG9AIqtqJJVlDfRO8aPzw+qKFOigy49dBwnut94\niQKDI2YDrA2qqOzmzUMXHXTpoeM40SsDH2GjCntLd7DoA33qJrT9l5ttuZe2vjG6wYjDiJ14\ns9IKUyGD7V4JMBcxA1LPDYo7rDtc4n6IQmwE6T/cm9DidffSu2OavM3SWMYfM5Oavyh92C+O\nvyu+1l/X8U25kEZy6drp+P6iQbWq9ijgB516srmr0YiT3nULFueK/jxGaqx7snczQ263G57B\ngVUBbup1rejeAId1h0t4RCcn8709KvDlUi66Tl2+2aUE8WQTudjnUSmkg1y6djq2PzlD2r6M\nWFBD2n3zKa+6BYvjRMvMwuKmUGvlzkcBVmBxLKRNX3QvwMfKl9tbdCW4sdlh7XAZj2hoPrUL\nK++2qZ0BMnlGiJs4jRlbjjicmc2bnggVvlMKOSaVrj8ds5wyftKN0t+/H8ATK0YCDNfXLWgc\nK3odwCLE4lRoi4e6dWPvaD3ANP+iYVyJ/nAZj+hKv+GFWIg8gedj4VZJ9GrEPQD34fkIaMRa\nkA0AQ5VC5NL1p2P7v0d8D+A1LGLfA8SSBlDH92RB4DjRfcZytuIb7Etar169KnAj21+yc/aw\nWgA5Bp9o9r32OpzjEV2XbdaDhtIynWe8gX/d/wK1cDfAy2zTnQKtlUKUi6F2ugyoxl5vB5iI\nOwFmsO0je/a4fU4WBI4TrbTRo9Vb6rOYl8JWSYaia1xzuIRHdCP0KFZE803sCpXcq6TciLfD\nzUohntJ1p2NtN9u9g4v+EGC+n7oFi2NFT5W+tBL7K0Kj2fu/8hJ9G9u/TLkYeh8uYyRa+kTf\nxr7/yif6JrhTKUQuXX86neitvP3wqVvQOFY0+wDNY6vdm7fhWwBLEKcpotcgNoDkS9LRqmjd\n4TJGonkbvQugK2+jGxdLbfFgveg1XqfTiS5k1043uruk3u72OVkQOFb0+VSosWLb9Eim4d8A\nbRdPjZHe+VyAV4rZVx/+OnMg6ETrDpcxFB3/z5ybAD5FHAbQcRG764j4RhUtla4/nU40ZrI/\nyWJ2q/G478mCwLGicXllqRVsfAQPxfINdgMwFvE/0k+KNeDZo4rWHS5jJDolmh/WhzUgJxvL\n7eyLqBYila4/nV70gWrS4XVO+Z4sCJwrGr99sKar6UtFbGvTna7mU/+oDIkX0P2Pqkw05jWL\nyZj+iV607nAJI9EZ69vFN59Uwg+5OK5VXM2un/FNTyFy6brT6UXj8cfSoxs/XeTnZEHgLNG2\nI9+ulAUkWhAkWhAkWhDXmeiyg0QLgkQLgkQLgkQLgkQLgkQLgkQLgkQLgkQLgkQLgkQLgkQL\ngkQLgkQLgkQLgkQLgkQLgkQLgkQLgkQLgkQLgkQLgkQLgkQLgkQLgkQLwkGiX0kqh6R8G+S7\nc5DogR2XlD+iPw7y3TlJ9NCyroEJ4km0GEi0IEi0IEi0IEi0IEi0IEi0IEi0IEi0IEi0IOwQ\nvTMzNbrpmFOm6hMMJFpmW+U73swbm5xxyVyNSodEyzzU8CLyGRWWmapQEJBomSYPS6vRnyK6\n593uujUP8QvYgLi00i4TFfRDeRJ9/umbaz3Lv9s2iH4kaorSPr9WYWTeI7AQcXCDi6drvBBq\nLQ0oT6Kf5BPQjEJbRJ/oBpXbTtjhRjybMIa97lsf8WS1l7IaXzRTUz/0ffgIYtH/lofF/kQu\nOoVtxS4P8t2Fcnt35L1BKXDPUdwOe9mrjyOuIL4fWWmLCad+6d2zAPH43vKw2CNNu5TEtmKX\nBvnughZdcpHPQ3R5QWxvPt+cxP8hXqnauCTYEkqjPDUdA/j7fwztaDryYb20HpqKX8G67zjn\nESenRM01UU+/lCfRv/eJjHzkDNohuiQxk392r9zWHoui32Vbbw1244GoxS8mHTdRUX+UJ9FM\nhDzfug0Xw/eh2cR5k26rtBFxnGv8kmciJmJJm87uC3UGhlxJ/5Qv0R7s+Am+sUtqVL0+37At\n94xbohvPcOObUfmIq+ALExX0A4kWBIkWBIkWBIkWBIkWBIkWBImWyfL88N5grkalQ6JlsmJy\nJUKYnDo0SLRM1g3mqhI0JFpGFX16WI3IOi+VIKZ9MC45X+ltCR/7RW/Lzt5qcZF2iE4q5JzH\noUkvvj8c3meiO9Z76pTa2xI2totezK4wFRdZW6Z9F8Px2J3/Y/TWZ5jo2ud1vS1hM+CRC4hX\niuxbSMHdGlpbaNxHQb67EETHbeAc5NtH34/KZqJHob63JVx6dP0W8eAX9i2iuOhIt6WFuoL9\nhphoo3d3valatxQuejLqe1vCxfam4y5e0TutLdPGi+Fp15BdbryDi85BfW9LuNgu+puqAFV3\nWlumjaLXwyHEwhiPaK23JWzsv+soWroklNApwWCj6J8rZr43s2Fixi5JtNrbEj50Hy2jttHv\n14+5e8266jmyaKW3JXxItCBItCBItCBItCBItCBItCBItCBItEpHGGOiLsFCohWORsTWseKX\niQEkWmFGpdmw3UxtgsNK0Ve2b7Vt9JgXtohulXne9TTfuDyqVq2x/0rXRg1ZgoWiC5oA1N1n\nWXEBsEP0QViCfVP5Q9L94qfNz0hL140asgILRT/A//N8j2XFBcAO0a/EX8DlsBlxHyxH/CM+\n3cp+LEbPB/ci/rLVioU0lqfSZsvKM17ELA7y3YUg+pb+Z8+ecI1AnBd5lb3snW5lPxajf//T\niBeOW7FoIg2asqSoUhY29BnukfusUq7ieClYeVa6lf1YaGnTMY/XKsey4gJgQ9PxfJWNmzdv\nngyf41tRUkOdbmU/Flp71/FJn4ctfq7AAOtFu+v25KszkY/jN8C+L2eT063sx0K6j/awDT6Q\n1g/ccAUfSnhtQaumt1jZj4Uk2kNW5Glp/S6sxfN/T2k4c1pbK/uxkET7UnyWt9GjHjaRNRAk\n+lry4UvEkgYWtRgqJPpa3K2brvmyb6TVD0qTaB+ODaiWcM82MzkDQaIFQaIFQaJlsvnPWtcd\n+t/2KbNCqlQpkGiZ7Ojc3NycdPYTXIVE2yI6gS9/T+6u7SLR9onGTunsF8u4hrFtt8ui1aFD\nu+5PTO5xSFuHTBiiZzWqPvA307nDwTbRV+t2QBwaO3VRm6Rjkmhl6ND5lFZzplW7T12HjnnR\n8/nlo63Z3GFhh+i4Y8eO7X0C3sD8iA8Q/4h+RxKtDB3aAVsQFw5xK+vQ69y391HEop9MLLpI\n/xU/bCpvmAsbpmPLlv/JP7gYF1a4zF7/WqS00dLQoeOulh/x2ZyUdeiYn46to1Szg2UxMZv1\n07Fhtmvt2rXrD7OtV5VH0rlodejQV+0jKvbcpq1DxnzTMYN7bmbjIyfG2HcxZORW5DNk/fwr\nF60NHWLfoxUdKubr1iFiXnTJs66ItibOaAG2iv4e2PflYrUpXLQ6dGhZ/VP8iYTVyjrUGod3\ne1dywXzesLBVNPaJy1naNf4XLlodOpQf2WnenPaJhco6xAoj3Ucr6ERfyq4b285zH60OHfrk\ndldy5x2orkOGRAuCRAuCRAuCRAuCRAuCRAuCRHtQA96chUGYD9/57A8TEi2jBby50GmKTrRl\ngXBItIx3wBtNtGWBcEi0jC7gTVoOE/1Zr+QG44v1+wtgX58bGr1iNrpCKaLPvv7k7Msmi7YP\newPeSKJvHLT0KcjS7y+A+r0XPAnPhVRTjcCiz/Gn+NsWmyzbNmwNeCOL7sG2Rlc+odtfAF3Y\ncpTL5HQ6Ax5lzfyVMwaL2dI/95cFOqQsFjYMrdAC3siieR/OHtio218gPUL9PXwVsmOJwNOx\nSUGT4CmbZ2xzwHRsuoA3smg++eQpWKTbXwD/YZtFYHJQQ+CmY4kk+mtzRduHrQFvtE/0Ltiq\n218g3XnshU0hVtZDYNElvZjn/zJXso3YGvBGFv0Qa49HJpzR7S+AXmxrdNRJMzUu/fZuc67F\nc9ZZgb0BbyTRNfsveAIm6/cXQOyjC0fAs+aqTPfRClrAG0n0vgcTm8506/cXwKruSfUnXjVR\nXw6JDpYCMNODpUKig4VEB4BEhweJFgT9418QJFoQJFoQYgYLeZMW5gQZJFrGz2Ahb0h0IMIa\nLOSNnaKLNh0Ir3C7sHewkHdsIWXoUNrUl+vF9zLfGx5AdF4cQG/H9a5wbB4s5BVbSBk6lFaj\nx8qXKo80U10JY9GFLn59sHSsnVXYOljIO7aQOnQorSn7jA+421R9OcbTsb0h/dO/i/2Tqzli\nOjZtsBBHiy2kDh1Ke4Yf1SpkwQrG07F9KZ15mP2TqzliOjZtsJB3bCF16FDaVAxLtHHTUdKZ\n31n+YLpkG7H1iX/v2ELq0CHprsMW0Xj+la5D95ou2E5sFe0dW0gdOmSjaOdiq+hrYgspQ4dI\ndCBMDRbyji2kDB0i0YGgfyqFB4kWBIkWBIkWBIkWBIkWhH3hqmGs927dqCE9hdIDvT7MVn6v\nu5e0S0houUA/OoBEy2TF5Ep8473bpOiR0HnqjG5efzUSLaOGQvXGnOiN8Cp/bC8bdB0nJFpG\nJ7pe7n/XqDv7t8zEtMW6UUNe/S5bmeir/VJ+1GLinHk8NXXETE8hbW+RGo3COnO0E/gRPbdp\n9QFHg65hWWCH6KRCictMdM2sdQ9BrYmf3hFzSRs15NXvcgA2uocl7tZi4rjbuybm3pUgi74c\nMdn3BL6ipaf87yyTuZKCxcaL4Vom+n7+YD9zuxx+1EYNefW7sKbjH64tqMXE2ciHA1ysKYs+\nAEt8T9C39zE+yZlu0V064U9e+xy2sGE6tqy4DRKFTPTLiMWwmA8W2q8fNaTrdymETGjK2hM1\nJk5OHG8ssmXRu8HPB8F3Oraukuh9ZTHNWhlOx6Zvo1/loj/0iPaMGvLudymE6CnwP6jFxMlu\nxHO+LhdyDl6XS8r9t3YC36ZDGvLWyOyD7UKw+WLoJdozasi736UQ5uPfkk9pMXFelz7Rz3gK\naSK3vJdiRmgn8BXtfs4FdzmyB0tFpGjPqCHvfhd+e3coeoQWE2czb6MvpXkKWQyvcdMTPEO6\nJPzd3pVYE7TIPmz8wfLxtaI9o4a8+12k++gJEd+rMXHcnVwT57euw0RP63QW3Y/BfZPf6AmD\ndCeg+2gZ5a6j+bWilVFDXv0ukugLtTu41Zg4Zx+/ufrw1Uz0UPidFZfXrXrCHW/r218SLQgS\nLQgSLQgSLQgSLQgSLQgSLWNtD4ufOdxItIylPSz+5nAj0TKW9rD4m8ONRMtY2sOim9tNxVf0\nqckjF5id3E0QDu9h0c/tpuIj+kQquyT0DrqCZYLDe1j0c7upPMKnYys+oy1ekM73ndc+py1s\nGFphZQ8L6uZ2U+nOp2P76QttkSmJXu61z2kLV571oi3sYdHP7abi03RMl7sMg65hWeDwHhb9\n3G4qPqIvtQGoaHWsZotxeA+Lfm43Fd+7jpJ17+4Jun5lg9N7WHRzu6nQfbSMtT0s2txuKiRa\nECRaECRaECRaECRaECRaECRaECRaxtquLNzYt35Cu/H6CKYkWsbSriz3cxVajJ05qEpj3bOi\nJFrG0q6sFTCWP3a3J62F9vSdTvSVnTuvBF2xMsXhXVklaXfL//NfCdqT6Jrog00AGpdN+OlQ\ncXhX1kEpYgvDXXuwegJNdBd+Lbg36JqVJQ7vyloJuzyldmupnqDng/v4JGdsEcPPU2WL8tLJ\nCxumY7OyK2sZKBfB7i3UE/Tv9wfihSNsUZeLrnlEeenkhR19hhZ2Zf0AKzxFNfTXdMziOaYH\nXbOyxOFdWVeSH+Crj37b6PdiiEt69jAZB0o0Du/Kwrkww43uXqn17tYekKH7aBlLu7JKhkKb\nl2b0rQDjtROQaBmLBwuteqh2dOMX3o7UmmISbSebf1Y3SbQgSLQgSLQgSLQgSLQgSLQgSLQg\nSLQgSLQgSLQgSLQgBjYbw2mScacRDZsZJt3S2DDptrqGSS1rtTJMqxOgHj3HKESVQ9Ef3CsR\nEZdsROVow6QqkYZJsRGGSQmQZJhWIUA9atyr0PXn0t+YhINEewjwZWw/3jAp+0HDpIWphkn7\n4Zhhmmu1YVKbCYZJhpBoI0i0X0h0yJBoQZBoQZBoQZBoQZBoQSQZv8FOxgPDnzUM7oyLahkm\n5cMJw7S4dYZJ7ScZJhniPNHfXTZMyjcO0Xy8wDDp4m7DJPd243p8a/yE9o9FxtmMcJ7oPykk\nWhAkWhAkWhAkWhAkWhAkWhAkWhAkWhAkWhAkWhAk2oeLBZxCi0t1muilrRI67vCXMDo7cLof\nzmTVdv0lL9RcuFwaqjM01Gyl4DDRqyL6z22T4Ge8fX5ydsB0fwxImJLXC9aEmAtzanzI+DbU\nbKXgMNEdOrrxTPUx1+7e1LoSZAdI98vvMBvxav0BoeVCHN4pcGXM4SzRJ+Fttnyi/rX79+Tk\nSJ9oo3S/HGi3ny079gotF+L9j2NxoMqYw1mid8MWtpwR5SfMbL3swOl+KT6RVyUv1FwN2jWs\nUCfnasgnC4yzRK+XZj14D077JkmiA6T75VWAEe4Qc12tnDxr1XCYEPLJAuM00fzbPh9O+iZ5\nRBum++XopklRY0LMdSmPTw0+xHU11JMFxlmid8HXbDkz0qjpCJBuxJjIKyZy4Qr40Uw2Y5wl\n+iTksuWIun6S6mUHTvclL51PjDUXzoaUCw+t5RMufAS/hZatNJwlGts/wL67tZ/xkyKJDpDu\nyxb4gi0H1A4tF/vWrGTLoWnu0LKVhsNEr6owalX3BH/xE2TRxum+lNx10/S8IfxjGUoudHdJ\nzvlgMJ9dJKRspeEw0bi4RXyHnf4SZNHG6X74bVBqTIvF7hBz4e/Da8TctSbUk5WG00T/aSHR\ngiDRgiDRgiDRgiDRgiDRgiDRgiDRgiDRgiDRgiDRgiDRgiDRgiDRgiDRgiDRgiDRgiDRgiDR\ngiDRgiDRgiDRgiDRgiDRgiDRgnC+6AyYKa1PVoQ9gY4bwcdSpfXeF+bptnS7oUqjv+cjnoVB\n2l6vF6YoD6JbS+t3oBTR8bm5c0fXjv0srLNNrdj8ubdGV4/9Gi90mqLt9nphinIgulmFI3zd\ntVkpolP4sqhF46uBjiqFTZDF56wqat7EmsfPNcqB6CG1ZrHVH5UnMNFqqGAtkvCCnCbxvU8p\nonEdfILaYcXjGsa23S7HG873zbzr/sTkHod0x7dMlwZk4Q8Lzkmx1dTS+YuwKAeih45qx1YL\nq25kopVQwbpIwvdkrp4V+aQq+mqlSag7LHbqojZJx6R4w6d8Mp9PaTVnWrX7tOPPSNeDczyG\n8SVZtFL6dSF6a4VjiD0e3wx71FDBukjCvKkY1FIVjbUf0yIK50d8wL4L0e9I8YZ9M+/gA9wW\nDnGrKTtgPdsYyy+rebJopfTrQnTJzW/iuehPmWg1VDBP8EQS5kMfxmR4iVYPW1iBz1b4a5EU\nb9g383FXy4/OoC5lM3zONvI3bFgf4RGtlH5diMasDrg08TITrYYK1kUSnopeonnToR72qhK+\nlccb9pP5q/YRFXtu01IK+ahmzi/KJ1op/foQvTni+N8GIROthgr2jiSsF80vhuphuRX5te3n\nX6Wj/GRm9xcrOlTMV1OwTgs5Bu4cRbRS+vUhuqT6jNiVXLQaKtg7krBOtHR7px72PSxFvFht\ninSUb+Zl9dn9xEFYrabgEnie3x3ur3p9isaRiXGXuGg1VLB3JGFZNPvBMu/Z2jEb2GvlMOwT\nl7O0a/wv8lE+mfMjO82b0z6xUDvePRwynn9zWMzLqden6C+hH0qi1VDBXpGEZdG8la3Zi1/W\ntMMuZdeNbbfd48k38ye3u5I779ClIH7YrXpK5irMvP5E/0kg0YIg0YIg0YIg0YIg0YIg0YIg\n0YIg0YIg0YIg0YIg0YIg0YIg0YIg0YIg0YIg0YIg0YIg0YIg0YIg0YIg0YIg0YIg0YIg0YL4\nf1/0eSq9vbMnAAAAAElFTkSuQmCC", | |
"text/plain": [ | |
"Plot with title “Feature importance”" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"varImpPlot(model$finalModel,\n", | |
" cex=0.8, pch=20, main=\"Feature importance\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"'F1 score = 0.86'" | |
], | |
"text/latex": [ | |
"'F1 score = 0.86'" | |
], | |
"text/markdown": [ | |
"'F1 score = 0.86'" | |
], | |
"text/plain": [ | |
"[1] \"F1 score = 0.86\"" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"predictions = predict(model, X_val)\n", | |
"\n", | |
"y = X_val$Survived\n", | |
"\n", | |
"f1 = f1_score(predictions, y)\n", | |
"sprintf(\"F1 score = %.2f\", f1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Doesn't look like the `Embarked.X` features are actually useful in the random forest model." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"'F1 score = 0.85'" | |
], | |
"text/latex": [ | |
"'F1 score = 0.85'" | |
], | |
"text/markdown": [ | |
"'F1 score = 0.85'" | |
], | |
"text/plain": [ | |
"[1] \"F1 score = 0.85\"" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"cols_to_remove = c(\"PassengerId\", \"Name\", \"Ticket\", \"Cabin\", \"SibSp\",\n", | |
" \"Parch\", \"Embarked.S\", \"Embarked.Q\", \"Embarked.C\")\n", | |
"cols_to_keep = setdiff(colnames(orig_all_data), cols_to_remove)\n", | |
"\n", | |
"train_ = orig_all_data[, cols_to_keep]\n", | |
"train_ = train_[train_$dataset == \"train\", ]\n", | |
"train_$dataset = NULL\n", | |
"\n", | |
"X_train = train_[train_idx, ]\n", | |
"X_val = train_[-train_idx, ]\n", | |
"\n", | |
"\n", | |
"train_settings = trainControl(\n", | |
" method = \"repeatedcv\", number=10, repeats=20,\n", | |
" summaryFunction=twoClassSummary)\n", | |
"\n", | |
"model = train(Survived ~ ., data=X_train,\n", | |
" method=\"rf\", tfControl=train_settings,\n", | |
" allowParallel=TRUE)\n", | |
"\n", | |
"predictions = predict(model, X_val)\n", | |
"\n", | |
"y = X_val$Survived\n", | |
"\n", | |
"precision = posPredValue(predictions, y)\n", | |
"recall = sensitivity(predictions, y)\n", | |
"\n", | |
"F1 = (2 * precision * recall) / (precision + recall)\n", | |
"sprintf(\"F1 score = %.2f\", F1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAWgAAAFoCAMAAABNO5HnAAAC+lBMVEUAAAABAQECAgIDAwME\nBAQFBQUGBgYHBwcICAgJCQkKCgoLCwsMDAwNDQ0ODg4PDw8QEBARERESEhITExMUFBQVFRUW\nFhYXFxcYGBgZGRkaGhobGxscHBwdHR0eHh4fHx8gICAhISEiIiIjIyMkJCQlJSUmJiYnJyco\nKCgpKSkqKiorKyssLCwtLS0uLi4vLy8wMDAxMTEyMjIzMzM0NDQ1NTU2NjY3Nzc4ODg5OTk6\nOjo7Ozs8PDw9PT0+Pj4/Pz9AQEBBQUFCQkJDQ0NERERFRUVGRkZHR0dISEhJSUlKSkpLS0tM\nTExNTU1OTk5PT09QUFBRUVFSUlJTU1NUVFRVVVVWVlZXV1dYWFhZWVlaWlpbW1tcXFxdXV1e\nXl5fX19hYWFiYmJjY2NkZGRlZWVmZmZnZ2doaGhpaWlqampra2tsbGxtbW1ubm5vb29wcHBx\ncXFycnJzc3N0dHR1dXV2dnZ3d3d4eHh5eXl6enp7e3t8fHx9fX1+fn5/f3+AgICBgYGCgoKD\ng4OEhISFhYWGhoaHh4eIiIiJiYmKioqLi4uMjIyNjY2Ojo6Pj4+QkJCRkZGSkpKTk5OVlZWW\nlpaXl5eYmJiZmZmampqbm5ucnJydnZ2enp6fn5+goKChoaGioqKjo6OkpKSlpaWmpqanp6eo\nqKipqamqqqqrq6usrKytra2urq6vr6+wsLCxsbGysrKzs7O0tLS1tbW2tra3t7e4uLi5ubm6\nurq7u7u8vLy9vb2+vr6/v7/AwMDBwcHCwsLDw8PExMTFxcXGxsbHx8fIyMjJycnKysrLy8vM\nzMzNzc3Ozs7Pz8/Q0NDR0dHS0tLT09PU1NTV1dXW1tbX19fY2NjZ2dna2trb29vc3Nzd3d3e\n3t7f39/g4ODh4eHi4uLj4+Pk5OTl5eXm5ubn5+fo6Ojp6enq6urr6+vs7Ozt7e3u7u7v7+/w\n8PDx8fHy8vLz8/P09PT19fX29vb39/f4+Pj5+fn6+vr7+/v8/Pz9/f3+/v7///+YKFYwAAAA\nCXBIWXMAABJ0AAASdAHeZh94AAASo0lEQVR4nO2deVhV1drAXwYhDodRERUTccABhyuo9JWZ\nYxkWOfs55ZCl18pyuPblp1g2mOgXZg5pcVNxQi3nxGvW1W5yMcP06i3MyolMwQEREM95n+db\na5+Ri8jenHNezgPv74+1h7Ped+398+y19zmexQJkSIDqPoDaAosmgkUTwaKJYNFEsGgiWDQR\nLJoIFk0EiyaCRRPBoolg0USwaCJYNBEsmggWTQSLJoJFE8GiiWDRRLBoIlg0ESyaCBZNBIsm\ngkUTwaKJYNFEsGgiWDQR1Sd6Clg46GiqwQClTjgil+LOorOSknJUpaqSaNXZnUO1ih6RpPBL\nBTWWA+xVlWp6x47aRavO7hyqVfTO+9dwrYraKfrHYc2Cun9oFGvG9G71/VpOOY/YW3Yr8RgB\ncWL3ToBViHEQcWtswHm76gpPgC9iK4g51Seo8/8Z0x/2b/OReE0E/pQY0nGu8mYvSvqvwCZP\n7pOrpiStTNltzcn9+WOb1Bv4i6yU9+eOulZTrpY9NkdwD9EH/JXOepA4mxRTvx19E8fUA2gw\n+D9FDwU4b1ddwSw6NFTuHeghy3QpOqqZXO1nQLzaxpT2dbQk6WnKbmtO7A+NU9ZLEH9ppOxu\nnFfm2BzBDW6GS7G0LTTZefRZgG1YqofI9zf2AdhuubjLivaG+h3O26qbMIuGjgv7iXydFj4O\nkCgDIWD+ImFsK+JkYXbD+8Hg8b0lSa6S3b45YTk86Z36yr//CIDnt70IMNn+2BzCLUTvA9iI\nWBoB3fFc//7ijDIAFt1bNMwx2Fc3YRbt/Tve1oPPH1ioh/aK6N2IJwD6YqEntBI9yH6ACZYk\npuz2zYn9PyCuAViM+eI6QDS0hKjyjVWRahU9bLbkW/xAXKTNmzd/AOqL/Yajyyc1AUiu4B0t\nrusy1SVm0c3EanOIVsoYGVhXXu5/giZ4HOANsWoMh26WJJaboa25OAgT25kA8/EoQIpYv3ji\nhLFcY1XELfroGdZH6gLcEC4WIRWKbvQf1RXMoluhWbFFtFzFBPA27lKiEWOhsSWJObtdc6Lv\nFruzpOjPAD69x7E5gluIXqhctAqnvaDV8tOHy4juJPZvsdwMy1Y3UZFo5R3dSVz/lnd0A3jI\nksSU3b45O9Hfyv6j3LE5hFuIFm+g1WJx/NARXAGwGXGRRfQexJYQWqzUtoq2q26iItGyj84G\nSJB9dOtSpS8eZy96T5nm7ERfEfdOIxr7RcQayzVWRdxCdGEENNp25H0foeGvAN03LfRXznwV\nwNul4tKHJ5eMATvRdtVNVCg68K3kBgBfIE4C6LVRPHV4fmcVrWS3b85ONCaKf5JN4lFjYvnG\nqohbiMatdZResPVFPKeXK+IBYDbi35WPFHvAvMcq2q66iYpEh/vJasNEB3K1tamfnYvWJEp2\n++bsRf8YplSPyivfWBVxD9F47OkHdW3n5Yu1rx/SdVx4vQ4E30bja/WEaNzQwT/u/R32ou2q\nK1QkOi7jscCO7xhklaI58QEPJvxNrpqTmLLbNWcvGi8/F+PX+tX8ezRWRWrw99GmxxV3gUUT\nwaKJYNFE1GDR7gWLJoJFE8GiiWDRRLBoIlg0ESyaCBZNBIsmgkUTwaKJYNFEsGgiWDQRLJoI\nFk0EiyaCRRPBoolg0USwaCJYNBEsmggWTQSLJoJFE0Eg+u2QGkz4MZUWCESP6bW55uK3XaUF\nCtETXN9GtRHIomlg0USwaCJYNBEsmggWTQSLJoJFE8GiiWDRRLBoIlg0ESyaCBZNBIsmgkUT\nwaKJYNFEsGgiWDQRLJoIFk0EiyaCRRPBoolg0USwaCJYNBGuEH00McKv7aw8zcfCoiXqRR+p\n02XZhtmhccVaj4VFS9SLfia6COXfsd6i9VhYtES96DZDlMWMLxCNq2N17Tcgfgn7EdO9s+8f\nWANFF6ZMSilU1lwgerTve5b+ebHHixtGwzrEcS2LbjT630oCa57o4lg5U5TSh7pA9B/9oU73\nN7OMiAVBs8T28BaIV8PmTW1dVEngqNG3Ee/k16BinTK9wiq5GfC500UjXlwzNhweuYSZcFJs\nbfe8g5jm4/1NZWEDE44h/vxlDSr+ooieIDd1G50t2lAkZ38oWasfKmf5UfhV/IvWa22oLLLm\ndR37lNNXpp10fteRAxnKckIEHoZ930vE7WBBuO+qyiJrnmgcD2CeAMf5og3BifK9e6dTD8z3\n+0SsrRhnxB99N80NuVxJZA0UjZl/zTStuOBmmAYd5q9+p5P3QcQ5uqTNMz3no+HRx423o8ZU\nElgTRVtxxUfwg/0ifJsP+06sGVPa+bVOMeIy3xzEXfDl/eNYtIS/VHIMFk0EiyaCRRPBoolg\n0USwaCJYNBEsmggWTQSLJoJFE8GiiWDRRLBoIlg0ESyaCBZNBIsmgkUTwaKJYNFEsGgiWDQR\nLJoIF4ieav4N6X6tx8KiJRpE+6cqXNR6LCxaokF03aodCotWqILoG5Ma+UTNMyBGrp8TmmMZ\nOFQx7it6e2zYk6ccS+EK0SFXJIU4IWRu2mRIE6J7NX85zzpwqELcVvSROuKWE3nToRyuuxkm\n4QD5G//2M4XopoV2A4cqZPjQS4j5Z9yvmGEaIeFQFv1W54sO2C/5Wa5fSvOdLkRPQ/uBQxUx\ndNAviJdPul/xsiJ6u0NZ9Okq9VWhjz6e0CCsf7gUvQDtBw5VhNt2HfvlgdfTPrjdHhfeDG/o\nxmcbsYsUnYz2A4cqwm1F47K6EHPIsRQuFJ0B5xCv+JtF2wYOVYj7ika85WgCF4o+65W4Zkl0\ncFy2Ito6cKhi3Fm0w7jyOTqthf/De/Y1TDaJtgwcqhgWLeEvlRyDRRPBoolg0USwaCJYNBEs\nmggWTQSLJoJFE8GiiWDRRLBoIlg0ESyaCBZNBIsmgkUTwaKJYNFEsGgiWDQRLJoIFk0EiybC\nNaJ7wawqHAuLlmgRfclTH3W/335VAIuWaBGd4r0cMrUfS/WLLvnnP0tclNolouMTC3WvypWS\naU2azP4wxjbF0H2pdtE5rQBa5bgmtytE/wybcXiEnFFhROCiT+MiY+ymGLof1S66r/xlfx/X\n5HaF6LcDb+NWOIT4L9iKeD0wRs1IIcGgp08i/vZt9RX+UrTfry5J77/J+aLbjSwo+EM3BXG1\nz12xOTRGzUghwciRNxBvX66+IkqKjnRNehdM4XTCNCoo/C4mRcjtqTFqRgqhG3QdS+UxfuCa\n3C7oOl5/4OChQ4cWwAFc4at01DFqRgqhG4jGzUMGb3ZRaueLNjYbJBc3fSbidyCul4LQGDUj\nhdAdRLsQ54s+AuuV5VN17+AzQYvXxrdtp2akELJoE6pFT/W5oSw/gb1Y+EJ49JJF3dWMFEIW\nbaIqXyqVFsg+etoQldVZtKQqonPgK0RDy0p6DCssWlIV0cZubfd8NdxH7Z+iYdGSKn0fnTsq\nLOiRI2prs2gJf/HvGCyaCBZNBIsmgkUTwaKJYNFEsGgiWDQRLJoIFk0EiyaCRRPBoolg0USw\naCJYNBEsmggWTQSLJsIFoqfLnwrqutj/ejJ8qZpAFi3RINovNTU1OQYO2HaxaJeIDpLltdAB\ntl0s2nWisXcMYumcaH33TJNo6/Q32U8Ehw48Z1tacaJo46aX3vrdadmcgctE323WE3GCfuHG\nR0NyFdGW6W8Kw+NXLgrra13acKLo5+Q0E+cqr0eHK0QH5ObmnnwePsAcz/WI1/0+VkRbpr/J\ngm8Q1403Wpa2wNHPFouL4KYTirPK+IKpzkjlrMIFQyumm4ZRjCvFdR5y1N6FfEsfrUx/c1nX\n9XM5G5JlaWNAwjHEM186oTigHEEfZ6RyVqGrbPBfFUTr9u7dm3FerL1rmVZBirZOf3O4h6fX\noCO2pRXndR2/y8mtYJ6z0jkD190MBalepaI8e0GKtk1/g5i/radXjt3SjBP76BRvgLgCp6Vz\nAi4V/QOkIxaFvSdFW6e/2dIiT4753G1Z2gKd+Xh3OnV3qfOyOQGXisZhAcnpCYG/SdHW6W9y\nfHqvXtkj+IplaQvk52hJlUQXT2+mf8z8HG2d/mZHrC708Sy0Lq2waAl/qeQYLJoIFk0EiyaC\nRRPBoolg0USwaCJYNBEsmggWTQSLJoJFE8GiiWDRRLBoIlg0ESyaCBZNBIsmgkUTwaKJYNFE\nsGgiWDQRLJoIFk0EiyaCRRPBoolwK9EdZikMafmQVuKiNIfEN+mqOSYqVnNI9DOmk/J1I9Hr\n+5ho7B2qlQAPzSHBEKQ5xkOvOaROQ9NJJZxVaYFAtIUF8ZpD9vhpDrkAP2mOqbtFc0ivuRoD\nWDSyaBZdNVg0ESyaCBZNBIsmgkUTsehhzSEZes0huaD2I4SN+mr/FoSNvm9oDCAUfe1HzSF3\njmlvpgoTamcXaw45c1VjAKHo2g2LJoJFE8GiiWDRRLBoIlg0ESyaCBZNBIsmgkUTwaKJoBOd\nHh/UK6vyaibuLm6ja7f0rtYwQ69R2po68XRokzcNWkJKk6N1nTZpa0VCJnqX58hVjwblVF5R\n4V2Pl9dP9nhDa9gyGKWpqX/XS1g9GRZoCUnymrl2FOzUemR0onv2MuLNhrPUVTaGTBTlS36l\n2sLO6vWjNDU1sbu4aKYP1hLS6AVxeJ0HaDwhOtFX4SNRPt9CXe0LIL+KT4ezmsIMPcc8MkpL\nU8UBaXhX29EZ680R5VP9NJ4Qnejj8u93Y4qvsdKakqLTt0X5ik+hprAPG+RJ0epjfoa0R30i\n5pVqObpXwv9+fWOdtRpPiE50BpwS5Rq4oT4k1WuqprCz/p+jFK0+5jDoZuye6z1XSzOGbgAw\nTfsJ0Yk+LcpPQfX/AF0YAiNLtIQZegrJJtFqY3bBq6J8RVeqPsT4QoOVB98OWKH5hKhEZ8M/\nRLnER+2Vtjm42TZtYR+HnLl2LX7otTvqYzJhvyh3wE/qQ76GfaJ8y+e61hOiuxmminJKM5XV\n02F8kcYw8xwE8Jn6mHPyMQ23wiX1IZ+C/MvYe+CUxhOie7zr8ZS4yzedqa5ySf0JRq1hZw4K\n2vU5eEV9jLHDCFGObmxUH5IFW0X5mk+JthOi/MDiMW3XgKAz6iofgMnJklvawkx9tIaYNHh2\n3XhYpyHE+GTgWxtf8pqr8YQoP4Jv6hzY86jKuh+Zu4FcbWFm0Rpi1nbRx6ZrCil4raVfuw/v\nampFwl8qEcGiiWDRRLBoIlg0ESyaCBZNBIsmgkUTwaKJYNFEsGgiWDQRLJoIFk0EiyaCRRPB\noolg0USwaCJYNBEsmggWTQSLJoJFE8GiiXAv0XGwRFle9YIT96s3Rf5eLHLovxxs7pv+dR9o\n9UIOYgGMte0ts+E03E10N2X5MVQiOjA1ddWMpvq/OdTaQq+O/7NiRkP9P/B27/dsu8tsOA03\nE93B46JcJnSoRHS4LPM7t77rQGNfw9Q7MkvHNmp/TO4IbiZ6fJOlYnG9zptCtHF1rK79BrF5\nY1Ijn6h5BsTItcltAofmWUTjPtiBtmqlc6L13TNFrfVzQnPKB2c/ERw68Jxd/a4xpUqWU2tv\nYWSyXXa54XTcTPSEaY+Jxbp6B4XoxR4vbhgtf7w8IWRu2mRIEwYeSdy91OfPVtF3vd9Bu2r6\nhRsfDcnFyF7NX84rF1wYHr9yUVhfW/2byv3g1hVBsUm0JXutEP2tRy7iwImH4ERBkBwsObwF\n4oBVYqX9TGFAdhVju1pFY9Pn0Fotx3O9uBb8PsbIpoVYPjhLDldbN95ofSULMsTKbHlb3WAS\nbcleK0QbGi/DW35fCNGZcFLs2e4p+1G8lOY7XRiQAxlmxZURba22zqNErFzIx8hpiOWDL+u6\nfn4T7V45BAfESs7+/RmeZtGW7LVCNE7tienBJUL0FvOv/n/F4wkNwvqHS9ELsYxo2XVYq71b\n15wkcgHiPYIP9/D0GnTE9soVWG6q/5vlHW3JXjtEH/K8/N9jUYg+DPu+lxTe0I3PNmKX6WYD\ndqLlzdBaLdVL3tvOXlBq3SNYPF9s6+mVY30FozoblCwrLaIt2WuHaEPDFP1OKTrf7xOxZ8U4\nYwaIZ4Ur/uVFK4931mo/QDpiUdh7Sq3ywVta5Mkxybutr+BmeF0+HZ6uVztF44vBAcVSNM7R\nJW2e6Tkfz3olrlkSHRyXbSdafGBZ/Zem/nI8pqUaDgtITk8I/M1Uq1xwjk/v1St7BF+x1TdO\nhrjXl03yfyOidor+CkagItqY0s6vdYr4MJHWwv/hPfsaJtuJlr3sg4Plbc1WrXh6M/1jmWZP\n5YN3xOpCH8+yewXxs/4NwxN3YWLtE12DYdFEsGgiWDQRLJoIFk0EiyaCRRPBoolg0USwaCJY\nNBEsmggWTQSLJoJFE8GiiWDRRLBoIlg0ESyaCBZNBIsmgkUTwaKJ+H9A27Dlt2K9EwAAAABJ\nRU5ErkJggg==", | |
"text/plain": [ | |
"Plot with title “Feature importance”" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"varImpPlot(model$finalModel, cex=0.8, main=\"Feature importance\", pch=20)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"test_dat = orig_all_data[orig_all_data$dataset == \"test\", ]\n", | |
"passenger_id = test_dat$PassengerId\n", | |
"test_dat = test_dat[, cols_to_keep]\n", | |
"\n", | |
"test_dat$dataset = NULL\n", | |
"\n", | |
"predictions = predict(model, test_dat)\n", | |
"\n", | |
"prediction_df = data.frame(\"PassengerId\" = passenger_id,\n", | |
" \"Survived\" = predictions)\n", | |
"\n", | |
"write.csv(prediction_df, \"pred_df_caret_2.csv\", row.names=FALSE)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Kaggle submission score 0.785**\n", | |
"\n", | |
"Basic random forest with (`Sex`, `Fare`, `Age`, `Pclass`) " | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "R", | |
"language": "R", | |
"name": "ir" | |
}, | |
"language_info": { | |
"codemirror_mode": "r", | |
"file_extension": ".r", | |
"mimetype": "text/x-r-source", | |
"name": "R", | |
"pygments_lexer": "r", | |
"version": "3.4.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment