Created
January 27, 2014 17:38
-
-
Save mickaellegal/8653451 to your computer and use it in GitHub Desktop.
iPython Notebook: 50onRed test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Importing the libaries" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Importing the libraries\n", | |
"import pandas as pd\n", | |
"import numpy as np" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 55 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import warnings\n", | |
"warnings.filterwarnings(\"ignore\")" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 39 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Loading and formatting the data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Loading the training set \n", | |
"train_data = pd.read_csv(\"training-data-set.csv\", sep=\" \")\n", | |
"\n", | |
"train_data.head(10)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>cat1</th>\n", | |
" <th>cat2</th>\n", | |
" <th>cat3</th>\n", | |
" <th>cat4</th>\n", | |
" <th>cat5</th>\n", | |
" <th>cat6</th>\n", | |
" <th>cat7</th>\n", | |
" <th>num1</th>\n", | |
" <th>num2</th>\n", | |
" <th>num3</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> b</td>\n", | |
" <td> 1.053900</td>\n", | |
" <td>-0.062460</td>\n", | |
" <td> 0.508648</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> y</td>\n", | |
" <td> y</td>\n", | |
" <td> a</td>\n", | |
" <td> d</td>\n", | |
" <td>-0.575898</td>\n", | |
" <td> 1.053315</td>\n", | |
" <td> 2.100263</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> y</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> a</td>\n", | |
" <td> 0.392731</td>\n", | |
" <td>-0.395918</td>\n", | |
" <td> 1.813869</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> n</td>\n", | |
" <td> y</td>\n", | |
" <td> c</td>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> d</td>\n", | |
" <td> 1.255048</td>\n", | |
" <td> 0.812365</td>\n", | |
" <td> 0.115558</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> n</td>\n", | |
" <td> y</td>\n", | |
" <td> a</td>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> a</td>\n", | |
" <td>-0.848028</td>\n", | |
" <td> 1.575932</td>\n", | |
" <td> 0.407990</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> n</td>\n", | |
" <td> y</td>\n", | |
" <td> a</td>\n", | |
" <td> c</td>\n", | |
" <td>-2.000425</td>\n", | |
" <td> 0.168658</td>\n", | |
" <td> 1.089865</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td> n</td>\n", | |
" <td> y</td>\n", | |
" <td> c</td>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> d</td>\n", | |
" <td> 1.986990</td>\n", | |
" <td> 0.100123</td>\n", | |
" <td>-0.156572</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> c</td>\n", | |
" <td> a</td>\n", | |
" <td> 0.179694</td>\n", | |
" <td>-0.207595</td>\n", | |
" <td> 0.150446</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td> n</td>\n", | |
" <td> y</td>\n", | |
" <td> b</td>\n", | |
" <td> n</td>\n", | |
" <td> y</td>\n", | |
" <td> a</td>\n", | |
" <td> a</td>\n", | |
" <td>-0.287543</td>\n", | |
" <td> 1.227005</td>\n", | |
" <td> 1.037588</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td> n</td>\n", | |
" <td> y</td>\n", | |
" <td> a</td>\n", | |
" <td> n</td>\n", | |
" <td> y</td>\n", | |
" <td> b</td>\n", | |
" <td> a</td>\n", | |
" <td> 0.018208</td>\n", | |
" <td>-0.942384</td>\n", | |
" <td>-0.494788</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 12, | |
"text": [ | |
" cat1 cat2 cat3 cat4 cat5 cat6 cat7 num1 num2 num3\n", | |
"0 n n a n n a b 1.053900 -0.062460 0.508648\n", | |
"1 n n a y y a d -0.575898 1.053315 2.100263\n", | |
"2 y n a n n a a 0.392731 -0.395918 1.813869\n", | |
"3 n y c n n a d 1.255048 0.812365 0.115558\n", | |
"4 n y a n n a a -0.848028 1.575932 0.407990\n", | |
"5 n n a n y a c -2.000425 0.168658 1.089865\n", | |
"6 n y c n n a d 1.986990 0.100123 -0.156572\n", | |
"7 n n a n n c a 0.179694 -0.207595 0.150446\n", | |
"8 n y b n y a a -0.287543 1.227005 1.037588\n", | |
"9 n y a n y b a 0.018208 -0.942384 -0.494788" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Converting the categorical variables into numercial variables\n", | |
"categorical_values = set()\n", | |
"for i in train_data['cat1']:\n", | |
" categorical_values.add(i)\n", | |
"for j in train_data['cat7']:\n", | |
" categorical_values.add(j)\n", | |
" \n", | |
"print categorical_values " | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"set(['a', 'c', 'b', 'd', 'n', 'y'])\n" | |
] | |
} | |
], | |
"prompt_number": 30 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# We assign a numerical value to each of the categorical values\n", | |
"train_data = train_data.replace(['a','b','c','d','n','y'], [1,2,3,4,5,6])\n", | |
"\n", | |
"train_data.head(10)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>cat1</th>\n", | |
" <th>cat2</th>\n", | |
" <th>cat3</th>\n", | |
" <th>cat4</th>\n", | |
" <th>cat5</th>\n", | |
" <th>cat6</th>\n", | |
" <th>cat7</th>\n", | |
" <th>num1</th>\n", | |
" <th>num2</th>\n", | |
" <th>num3</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 2</td>\n", | |
" <td> 1.053900</td>\n", | |
" <td>-0.062460</td>\n", | |
" <td> 0.508648</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 6</td>\n", | |
" <td> 6</td>\n", | |
" <td> 1</td>\n", | |
" <td> 4</td>\n", | |
" <td>-0.575898</td>\n", | |
" <td> 1.053315</td>\n", | |
" <td> 2.100263</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 6</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.392731</td>\n", | |
" <td>-0.395918</td>\n", | |
" <td> 1.813869</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 5</td>\n", | |
" <td> 6</td>\n", | |
" <td> 3</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 4</td>\n", | |
" <td> 1.255048</td>\n", | |
" <td> 0.812365</td>\n", | |
" <td> 0.115558</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 5</td>\n", | |
" <td> 6</td>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td>-0.848028</td>\n", | |
" <td> 1.575932</td>\n", | |
" <td> 0.407990</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 6</td>\n", | |
" <td> 1</td>\n", | |
" <td> 3</td>\n", | |
" <td>-2.000425</td>\n", | |
" <td> 0.168658</td>\n", | |
" <td> 1.089865</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td> 5</td>\n", | |
" <td> 6</td>\n", | |
" <td> 3</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 4</td>\n", | |
" <td> 1.986990</td>\n", | |
" <td> 0.100123</td>\n", | |
" <td>-0.156572</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 3</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.179694</td>\n", | |
" <td>-0.207595</td>\n", | |
" <td> 0.150446</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td> 5</td>\n", | |
" <td> 6</td>\n", | |
" <td> 2</td>\n", | |
" <td> 5</td>\n", | |
" <td> 6</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td>-0.287543</td>\n", | |
" <td> 1.227005</td>\n", | |
" <td> 1.037588</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td> 5</td>\n", | |
" <td> 6</td>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 6</td>\n", | |
" <td> 2</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.018208</td>\n", | |
" <td>-0.942384</td>\n", | |
" <td>-0.494788</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 31, | |
"text": [ | |
" cat1 cat2 cat3 cat4 cat5 cat6 cat7 num1 num2 num3\n", | |
"0 5 5 1 5 5 1 2 1.053900 -0.062460 0.508648\n", | |
"1 5 5 1 6 6 1 4 -0.575898 1.053315 2.100263\n", | |
"2 6 5 1 5 5 1 1 0.392731 -0.395918 1.813869\n", | |
"3 5 6 3 5 5 1 4 1.255048 0.812365 0.115558\n", | |
"4 5 6 1 5 5 1 1 -0.848028 1.575932 0.407990\n", | |
"5 5 5 1 5 6 1 3 -2.000425 0.168658 1.089865\n", | |
"6 5 6 3 5 5 1 4 1.986990 0.100123 -0.156572\n", | |
"7 5 5 1 5 5 3 1 0.179694 -0.207595 0.150446\n", | |
"8 5 6 2 5 6 1 1 -0.287543 1.227005 1.037588\n", | |
"9 5 6 1 5 6 2 1 0.018208 -0.942384 -0.494788" | |
] | |
} | |
], | |
"prompt_number": 31 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Loading the training labels\n", | |
"\n", | |
"train_labels = pd.read_csv(\"training-data-labels.csv\")\n", | |
"\n", | |
"train_labels.head(5)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>label</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
" label\n", | |
"0 0\n", | |
"1 1\n", | |
"2 1\n", | |
"3 0\n", | |
"4 1" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Loading the test set\n", | |
"\n", | |
"test_data = pd.read_csv(\"test-data-set.csv\", sep=\" \")\n", | |
"\n", | |
"test_data.head(5)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>cat1</th>\n", | |
" <th>cat2</th>\n", | |
" <th>cat3</th>\n", | |
" <th>cat4</th>\n", | |
" <th>cat5</th>\n", | |
" <th>cat6</th>\n", | |
" <th>cat7</th>\n", | |
" <th>num1</th>\n", | |
" <th>num2</th>\n", | |
" <th>num3</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> a</td>\n", | |
" <td> 0.171982</td>\n", | |
" <td>-0.022455</td>\n", | |
" <td> 0.668533</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> n</td>\n", | |
" <td> y</td>\n", | |
" <td> a</td>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> a</td>\n", | |
" <td> 0.301511</td>\n", | |
" <td> 0.119037</td>\n", | |
" <td>-0.292068</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> y</td>\n", | |
" <td> y</td>\n", | |
" <td> a</td>\n", | |
" <td> a</td>\n", | |
" <td>-0.441025</td>\n", | |
" <td> 1.052455</td>\n", | |
" <td> 0.820292</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> c</td>\n", | |
" <td> a</td>\n", | |
" <td> 0.421350</td>\n", | |
" <td> 0.223962</td>\n", | |
" <td>-0.187951</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> n</td>\n", | |
" <td> n</td>\n", | |
" <td> a</td>\n", | |
" <td> y</td>\n", | |
" <td> y</td>\n", | |
" <td> a</td>\n", | |
" <td> a</td>\n", | |
" <td>-0.390083</td>\n", | |
" <td> 0.556335</td>\n", | |
" <td>-1.434217</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
" cat1 cat2 cat3 cat4 cat5 cat6 cat7 num1 num2 num3\n", | |
"0 n n a n n a a 0.171982 -0.022455 0.668533\n", | |
"1 n y a n n a a 0.301511 0.119037 -0.292068\n", | |
"2 n n a y y a a -0.441025 1.052455 0.820292\n", | |
"3 n n a n n c a 0.421350 0.223962 -0.187951\n", | |
"4 n n a y y a a -0.390083 0.556335 -1.434217" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# We assign a numerical value to each of the categorical values\n", | |
"test_data = test_data.replace(['a','b','c','d','n','y'], [1,2,3,4,5,6])\n", | |
"\n", | |
"test_data.head(10)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>cat1</th>\n", | |
" <th>cat2</th>\n", | |
" <th>cat3</th>\n", | |
" <th>cat4</th>\n", | |
" <th>cat5</th>\n", | |
" <th>cat6</th>\n", | |
" <th>cat7</th>\n", | |
" <th>num1</th>\n", | |
" <th>num2</th>\n", | |
" <th>num3</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.171982</td>\n", | |
" <td>-0.022455</td>\n", | |
" <td> 0.668533</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 5</td>\n", | |
" <td> 6</td>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.301511</td>\n", | |
" <td> 0.119037</td>\n", | |
" <td>-0.292068</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 6</td>\n", | |
" <td> 6</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td>-0.441025</td>\n", | |
" <td> 1.052455</td>\n", | |
" <td> 0.820292</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 3</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.421350</td>\n", | |
" <td> 0.223962</td>\n", | |
" <td>-0.187951</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 6</td>\n", | |
" <td> 6</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td>-0.390083</td>\n", | |
" <td> 0.556335</td>\n", | |
" <td>-1.434217</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 6</td>\n", | |
" <td> 6</td>\n", | |
" <td> 2</td>\n", | |
" <td> 1</td>\n", | |
" <td>-0.207254</td>\n", | |
" <td> 0.405312</td>\n", | |
" <td> 0.185214</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td> 5</td>\n", | |
" <td> 6</td>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.320936</td>\n", | |
" <td> 1.232641</td>\n", | |
" <td>-0.661283</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td>-0.718697</td>\n", | |
" <td> 0.905296</td>\n", | |
" <td> 0.838255</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 2</td>\n", | |
" <td> 0.391449</td>\n", | |
" <td> 0.013134</td>\n", | |
" <td> 0.559273</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 3</td>\n", | |
" <td> 5</td>\n", | |
" <td> 5</td>\n", | |
" <td> 1</td>\n", | |
" <td> 2</td>\n", | |
" <td> 1.173640</td>\n", | |
" <td> 0.860782</td>\n", | |
" <td>-1.237148</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 74, | |
"text": [ | |
" cat1 cat2 cat3 cat4 cat5 cat6 cat7 num1 num2 num3\n", | |
"0 5 5 1 5 5 1 1 0.171982 -0.022455 0.668533\n", | |
"1 5 6 1 5 5 1 1 0.301511 0.119037 -0.292068\n", | |
"2 5 5 1 6 6 1 1 -0.441025 1.052455 0.820292\n", | |
"3 5 5 1 5 5 3 1 0.421350 0.223962 -0.187951\n", | |
"4 5 5 1 6 6 1 1 -0.390083 0.556335 -1.434217\n", | |
"5 5 5 1 6 6 2 1 -0.207254 0.405312 0.185214\n", | |
"6 5 6 1 5 5 1 1 0.320936 1.232641 -0.661283\n", | |
"7 5 5 1 5 5 1 1 -0.718697 0.905296 0.838255\n", | |
"8 5 5 1 5 5 1 2 0.391449 0.013134 0.559273\n", | |
"9 5 5 3 5 5 1 2 1.173640 0.860782 -1.237148" | |
] | |
} | |
], | |
"prompt_number": 74 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Testing different classification models" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Testing different classifiers from the scikit-learn libraries\n", | |
"# Importing the different libraries\n", | |
"\n", | |
"from sklearn.ensemble import RandomForestClassifier\n", | |
"from sklearn.neighbors import KNeighborsClassifier\n", | |
"from sklearn.svm import SVC\n", | |
"from sklearn.metrics import roc_auc_score" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 57 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# I split the training set into a sub training set(75%) et test set (25%)\n", | |
"\n", | |
"from sklearn.cross_validation import train_test_split\n", | |
"x_train, x_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.25)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 33 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"1 - SVM Model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# create and train a classifier\n", | |
"SVM = SVC(gamma=0.001)\n", | |
"\n", | |
"# Fit the model\n", | |
"SVM.fit(x_train, y_train)\n", | |
"\n", | |
"# Return the accuracy of the model \n", | |
"accuracy = SVM.score(x_test, y_test)\n", | |
"print \"The accuracy score for the SVM model is:\" \n", | |
"print accuracy \n", | |
"\n", | |
"# Get the prediction\n", | |
"preds = SVM.predict(x_test)\n", | |
"\n", | |
"# Return the ROC AUC score\n", | |
"print \"The Area Under the Curve is:\" \n", | |
"roc_auc_score(y_test, preds)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"The accuracy score for the SVM model is:\n", | |
"0.6232\n", | |
"The Area Under the Curve is:" | |
] | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 67, | |
"text": [ | |
"0.62780636827753056" | |
] | |
} | |
], | |
"prompt_number": 67 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"2 - Random Forest Model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# create and train a classifier\n", | |
"RandomForest = RandomForestClassifier()\n", | |
"\n", | |
"# Fit the model\n", | |
"RandomForest.fit(x_train, y_train)\n", | |
"\n", | |
"# Return the accuracy of the model\n", | |
"accuracy = RandomForest.score(x_test, y_test)\n", | |
"print \"The accuracy of the Random Forest Model is:\"\n", | |
"print accuracy\n", | |
"\n", | |
"# Get the predictions\n", | |
"preds = RandomForest.predict(x_test)\n", | |
"\n", | |
"# Return the ROC AUC score\n", | |
"print \"The Area Under the Curve is:\" \n", | |
"roc_auc_score(y_test, preds)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"The accuracy of the Random Forest Model is:\n", | |
"0.7992\n", | |
"The Area Under the Curve is:\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 72, | |
"text": [ | |
"0.80056076107682528" | |
] | |
} | |
], | |
"prompt_number": 72 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 2, | |
"metadata": {}, | |
"source": [ | |
"3 - K-Nearest Neighor Model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# create and train a classifier\n", | |
"NearestNeighbor = KNeighborsClassifier()\n", | |
"\n", | |
"# Fit the model\n", | |
"NearestNeighbor.fit(x_train, y_train)\n", | |
"\n", | |
"# Return the accuracy of the model \n", | |
"accuracy = NearestNeighbor.score(x_test, y_test)\n", | |
"print \"The accuracy of the K-Nearest Neighbor Model is:\"\n", | |
"print accuracy\n", | |
"\n", | |
"# Get the predictions\n", | |
"preds = NearestNeighbor.predict(x_test)\n", | |
"\n", | |
"# Return the ROC AUC score\n", | |
"print \"The Area Under the Curve is:\" \n", | |
"roc_auc_score(y_test, preds)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"The accuracy of the K-Nearest Neighbor Model is:\n", | |
"0.7792\n", | |
"The Area Under the Curve is:\n" | |
] | |
}, | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 71, | |
"text": [ | |
"0.77948865150800661" | |
] | |
} | |
], | |
"prompt_number": 71 | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 1, | |
"metadata": {}, | |
"source": [ | |
"Making predictions on the test set" | |
] | |
}, | |
{ | |
"cell_type": "heading", | |
"level": 3, | |
"metadata": {}, | |
"source": [ | |
"The Random Forest model is the one providing the highest prediction accuracy. \n", | |
"I will therefore use this model to make the predictions on the test set. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Predicitons made on the test set\n", | |
"final_preds = RandomForest.predict(test_data)\n", | |
"\n", | |
"# Dumping the results into a text file\n", | |
"np.savetxt('predictions_test_set.txt', final_preds, fmt='%i')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 84 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment