Created
April 8, 2015 22:53
-
-
Save Aerlinger/d6610fdaef7ebdbb0003 to your computer and use it in GitHub Desktop.
OneHotEncoding
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:973f629a9825a5e2ab8a547b36f84aa20289633b121d810ca166ed9acdeebc6f" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"\n", | |
"import matplotlib as plt\n", | |
"\n", | |
"%matplotlib inline" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# My notebook is running from the DAT_20_NYC folder so it's a simple relative path\n", | |
"local_path_to_file = 'Data/bikeshare.csv'\n", | |
"\n", | |
"bikeshare = pd.read_csv(local_path_to_file)\n", | |
"\n", | |
"bikeshare.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>instant</th>\n", | |
" <th>dteday</th>\n", | |
" <th>season</th>\n", | |
" <th>yr</th>\n", | |
" <th>mnth</th>\n", | |
" <th>hr</th>\n", | |
" <th>holiday</th>\n", | |
" <th>weekday</th>\n", | |
" <th>workingday</th>\n", | |
" <th>weathersit</th>\n", | |
" <th>temp</th>\n", | |
" <th>atemp</th>\n", | |
" <th>hum</th>\n", | |
" <th>windspeed</th>\n", | |
" <th>casual</th>\n", | |
" <th>registered</th>\n", | |
" <th>cnt</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 6</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.24</td>\n", | |
" <td> 0.2879</td>\n", | |
" <td> 0.81</td>\n", | |
" <td> 0</td>\n", | |
" <td> 3</td>\n", | |
" <td> 13</td>\n", | |
" <td> 16</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 2</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 6</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.22</td>\n", | |
" <td> 0.2727</td>\n", | |
" <td> 0.80</td>\n", | |
" <td> 0</td>\n", | |
" <td> 8</td>\n", | |
" <td> 32</td>\n", | |
" <td> 40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 3</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 2</td>\n", | |
" <td> 0</td>\n", | |
" <td> 6</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.22</td>\n", | |
" <td> 0.2727</td>\n", | |
" <td> 0.80</td>\n", | |
" <td> 0</td>\n", | |
" <td> 5</td>\n", | |
" <td> 27</td>\n", | |
" <td> 32</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 4</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 3</td>\n", | |
" <td> 0</td>\n", | |
" <td> 6</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.24</td>\n", | |
" <td> 0.2879</td>\n", | |
" <td> 0.75</td>\n", | |
" <td> 0</td>\n", | |
" <td> 3</td>\n", | |
" <td> 10</td>\n", | |
" <td> 13</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 5</td>\n", | |
" <td> 2011-01-01</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 4</td>\n", | |
" <td> 0</td>\n", | |
" <td> 6</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0.24</td>\n", | |
" <td> 0.2879</td>\n", | |
" <td> 0.75</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 2, | |
"text": [ | |
" instant dteday season yr mnth hr holiday weekday workingday \\\n", | |
"0 1 2011-01-01 1 0 1 0 0 6 0 \n", | |
"1 2 2011-01-01 1 0 1 1 0 6 0 \n", | |
"2 3 2011-01-01 1 0 1 2 0 6 0 \n", | |
"3 4 2011-01-01 1 0 1 3 0 6 0 \n", | |
"4 5 2011-01-01 1 0 1 4 0 6 0 \n", | |
"\n", | |
" weathersit temp atemp hum windspeed casual registered cnt \n", | |
"0 1 0.24 0.2879 0.81 0 3 13 16 \n", | |
"1 1 0.22 0.2727 0.80 0 8 32 40 \n", | |
"2 1 0.22 0.2727 0.80 0 5 27 32 \n", | |
"3 1 0.24 0.2879 0.75 0 3 10 13 \n", | |
"4 1 0.24 0.2879 0.75 0 0 1 1 " | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Cut the crap!\n", | |
"bikeshare.drop(['instant', 'dteday'], axis=1, inplace=True)\n", | |
"\n", | |
"# Be careful using these \"inplace\" arguments, can't run the cell more than once... \n", | |
"# One solution... store a copy so we can run more than once\n", | |
"bikeshare.dropna(inplace=True)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# what does bikeshare.weathersit look like?\n", | |
"print bikeshare.weathersit.value_counts()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"1 11413\n", | |
"2 4544\n", | |
"3 1419\n", | |
"4 3\n", | |
"dtype: int64\n" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from sklearn.preprocessing import OneHotEncoder\n", | |
"\n", | |
"ohe = OneHotEncoder(sparse=False)\n", | |
"\n", | |
"\n", | |
"encoded_weathersit = ohe.fit_transform(bikeshare[['weathersit']])" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"I ran into a snag in class where I ran `ohe.fit_transform(bikeshare.weathersit)` instead of `ohe.fit_transform(bikeshare[['weathersit']])`\n", | |
"\n", | |
"\n", | |
"*What's the difference between these two?* **Hint: Always pay attention to types!**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"encoded_weathersit" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"text": [ | |
"array([[ 1., 0., 0., 0.],\n", | |
" [ 1., 0., 0., 0.],\n", | |
" [ 1., 0., 0., 0.],\n", | |
" ..., \n", | |
" [ 1., 0., 0., 0.],\n", | |
" [ 1., 0., 0., 0.],\n", | |
" [ 1., 0., 0., 0.]])" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Just a sanity check, these values should be the same as `bikeshare.weathersit.value_counts()`\n", | |
"encoded_weathersit.sum(axis=0).astype(np.int)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
"array([11413, 4544, 1419, 3])" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"### How to get these encoded values back into the bikeshare dataframe?\n", | |
"\n", | |
"# Simple conversion, but let's be mindful to keep our labels consistent. \n", | |
"weathersit_factors = bikeshare.weathersit.unique()\n", | |
"\n", | |
"## I strongly recommend you rename your column labels to something that's comprehensible! \n", | |
"encoded_dataframe = pd.DataFrame(encoded_weathersit, columns=['Weather Label 1', 'Weather Label 2', 'Weather Label 3', 'Weather Label 4'])\n", | |
"\n", | |
"encoded_dataframe.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Weather Label 1</th>\n", | |
" <th>Weather Label 2</th>\n", | |
" <th>Weather Label 3</th>\n", | |
" <th>Weather Label 4</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
" Weather Label 1 Weather Label 2 Weather Label 3 Weather Label 4\n", | |
"0 1 0 0 0\n", | |
"1 1 0 0 0\n", | |
"2 1 0 0 0\n", | |
"3 1 0 0 0\n", | |
"4 1 0 0 0" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### We'll want to merge the encoded dataframe back into the original" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print encoded_dataframe.shape\n", | |
"\n", | |
"print bikeshare.shape" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"(17379, 4)\n", | |
"(17379, 15)\n" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"encoded_bikeshare = pd.concat([bikeshare, encoded_dataframe], axis=1)\n", | |
"\n", | |
"encoded_bikeshare.drop(['weathersit'], axis=1, inplace=True)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"encoded_bikeshare.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>season</th>\n", | |
" <th>yr</th>\n", | |
" <th>mnth</th>\n", | |
" <th>hr</th>\n", | |
" <th>holiday</th>\n", | |
" <th>weekday</th>\n", | |
" <th>workingday</th>\n", | |
" <th>temp</th>\n", | |
" <th>atemp</th>\n", | |
" <th>hum</th>\n", | |
" <th>windspeed</th>\n", | |
" <th>casual</th>\n", | |
" <th>registered</th>\n", | |
" <th>cnt</th>\n", | |
" <th>Weather Label 1</th>\n", | |
" <th>Weather Label 2</th>\n", | |
" <th>Weather Label 3</th>\n", | |
" <th>Weather Label 4</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 6</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.24</td>\n", | |
" <td> 0.2879</td>\n", | |
" <td> 0.81</td>\n", | |
" <td> 0</td>\n", | |
" <td> 3</td>\n", | |
" <td> 13</td>\n", | |
" <td> 16</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 6</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.22</td>\n", | |
" <td> 0.2727</td>\n", | |
" <td> 0.80</td>\n", | |
" <td> 0</td>\n", | |
" <td> 8</td>\n", | |
" <td> 32</td>\n", | |
" <td> 40</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 2</td>\n", | |
" <td> 0</td>\n", | |
" <td> 6</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.22</td>\n", | |
" <td> 0.2727</td>\n", | |
" <td> 0.80</td>\n", | |
" <td> 0</td>\n", | |
" <td> 5</td>\n", | |
" <td> 27</td>\n", | |
" <td> 32</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 3</td>\n", | |
" <td> 0</td>\n", | |
" <td> 6</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.24</td>\n", | |
" <td> 0.2879</td>\n", | |
" <td> 0.75</td>\n", | |
" <td> 0</td>\n", | |
" <td> 3</td>\n", | |
" <td> 10</td>\n", | |
" <td> 13</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 4</td>\n", | |
" <td> 0</td>\n", | |
" <td> 6</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0.24</td>\n", | |
" <td> 0.2879</td>\n", | |
" <td> 0.75</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": [ | |
" season yr mnth hr holiday weekday workingday temp atemp hum \\\n", | |
"0 1 0 1 0 0 6 0 0.24 0.2879 0.81 \n", | |
"1 1 0 1 1 0 6 0 0.22 0.2727 0.80 \n", | |
"2 1 0 1 2 0 6 0 0.22 0.2727 0.80 \n", | |
"3 1 0 1 3 0 6 0 0.24 0.2879 0.75 \n", | |
"4 1 0 1 4 0 6 0 0.24 0.2879 0.75 \n", | |
"\n", | |
" windspeed casual registered cnt Weather Label 1 Weather Label 2 \\\n", | |
"0 0 3 13 16 1 0 \n", | |
"1 0 8 32 40 1 0 \n", | |
"2 0 5 27 32 1 0 \n", | |
"3 0 3 10 13 1 0 \n", | |
"4 0 0 1 1 1 0 \n", | |
"\n", | |
" Weather Label 3 Weather Label 4 \n", | |
"0 0 0 \n", | |
"1 0 0 \n", | |
"2 0 0 \n", | |
"3 0 0 \n", | |
"4 0 0 " | |
] | |
} | |
], | |
"prompt_number": 11 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment