Created
February 27, 2018 14:31
-
-
Save Erlemar/c54e509ee92359d2bc25d742e0a5d40c to your computer and use it in GitHub Desktop.
An example of mean-encoding.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Mean encoding\n", | |
"\n", | |
"Mean encoding is a technique to replace categorical values with some figures. In this example I use simple means for each category. Advanced techniques use some kind of regularization.\n", | |
"\n", | |
"Let's imagine that we have a dataset where we need to predict shop values based on some features. One of features is shop_id. Using one hot encoding may lead to huge amount of features. Mean-encoding creates a single feature instead." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>shop_id</th>\n", | |
" <th>target</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>11</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>23</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>56</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>78</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>89</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>5</td>\n", | |
" <td>45</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>6</td>\n", | |
" <td>12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>1</td>\n", | |
" <td>34</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>5</td>\n", | |
" <td>12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>6</td>\n", | |
" <td>45</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>7</td>\n", | |
" <td>12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>8</td>\n", | |
" <td>12</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>9</td>\n", | |
" <td>34</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>0</td>\n", | |
" <td>90</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>4</td>\n", | |
" <td>55</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" shop_id target\n", | |
"0 1 11\n", | |
"1 1 23\n", | |
"2 2 56\n", | |
"3 3 78\n", | |
"4 4 89\n", | |
"5 5 45\n", | |
"6 6 12\n", | |
"7 1 34\n", | |
"8 5 12\n", | |
"9 6 45\n", | |
"10 7 12\n", | |
"11 8 12\n", | |
"12 9 34\n", | |
"13 0 90\n", | |
"14 4 55" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.DataFrame({'shop_id': [1, 1, 2, 3, 4, 5, 6, 1, 5, 6, 7, 8, 9, 0, 4],\n", | |
" 'target': [11, 23, 56, 78, 89, 45, 12, 34, 12, 45, 12, 12, 34, 90, 55]})\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>shop_id</th>\n", | |
" <th>target</th>\n", | |
" <th>encoded</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>11</td>\n", | |
" <td>22.666667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>23</td>\n", | |
" <td>22.666667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2</td>\n", | |
" <td>56</td>\n", | |
" <td>56.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>3</td>\n", | |
" <td>78</td>\n", | |
" <td>78.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>4</td>\n", | |
" <td>89</td>\n", | |
" <td>72.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>5</td>\n", | |
" <td>45</td>\n", | |
" <td>28.500000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>6</td>\n", | |
" <td>12</td>\n", | |
" <td>28.500000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>1</td>\n", | |
" <td>34</td>\n", | |
" <td>22.666667</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>5</td>\n", | |
" <td>12</td>\n", | |
" <td>28.500000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>6</td>\n", | |
" <td>45</td>\n", | |
" <td>28.500000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>7</td>\n", | |
" <td>12</td>\n", | |
" <td>12.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>8</td>\n", | |
" <td>12</td>\n", | |
" <td>12.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>9</td>\n", | |
" <td>34</td>\n", | |
" <td>34.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>0</td>\n", | |
" <td>90</td>\n", | |
" <td>90.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>4</td>\n", | |
" <td>55</td>\n", | |
" <td>72.000000</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" shop_id target encoded\n", | |
"0 1 11 22.666667\n", | |
"1 1 23 22.666667\n", | |
"2 2 56 56.000000\n", | |
"3 3 78 78.000000\n", | |
"4 4 89 72.000000\n", | |
"5 5 45 28.500000\n", | |
"6 6 12 28.500000\n", | |
"7 1 34 22.666667\n", | |
"8 5 12 28.500000\n", | |
"9 6 45 28.500000\n", | |
"10 7 12 12.000000\n", | |
"11 8 12 12.000000\n", | |
"12 9 34 34.000000\n", | |
"13 0 90 90.000000\n", | |
"14 4 55 72.000000" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"shop_mean = df.groupby('shop_id').target.mean()\n", | |
"df['encoded'] = df['shop_id'].map(shop_mean)\n", | |
"df" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment