Last active
April 9, 2019 17:44
-
-
Save mariushelf/a0c3ef39516274480e6fb78e158ff937 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import re\n", | |
"from sklearn.preprocessing import OrdinalEncoder\n", | |
"from sklearn_pandas import DataFrameMapper, CategoricalImputer" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Init DF" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>foo_1</th>\n", | |
" <th>foo_2</th>\n", | |
" <th>foo_3</th>\n", | |
" <th>bar_1</th>\n", | |
" <th>bar_2</th>\n", | |
" <th>bar_4</th>\n", | |
" <th>baz</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>13</td>\n", | |
" <td>15</td>\n", | |
" <td>17</td>\n", | |
" <td>a</td>\n", | |
" <td>c</td>\n", | |
" <td>e</td>\n", | |
" <td>val1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>14</td>\n", | |
" <td>16</td>\n", | |
" <td>18</td>\n", | |
" <td>b</td>\n", | |
" <td>d</td>\n", | |
" <td>f</td>\n", | |
" <td>val2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" foo_1 foo_2 foo_3 bar_1 bar_2 bar_4 baz\n", | |
"0 13 15 17 a c e val1\n", | |
"1 14 16 18 b d f val2" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.DataFrame(\n", | |
" {\n", | |
" 'foo_1': [13, 14],\n", | |
" 'foo_2': [15, 16],\n", | |
" 'foo_3': [17, 18],\n", | |
" 'bar_1': ['a', 'b'],\n", | |
" 'bar_2': ['c', 'd'],\n", | |
" 'bar_4': ['e', 'f'],\n", | |
" 'baz': ['val1', 'val2'],\n", | |
" }\n", | |
")\n", | |
"\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Stack function and helper function" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def stack_by_regex(df, regex, col_name, id):\n", | |
" col_tuples = [(re.sub(regex, col_name, c), re.sub(regex, id, c)) for c in df.columns if re.match(regex, c)]\n", | |
" other_cols = [c for c in df.columns if not re.match(regex, c)]\n", | |
"\n", | |
" df = df.drop(other_cols, axis=1)\n", | |
" df.columns = pd.MultiIndex.from_tuples(col_tuples)\n", | |
"\n", | |
" df = df.stack()\n", | |
" return df\n", | |
"\n", | |
"def multi_index_to_single(df, levels=None, sep='_'):\n", | |
" if levels is None:\n", | |
" levels = [0, 1]\n", | |
" level_values = [df.columns.get_level_values(l) for l in levels]\n", | |
" df.columns = [sep.join(names) for names in zip(*level_values)]\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Demo Example" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Stack" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>bar</th>\n", | |
" <th>foo</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"4\" valign=\"top\">0</th>\n", | |
" <th>1</th>\n", | |
" <td>a</td>\n", | |
" <td>13.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>c</td>\n", | |
" <td>15.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>NaN</td>\n", | |
" <td>17.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>e</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"4\" valign=\"top\">1</th>\n", | |
" <th>1</th>\n", | |
" <td>b</td>\n", | |
" <td>14.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>d</td>\n", | |
" <td>16.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>NaN</td>\n", | |
" <td>18.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>f</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" bar foo\n", | |
"0 1 a 13.0\n", | |
" 2 c 15.0\n", | |
" 3 NaN 17.0\n", | |
" 4 e NaN\n", | |
"1 1 b 14.0\n", | |
" 2 d 16.0\n", | |
" 3 NaN 18.0\n", | |
" 4 f NaN" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"regex = r'(.+)_(.+)'\n", | |
"col_name = r'\\1'\n", | |
"id = r'\\2'\n", | |
"stacked_df = stack_by_regex(df, regex, col_name, id)\n", | |
"stacked_df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Do transformations" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"mapper = DataFrameMapper(\n", | |
" [\n", | |
" (['bar'], [CategoricalImputer(strategy='constant', fill_value='NA'),\n", | |
" OrdinalEncoder()]),\n", | |
" ],\n", | |
" df_out=True,\n", | |
" default=None,\n", | |
")\n", | |
"stacked_df = mapper.fit_transform(stacked_df)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Unstack" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead tr th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <th colspan=\"4\" halign=\"left\">bar</th>\n", | |
" <th colspan=\"4\" halign=\"left\">foo</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>3</th>\n", | |
" <th>4</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>3</th>\n", | |
" <th>4</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1.0</td>\n", | |
" <td>3.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>5.0</td>\n", | |
" <td>13.0</td>\n", | |
" <td>15.0</td>\n", | |
" <td>17.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2.0</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>6.0</td>\n", | |
" <td>14.0</td>\n", | |
" <td>16.0</td>\n", | |
" <td>18.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" bar foo \n", | |
" 1 2 3 4 1 2 3 4\n", | |
"0 1.0 3.0 0.0 5.0 13.0 15.0 17.0 NaN\n", | |
"1 2.0 4.0 0.0 6.0 14.0 16.0 18.0 NaN" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"unstacked_df = stacked_df.unstack()\n", | |
"unstacked_df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Fix Column Names" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>bar_1</th>\n", | |
" <th>bar_2</th>\n", | |
" <th>bar_3</th>\n", | |
" <th>bar_4</th>\n", | |
" <th>foo_1</th>\n", | |
" <th>foo_2</th>\n", | |
" <th>foo_3</th>\n", | |
" <th>foo_4</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1.0</td>\n", | |
" <td>3.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>5.0</td>\n", | |
" <td>13.0</td>\n", | |
" <td>15.0</td>\n", | |
" <td>17.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2.0</td>\n", | |
" <td>4.0</td>\n", | |
" <td>0.0</td>\n", | |
" <td>6.0</td>\n", | |
" <td>14.0</td>\n", | |
" <td>16.0</td>\n", | |
" <td>18.0</td>\n", | |
" <td>NaN</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" bar_1 bar_2 bar_3 bar_4 foo_1 foo_2 foo_3 foo_4\n", | |
"0 1.0 3.0 0.0 5.0 13.0 15.0 17.0 NaN\n", | |
"1 2.0 4.0 0.0 6.0 14.0 16.0 18.0 NaN" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"multi_index_to_single(unstacked_df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment