Skip to content

Instantly share code, notes, and snippets.

@mariushelf
Last active April 9, 2019 17:44
Show Gist options
  • Save mariushelf/a0c3ef39516274480e6fb78e158ff937 to your computer and use it in GitHub Desktop.
Save mariushelf/a0c3ef39516274480e6fb78e158ff937 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re\n",
"from sklearn.preprocessing import OrdinalEncoder\n",
"from sklearn_pandas import DataFrameMapper, CategoricalImputer"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Init DF"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>foo_1</th>\n",
" <th>foo_2</th>\n",
" <th>foo_3</th>\n",
" <th>bar_1</th>\n",
" <th>bar_2</th>\n",
" <th>bar_4</th>\n",
" <th>baz</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>13</td>\n",
" <td>15</td>\n",
" <td>17</td>\n",
" <td>a</td>\n",
" <td>c</td>\n",
" <td>e</td>\n",
" <td>val1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>14</td>\n",
" <td>16</td>\n",
" <td>18</td>\n",
" <td>b</td>\n",
" <td>d</td>\n",
" <td>f</td>\n",
" <td>val2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" foo_1 foo_2 foo_3 bar_1 bar_2 bar_4 baz\n",
"0 13 15 17 a c e val1\n",
"1 14 16 18 b d f val2"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(\n",
" {\n",
" 'foo_1': [13, 14],\n",
" 'foo_2': [15, 16],\n",
" 'foo_3': [17, 18],\n",
" 'bar_1': ['a', 'b'],\n",
" 'bar_2': ['c', 'd'],\n",
" 'bar_4': ['e', 'f'],\n",
" 'baz': ['val1', 'val2'],\n",
" }\n",
")\n",
"\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Stack function and helper function"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def stack_by_regex(df, regex, col_name, id):\n",
" col_tuples = [(re.sub(regex, col_name, c), re.sub(regex, id, c)) for c in df.columns if re.match(regex, c)]\n",
" other_cols = [c for c in df.columns if not re.match(regex, c)]\n",
"\n",
" df = df.drop(other_cols, axis=1)\n",
" df.columns = pd.MultiIndex.from_tuples(col_tuples)\n",
"\n",
" df = df.stack()\n",
" return df\n",
"\n",
"def multi_index_to_single(df, levels=None, sep='_'):\n",
" if levels is None:\n",
" levels = [0, 1]\n",
" level_values = [df.columns.get_level_values(l) for l in levels]\n",
" df.columns = [sep.join(names) for names in zip(*level_values)]\n",
" return df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Demo Example"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Stack"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>bar</th>\n",
" <th>foo</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"4\" valign=\"top\">0</th>\n",
" <th>1</th>\n",
" <td>a</td>\n",
" <td>13.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>c</td>\n",
" <td>15.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>17.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>e</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"4\" valign=\"top\">1</th>\n",
" <th>1</th>\n",
" <td>b</td>\n",
" <td>14.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>d</td>\n",
" <td>16.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>NaN</td>\n",
" <td>18.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>f</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" bar foo\n",
"0 1 a 13.0\n",
" 2 c 15.0\n",
" 3 NaN 17.0\n",
" 4 e NaN\n",
"1 1 b 14.0\n",
" 2 d 16.0\n",
" 3 NaN 18.0\n",
" 4 f NaN"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"regex = r'(.+)_(.+)'\n",
"col_name = r'\\1'\n",
"id = r'\\2'\n",
"stacked_df = stack_by_regex(df, regex, col_name, id)\n",
"stacked_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Do transformations"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"mapper = DataFrameMapper(\n",
" [\n",
" (['bar'], [CategoricalImputer(strategy='constant', fill_value='NA'),\n",
" OrdinalEncoder()]),\n",
" ],\n",
" df_out=True,\n",
" default=None,\n",
")\n",
"stacked_df = mapper.fit_transform(stacked_df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Unstack"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"4\" halign=\"left\">bar</th>\n",
" <th colspan=\"4\" halign=\"left\">foo</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>5.0</td>\n",
" <td>13.0</td>\n",
" <td>15.0</td>\n",
" <td>17.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>6.0</td>\n",
" <td>14.0</td>\n",
" <td>16.0</td>\n",
" <td>18.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" bar foo \n",
" 1 2 3 4 1 2 3 4\n",
"0 1.0 3.0 0.0 5.0 13.0 15.0 17.0 NaN\n",
"1 2.0 4.0 0.0 6.0 14.0 16.0 18.0 NaN"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"unstacked_df = stacked_df.unstack()\n",
"unstacked_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fix Column Names"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>bar_1</th>\n",
" <th>bar_2</th>\n",
" <th>bar_3</th>\n",
" <th>bar_4</th>\n",
" <th>foo_1</th>\n",
" <th>foo_2</th>\n",
" <th>foo_3</th>\n",
" <th>foo_4</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>5.0</td>\n",
" <td>13.0</td>\n",
" <td>15.0</td>\n",
" <td>17.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>6.0</td>\n",
" <td>14.0</td>\n",
" <td>16.0</td>\n",
" <td>18.0</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" bar_1 bar_2 bar_3 bar_4 foo_1 foo_2 foo_3 foo_4\n",
"0 1.0 3.0 0.0 5.0 13.0 15.0 17.0 NaN\n",
"1 2.0 4.0 0.0 6.0 14.0 16.0 18.0 NaN"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"multi_index_to_single(unstacked_df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment