Forked from sureshsarda/ExplodingAColumnIntoMultipleCells.ipynb
Created
September 10, 2019 10:28
-
-
Save namanjh/9c7c4d82f52774b18c1bf16b6bce5e8d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>City</th>\n", | |
" <th>EmployeeId</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Mumbai|Bangalore</td>\n", | |
" <td>001</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Pune|Mumbai|Delhi</td>\n", | |
" <td>002</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Mumbai|Bangalore</td>\n", | |
" <td>003</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Mumbai|Pune</td>\n", | |
" <td>004</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Bangalore</td>\n", | |
" <td>005</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" City EmployeeId\n", | |
"0 Mumbai|Bangalore 001\n", | |
"1 Pune|Mumbai|Delhi 002\n", | |
"2 Mumbai|Bangalore 003\n", | |
"3 Mumbai|Pune 004\n", | |
"4 Bangalore 005" | |
] | |
}, | |
"execution_count": 1, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import pandas as pd\n", | |
"\n", | |
"df = pd.DataFrame({\n", | |
" 'EmployeeId': ['001', '002', '003', '004', '005'],\n", | |
" 'City': ['Mumbai|Bangalore', 'Pune|Mumbai|Delhi', 'Mumbai|Bangalore', 'Mumbai|Pune', 'Bangalore'] \n", | |
"})\n", | |
"\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"EmployeeId \n", | |
"001 0 Mumbai\n", | |
" 1 Bangalore\n", | |
"002 0 Pune\n", | |
" 1 Mumbai\n", | |
" 2 Delhi\n", | |
"003 0 Mumbai\n", | |
" 1 Bangalore\n", | |
"004 0 Mumbai\n", | |
" 1 Pune\n", | |
"005 0 Bangalore\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Exploding into multiple cells\n", | |
"\n", | |
"# We start with creating a new dataframe from the series with EmployeeId as the index\n", | |
"new_df = pd.DataFrame(df.City.str.split('|').tolist(), index=df.EmployeeId).stack()\n", | |
"new_df\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>EmployeeId</th>\n", | |
" <th>0</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>001</td>\n", | |
" <td>Mumbai</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>001</td>\n", | |
" <td>Bangalore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>002</td>\n", | |
" <td>Pune</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>002</td>\n", | |
" <td>Mumbai</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>002</td>\n", | |
" <td>Delhi</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>003</td>\n", | |
" <td>Mumbai</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>003</td>\n", | |
" <td>Bangalore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>004</td>\n", | |
" <td>Mumbai</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>004</td>\n", | |
" <td>Pune</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>005</td>\n", | |
" <td>Bangalore</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" EmployeeId 0\n", | |
"0 001 Mumbai\n", | |
"1 001 Bangalore\n", | |
"2 002 Pune\n", | |
"3 002 Mumbai\n", | |
"4 002 Delhi\n", | |
"5 003 Mumbai\n", | |
"6 003 Bangalore\n", | |
"7 004 Mumbai\n", | |
"8 004 Pune\n", | |
"9 005 Bangalore" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# We now want to get rid of the secondary index\n", | |
"# To do this, we will make EmployeeId as a column (it can't be an index since the values will be duplicate)\n", | |
"new_df = new_df.reset_index([0, 'EmployeeId'])\n", | |
"new_df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>EmployeeId</th>\n", | |
" <th>City</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>001</td>\n", | |
" <td>Mumbai</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>001</td>\n", | |
" <td>Bangalore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>002</td>\n", | |
" <td>Pune</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>002</td>\n", | |
" <td>Mumbai</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>002</td>\n", | |
" <td>Delhi</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>003</td>\n", | |
" <td>Mumbai</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>003</td>\n", | |
" <td>Bangalore</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>004</td>\n", | |
" <td>Mumbai</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>004</td>\n", | |
" <td>Pune</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>005</td>\n", | |
" <td>Bangalore</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" EmployeeId City\n", | |
"0 001 Mumbai\n", | |
"1 001 Bangalore\n", | |
"2 002 Pune\n", | |
"3 002 Mumbai\n", | |
"4 002 Delhi\n", | |
"5 003 Mumbai\n", | |
"6 003 Bangalore\n", | |
"7 004 Mumbai\n", | |
"8 004 Pune\n", | |
"9 005 Bangalore" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# The final step is to set the column names as we want them\n", | |
"new_df.columns = ['EmployeeId', 'City']\n", | |
"new_df" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment