Created
June 1, 2020 22:28
-
-
Save hsteinshiromoto/19a87b5821760dbace62b4d42900fe2d to your computer and use it in GitHub Desktop.
Cumulative Sum with Pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Import" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Modules" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-05-16T00:03:37.493917Z", | |
"start_time": "2020-05-16T00:03:36.539411Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Problem Statement" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Calculate cumulative sum column to the data frame so that:\n", | |
"```\n", | |
"name | day | no\n", | |
"-----|-----------|----\n", | |
"Jack | Monday | 10\n", | |
"Jack | Tuesday | 20\n", | |
"Jack | Tuesday | 10\n", | |
"Jack | Wednesday | 50\n", | |
"Jill | Monday | 40\n", | |
"Jill | Wednesday | 110\n", | |
"```\n", | |
"becomes\n", | |
"```\n", | |
"Jack | Monday | 10 | 10\n", | |
"Jack | Tuesday | 30 | 40\n", | |
"Jack | Wednesday | 50 | 90\n", | |
"Jill | Monday | 40 | 40\n", | |
"Jill | Wednesday | 110 | 150\n", | |
"```" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Solution" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Create the data frame" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-05-16T00:10:23.546515Z", | |
"start_time": "2020-05-16T00:10:23.532150Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>day</th>\n", | |
" <th>no</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Jack</td>\n", | |
" <td>Monday</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Jack</td>\n", | |
" <td>Tuesday</td>\n", | |
" <td>20</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Jack</td>\n", | |
" <td>Tuesday</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Jack</td>\n", | |
" <td>Wednesday</td>\n", | |
" <td>50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Jill</td>\n", | |
" <td>Monday</td>\n", | |
" <td>40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>Jill</td>\n", | |
" <td>Wednesday</td>\n", | |
" <td>110</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name day no\n", | |
"0 Jack Monday 10\n", | |
"1 Jack Tuesday 20\n", | |
"2 Jack Tuesday 10\n", | |
"3 Jack Wednesday 50\n", | |
"4 Jill Monday 40\n", | |
"5 Jill Wednesday 110" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.DataFrame.from_dict(\n", | |
" {\n", | |
" \"name\": [\"Jack\", \"Jack\", \"Jack\", \"Jack\", \"Jill\", \"Jill\"]\n", | |
" ,\"day\": [\"Monday\", \"Tuesday\", \"Tuesday\", \"Wednesday\", \"Monday\", \"Wednesday\"]\n", | |
" ,\"no\": [10, 20, 10, 50, 40, 110]\n", | |
" }\n", | |
")\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Use groupby twice" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-05-16T00:09:52.551515Z", | |
"start_time": "2020-05-16T00:09:52.503778Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>day</th>\n", | |
" <th>no</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Jack</td>\n", | |
" <td>Monday</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Jack</td>\n", | |
" <td>Tuesday</td>\n", | |
" <td>40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Jack</td>\n", | |
" <td>Wednesday</td>\n", | |
" <td>90</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Jill</td>\n", | |
" <td>Monday</td>\n", | |
" <td>40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Jill</td>\n", | |
" <td>Wednesday</td>\n", | |
" <td>150</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name day no\n", | |
"0 Jack Monday 10\n", | |
"1 Jack Tuesday 40\n", | |
"2 Jack Wednesday 90\n", | |
"3 Jill Monday 40\n", | |
"4 Jill Wednesday 150" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.groupby(['name', 'day']).sum().groupby(level=0).cumsum().reset_index()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Explanation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-05-16T00:10:39.577110Z", | |
"start_time": "2020-05-16T00:10:39.555506Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>no</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>name</th>\n", | |
" <th>day</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"3\" valign=\"top\">Jack</th>\n", | |
" <th>Monday</th>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Tuesday</th>\n", | |
" <td>30</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Wednesday</th>\n", | |
" <td>50</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Jill</th>\n", | |
" <th>Monday</th>\n", | |
" <td>40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Wednesday</th>\n", | |
" <td>110</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" no\n", | |
"name day \n", | |
"Jack Monday 10\n", | |
" Tuesday 30\n", | |
" Wednesday 50\n", | |
"Jill Monday 40\n", | |
" Wednesday 110" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# sum per name/day\n", | |
"df.groupby(['name', 'day']).sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-05-16T00:10:56.892031Z", | |
"start_time": "2020-05-16T00:10:56.872325Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>no</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>name</th>\n", | |
" <th>day</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"3\" valign=\"top\">Jack</th>\n", | |
" <th>Monday</th>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Tuesday</th>\n", | |
" <td>40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Wednesday</th>\n", | |
" <td>90</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"2\" valign=\"top\">Jill</th>\n", | |
" <th>Monday</th>\n", | |
" <td>40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Wednesday</th>\n", | |
" <td>150</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" no\n", | |
"name day \n", | |
"Jack Monday 10\n", | |
" Tuesday 40\n", | |
" Wednesday 90\n", | |
"Jill Monday 40\n", | |
" Wednesday 150" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# cumulative sum per name/day\n", | |
"df.groupby(['name', 'day']).sum().groupby(level=0).cumsum()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The dataframe resulting from the first sum is indexed by `name` and by `day`. You can see it by printing " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-05-16T00:11:45.889529Z", | |
"start_time": "2020-05-16T00:11:45.873520Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"MultiIndex([('Jack', 'Monday'),\n", | |
" ('Jack', 'Tuesday'),\n", | |
" ('Jack', 'Wednesday'),\n", | |
" ('Jill', 'Monday'),\n", | |
" ('Jill', 'Wednesday')],\n", | |
" names=['name', 'day'])" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.groupby(['name', 'day']).sum().index " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"When computing the cumulative sum, you want to do so by `name`, corresponding to the first index (level 0).\n", | |
"\n", | |
"Finally, use `reset_index` to have the names repeated." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-05-16T00:12:28.438021Z", | |
"start_time": "2020-05-16T00:12:28.414139Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>day</th>\n", | |
" <th>no</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Jack</td>\n", | |
" <td>Monday</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Jack</td>\n", | |
" <td>Tuesday</td>\n", | |
" <td>40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Jack</td>\n", | |
" <td>Wednesday</td>\n", | |
" <td>90</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Jill</td>\n", | |
" <td>Monday</td>\n", | |
" <td>40</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Jill</td>\n", | |
" <td>Wednesday</td>\n", | |
" <td>150</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name day no\n", | |
"0 Jack Monday 10\n", | |
"1 Jack Tuesday 40\n", | |
"2 Jack Wednesday 90\n", | |
"3 Jill Monday 40\n", | |
"4 Jill Wednesday 150" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.groupby(['name', 'day']).sum().groupby(level=0).cumsum().reset_index()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# References" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"https://stackoverflow.com/questions/22650833/pandas-groupby-cumulative-sum" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment