Last active
July 23, 2020 04:56
-
-
Save hsteinshiromoto/58b448e8398fd18ad374c940fab1436b to your computer and use it in GitHub Desktop.
Data Manipulation with Pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Import" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Modules" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-07-23T04:37:40.829704Z", | |
"start_time": "2020-07-23T04:37:40.825110Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-07-23T04:40:51.038860Z", | |
"start_time": "2020-07-23T04:40:51.027013Z" | |
} | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.DataFrame.from_dict({\"date\": pd.date_range('2015-02-24', periods=10, freq='D'),\n", | |
" \"col1\": np.random.randint(5, size=10), \n", | |
" \"col2\": 100*np.random.randint(5, size=10),\n", | |
" \"col3\": np.random.randn(1, 10).squeeze()})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-07-23T04:40:51.257785Z", | |
"start_time": "2020-07-23T04:40:51.243952Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>date</th>\n", | |
" <th>col1</th>\n", | |
" <th>col2</th>\n", | |
" <th>col3</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2015-02-24</td>\n", | |
" <td>4</td>\n", | |
" <td>200</td>\n", | |
" <td>0.966712</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>2015-02-25</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>-1.239591</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>2015-02-26</td>\n", | |
" <td>4</td>\n", | |
" <td>0</td>\n", | |
" <td>-2.138567</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2015-02-27</td>\n", | |
" <td>3</td>\n", | |
" <td>400</td>\n", | |
" <td>1.792204</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2015-02-28</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>1.726700</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>2015-03-01</td>\n", | |
" <td>2</td>\n", | |
" <td>200</td>\n", | |
" <td>0.903985</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>2015-03-02</td>\n", | |
" <td>3</td>\n", | |
" <td>400</td>\n", | |
" <td>-0.255214</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>2015-03-03</td>\n", | |
" <td>0</td>\n", | |
" <td>100</td>\n", | |
" <td>1.247975</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>2015-03-04</td>\n", | |
" <td>4</td>\n", | |
" <td>200</td>\n", | |
" <td>-0.313888</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>2015-03-05</td>\n", | |
" <td>4</td>\n", | |
" <td>300</td>\n", | |
" <td>-0.027013</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" date col1 col2 col3\n", | |
"0 2015-02-24 4 200 0.966712\n", | |
"1 2015-02-25 1 0 -1.239591\n", | |
"2 2015-02-26 4 0 -2.138567\n", | |
"3 2015-02-27 3 400 1.792204\n", | |
"4 2015-02-28 3 0 1.726700\n", | |
"5 2015-03-01 2 200 0.903985\n", | |
"6 2015-03-02 3 400 -0.255214\n", | |
"7 2015-03-03 0 100 1.247975\n", | |
"8 2015-03-04 4 200 -0.313888\n", | |
"9 2015-03-05 4 300 -0.027013" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Problem Statement" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Aggregate the previous data set according to day, week and month" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-05-16T00:25:08.376981Z", | |
"start_time": "2020-05-16T00:25:08.374132Z" | |
} | |
}, | |
"source": [ | |
"# Solution" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Set the date as the index of the data frame" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-07-23T04:40:56.403511Z", | |
"start_time": "2020-07-23T04:40:56.380339Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>date</th>\n", | |
" <th>col1</th>\n", | |
" <th>col2</th>\n", | |
" <th>col3</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>date</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2015-02-24</th>\n", | |
" <td>2015-02-24</td>\n", | |
" <td>4</td>\n", | |
" <td>200</td>\n", | |
" <td>0.966712</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-02-25</th>\n", | |
" <td>2015-02-25</td>\n", | |
" <td>1</td>\n", | |
" <td>0</td>\n", | |
" <td>-1.239591</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-02-26</th>\n", | |
" <td>2015-02-26</td>\n", | |
" <td>4</td>\n", | |
" <td>0</td>\n", | |
" <td>-2.138567</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-02-27</th>\n", | |
" <td>2015-02-27</td>\n", | |
" <td>3</td>\n", | |
" <td>400</td>\n", | |
" <td>1.792204</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-02-28</th>\n", | |
" <td>2015-02-28</td>\n", | |
" <td>3</td>\n", | |
" <td>0</td>\n", | |
" <td>1.726700</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-03-01</th>\n", | |
" <td>2015-03-01</td>\n", | |
" <td>2</td>\n", | |
" <td>200</td>\n", | |
" <td>0.903985</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-03-02</th>\n", | |
" <td>2015-03-02</td>\n", | |
" <td>3</td>\n", | |
" <td>400</td>\n", | |
" <td>-0.255214</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-03-03</th>\n", | |
" <td>2015-03-03</td>\n", | |
" <td>0</td>\n", | |
" <td>100</td>\n", | |
" <td>1.247975</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-03-04</th>\n", | |
" <td>2015-03-04</td>\n", | |
" <td>4</td>\n", | |
" <td>200</td>\n", | |
" <td>-0.313888</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-03-05</th>\n", | |
" <td>2015-03-05</td>\n", | |
" <td>4</td>\n", | |
" <td>300</td>\n", | |
" <td>-0.027013</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" date col1 col2 col3\n", | |
"date \n", | |
"2015-02-24 2015-02-24 4 200 0.966712\n", | |
"2015-02-25 2015-02-25 1 0 -1.239591\n", | |
"2015-02-26 2015-02-26 4 0 -2.138567\n", | |
"2015-02-27 2015-02-27 3 400 1.792204\n", | |
"2015-02-28 2015-02-28 3 0 1.726700\n", | |
"2015-03-01 2015-03-01 2 200 0.903985\n", | |
"2015-03-02 2015-03-02 3 400 -0.255214\n", | |
"2015-03-03 2015-03-03 0 100 1.247975\n", | |
"2015-03-04 2015-03-04 4 200 -0.313888\n", | |
"2015-03-05 2015-03-05 4 300 -0.027013" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.set_index(pd.to_datetime(df[\"date\"].dt.date), inplace=True)\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Agregate by day" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-07-23T04:41:03.791639Z", | |
"start_time": "2020-07-23T04:41:03.732275Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>col1</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>date</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2015-02-24</th>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-02-25</th>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-02-26</th>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-02-27</th>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-02-28</th>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-03-01</th>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-03-02</th>\n", | |
" <td>3</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-03-03</th>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-03-04</th>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-03-05</th>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" col1\n", | |
"date \n", | |
"2015-02-24 4\n", | |
"2015-02-25 1\n", | |
"2015-02-26 4\n", | |
"2015-02-27 3\n", | |
"2015-02-28 3\n", | |
"2015-03-01 2\n", | |
"2015-03-02 3\n", | |
"2015-03-03 0\n", | |
"2015-03-04 4\n", | |
"2015-03-05 4" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['col1'].resample('D').sum().to_frame()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Agregate by week" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-07-23T04:41:14.326604Z", | |
"start_time": "2020-07-23T04:41:14.297969Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>col1</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>date</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2015-03-01</th>\n", | |
" <td>17</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-03-08</th>\n", | |
" <td>11</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" col1\n", | |
"date \n", | |
"2015-03-01 17\n", | |
"2015-03-08 11" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['col1'].resample('W').sum().to_frame()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Agregate by month" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-07-23T04:41:17.634954Z", | |
"start_time": "2020-07-23T04:41:17.605153Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>col1</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>date</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2015-02-28</th>\n", | |
" <td>15</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2015-03-31</th>\n", | |
" <td>13</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" col1\n", | |
"date \n", | |
"2015-02-28 15\n", | |
"2015-03-31 13" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df['col1'].resample('M').sum().to_frame()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Agregate by week and by `co11`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"ExecuteTime": { | |
"end_time": "2020-07-23T04:44:26.353672Z", | |
"start_time": "2020-07-23T04:44:26.328205Z" | |
} | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>date</th>\n", | |
" <th>col2</th>\n", | |
" <th>col3</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>date</th>\n", | |
" <th>col1</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th rowspan=\"4\" valign=\"top\">2015-03-01</th>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th rowspan=\"3\" valign=\"top\">2015-03-08</th>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" date col2 col3\n", | |
"date col1 \n", | |
"2015-03-01 1 1 1 1\n", | |
" 2 1 1 1\n", | |
" 3 2 2 2\n", | |
" 4 2 2 2\n", | |
"2015-03-08 0 1 1 1\n", | |
" 3 1 1 1\n", | |
" 4 2 2 2" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.groupby([pd.Grouper(freq='W'), 'col1']).count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
}, | |
"toc": { | |
"base_numbering": 1, | |
"nav_menu": {}, | |
"number_sections": true, | |
"sideBar": true, | |
"skip_h1_title": false, | |
"title_cell": "Table of Contents", | |
"title_sidebar": "Contents", | |
"toc_cell": false, | |
"toc_position": {}, | |
"toc_section_display": true, | |
"toc_window_display": false | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment