Skip to content

Instantly share code, notes, and snippets.

@vaclavdekanovsky
Last active October 3, 2020 20:11
Show Gist options
  • Select an option

  • Save vaclavdekanovsky/5bd2fd13fd6928a9d8c2aa31dbaf4651 to your computer and use it in GitHub Desktop.

Select an option

Save vaclavdekanovsky/5bd2fd13fd6928a9d8c2aa31dbaf4651 to your computer and use it in GitHub Desktop.
Up-sampling to average for several groups must be done separately per group
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Putting it all together\n",
"Upsample multiple groups with an average"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import io"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CAT</th>\n",
" <th>VALUE</th>\n",
" <th>count</th>\n",
" <th>daily_average</th>\n",
" </tr>\n",
" <tr>\n",
" <th>DATE</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1900-01-02</th>\n",
" <td>abc</td>\n",
" <td>20</td>\n",
" <td>2.0</td>\n",
" <td>10.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1900-01-03</th>\n",
" <td>abc</td>\n",
" <td>20</td>\n",
" <td>2.0</td>\n",
" <td>10.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1900-01-04</th>\n",
" <td>abc</td>\n",
" <td>15</td>\n",
" <td>3.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1900-01-05</th>\n",
" <td>abc</td>\n",
" <td>15</td>\n",
" <td>3.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1900-01-06</th>\n",
" <td>abc</td>\n",
" <td>15</td>\n",
" <td>3.0</td>\n",
" <td>5.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1900-01-02</th>\n",
" <td>efg</td>\n",
" <td>40</td>\n",
" <td>4.0</td>\n",
" <td>10.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1900-01-03</th>\n",
" <td>efg</td>\n",
" <td>40</td>\n",
" <td>4.0</td>\n",
" <td>10.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1900-01-04</th>\n",
" <td>efg</td>\n",
" <td>40</td>\n",
" <td>4.0</td>\n",
" <td>10.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1900-01-05</th>\n",
" <td>efg</td>\n",
" <td>40</td>\n",
" <td>4.0</td>\n",
" <td>10.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1900-01-06</th>\n",
" <td>efg</td>\n",
" <td>12</td>\n",
" <td>1.0</td>\n",
" <td>12.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" CAT VALUE count daily_average\n",
"DATE \n",
"1900-01-02 abc 20 2.0 10.0\n",
"1900-01-03 abc 20 2.0 10.0\n",
"1900-01-04 abc 15 3.0 5.0\n",
"1900-01-05 abc 15 3.0 5.0\n",
"1900-01-06 abc 15 3.0 5.0\n",
"1900-01-02 efg 40 4.0 10.0\n",
"1900-01-03 efg 40 4.0 10.0\n",
"1900-01-04 efg 40 4.0 10.0\n",
"1900-01-05 efg 40 4.0 10.0\n",
"1900-01-06 efg 12 1.0 12.0"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data=\"\"\"\n",
"CAT|DATE|VALUE\n",
"abc|0101|10\n",
"abc|0103|20\n",
"abc|0106|15\n",
"efg|0101|10\n",
"efg|0105|40\n",
"efg|0106|12\"\"\"\n",
"df = pd.read_csv(io.StringIO(data), skiprows=1, sep=\"|\", dtype={\"DATE\":\"str\"})\n",
"df[\"DATE\"] = pd.to_datetime(df[\"DATE\"], format=\"%m%d\")\n",
"\n",
"updated_df = []\n",
"for gr in df[\"CAT\"].unique():\n",
" \n",
" subdf = df[df[\"CAT\"]==gr].set_index(\"DATE\").resample(\"1D\", label=\"right\")\n",
" \n",
" # back fill the data\n",
" res = subdf.bfill()\n",
" \n",
" # Calculate the number of hours in each group\n",
" resampled_groups = subdf.groups\n",
" df_groups = pd.DataFrame(resampled_groups, index=[\"group\"]).T\n",
" df_groups\n",
" \n",
" s = df_groups.groupby(\"group\").size()\n",
" s.name = \"count\"\n",
" s_counts = df_groups.join(s, on=\"group\")[\"count\"]\n",
" \n",
" res = res.join(s_counts)\n",
" res[\"daily_average\"] = res[\"VALUE\"]/res[\"count\"]\n",
" \n",
" # add to list\n",
" updated_df.append(res)\n",
" \n",
"pd.concat(updated_df).dropna()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment