Last active
October 3, 2020 20:11
-
-
Save vaclavdekanovsky/5bd2fd13fd6928a9d8c2aa31dbaf4651 to your computer and use it in GitHub Desktop.
Up-sampling to average for several groups must be done separately per group
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Putting it all together\n", | |
| "Upsample multiple groups with an average" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import io" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>CAT</th>\n", | |
| " <th>VALUE</th>\n", | |
| " <th>count</th>\n", | |
| " <th>daily_average</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>DATE</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>1900-01-02</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>20</td>\n", | |
| " <td>2.0</td>\n", | |
| " <td>10.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1900-01-03</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>20</td>\n", | |
| " <td>2.0</td>\n", | |
| " <td>10.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1900-01-04</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>15</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>5.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1900-01-05</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>15</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>5.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1900-01-06</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>15</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>5.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1900-01-02</th>\n", | |
| " <td>efg</td>\n", | |
| " <td>40</td>\n", | |
| " <td>4.0</td>\n", | |
| " <td>10.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1900-01-03</th>\n", | |
| " <td>efg</td>\n", | |
| " <td>40</td>\n", | |
| " <td>4.0</td>\n", | |
| " <td>10.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1900-01-04</th>\n", | |
| " <td>efg</td>\n", | |
| " <td>40</td>\n", | |
| " <td>4.0</td>\n", | |
| " <td>10.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1900-01-05</th>\n", | |
| " <td>efg</td>\n", | |
| " <td>40</td>\n", | |
| " <td>4.0</td>\n", | |
| " <td>10.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1900-01-06</th>\n", | |
| " <td>efg</td>\n", | |
| " <td>12</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>12.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " CAT VALUE count daily_average\n", | |
| "DATE \n", | |
| "1900-01-02 abc 20 2.0 10.0\n", | |
| "1900-01-03 abc 20 2.0 10.0\n", | |
| "1900-01-04 abc 15 3.0 5.0\n", | |
| "1900-01-05 abc 15 3.0 5.0\n", | |
| "1900-01-06 abc 15 3.0 5.0\n", | |
| "1900-01-02 efg 40 4.0 10.0\n", | |
| "1900-01-03 efg 40 4.0 10.0\n", | |
| "1900-01-04 efg 40 4.0 10.0\n", | |
| "1900-01-05 efg 40 4.0 10.0\n", | |
| "1900-01-06 efg 12 1.0 12.0" | |
| ] | |
| }, | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data=\"\"\"\n", | |
| "CAT|DATE|VALUE\n", | |
| "abc|0101|10\n", | |
| "abc|0103|20\n", | |
| "abc|0106|15\n", | |
| "efg|0101|10\n", | |
| "efg|0105|40\n", | |
| "efg|0106|12\"\"\"\n", | |
| "df = pd.read_csv(io.StringIO(data), skiprows=1, sep=\"|\", dtype={\"DATE\":\"str\"})\n", | |
| "df[\"DATE\"] = pd.to_datetime(df[\"DATE\"], format=\"%m%d\")\n", | |
| "\n", | |
| "updated_df = []\n", | |
| "for gr in df[\"CAT\"].unique():\n", | |
| " \n", | |
| " subdf = df[df[\"CAT\"]==gr].set_index(\"DATE\").resample(\"1D\", label=\"right\")\n", | |
| " \n", | |
| " # back fill the data\n", | |
| " res = subdf.bfill()\n", | |
| " \n", | |
| " # Calculate the number of hours in each group\n", | |
| " resampled_groups = subdf.groups\n", | |
| " df_groups = pd.DataFrame(resampled_groups, index=[\"group\"]).T\n", | |
| " df_groups\n", | |
| " \n", | |
| " s = df_groups.groupby(\"group\").size()\n", | |
| " s.name = \"count\"\n", | |
| " s_counts = df_groups.join(s, on=\"group\")[\"count\"]\n", | |
| " \n", | |
| " res = res.join(s_counts)\n", | |
| " res[\"daily_average\"] = res[\"VALUE\"]/res[\"count\"]\n", | |
| " \n", | |
| " # add to list\n", | |
| " updated_df.append(res)\n", | |
| " \n", | |
| "pd.concat(updated_df).dropna()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.8.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment