Created
October 3, 2020 19:39
-
-
Save vaclavdekanovsky/7714cb8b36f341947bd716adf0488019 to your computer and use it in GitHub Desktop.
Applying upsampling with average to the stock market fundamental data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Upsampling Stock Market Example" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import io" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data=\"\"\"\n", | |
| "COM|DATE |VALUE\n", | |
| "abc|20200131|100\n", | |
| "abc|20200430|80\n", | |
| "abc|20200731|125\n", | |
| "abc|20201031|110\n", | |
| "efg|20200331|25\n", | |
| "efg|20200930|75\n", | |
| "ijk|20200331|15\n", | |
| "ijk|20200630|45\n", | |
| "ijk|20200930|50\"\"\"\n", | |
| "\n", | |
| "# read the data into dataframe\n", | |
| "df = pd.read_csv(io.StringIO(data), skiprows=1, sep=\"|\", dtype={\"DATE \":\"str\"}).rename(columns={\"DATE \":\"DATE\"})\n", | |
| "df[\"DATE\"] = pd.to_datetime(df[\"DATE\"], format=\"%Y%m%d\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>COM</th>\n", | |
| " <th>VALUE</th>\n", | |
| " <th>count</th>\n", | |
| " <th>monthly_average</th>\n", | |
| " <th>3months</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>DATE</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>2020-02-29</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>80</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>26.666667</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-03-31</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>80</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>26.666667</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-04-30</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>80</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>26.666667</td>\n", | |
| " <td>80.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-05-31</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>125</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>41.666667</td>\n", | |
| " <td>95.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-06-30</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>125</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>41.666667</td>\n", | |
| " <td>110.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-07-31</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>125</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>41.666667</td>\n", | |
| " <td>125.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-08-31</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>110</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>36.666667</td>\n", | |
| " <td>120.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-09-30</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>110</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>36.666667</td>\n", | |
| " <td>115.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-10-31</th>\n", | |
| " <td>abc</td>\n", | |
| " <td>110</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>36.666667</td>\n", | |
| " <td>110.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-04-30</th>\n", | |
| " <td>efg</td>\n", | |
| " <td>75</td>\n", | |
| " <td>6.0</td>\n", | |
| " <td>12.500000</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-05-31</th>\n", | |
| " <td>efg</td>\n", | |
| " <td>75</td>\n", | |
| " <td>6.0</td>\n", | |
| " <td>12.500000</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-06-30</th>\n", | |
| " <td>efg</td>\n", | |
| " <td>75</td>\n", | |
| " <td>6.0</td>\n", | |
| " <td>12.500000</td>\n", | |
| " <td>37.500000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-07-31</th>\n", | |
| " <td>efg</td>\n", | |
| " <td>75</td>\n", | |
| " <td>6.0</td>\n", | |
| " <td>12.500000</td>\n", | |
| " <td>37.500000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-08-31</th>\n", | |
| " <td>efg</td>\n", | |
| " <td>75</td>\n", | |
| " <td>6.0</td>\n", | |
| " <td>12.500000</td>\n", | |
| " <td>37.500000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-09-30</th>\n", | |
| " <td>efg</td>\n", | |
| " <td>75</td>\n", | |
| " <td>6.0</td>\n", | |
| " <td>12.500000</td>\n", | |
| " <td>37.500000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-04-30</th>\n", | |
| " <td>ijk</td>\n", | |
| " <td>45</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>15.000000</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-05-31</th>\n", | |
| " <td>ijk</td>\n", | |
| " <td>45</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>15.000000</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-06-30</th>\n", | |
| " <td>ijk</td>\n", | |
| " <td>45</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>15.000000</td>\n", | |
| " <td>45.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-07-31</th>\n", | |
| " <td>ijk</td>\n", | |
| " <td>50</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>16.666667</td>\n", | |
| " <td>46.666667</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-08-31</th>\n", | |
| " <td>ijk</td>\n", | |
| " <td>50</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>16.666667</td>\n", | |
| " <td>48.333333</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-09-30</th>\n", | |
| " <td>ijk</td>\n", | |
| " <td>50</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>16.666667</td>\n", | |
| " <td>50.000000</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " COM VALUE count monthly_average 3months\n", | |
| "DATE \n", | |
| "2020-02-29 abc 80 3.0 26.666667 NaN\n", | |
| "2020-03-31 abc 80 3.0 26.666667 NaN\n", | |
| "2020-04-30 abc 80 3.0 26.666667 80.000000\n", | |
| "2020-05-31 abc 125 3.0 41.666667 95.000000\n", | |
| "2020-06-30 abc 125 3.0 41.666667 110.000000\n", | |
| "2020-07-31 abc 125 3.0 41.666667 125.000000\n", | |
| "2020-08-31 abc 110 3.0 36.666667 120.000000\n", | |
| "2020-09-30 abc 110 3.0 36.666667 115.000000\n", | |
| "2020-10-31 abc 110 3.0 36.666667 110.000000\n", | |
| "2020-04-30 efg 75 6.0 12.500000 NaN\n", | |
| "2020-05-31 efg 75 6.0 12.500000 NaN\n", | |
| "2020-06-30 efg 75 6.0 12.500000 37.500000\n", | |
| "2020-07-31 efg 75 6.0 12.500000 37.500000\n", | |
| "2020-08-31 efg 75 6.0 12.500000 37.500000\n", | |
| "2020-09-30 efg 75 6.0 12.500000 37.500000\n", | |
| "2020-04-30 ijk 45 3.0 15.000000 NaN\n", | |
| "2020-05-31 ijk 45 3.0 15.000000 NaN\n", | |
| "2020-06-30 ijk 45 3.0 15.000000 45.000000\n", | |
| "2020-07-31 ijk 50 3.0 16.666667 46.666667\n", | |
| "2020-08-31 ijk 50 3.0 16.666667 48.333333\n", | |
| "2020-09-30 ijk 50 3.0 16.666667 50.000000" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "updated_df = []\n", | |
| "for gr in df[\"COM\"].unique():\n", | |
| " \n", | |
| " subdf = df[df[\"COM\"]==gr].set_index(\"DATE\").resample(\"M\", label=\"right\", closed=\"left\")\n", | |
| " \n", | |
| " # back fill the data\n", | |
| " res = subdf.bfill()\n", | |
| " \n", | |
| " # Calculate the number of hours in each group\n", | |
| " resampled_groups = subdf.groups\n", | |
| " df_groups = pd.DataFrame(resampled_groups, index=[\"group\"]).T\n", | |
| " df_groups\n", | |
| " \n", | |
| " s = df_groups.groupby(\"group\").size()\n", | |
| " s.name = \"count\"\n", | |
| " s_counts = df_groups.join(s, on=\"group\")[\"count\"]\n", | |
| " \n", | |
| " res = res.join(s_counts)\n", | |
| " res[\"monthly_average\"] = res[\"VALUE\"]/res[\"count\"]\n", | |
| " \n", | |
| " # add to list\n", | |
| " updated_df.append(res)\n", | |
| " \n", | |
| "av_df = pd.concat(updated_df).dropna().reset_index().set_index(\"DATE\")\n", | |
| "av_df[\"3months\"] = av_df.groupby(\"COM\")[\"monthly_average\"].rolling(3).sum().values\n", | |
| "av_df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th>COM</th>\n", | |
| " <th>abc</th>\n", | |
| " <th>efg</th>\n", | |
| " <th>ijk</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>DATE</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>2020-06-30</th>\n", | |
| " <td>110.0</td>\n", | |
| " <td>37.5</td>\n", | |
| " <td>45.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2020-09-30</th>\n", | |
| " <td>115.0</td>\n", | |
| " <td>37.5</td>\n", | |
| " <td>50.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| "COM abc efg ijk\n", | |
| "DATE \n", | |
| "2020-06-30 110.0 37.5 45.0\n", | |
| "2020-09-30 115.0 37.5 50.0" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# for your model you might want only the quartely data\n", | |
| "av_df\\\n", | |
| ".loc[[pd.to_datetime(\"2020-06-30\"), pd.to_datetime(\"2020-09-30\")],[\"COM\",\"3months\"]]\\\n", | |
| ".pivot(columns=\"COM\" ,values=\"3months\")" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.8.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment