Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save vaclavdekanovsky/7714cb8b36f341947bd716adf0488019 to your computer and use it in GitHub Desktop.

Select an option

Save vaclavdekanovsky/7714cb8b36f341947bd716adf0488019 to your computer and use it in GitHub Desktop.
Applying upsampling with average to the stock market fundamental data
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Upsampling Stock Market Example"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import io"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"data=\"\"\"\n",
"COM|DATE |VALUE\n",
"abc|20200131|100\n",
"abc|20200430|80\n",
"abc|20200731|125\n",
"abc|20201031|110\n",
"efg|20200331|25\n",
"efg|20200930|75\n",
"ijk|20200331|15\n",
"ijk|20200630|45\n",
"ijk|20200930|50\"\"\"\n",
"\n",
"# read the data into dataframe\n",
"df = pd.read_csv(io.StringIO(data), skiprows=1, sep=\"|\", dtype={\"DATE \":\"str\"}).rename(columns={\"DATE \":\"DATE\"})\n",
"df[\"DATE\"] = pd.to_datetime(df[\"DATE\"], format=\"%Y%m%d\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>COM</th>\n",
" <th>VALUE</th>\n",
" <th>count</th>\n",
" <th>monthly_average</th>\n",
" <th>3months</th>\n",
" </tr>\n",
" <tr>\n",
" <th>DATE</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2020-02-29</th>\n",
" <td>abc</td>\n",
" <td>80</td>\n",
" <td>3.0</td>\n",
" <td>26.666667</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-03-31</th>\n",
" <td>abc</td>\n",
" <td>80</td>\n",
" <td>3.0</td>\n",
" <td>26.666667</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-04-30</th>\n",
" <td>abc</td>\n",
" <td>80</td>\n",
" <td>3.0</td>\n",
" <td>26.666667</td>\n",
" <td>80.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-05-31</th>\n",
" <td>abc</td>\n",
" <td>125</td>\n",
" <td>3.0</td>\n",
" <td>41.666667</td>\n",
" <td>95.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-06-30</th>\n",
" <td>abc</td>\n",
" <td>125</td>\n",
" <td>3.0</td>\n",
" <td>41.666667</td>\n",
" <td>110.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-07-31</th>\n",
" <td>abc</td>\n",
" <td>125</td>\n",
" <td>3.0</td>\n",
" <td>41.666667</td>\n",
" <td>125.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-08-31</th>\n",
" <td>abc</td>\n",
" <td>110</td>\n",
" <td>3.0</td>\n",
" <td>36.666667</td>\n",
" <td>120.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-09-30</th>\n",
" <td>abc</td>\n",
" <td>110</td>\n",
" <td>3.0</td>\n",
" <td>36.666667</td>\n",
" <td>115.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-10-31</th>\n",
" <td>abc</td>\n",
" <td>110</td>\n",
" <td>3.0</td>\n",
" <td>36.666667</td>\n",
" <td>110.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-04-30</th>\n",
" <td>efg</td>\n",
" <td>75</td>\n",
" <td>6.0</td>\n",
" <td>12.500000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-05-31</th>\n",
" <td>efg</td>\n",
" <td>75</td>\n",
" <td>6.0</td>\n",
" <td>12.500000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-06-30</th>\n",
" <td>efg</td>\n",
" <td>75</td>\n",
" <td>6.0</td>\n",
" <td>12.500000</td>\n",
" <td>37.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-07-31</th>\n",
" <td>efg</td>\n",
" <td>75</td>\n",
" <td>6.0</td>\n",
" <td>12.500000</td>\n",
" <td>37.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-08-31</th>\n",
" <td>efg</td>\n",
" <td>75</td>\n",
" <td>6.0</td>\n",
" <td>12.500000</td>\n",
" <td>37.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-09-30</th>\n",
" <td>efg</td>\n",
" <td>75</td>\n",
" <td>6.0</td>\n",
" <td>12.500000</td>\n",
" <td>37.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-04-30</th>\n",
" <td>ijk</td>\n",
" <td>45</td>\n",
" <td>3.0</td>\n",
" <td>15.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-05-31</th>\n",
" <td>ijk</td>\n",
" <td>45</td>\n",
" <td>3.0</td>\n",
" <td>15.000000</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-06-30</th>\n",
" <td>ijk</td>\n",
" <td>45</td>\n",
" <td>3.0</td>\n",
" <td>15.000000</td>\n",
" <td>45.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-07-31</th>\n",
" <td>ijk</td>\n",
" <td>50</td>\n",
" <td>3.0</td>\n",
" <td>16.666667</td>\n",
" <td>46.666667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-08-31</th>\n",
" <td>ijk</td>\n",
" <td>50</td>\n",
" <td>3.0</td>\n",
" <td>16.666667</td>\n",
" <td>48.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-09-30</th>\n",
" <td>ijk</td>\n",
" <td>50</td>\n",
" <td>3.0</td>\n",
" <td>16.666667</td>\n",
" <td>50.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" COM VALUE count monthly_average 3months\n",
"DATE \n",
"2020-02-29 abc 80 3.0 26.666667 NaN\n",
"2020-03-31 abc 80 3.0 26.666667 NaN\n",
"2020-04-30 abc 80 3.0 26.666667 80.000000\n",
"2020-05-31 abc 125 3.0 41.666667 95.000000\n",
"2020-06-30 abc 125 3.0 41.666667 110.000000\n",
"2020-07-31 abc 125 3.0 41.666667 125.000000\n",
"2020-08-31 abc 110 3.0 36.666667 120.000000\n",
"2020-09-30 abc 110 3.0 36.666667 115.000000\n",
"2020-10-31 abc 110 3.0 36.666667 110.000000\n",
"2020-04-30 efg 75 6.0 12.500000 NaN\n",
"2020-05-31 efg 75 6.0 12.500000 NaN\n",
"2020-06-30 efg 75 6.0 12.500000 37.500000\n",
"2020-07-31 efg 75 6.0 12.500000 37.500000\n",
"2020-08-31 efg 75 6.0 12.500000 37.500000\n",
"2020-09-30 efg 75 6.0 12.500000 37.500000\n",
"2020-04-30 ijk 45 3.0 15.000000 NaN\n",
"2020-05-31 ijk 45 3.0 15.000000 NaN\n",
"2020-06-30 ijk 45 3.0 15.000000 45.000000\n",
"2020-07-31 ijk 50 3.0 16.666667 46.666667\n",
"2020-08-31 ijk 50 3.0 16.666667 48.333333\n",
"2020-09-30 ijk 50 3.0 16.666667 50.000000"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"updated_df = []\n",
"for gr in df[\"COM\"].unique():\n",
" \n",
" subdf = df[df[\"COM\"]==gr].set_index(\"DATE\").resample(\"M\", label=\"right\", closed=\"left\")\n",
" \n",
" # back fill the data\n",
" res = subdf.bfill()\n",
" \n",
" # Calculate the number of hours in each group\n",
" resampled_groups = subdf.groups\n",
" df_groups = pd.DataFrame(resampled_groups, index=[\"group\"]).T\n",
" df_groups\n",
" \n",
" s = df_groups.groupby(\"group\").size()\n",
" s.name = \"count\"\n",
" s_counts = df_groups.join(s, on=\"group\")[\"count\"]\n",
" \n",
" res = res.join(s_counts)\n",
" res[\"monthly_average\"] = res[\"VALUE\"]/res[\"count\"]\n",
" \n",
" # add to list\n",
" updated_df.append(res)\n",
" \n",
"av_df = pd.concat(updated_df).dropna().reset_index().set_index(\"DATE\")\n",
"av_df[\"3months\"] = av_df.groupby(\"COM\")[\"monthly_average\"].rolling(3).sum().values\n",
"av_df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>COM</th>\n",
" <th>abc</th>\n",
" <th>efg</th>\n",
" <th>ijk</th>\n",
" </tr>\n",
" <tr>\n",
" <th>DATE</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2020-06-30</th>\n",
" <td>110.0</td>\n",
" <td>37.5</td>\n",
" <td>45.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2020-09-30</th>\n",
" <td>115.0</td>\n",
" <td>37.5</td>\n",
" <td>50.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"COM abc efg ijk\n",
"DATE \n",
"2020-06-30 110.0 37.5 45.0\n",
"2020-09-30 115.0 37.5 50.0"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# for your model you might want only the quartely data\n",
"av_df\\\n",
".loc[[pd.to_datetime(\"2020-06-30\"), pd.to_datetime(\"2020-09-30\")],[\"COM\",\"3months\"]]\\\n",
".pivot(columns=\"COM\" ,values=\"3months\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment