Last active
February 3, 2021 02:28
-
-
Save phsamuel/feae431821d76e93f281b237992c67e1 to your computer and use it in GitHub Desktop.
Stock prediction exercise
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "speaking-water", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Requirement already satisfied: yfinance in /home/phsamuel/p3/lib/python3.8/site-packages (0.1.55)\n", | |
"Requirement already satisfied: multitasking>=0.0.7 in /home/phsamuel/p3/lib/python3.8/site-packages (from yfinance) (0.0.9)\n", | |
"Requirement already satisfied: pandas>=0.24 in /home/phsamuel/p3/lib/python3.8/site-packages (from yfinance) (1.2.1)\n", | |
"Requirement already satisfied: requests>=2.20 in /home/phsamuel/p3/lib/python3.8/site-packages (from yfinance) (2.22.0)\n", | |
"Requirement already satisfied: lxml>=4.5.1 in /home/phsamuel/p3/lib/python3.8/site-packages (from yfinance) (4.6.2)\n", | |
"Requirement already satisfied: numpy>=1.15 in /home/phsamuel/p3/lib/python3.8/site-packages (from yfinance) (1.20.0)\n", | |
"Requirement already satisfied: pytz>=2017.3 in /usr/lib/python3/dist-packages (from pandas>=0.24->yfinance) (2019.3)\n", | |
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/lib/python3/dist-packages (from pandas>=0.24->yfinance) (2.7.3)\n", | |
"Requirement already satisfied: pandas in /home/phsamuel/p3/lib/python3.8/site-packages (1.2.1)\n", | |
"Requirement already satisfied: pytz>=2017.3 in /usr/lib/python3/dist-packages (from pandas) (2019.3)\n", | |
"Requirement already satisfied: numpy>=1.16.5 in /home/phsamuel/p3/lib/python3.8/site-packages (from pandas) (1.20.0)\n", | |
"Requirement already satisfied: python-dateutil>=2.7.3 in /usr/lib/python3/dist-packages (from pandas) (2.7.3)\n" | |
] | |
} | |
], | |
"source": [ | |
"# !pip install yfinance # uncomment these to install missing packages if they are not already installed\n", | |
"# !pip install pandas \n", | |
"\n", | |
"import yfinance as yf\n", | |
"import pandas as pd\n", | |
"\n", | |
"def get_price(tick,start='2020-10-01',end=None):\n", | |
" return yf.Ticker(tick).history(start=start,end=end)['Close']\n", | |
"\n", | |
"def get_prices(tickers,start='2020-10-01',end=None):\n", | |
" df=pd.DataFrame()\n", | |
" for s in tickers:\n", | |
" df[s]=get_price(s,start,end)\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "quick-cartoon", | |
"metadata": {}, | |
"source": [ | |
"# Prepare training and testing data sets" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "designed-storm", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"feature_stocks=['tsla','fb','twtr','amzn','nflx','gbtc','gdx','intc','dal','c']\n", | |
"predict_stock='msft'\n", | |
"\n", | |
"# training set\n", | |
"start_date_train='2020-10-01'\n", | |
"end_date_train='2020-12-31'\n", | |
"\n", | |
"X_train=get_prices(feature_stocks,start=start_date_train,end=end_date_train)\n", | |
"y_train=get_prices([predict_stock],start=start_date_train,end=end_date_train)\n", | |
"\n", | |
"# testing set\n", | |
"start_date_test='2021-01-01' # end date omit, default is doday\n", | |
"X_test=get_prices(feature_stocks,start=start_date_test)\n", | |
"y_test=get_prices([predict_stock],start=start_date_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "former-sodium", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>tsla</th>\n", | |
" <th>fb</th>\n", | |
" <th>twtr</th>\n", | |
" <th>amzn</th>\n", | |
" <th>nflx</th>\n", | |
" <th>gbtc</th>\n", | |
" <th>gdx</th>\n", | |
" <th>intc</th>\n", | |
" <th>dal</th>\n", | |
" <th>c</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Date</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2020-10-01</th>\n", | |
" <td>448.160004</td>\n", | |
" <td>266.630005</td>\n", | |
" <td>46.700001</td>\n", | |
" <td>3221.260010</td>\n", | |
" <td>527.510010</td>\n", | |
" <td>10.870000</td>\n", | |
" <td>39.364471</td>\n", | |
" <td>51.862778</td>\n", | |
" <td>31.100000</td>\n", | |
" <td>42.545544</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-10-02</th>\n", | |
" <td>415.089996</td>\n", | |
" <td>259.940002</td>\n", | |
" <td>46.119999</td>\n", | |
" <td>3125.000000</td>\n", | |
" <td>503.059998</td>\n", | |
" <td>10.860000</td>\n", | |
" <td>38.767590</td>\n", | |
" <td>50.641655</td>\n", | |
" <td>31.750000</td>\n", | |
" <td>42.761013</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-10-05</th>\n", | |
" <td>425.679993</td>\n", | |
" <td>264.649994</td>\n", | |
" <td>47.310001</td>\n", | |
" <td>3199.199951</td>\n", | |
" <td>520.650024</td>\n", | |
" <td>11.280000</td>\n", | |
" <td>39.364471</td>\n", | |
" <td>51.316746</td>\n", | |
" <td>32.000000</td>\n", | |
" <td>43.985275</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-10-06</th>\n", | |
" <td>413.980011</td>\n", | |
" <td>258.660004</td>\n", | |
" <td>45.599998</td>\n", | |
" <td>3099.959961</td>\n", | |
" <td>505.869995</td>\n", | |
" <td>10.845000</td>\n", | |
" <td>37.912056</td>\n", | |
" <td>50.999058</td>\n", | |
" <td>31.059999</td>\n", | |
" <td>43.495571</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-10-07</th>\n", | |
" <td>425.299988</td>\n", | |
" <td>258.119995</td>\n", | |
" <td>45.869999</td>\n", | |
" <td>3195.689941</td>\n", | |
" <td>534.659973</td>\n", | |
" <td>10.970000</td>\n", | |
" <td>38.150806</td>\n", | |
" <td>52.289669</td>\n", | |
" <td>32.150002</td>\n", | |
" <td>43.916718</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-12-23</th>\n", | |
" <td>645.979980</td>\n", | |
" <td>268.109985</td>\n", | |
" <td>54.299999</td>\n", | |
" <td>3185.270020</td>\n", | |
" <td>514.479980</td>\n", | |
" <td>28.879999</td>\n", | |
" <td>35.919998</td>\n", | |
" <td>46.570000</td>\n", | |
" <td>40.240002</td>\n", | |
" <td>60.266277</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-12-24</th>\n", | |
" <td>661.770020</td>\n", | |
" <td>267.399994</td>\n", | |
" <td>53.970001</td>\n", | |
" <td>3172.689941</td>\n", | |
" <td>513.969971</td>\n", | |
" <td>27.350000</td>\n", | |
" <td>36.029999</td>\n", | |
" <td>47.070000</td>\n", | |
" <td>39.730000</td>\n", | |
" <td>60.058056</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-12-28</th>\n", | |
" <td>663.690002</td>\n", | |
" <td>277.000000</td>\n", | |
" <td>54.430000</td>\n", | |
" <td>3283.959961</td>\n", | |
" <td>519.119995</td>\n", | |
" <td>30.450001</td>\n", | |
" <td>35.689999</td>\n", | |
" <td>47.070000</td>\n", | |
" <td>40.150002</td>\n", | |
" <td>60.613323</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-12-29</th>\n", | |
" <td>665.989990</td>\n", | |
" <td>276.779999</td>\n", | |
" <td>54.360001</td>\n", | |
" <td>3322.000000</td>\n", | |
" <td>530.869995</td>\n", | |
" <td>30.080000</td>\n", | |
" <td>35.740002</td>\n", | |
" <td>49.389999</td>\n", | |
" <td>40.029999</td>\n", | |
" <td>60.395180</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-12-30</th>\n", | |
" <td>694.780029</td>\n", | |
" <td>271.869995</td>\n", | |
" <td>54.330002</td>\n", | |
" <td>3285.850098</td>\n", | |
" <td>524.590027</td>\n", | |
" <td>32.900002</td>\n", | |
" <td>36.560001</td>\n", | |
" <td>48.750000</td>\n", | |
" <td>40.560001</td>\n", | |
" <td>60.345604</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>63 rows × 10 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" tsla fb twtr amzn nflx \\\n", | |
"Date \n", | |
"2020-10-01 448.160004 266.630005 46.700001 3221.260010 527.510010 \n", | |
"2020-10-02 415.089996 259.940002 46.119999 3125.000000 503.059998 \n", | |
"2020-10-05 425.679993 264.649994 47.310001 3199.199951 520.650024 \n", | |
"2020-10-06 413.980011 258.660004 45.599998 3099.959961 505.869995 \n", | |
"2020-10-07 425.299988 258.119995 45.869999 3195.689941 534.659973 \n", | |
"... ... ... ... ... ... \n", | |
"2020-12-23 645.979980 268.109985 54.299999 3185.270020 514.479980 \n", | |
"2020-12-24 661.770020 267.399994 53.970001 3172.689941 513.969971 \n", | |
"2020-12-28 663.690002 277.000000 54.430000 3283.959961 519.119995 \n", | |
"2020-12-29 665.989990 276.779999 54.360001 3322.000000 530.869995 \n", | |
"2020-12-30 694.780029 271.869995 54.330002 3285.850098 524.590027 \n", | |
"\n", | |
" gbtc gdx intc dal c \n", | |
"Date \n", | |
"2020-10-01 10.870000 39.364471 51.862778 31.100000 42.545544 \n", | |
"2020-10-02 10.860000 38.767590 50.641655 31.750000 42.761013 \n", | |
"2020-10-05 11.280000 39.364471 51.316746 32.000000 43.985275 \n", | |
"2020-10-06 10.845000 37.912056 50.999058 31.059999 43.495571 \n", | |
"2020-10-07 10.970000 38.150806 52.289669 32.150002 43.916718 \n", | |
"... ... ... ... ... ... \n", | |
"2020-12-23 28.879999 35.919998 46.570000 40.240002 60.266277 \n", | |
"2020-12-24 27.350000 36.029999 47.070000 39.730000 60.058056 \n", | |
"2020-12-28 30.450001 35.689999 47.070000 40.150002 60.613323 \n", | |
"2020-12-29 30.080000 35.740002 49.389999 40.029999 60.395180 \n", | |
"2020-12-30 32.900002 36.560001 48.750000 40.560001 60.345604 \n", | |
"\n", | |
"[63 rows x 10 columns]" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X_train" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "round-stadium", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>msft</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Date</th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2020-10-01</th>\n", | |
" <td>211.905228</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-10-02</th>\n", | |
" <td>205.651596</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-10-05</th>\n", | |
" <td>209.830658</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-10-06</th>\n", | |
" <td>205.372330</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-10-07</th>\n", | |
" <td>209.282089</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-12-23</th>\n", | |
" <td>221.020004</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-12-24</th>\n", | |
" <td>222.750000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-12-28</th>\n", | |
" <td>224.960007</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-12-29</th>\n", | |
" <td>224.149994</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2020-12-30</th>\n", | |
" <td>221.679993</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>63 rows × 1 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" msft\n", | |
"Date \n", | |
"2020-10-01 211.905228\n", | |
"2020-10-02 205.651596\n", | |
"2020-10-05 209.830658\n", | |
"2020-10-06 205.372330\n", | |
"2020-10-07 209.282089\n", | |
"... ...\n", | |
"2020-12-23 221.020004\n", | |
"2020-12-24 222.750000\n", | |
"2020-12-28 224.960007\n", | |
"2020-12-29 224.149994\n", | |
"2020-12-30 221.679993\n", | |
"\n", | |
"[63 rows x 1 columns]" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"y_train" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "written-antibody", | |
"metadata": {}, | |
"source": [ | |
"# Convert training and testing data into numpy array" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "hired-findings", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"\n", | |
"X_train=np.array(X_train)\n", | |
"y_train=np.array(y_train)\n", | |
"X_test=np.array(X_test)\n", | |
"y_test=np.array(y_test)\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "helpful-metadata", | |
"metadata": {}, | |
"source": [ | |
"# Use linear regression to predict msft stock price from the other stocks' prices " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "southeast-equivalent", | |
"metadata": {}, | |
"source": [ | |
"## 1. Append a dummy feature to both X_train and X_test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "connected-nursing", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Your solution here" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "romance-pharmaceutical", | |
"metadata": {}, | |
"source": [ | |
"## 2. Find the best linear regression model based on your training data ($w=(X X')^{-1} X y$)\n", | |
"### Note that you may need to transpose the matrices to make things work\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "emotional-vacation", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Your solution here" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "understood-vacation", | |
"metadata": {}, | |
"source": [ | |
"## 3. Report your training and testing error\n", | |
"### How far your prediction from the actual price. Compute the mean square error for both training and testing" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "crude-breeding", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Your solution here" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "stainless-circuit", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "sticky-portland", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment