Last active
February 13, 2018 03:28
-
-
Save wassname/e0d8fad125dcd7702091390e9d5f45f0 to your computer and use it in GitHub Desktop.
starter colab jupyter notebook for the hydrosaver competition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "hydrosaver.ipynb", | |
"version": "0.3.2", | |
"views": {}, | |
"default_view": {}, | |
"provenance": [], | |
"collapsed_sections": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"metadata": { | |
"id": "ouWjqOFAxk3G", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [ | |
{ | |
"item_id": 4 | |
} | |
], | |
"base_uri": "https://localhost:8080/", | |
"height": 600 | |
}, | |
"outputId": "7507890c-e73e-45bd-a287-dfa012032d9b", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1517110247838, | |
"user_tz": -480, | |
"elapsed": 6020, | |
"user": { | |
"displayName": "Mike C", | |
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg", | |
"userId": "110113503404408134511" | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl \n", | |
"!pip install http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl \n", | |
"!pip install xgboost tpot pandas-profiling seaborn torchvision tqdm" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Requirement already satisfied: torch==0.3.0.post4 from http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl in /usr/local/lib/python3.6/dist-packages\n", | |
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from torch==0.3.0.post4)\n", | |
"Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch==0.3.0.post4)\n", | |
"Requirement already satisfied: xgboost in /usr/local/lib/python3.6/dist-packages\n", | |
"Requirement already satisfied: tpot in /usr/local/lib/python3.6/dist-packages\n", | |
"Requirement already satisfied: pandas-profiling in /usr/local/lib/python3.6/dist-packages\n", | |
"Requirement already satisfied: seaborn in /usr/local/lib/python3.6/dist-packages\n", | |
"Requirement already satisfied: torchvision in /usr/local/lib/python3.6/dist-packages\n", | |
"Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages\n", | |
"Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from xgboost)\n", | |
"Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from xgboost)\n", | |
"Requirement already satisfied: deap>=1.0 in /usr/local/lib/python3.6/dist-packages (from tpot)\n", | |
"Requirement already satisfied: update-checker>=0.16 in /usr/local/lib/python3.6/dist-packages (from tpot)\n", | |
"Requirement already satisfied: scikit-learn>=0.18.1 in /usr/local/lib/python3.6/dist-packages (from tpot)\n", | |
"Requirement already satisfied: stopit>=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tpot)\n", | |
"Requirement already satisfied: pandas>=0.20.2 in /usr/local/lib/python3.6/dist-packages (from tpot)\n", | |
"Requirement already satisfied: matplotlib>=1.4 in /usr/local/lib/python3.6/dist-packages (from pandas-profiling)\n", | |
"Requirement already satisfied: jinja2>=2.8 in /usr/local/lib/python3.6/dist-packages (from pandas-profiling)\n", | |
"Requirement already satisfied: six>=1.9 in /usr/local/lib/python3.6/dist-packages (from pandas-profiling)\n", | |
"Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from torchvision)\n", | |
"Requirement already satisfied: pillow>=4.1.1 in /usr/local/lib/python3.6/dist-packages (from torchvision)\n", | |
"Requirement already satisfied: requests>=2.3.0 in /usr/local/lib/python3.6/dist-packages (from update-checker>=0.16->tpot)\n", | |
"Requirement already satisfied: python-dateutil>=2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.20.2->tpot)\n", | |
"Requirement already satisfied: pytz>=2011k in /usr/local/lib/python3.6/dist-packages (from pandas>=0.20.2->tpot)\n", | |
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.4->pandas-profiling)\n", | |
"Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.4->pandas-profiling)\n", | |
"Requirement already satisfied: MarkupSafe in /usr/local/lib/python3.6/dist-packages (from jinja2>=2.8->pandas-profiling)\n", | |
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from torch->torchvision)\n", | |
"Requirement already satisfied: idna<2.7,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot)\n", | |
"Requirement already satisfied: urllib3<1.23,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot)\n", | |
"Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "iCu7iIqOOBwg", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "cOUoN4Iytsl6", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [ | |
{ | |
"item_id": 1 | |
} | |
], | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "e5662991-d80f-42c4-998c-ee969542f125", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1517111898602, | |
"user_tz": -480, | |
"elapsed": 984, | |
"user": { | |
"displayName": "Mike C", | |
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg", | |
"userId": "110113503404408134511" | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"%pylab inline\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import seaborn as sn\n", | |
"import os\n", | |
"from tqdm import tqdm" | |
], | |
"execution_count": 2, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Populating the interactive namespace from numpy and matplotlib\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "MYHMWSswtpMW", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"eps = 1e-6\n", | |
"seed = 42\n", | |
"np.random.seed(seed)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "P5QboV3_vVbt", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "DG73KWmRvvqD", | |
"colab_type": "text" | |
}, | |
"cell_type": "markdown", | |
"source": [ | |
"# Download data" | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "i_S6eqlBu0Ti", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# from https://stackoverflow.com/a/39225039/221742\n", | |
"import requests\n", | |
"\n", | |
"def download_file_from_google_drive(id, destination):\n", | |
" def get_confirm_token(response):\n", | |
" for key, value in response.cookies.items():\n", | |
" if key.startswith('download_warning'):\n", | |
" return value\n", | |
"\n", | |
" return None\n", | |
"\n", | |
" def save_response_content(response, destination):\n", | |
" CHUNK_SIZE = 32768\n", | |
"\n", | |
" with open(destination, \"wb\") as f:\n", | |
" for chunk in response.iter_content(CHUNK_SIZE):\n", | |
" if chunk: # filter out keep-alive new chunks\n", | |
" f.write(chunk)\n", | |
"\n", | |
" URL = \"https://docs.google.com/uc?export=download\"\n", | |
"\n", | |
" session = requests.Session()\n", | |
"\n", | |
" response = session.get(URL, params = { 'id' : id }, stream = True)\n", | |
" token = get_confirm_token(response)\n", | |
"\n", | |
" if token:\n", | |
" params = { 'id' : id, 'confirm' : token }\n", | |
" response = session.get(URL, params = params, stream = True)\n", | |
"\n", | |
" save_response_content(response, destination) \n", | |
" \n", | |
"if not os.path.isdir('data/original'):\n", | |
" os.makedirs('data/original')\n", | |
"download_file_from_google_drive('15BqAMEBsTjAzT2eJXED-zA1pdHpGWZLl', './data/original/train.csv')\n", | |
"download_file_from_google_drive('1Xi_lLCKTsgSNECerpIPhQPzUCtmutDeS', './data/original/publishable_test_set.csv')\n" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "Wgr_5JJDv9A0", | |
"colab_type": "text" | |
}, | |
"cell_type": "markdown", | |
"source": [ | |
"# Load data" | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "MjzCmx7ZtzRN", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# So we have some unique NaN values: 'No Data', 'Bad Input', etc. We also have date index col\n", | |
"df_train_val = pd.read_csv('./data/original/train.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])\n", | |
"df_train_val = df_train_val.dropna(axis=1, how='all') # drop the columns that are all NaN's\n", | |
"df_train_val = df_train_val.resample('1T').first()\n", | |
"df_train_val = df_train_val.drop('DIC88023.PV', 1)\n", | |
"\n", | |
"df_test = pd.read_csv('./data/original/publishable_test_set.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])\n", | |
"df_test = df_test.dropna(axis=1, how='all') # drop the columns that are all NaN's\n", | |
"\n", | |
"y_train_val = df_train_val.target\n", | |
"x_train_val = df_train_val.drop('target', 1) # We don't want the answer to be in the input data\n", | |
"\n", | |
"x_test = df_test\n", | |
"\n", | |
"# normalize the input columns\n", | |
"x_mean = x_train_val.mean()\n", | |
"x_std = x_train_val.mean()\n", | |
"\n", | |
"x_train_val = (x_train_val - x_mean)/(x_std + eps)\n", | |
"x_test = (x_test - x_mean)/(x_std + eps)\n", | |
"\n", | |
"# TODO I may want to normalize y too\n", | |
"\n", | |
"print('mean', x_mean)\n", | |
"print('std', x_std)\n", | |
"\n", | |
"# TPOT wont accept NaNs, so we either replace or drop\n", | |
"# Another approach would be to use unique numbers or extra columns for this\n", | |
"# Since we've normalized it, 0 is the nothing value. So let's use that\n", | |
"\n", | |
"\n", | |
"x_train_val = x_train_val.replace(np.nan, 0)\n", | |
"y_train_val = y_train_val.replace(np.nan, 0)\n", | |
"x_test = x_test.replace(np.nan, 0)\n", | |
"\n", | |
"# since it's a timeseries the validation will be in the future\n", | |
"val_split_in = int(len(df_train_val.index)*0.85)\n", | |
"x_val = x_train_val[val_split_in:]\n", | |
"x_train = x_train_val[:val_split_in]\n", | |
"y_val = y_train_val[val_split_in:]\n", | |
"y_train = y_train_val[:val_split_in]\n", | |
"\n", | |
"# convert to numpy\n", | |
"X_train = x_train.as_matrix()\n", | |
"y_train = y_train.as_matrix()\n", | |
"X_val = x_val.as_matrix()\n", | |
"y_val = y_val.as_matrix()\n", | |
"X_test = x_test.as_matrix()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "DOW31bu1LmCZ", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "4B2eXef2LmTq", | |
"colab_type": "text" | |
}, | |
"cell_type": "markdown", | |
"source": [ | |
"# Have look into the data" | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "eK3sF_pewzCe", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"df_train_val.info()" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "ixLBMdpYtzlr", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [ | |
{ | |
"item_id": 1 | |
} | |
], | |
"base_uri": "https://localhost:8080/", | |
"height": 333 | |
}, | |
"outputId": "677c9e2a-7995-448d-d8bb-97260d6bcf29", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1517110268232, | |
"user_tz": -480, | |
"elapsed": 1591, | |
"user": { | |
"displayName": "Mike C", | |
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg", | |
"userId": "110113503404408134511" | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"df_train_val.describe()" | |
], | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>WQI8100XCL1.CPV</th>\n", | |
" <th>XI84201.PV</th>\n", | |
" <th>XI84202.PV</th>\n", | |
" <th>XI84123.PV</th>\n", | |
" <th>XI84124.PV</th>\n", | |
" <th>XI84125.PV</th>\n", | |
" <th>FX87211.CPV1</th>\n", | |
" <th>FIC87211.PV</th>\n", | |
" <th>FIC87211.SV</th>\n", | |
" <th>FX87211.P01</th>\n", | |
" <th>...</th>\n", | |
" <th>NIC88002.PV</th>\n", | |
" <th>PIC88007.PV</th>\n", | |
" <th>LIC88006.PV</th>\n", | |
" <th>AIC88055.PV</th>\n", | |
" <th>FIC88022.PV</th>\n", | |
" <th>DIC88023.PV</th>\n", | |
" <th>SI88033.PV</th>\n", | |
" <th>SI88034.PV</th>\n", | |
" <th>MQI88024.CPV</th>\n", | |
" <th>target</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>count</th>\n", | |
" <td>567467.000000</td>\n", | |
" <td>568873.000000</td>\n", | |
" <td>568873.000000</td>\n", | |
" <td>568872.000000</td>\n", | |
" <td>568868.000000</td>\n", | |
" <td>568864.000000</td>\n", | |
" <td>568748.000000</td>\n", | |
" <td>568873.000000</td>\n", | |
" <td>568873.000000</td>\n", | |
" <td>568748.000000</td>\n", | |
" <td>...</td>\n", | |
" <td>568748.000000</td>\n", | |
" <td>568748.000000</td>\n", | |
" <td>568748.000000</td>\n", | |
" <td>568747.000000</td>\n", | |
" <td>568873.000000</td>\n", | |
" <td>568873.000000</td>\n", | |
" <td>568873.000000</td>\n", | |
" <td>568873.000000</td>\n", | |
" <td>568748.000000</td>\n", | |
" <td>568873.000000</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mean</th>\n", | |
" <td>702.185281</td>\n", | |
" <td>105.143377</td>\n", | |
" <td>78.833709</td>\n", | |
" <td>0.046374</td>\n", | |
" <td>4.729106</td>\n", | |
" <td>23.870133</td>\n", | |
" <td>701.753521</td>\n", | |
" <td>4099.623820</td>\n", | |
" <td>4369.277083</td>\n", | |
" <td>21.063556</td>\n", | |
" <td>...</td>\n", | |
" <td>24.874992</td>\n", | |
" <td>8.484779</td>\n", | |
" <td>59.188180</td>\n", | |
" <td>35.818553</td>\n", | |
" <td>828.305065</td>\n", | |
" <td>52.464238</td>\n", | |
" <td>39.967696</td>\n", | |
" <td>23.986799</td>\n", | |
" <td>688.014555</td>\n", | |
" <td>52.463664</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>std</th>\n", | |
" <td>223.248174</td>\n", | |
" <td>39.179601</td>\n", | |
" <td>15.586560</td>\n", | |
" <td>0.020848</td>\n", | |
" <td>2.002726</td>\n", | |
" <td>10.889995</td>\n", | |
" <td>222.411906</td>\n", | |
" <td>1270.383514</td>\n", | |
" <td>623.180916</td>\n", | |
" <td>2.717285</td>\n", | |
" <td>...</td>\n", | |
" <td>7.449179</td>\n", | |
" <td>5.059790</td>\n", | |
" <td>31.757723</td>\n", | |
" <td>32.317187</td>\n", | |
" <td>241.930273</td>\n", | |
" <td>12.142946</td>\n", | |
" <td>34.175055</td>\n", | |
" <td>33.253739</td>\n", | |
" <td>212.166525</td>\n", | |
" <td>12.142832</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>min</th>\n", | |
" <td>-431.185300</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>-319.424500</td>\n", | |
" <td>-141.249900</td>\n", | |
" <td>0.000000</td>\n", | |
" <td>15.000000</td>\n", | |
" <td>...</td>\n", | |
" <td>-0.305536</td>\n", | |
" <td>-0.660624</td>\n", | |
" <td>0.100289</td>\n", | |
" <td>-0.185952</td>\n", | |
" <td>-8.621460</td>\n", | |
" <td>-1.073746</td>\n", | |
" <td>-0.091523</td>\n", | |
" <td>-0.042978</td>\n", | |
" <td>-19.506880</td>\n", | |
" <td>-1.073746</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25%</th>\n", | |
" <td>704.654800</td>\n", | |
" <td>85.875800</td>\n", | |
" <td>76.717600</td>\n", | |
" <td>0.043750</td>\n", | |
" <td>4.615610</td>\n", | |
" <td>23.181300</td>\n", | |
" <td>705.026125</td>\n", | |
" <td>3963.361000</td>\n", | |
" <td>4000.860000</td>\n", | |
" <td>19.000000</td>\n", | |
" <td>...</td>\n", | |
" <td>20.495800</td>\n", | |
" <td>4.414151</td>\n", | |
" <td>34.129130</td>\n", | |
" <td>-0.042471</td>\n", | |
" <td>823.644000</td>\n", | |
" <td>53.834340</td>\n", | |
" <td>0.061580</td>\n", | |
" <td>0.043636</td>\n", | |
" <td>684.447975</td>\n", | |
" <td>53.834050</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>50%</th>\n", | |
" <td>771.509900</td>\n", | |
" <td>104.959000</td>\n", | |
" <td>80.717500</td>\n", | |
" <td>0.053122</td>\n", | |
" <td>5.664090</td>\n", | |
" <td>27.982300</td>\n", | |
" <td>771.734550</td>\n", | |
" <td>4365.759000</td>\n", | |
" <td>4366.150000</td>\n", | |
" <td>21.000000</td>\n", | |
" <td>...</td>\n", | |
" <td>25.944700</td>\n", | |
" <td>9.353214</td>\n", | |
" <td>45.527020</td>\n", | |
" <td>64.933850</td>\n", | |
" <td>893.697500</td>\n", | |
" <td>54.826150</td>\n", | |
" <td>61.135900</td>\n", | |
" <td>0.058294</td>\n", | |
" <td>748.692300</td>\n", | |
" <td>54.824650</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>75%</th>\n", | |
" <td>813.349800</td>\n", | |
" <td>117.191000</td>\n", | |
" <td>86.680900</td>\n", | |
" <td>0.059375</td>\n", | |
" <td>5.875010</td>\n", | |
" <td>30.394600</td>\n", | |
" <td>813.457500</td>\n", | |
" <td>4711.450000</td>\n", | |
" <td>4711.220000</td>\n", | |
" <td>23.000000</td>\n", | |
" <td>...</td>\n", | |
" <td>30.657177</td>\n", | |
" <td>12.259470</td>\n", | |
" <td>99.866600</td>\n", | |
" <td>64.951930</td>\n", | |
" <td>945.023000</td>\n", | |
" <td>55.838750</td>\n", | |
" <td>69.629300</td>\n", | |
" <td>66.137960</td>\n", | |
" <td>795.153125</td>\n", | |
" <td>55.838450</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>max</th>\n", | |
" <td>1172.474000</td>\n", | |
" <td>726.869000</td>\n", | |
" <td>100.000000</td>\n", | |
" <td>0.200000</td>\n", | |
" <td>15.812500</td>\n", | |
" <td>51.659300</td>\n", | |
" <td>1146.185000</td>\n", | |
" <td>15645.750000</td>\n", | |
" <td>14000.000000</td>\n", | |
" <td>35.000000</td>\n", | |
" <td>...</td>\n", | |
" <td>58.597800</td>\n", | |
" <td>36.658300</td>\n", | |
" <td>100.127200</td>\n", | |
" <td>64.983190</td>\n", | |
" <td>1303.840000</td>\n", | |
" <td>77.728490</td>\n", | |
" <td>99.626400</td>\n", | |
" <td>98.255230</td>\n", | |
" <td>1399.242000</td>\n", | |
" <td>77.728490</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>8 rows × 23 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" WQI8100XCL1.CPV XI84201.PV XI84202.PV XI84123.PV \\\n", | |
"count 567467.000000 568873.000000 568873.000000 568872.000000 \n", | |
"mean 702.185281 105.143377 78.833709 0.046374 \n", | |
"std 223.248174 39.179601 15.586560 0.020848 \n", | |
"min -431.185300 0.000000 0.000000 0.000000 \n", | |
"25% 704.654800 85.875800 76.717600 0.043750 \n", | |
"50% 771.509900 104.959000 80.717500 0.053122 \n", | |
"75% 813.349800 117.191000 86.680900 0.059375 \n", | |
"max 1172.474000 726.869000 100.000000 0.200000 \n", | |
"\n", | |
" XI84124.PV XI84125.PV FX87211.CPV1 FIC87211.PV \\\n", | |
"count 568868.000000 568864.000000 568748.000000 568873.000000 \n", | |
"mean 4.729106 23.870133 701.753521 4099.623820 \n", | |
"std 2.002726 10.889995 222.411906 1270.383514 \n", | |
"min 0.000000 0.000000 -319.424500 -141.249900 \n", | |
"25% 4.615610 23.181300 705.026125 3963.361000 \n", | |
"50% 5.664090 27.982300 771.734550 4365.759000 \n", | |
"75% 5.875010 30.394600 813.457500 4711.450000 \n", | |
"max 15.812500 51.659300 1146.185000 15645.750000 \n", | |
"\n", | |
" FIC87211.SV FX87211.P01 ... NIC88002.PV \\\n", | |
"count 568873.000000 568748.000000 ... 568748.000000 \n", | |
"mean 4369.277083 21.063556 ... 24.874992 \n", | |
"std 623.180916 2.717285 ... 7.449179 \n", | |
"min 0.000000 15.000000 ... -0.305536 \n", | |
"25% 4000.860000 19.000000 ... 20.495800 \n", | |
"50% 4366.150000 21.000000 ... 25.944700 \n", | |
"75% 4711.220000 23.000000 ... 30.657177 \n", | |
"max 14000.000000 35.000000 ... 58.597800 \n", | |
"\n", | |
" PIC88007.PV LIC88006.PV AIC88055.PV FIC88022.PV \\\n", | |
"count 568748.000000 568748.000000 568747.000000 568873.000000 \n", | |
"mean 8.484779 59.188180 35.818553 828.305065 \n", | |
"std 5.059790 31.757723 32.317187 241.930273 \n", | |
"min -0.660624 0.100289 -0.185952 -8.621460 \n", | |
"25% 4.414151 34.129130 -0.042471 823.644000 \n", | |
"50% 9.353214 45.527020 64.933850 893.697500 \n", | |
"75% 12.259470 99.866600 64.951930 945.023000 \n", | |
"max 36.658300 100.127200 64.983190 1303.840000 \n", | |
"\n", | |
" DIC88023.PV SI88033.PV SI88034.PV MQI88024.CPV \\\n", | |
"count 568873.000000 568873.000000 568873.000000 568748.000000 \n", | |
"mean 52.464238 39.967696 23.986799 688.014555 \n", | |
"std 12.142946 34.175055 33.253739 212.166525 \n", | |
"min -1.073746 -0.091523 -0.042978 -19.506880 \n", | |
"25% 53.834340 0.061580 0.043636 684.447975 \n", | |
"50% 54.826150 61.135900 0.058294 748.692300 \n", | |
"75% 55.838750 69.629300 66.137960 795.153125 \n", | |
"max 77.728490 99.626400 98.255230 1399.242000 \n", | |
"\n", | |
" target \n", | |
"count 568873.000000 \n", | |
"mean 52.463664 \n", | |
"std 12.142832 \n", | |
"min -1.073746 \n", | |
"25% 53.834050 \n", | |
"50% 54.824650 \n", | |
"75% 55.838450 \n", | |
"max 77.728490 \n", | |
"\n", | |
"[8 rows x 23 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 10 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "LSicdcLVLp7i", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# You can use pandas profiling to get an overview of the data\n", | |
"import pandas_profiling\n", | |
"profile = pandas_profiling.ProfileReport(df_train_val[:2000])\n", | |
"profile.to_file(outputfile=\"/tmp/myoutputfile.html\")\n", | |
"profile" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "X9gXoF9CxTst", | |
"colab_type": "text" | |
}, | |
"cell_type": "markdown", | |
"source": [ | |
"# TPOT!\n", | |
"\n", | |
"TPOT is an automatic machine learning library that uses genetic algorithms to try different generations of scikit-learn algorihtms.\n", | |
"\n", | |
"link: https://epistasislab.github.io/tpot/" | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "_26g069T5KCG", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [ | |
{ | |
"item_id": 1 | |
} | |
], | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "eb67d081-78f6-42e4-c8e3-35d923022724", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1517110277832, | |
"user_tz": -480, | |
"elapsed": 779, | |
"user": { | |
"displayName": "Mike C", | |
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg", | |
"userId": "110113503404408134511" | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Check data for TPOT compatability\n", | |
"from tpot.base import check_X_y\n", | |
"check_X_y(X_train, y_train, accept_sparse=True)\n", | |
"check_X_y(X_val, y_val, accept_sparse=True)\n", | |
"'ok'" | |
], | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"'ok'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 12 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "pwEFjEGKYHEV", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# Ensure the it respects causality, by only giving each sample access to a window of past data\n", | |
"# make padded sequences, we need to make the data in shape (batch, window_of_timesteps, features)\n", | |
"\n", | |
"def timeseries_to_seq(x, window=3):\n", | |
" \"\"\"\n", | |
" Inputs:\n", | |
" - x: shape (timeseries, features)\n", | |
" - window: e.g. 3\n", | |
" Outputs:\n", | |
" - y: shape shape (window, batch, features)\n", | |
" \"\"\"\n", | |
" x_pad = np.pad(x, [[window,0],[0,0]], mode='constant')\n", | |
" y = np.stack([x_pad[i:i+window] for i in range(len(x))], axis=1)\n", | |
" return y" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "mijUpEIFYOza", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [ | |
{ | |
"item_id": 1 | |
} | |
], | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "ef055268-8e12-4b3c-a06c-cc6d19a7949e", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1517110279696, | |
"user_tz": -480, | |
"elapsed": 644, | |
"user": { | |
"displayName": "Mike C", | |
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg", | |
"userId": "110113503404408134511" | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# For now I will just run on a subset of the data, for speed!\n", | |
"subset = 200\n", | |
"window=60*3\n", | |
"x=X_train[:subset]\n", | |
"y_stacked=y_train[:subset]\n", | |
"print(x.shape)\n", | |
"X_train_stacked = timeseries_to_seq(x, window=window).reshape((x.shape[0], -1))" | |
], | |
"execution_count": 14, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"(200, 22)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "s1h_9IETxU0d", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"from tpot import TPOTRegressor\n", | |
"# A quick run of TPOT with small population and short number of generation\n", | |
"# About 25 minutes to run\n", | |
"tpot = TPOTRegressor(generations=3, population_size=10, verbosity=3)\n", | |
"tpot.fit(X_train_stacked, y_stacked)\n", | |
"tpot.export('tpot_hydrosaver_export.py')" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "lpw8PhAS59EC", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [ | |
{ | |
"item_id": 1 | |
} | |
], | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "18c73a74-802d-48ae-ce6c-ae46ce8a694d", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1517111729035, | |
"user_tz": -480, | |
"elapsed": 810, | |
"user": { | |
"displayName": "Mike C", | |
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg", | |
"userId": "110113503404408134511" | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"tpot.export('tpot_hydrosaver_export.py')" | |
], | |
"execution_count": 16, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 16 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "UFeZKx9j27Fx", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [ | |
{ | |
"item_id": 1 | |
} | |
], | |
"base_uri": "https://localhost:8080/", | |
"height": 299 | |
}, | |
"outputId": "73c3978c-e876-45cd-9317-1e42b2ff3c56", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1517111731056, | |
"user_tz": -480, | |
"elapsed": 1665, | |
"user": { | |
"displayName": "Mike C", | |
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg", | |
"userId": "110113503404408134511" | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# What's the pipeline it saved?\n", | |
"# In this case it found that LassoLarsCV(normalize=False) performed best\n", | |
"!cat tpot_hydrosaver_export.py" | |
], | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"import numpy as np\r\n", | |
"import pandas as pd\r\n", | |
"from sklearn.linear_model import LassoLarsCV\r\n", | |
"from sklearn.model_selection import train_test_split\r\n", | |
"\r\n", | |
"# NOTE: Make sure that the class is labeled 'target' in the data file\r\n", | |
"tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\r\n", | |
"features = tpot_data.drop('target', axis=1).values\r\n", | |
"training_features, testing_features, training_target, testing_target = \\\r\n", | |
" train_test_split(features, tpot_data['target'].values, random_state=42)\r\n", | |
"\r\n", | |
"# Score on the training set was:-0.00011788279235816052\r\n", | |
"exported_pipeline = LassoLarsCV(normalize=False)\r\n", | |
"\r\n", | |
"exported_pipeline.fit(training_features, training_target)\r\n", | |
"results = exported_pipeline.predict(testing_features)\r\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "u8hGttzUwU3a", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# final score\n", | |
"def rmse(y_pred, y_true):\n", | |
" sqloss = (y_true-y_pred)**2\n", | |
" return np.sqrt(sqloss.mean())" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "Ypq2uShmIb3B", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
}, | |
"output_extras": [ | |
{ | |
"item_id": 1 | |
} | |
], | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "6ba65187-aaa7-4e2d-acb2-63fb8f7e4b88", | |
"executionInfo": { | |
"status": "ok", | |
"timestamp": 1517111877742, | |
"user_tz": -480, | |
"elapsed": 2417, | |
"user": { | |
"displayName": "Mike C", | |
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg", | |
"userId": "110113503404408134511" | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"X_val_stacked = timeseries_to_seq(X_val, window=window).reshape((X_val.shape[0], -1))\n", | |
"y_pred = tpot.predict(X_val_stacked)\n", | |
"score = rmse(y_pred, y_val)\n", | |
"score" | |
], | |
"execution_count": 27, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"3.8885540109338006" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 27 | |
} | |
] | |
}, | |
{ | |
"metadata": { | |
"id": "RfqDTdZCIb0X", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"X_test_stacked = timeseries_to_seq(X_test, window=window).reshape((X_test.shape[0], -1))\n", | |
"y_pred = tpot.predict(X_test_stacked)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "qhCNUxNExifF", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# save\n", | |
"s = pd.Series(y_submit, name='target')\n", | |
"assert len(s)==439140\n", | |
"\n", | |
"import datetime\n", | |
"ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')\n", | |
"\n", | |
"submission_file = 'submission_%s_score_%2.2f.csv'%(ts,score)\n", | |
"s.to_csv(submission_file, index=False, header=True, float_format='%2.9s')\n", | |
"print('upload file', submission_file)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "Fh1ceEwmiv3h", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"# and download\n", | |
"import google\n", | |
"google.colab.files.download(submission_file)" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "mvKjdQsK5mOz", | |
"colab_type": "code", | |
"colab": { | |
"autoexec": { | |
"startup": false, | |
"wait_interval": 0 | |
} | |
} | |
}, | |
"cell_type": "code", | |
"source": [ | |
"" | |
], | |
"execution_count": 0, | |
"outputs": [] | |
} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""hydrosaver.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/notebook#fileId=1gs18AtviN2Y3jSsVF2rgprAtCA8Jnt_8 | |
""" | |
# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl | |
#!pip install http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl | |
#!pip install xgboost tpot pandas-profiling seaborn torchvision tqdm | |
# %pylab inline | |
import numpy as np | |
import pandas as pd | |
import seaborn as sn | |
import os | |
from tqdm import tqdm | |
eps = 1e-6 | |
seed = 42 | |
np.random.seed(seed) | |
"""# Download data""" | |
# from https://stackoverflow.com/a/39225039/221742 | |
import requests | |
def download_file_from_google_drive(id, destination): | |
def get_confirm_token(response): | |
for key, value in response.cookies.items(): | |
if key.startswith('download_warning'): | |
return value | |
return None | |
def save_response_content(response, destination): | |
CHUNK_SIZE = 32768 | |
with open(destination, "wb") as f: | |
for chunk in response.iter_content(CHUNK_SIZE): | |
if chunk: # filter out keep-alive new chunks | |
f.write(chunk) | |
URL = "https://docs.google.com/uc?export=download" | |
session = requests.Session() | |
response = session.get(URL, params = { 'id' : id }, stream = True) | |
token = get_confirm_token(response) | |
if token: | |
params = { 'id' : id, 'confirm' : token } | |
response = session.get(URL, params = params, stream = True) | |
save_response_content(response, destination) | |
if not os.path.isdir('data/original'): | |
os.makedirs('data/original') | |
download_file_from_google_drive('15BqAMEBsTjAzT2eJXED-zA1pdHpGWZLl', './data/original/train.csv') | |
download_file_from_google_drive('1Xi_lLCKTsgSNECerpIPhQPzUCtmutDeS', './data/original/publishable_test_set.csv') | |
"""# Load data""" | |
# So we have some unique NaN values: 'No Data', 'Bad Input', etc. We also have date index col | |
df_train_val = pd.read_csv('./data/original/train.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout']) | |
df_train_val = df_train_val.dropna(axis=1, how='all') # drop the columns that are all NaN's | |
df_train_val = df_train_val.resample('1T').first() | |
df_train_val = df_train_val.drop('DIC88023.PV', 1) | |
df_test = pd.read_csv('./data/original/publishable_test_set.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout']) | |
df_test = df_test.dropna(axis=1, how='all') # drop the columns that are all NaN's | |
y_train_val = df_train_val.target | |
x_train_val = df_train_val.drop('target', 1) # We don't want the answer to be in the input data | |
x_test = df_test | |
# normalize the input columns | |
x_mean = x_train_val.mean() | |
x_std = x_train_val.mean() | |
x_train_val = (x_train_val - x_mean)/(x_std + eps) | |
x_test = (x_test - x_mean)/(x_std + eps) | |
# TODO I may want to normalize y too | |
print('mean', x_mean) | |
print('std', x_std) | |
# TPOT wont accept NaNs, so we either replace or drop | |
# Another approach would be to use unique numbers or extra columns for this | |
# Since we've normalized it, 0 is the nothing value. So let's use that | |
x_train_val = x_train_val.replace(np.nan, 0) | |
y_train_val = y_train_val.replace(np.nan, 0) | |
x_test = x_test.replace(np.nan, 0) | |
# since it's a timeseries the validation will be in the future | |
val_split_in = int(len(df_train_val.index)*0.85) | |
x_val = x_train_val[val_split_in:] | |
x_train = x_train_val[:val_split_in] | |
y_val = y_train_val[val_split_in:] | |
y_train = y_train_val[:val_split_in] | |
# convert to numpy | |
X_train = x_train.as_matrix() | |
y_train = y_train.as_matrix() | |
X_val = x_val.as_matrix() | |
y_val = y_val.as_matrix() | |
X_test = x_test.as_matrix() | |
"""# Have look into the data""" | |
df_train_val.info() | |
df_train_val.describe() | |
# You can use pandas profiling to get an overview of the data | |
import pandas_profiling | |
profile = pandas_profiling.ProfileReport(df_train_val[:2000]) | |
profile.to_file(outputfile="/tmp/myoutputfile.html") | |
profile | |
"""# TPOT! | |
TPOT is an automatic machine learning library that uses genetic algorithms to try different generations of scikit-learn algorihtms. | |
link: https://epistasislab.github.io/tpot/ | |
""" | |
# Check data for TPOT compatability | |
from tpot.base import check_X_y | |
check_X_y(X_train, y_train, accept_sparse=True) | |
check_X_y(X_val, y_val, accept_sparse=True) | |
'ok' | |
# Ensure the it respects causality, by only giving each sample access to a window of past data | |
# make padded sequences, we need to make the data in shape (batch, window_of_timesteps, features) | |
def timeseries_to_seq(x, window=3): | |
""" | |
Inputs: | |
- x: shape (timeseries, features) | |
- window: e.g. 3 | |
Outputs: | |
- y: shape shape (window, batch, features) | |
""" | |
x_pad = np.pad(x, [[window,0],[0,0]], mode='constant') | |
y = np.stack([x_pad[i:i+window] for i in range(len(x))], axis=1) | |
return y | |
# For now I will just run on a subset of the data, for speed! | |
subset = 200 | |
window=60*3 | |
x=X_train[:subset] | |
y_stacked=y_train[:subset] | |
print(x.shape) | |
X_train_stacked = timeseries_to_seq(x, window=window).reshape((x.shape[0], -1)) | |
from tpot import TPOTRegressor | |
# A quick run of TPOT with small population and short number of generation | |
# About 25 minutes to run | |
tpot = TPOTRegressor(generations=3, population_size=10, verbosity=3) | |
tpot.fit(X_train_stacked, y_stacked) | |
tpot.export('tpot_hydrosaver_export.py') | |
tpot.export('tpot_hydrosaver_export.py') | |
# What's the pipeline it saved? | |
# In this case it found that LassoLarsCV(normalize=False) performed best | |
#!cat tpot_hydrosaver_export.py | |
# final score | |
def rmse(y_pred, y_true): | |
sqloss = (y_true-y_pred)**2 | |
return np.sqrt(sqloss.mean()) | |
X_val_stacked = timeseries_to_seq(X_val, window=window).reshape((X_val.shape[0], -1)) | |
y_pred = tpot.predict(X_val_stacked) | |
score = rmse(y_pred, y_val) | |
score | |
X_test_stacked = timeseries_to_seq(X_test, window=window).reshape((X_test.shape[0], -1)) | |
y_pred = tpot.predict(X_test_stacked) | |
# save | |
s = pd.Series(y_submit, name='target') | |
assert len(s)==439140 | |
import datetime | |
ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S') | |
submission_file = 'submission_%s_score_%2.2f.csv'%(ts,score) | |
s.to_csv(submission_file, index=False, header=True, float_format='%2.9s') | |
print('upload file', submission_file) | |
# and download | |
import google | |
google.colab.files.download(submission_file) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment