Last active
February 13, 2018 03:28
-
-
Save wassname/e0d8fad125dcd7702091390e9d5f45f0 to your computer and use it in GitHub Desktop.
starter colab jupyter notebook for the hydrosaver competition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
"""hydrosaver.ipynb | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/notebook#fileId=1gs18AtviN2Y3jSsVF2rgprAtCA8Jnt_8 | |
""" | |
# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl | |
#!pip install http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl | |
#!pip install xgboost tpot pandas-profiling seaborn torchvision tqdm | |
# %pylab inline | |
import numpy as np | |
import pandas as pd | |
import seaborn as sn | |
import os | |
from tqdm import tqdm | |
eps = 1e-6 | |
seed = 42 | |
np.random.seed(seed) | |
"""# Download data""" | |
# from https://stackoverflow.com/a/39225039/221742 | |
import requests | |
def download_file_from_google_drive(id, destination): | |
def get_confirm_token(response): | |
for key, value in response.cookies.items(): | |
if key.startswith('download_warning'): | |
return value | |
return None | |
def save_response_content(response, destination): | |
CHUNK_SIZE = 32768 | |
with open(destination, "wb") as f: | |
for chunk in response.iter_content(CHUNK_SIZE): | |
if chunk: # filter out keep-alive new chunks | |
f.write(chunk) | |
URL = "https://docs.google.com/uc?export=download" | |
session = requests.Session() | |
response = session.get(URL, params = { 'id' : id }, stream = True) | |
token = get_confirm_token(response) | |
if token: | |
params = { 'id' : id, 'confirm' : token } | |
response = session.get(URL, params = params, stream = True) | |
save_response_content(response, destination) | |
if not os.path.isdir('data/original'): | |
os.makedirs('data/original') | |
download_file_from_google_drive('15BqAMEBsTjAzT2eJXED-zA1pdHpGWZLl', './data/original/train.csv') | |
download_file_from_google_drive('1Xi_lLCKTsgSNECerpIPhQPzUCtmutDeS', './data/original/publishable_test_set.csv') | |
"""# Load data""" | |
# So we have some unique NaN values: 'No Data', 'Bad Input', etc. We also have date index col | |
df_train_val = pd.read_csv('./data/original/train.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout']) | |
df_train_val = df_train_val.dropna(axis=1, how='all') # drop the columns that are all NaN's | |
df_train_val = df_train_val.resample('1T').first() | |
df_train_val = df_train_val.drop('DIC88023.PV', 1) | |
df_test = pd.read_csv('./data/original/publishable_test_set.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout']) | |
df_test = df_test.dropna(axis=1, how='all') # drop the columns that are all NaN's | |
y_train_val = df_train_val.target | |
x_train_val = df_train_val.drop('target', 1) # We don't want the answer to be in the input data | |
x_test = df_test | |
# normalize the input columns | |
x_mean = x_train_val.mean() | |
x_std = x_train_val.mean() | |
x_train_val = (x_train_val - x_mean)/(x_std + eps) | |
x_test = (x_test - x_mean)/(x_std + eps) | |
# TODO I may want to normalize y too | |
print('mean', x_mean) | |
print('std', x_std) | |
# TPOT wont accept NaNs, so we either replace or drop | |
# Another approach would be to use unique numbers or extra columns for this | |
# Since we've normalized it, 0 is the nothing value. So let's use that | |
x_train_val = x_train_val.replace(np.nan, 0) | |
y_train_val = y_train_val.replace(np.nan, 0) | |
x_test = x_test.replace(np.nan, 0) | |
# since it's a timeseries the validation will be in the future | |
val_split_in = int(len(df_train_val.index)*0.85) | |
x_val = x_train_val[val_split_in:] | |
x_train = x_train_val[:val_split_in] | |
y_val = y_train_val[val_split_in:] | |
y_train = y_train_val[:val_split_in] | |
# convert to numpy | |
X_train = x_train.as_matrix() | |
y_train = y_train.as_matrix() | |
X_val = x_val.as_matrix() | |
y_val = y_val.as_matrix() | |
X_test = x_test.as_matrix() | |
"""# Have look into the data""" | |
df_train_val.info() | |
df_train_val.describe() | |
# You can use pandas profiling to get an overview of the data | |
import pandas_profiling | |
profile = pandas_profiling.ProfileReport(df_train_val[:2000]) | |
profile.to_file(outputfile="/tmp/myoutputfile.html") | |
profile | |
"""# TPOT! | |
TPOT is an automatic machine learning library that uses genetic algorithms to try different generations of scikit-learn algorihtms. | |
link: https://epistasislab.github.io/tpot/ | |
""" | |
# Check data for TPOT compatability | |
from tpot.base import check_X_y | |
check_X_y(X_train, y_train, accept_sparse=True) | |
check_X_y(X_val, y_val, accept_sparse=True) | |
'ok' | |
# Ensure the it respects causality, by only giving each sample access to a window of past data | |
# make padded sequences, we need to make the data in shape (batch, window_of_timesteps, features) | |
def timeseries_to_seq(x, window=3): | |
""" | |
Inputs: | |
- x: shape (timeseries, features) | |
- window: e.g. 3 | |
Outputs: | |
- y: shape shape (window, batch, features) | |
""" | |
x_pad = np.pad(x, [[window,0],[0,0]], mode='constant') | |
y = np.stack([x_pad[i:i+window] for i in range(len(x))], axis=1) | |
return y | |
# For now I will just run on a subset of the data, for speed! | |
subset = 200 | |
window=60*3 | |
x=X_train[:subset] | |
y_stacked=y_train[:subset] | |
print(x.shape) | |
X_train_stacked = timeseries_to_seq(x, window=window).reshape((x.shape[0], -1)) | |
from tpot import TPOTRegressor | |
# A quick run of TPOT with small population and short number of generation | |
# About 25 minutes to run | |
tpot = TPOTRegressor(generations=3, population_size=10, verbosity=3) | |
tpot.fit(X_train_stacked, y_stacked) | |
tpot.export('tpot_hydrosaver_export.py') | |
tpot.export('tpot_hydrosaver_export.py') | |
# What's the pipeline it saved? | |
# In this case it found that LassoLarsCV(normalize=False) performed best | |
#!cat tpot_hydrosaver_export.py | |
# final score | |
def rmse(y_pred, y_true): | |
sqloss = (y_true-y_pred)**2 | |
return np.sqrt(sqloss.mean()) | |
X_val_stacked = timeseries_to_seq(X_val, window=window).reshape((X_val.shape[0], -1)) | |
y_pred = tpot.predict(X_val_stacked) | |
score = rmse(y_pred, y_val) | |
score | |
X_test_stacked = timeseries_to_seq(X_test, window=window).reshape((X_test.shape[0], -1)) | |
y_pred = tpot.predict(X_test_stacked) | |
# save | |
s = pd.Series(y_submit, name='target') | |
assert len(s)==439140 | |
import datetime | |
ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S') | |
submission_file = 'submission_%s_score_%2.2f.csv'%(ts,score) | |
s.to_csv(submission_file, index=False, header=True, float_format='%2.9s') | |
print('upload file', submission_file) | |
# and download | |
import google | |
google.colab.files.download(submission_file) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment