Skip to content

Instantly share code, notes, and snippets.

@wassname
Last active February 13, 2018 03:28
Show Gist options
  • Save wassname/e0d8fad125dcd7702091390e9d5f45f0 to your computer and use it in GitHub Desktop.
Save wassname/e0d8fad125dcd7702091390e9d5f45f0 to your computer and use it in GitHub Desktop.
starter colab jupyter notebook for the hydrosaver competition
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# -*- coding: utf-8 -*-
"""hydrosaver.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/notebook#fileId=1gs18AtviN2Y3jSsVF2rgprAtCA8Jnt_8
"""
# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
#!pip install http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
#!pip install xgboost tpot pandas-profiling seaborn torchvision tqdm
# %pylab inline
import numpy as np
import pandas as pd
import seaborn as sn
import os
from tqdm import tqdm
eps = 1e-6
seed = 42
np.random.seed(seed)
"""# Download data"""
# from https://stackoverflow.com/a/39225039/221742
import requests
def download_file_from_google_drive(id, destination):
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
if not os.path.isdir('data/original'):
os.makedirs('data/original')
download_file_from_google_drive('15BqAMEBsTjAzT2eJXED-zA1pdHpGWZLl', './data/original/train.csv')
download_file_from_google_drive('1Xi_lLCKTsgSNECerpIPhQPzUCtmutDeS', './data/original/publishable_test_set.csv')
"""# Load data"""
# So we have some unique NaN values: 'No Data', 'Bad Input', etc. We also have date index col
df_train_val = pd.read_csv('./data/original/train.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])
df_train_val = df_train_val.dropna(axis=1, how='all') # drop the columns that are all NaN's
df_train_val = df_train_val.resample('1T').first()
df_train_val = df_train_val.drop('DIC88023.PV', 1)
df_test = pd.read_csv('./data/original/publishable_test_set.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])
df_test = df_test.dropna(axis=1, how='all') # drop the columns that are all NaN's
y_train_val = df_train_val.target
x_train_val = df_train_val.drop('target', 1) # We don't want the answer to be in the input data
x_test = df_test
# normalize the input columns
x_mean = x_train_val.mean()
x_std = x_train_val.mean()
x_train_val = (x_train_val - x_mean)/(x_std + eps)
x_test = (x_test - x_mean)/(x_std + eps)
# TODO I may want to normalize y too
print('mean', x_mean)
print('std', x_std)
# TPOT wont accept NaNs, so we either replace or drop
# Another approach would be to use unique numbers or extra columns for this
# Since we've normalized it, 0 is the nothing value. So let's use that
x_train_val = x_train_val.replace(np.nan, 0)
y_train_val = y_train_val.replace(np.nan, 0)
x_test = x_test.replace(np.nan, 0)
# since it's a timeseries the validation will be in the future
val_split_in = int(len(df_train_val.index)*0.85)
x_val = x_train_val[val_split_in:]
x_train = x_train_val[:val_split_in]
y_val = y_train_val[val_split_in:]
y_train = y_train_val[:val_split_in]
# convert to numpy
X_train = x_train.as_matrix()
y_train = y_train.as_matrix()
X_val = x_val.as_matrix()
y_val = y_val.as_matrix()
X_test = x_test.as_matrix()
"""# Have look into the data"""
df_train_val.info()
df_train_val.describe()
# You can use pandas profiling to get an overview of the data
import pandas_profiling
profile = pandas_profiling.ProfileReport(df_train_val[:2000])
profile.to_file(outputfile="/tmp/myoutputfile.html")
profile
"""# TPOT!
TPOT is an automatic machine learning library that uses genetic algorithms to try different generations of scikit-learn algorihtms.
link: https://epistasislab.github.io/tpot/
"""
# Check data for TPOT compatability
from tpot.base import check_X_y
check_X_y(X_train, y_train, accept_sparse=True)
check_X_y(X_val, y_val, accept_sparse=True)
'ok'
# Ensure the it respects causality, by only giving each sample access to a window of past data
# make padded sequences, we need to make the data in shape (batch, window_of_timesteps, features)
def timeseries_to_seq(x, window=3):
"""
Inputs:
- x: shape (timeseries, features)
- window: e.g. 3
Outputs:
- y: shape shape (window, batch, features)
"""
x_pad = np.pad(x, [[window,0],[0,0]], mode='constant')
y = np.stack([x_pad[i:i+window] for i in range(len(x))], axis=1)
return y
# For now I will just run on a subset of the data, for speed!
subset = 200
window=60*3
x=X_train[:subset]
y_stacked=y_train[:subset]
print(x.shape)
X_train_stacked = timeseries_to_seq(x, window=window).reshape((x.shape[0], -1))
from tpot import TPOTRegressor
# A quick run of TPOT with small population and short number of generation
# About 25 minutes to run
tpot = TPOTRegressor(generations=3, population_size=10, verbosity=3)
tpot.fit(X_train_stacked, y_stacked)
tpot.export('tpot_hydrosaver_export.py')
tpot.export('tpot_hydrosaver_export.py')
# What's the pipeline it saved?
# In this case it found that LassoLarsCV(normalize=False) performed best
#!cat tpot_hydrosaver_export.py
# final score
def rmse(y_pred, y_true):
sqloss = (y_true-y_pred)**2
return np.sqrt(sqloss.mean())
X_val_stacked = timeseries_to_seq(X_val, window=window).reshape((X_val.shape[0], -1))
y_pred = tpot.predict(X_val_stacked)
score = rmse(y_pred, y_val)
score
X_test_stacked = timeseries_to_seq(X_test, window=window).reshape((X_test.shape[0], -1))
y_pred = tpot.predict(X_test_stacked)
# save
s = pd.Series(y_submit, name='target')
assert len(s)==439140
import datetime
ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')
submission_file = 'submission_%s_score_%2.2f.csv'%(ts,score)
s.to_csv(submission_file, index=False, header=True, float_format='%2.9s')
print('upload file', submission_file)
# and download
import google
google.colab.files.download(submission_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment