Created
November 28, 2018 10:20
-
-
Save MaxHalford/ab6cac51ff72ece5b32a600d61197b8d to your computer and use it in GitHub Desktop.
Keras autoencoder for timeseries
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
from keras import layers as l | |
from keras import models as m | |
from keras import preprocessing as p | |
import numpy as np | |
import pandas as pd | |
def generate_series(paths, prepare, chunk_size=1e6): | |
# Read the data in chunks | |
chunks = itertools.chain(*[pd.read_csv(p, chunksize=chunk_size) for p in paths]) | |
orphans = pd.DataFrame() | |
for chunk in chunks: | |
# Add the previous orphans to the chunk | |
chunk = pd.concat((orphans, chunk)) | |
# Determine which rows are orphans | |
last_val = chunk['object_id'].iloc[-1] | |
is_orphan = chunk['object_id'] == last_val | |
# Put the new orphans aside | |
chunk, orphans = chunk[~is_orphan], chunk[is_orphan] | |
# Yield one series per object | |
for object_id, g in chunk.groupby('object_id'): | |
yield object_id, prepare(g) | |
def generate_batch_series(paths, prepare, batch_size=16, chunk_size=1e6): | |
#while True: | |
xs = [] | |
ys = [] | |
for object_id, (x, y) in generate_series(paths, prepare, chunk_size=1e6): | |
xs.append(x) | |
ys.append(y) | |
if len(xs) == batch_size: | |
yield np.array(xs), np.array(ys) | |
xs = [] | |
ys = [] | |
timesteps = 180 | |
n_features = 4 | |
encoding_dim = 16 | |
inputs = l.Input(shape=(timesteps, n_features)) | |
encoded = l.normalization.BatchNormalization()(inputs) | |
encoded = l.LSTM(units=16)(encoded) | |
decoded = l.RepeatVector(timesteps)(encoded) | |
decoded = l.LSTM(1, return_sequences=True)(decoded) | |
autoencoder = m.Model(inputs, decoded) | |
encoder = m.Model(inputs, encoded) | |
autoencoder.compile(optimizer='adam', loss='mean_squared_error') | |
def log(x): | |
y = np.log1p(np.abs(x)) | |
return np.where(x < 0, -y, y) | |
def prepare_series(g): | |
features = g[['flux', 'flux_err', 'passband', 'mjd']] | |
features = p.sequence.pad_sequences( | |
features.T.values, | |
maxlen=180, | |
dtype='float64', | |
padding='pre', | |
truncating='pre', | |
value=0.0 | |
).T | |
x = log(features) | |
y = x[:, 0].reshape(-1, 1) | |
return x, y | |
batch_size = 16 | |
generator = generate_batch_series( | |
paths=[ | |
'~/projects/kaggle-plasticc-astro-classification/data/kaggle/training_set.csv', | |
#'~/projects/kaggle-plasticc-astro-classification/data/kaggle/test_set.csv' | |
], | |
prepare=prepare_series, | |
batch_size=batch_size | |
) | |
autoencoder.fit_generator(generator, steps_per_epoch=489, epochs=1); | |
generator = generate_series( | |
paths=[ | |
'~/projects/kaggle-plasticc-astro-classification/data/kaggle/training_set.csv', | |
'~/projects/kaggle-plasticc-astro-classification/data/kaggle/test_set.csv' | |
], | |
prepare=prepare_series | |
) | |
features = {} | |
ids = [] | |
xs = [] | |
for object_id, (x, y) in generator: | |
ids.append(object_id) | |
xs.append(x) | |
if len(xs) == batch_size: | |
encoded = encoder.predict(np.array(xs)) | |
for i, enc in zip(ids, encoded): | |
features[i] = enc | |
ids = [] | |
xs = [] | |
pd.DataFrame.from_dict(features, orient='index').add_prefix('encoded_').to_hdf('data/features.h5', 'encoded') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment