Skip to content

Instantly share code, notes, and snippets.

@MaxHalford
Created November 28, 2018 10:20
Show Gist options
  • Save MaxHalford/ab6cac51ff72ece5b32a600d61197b8d to your computer and use it in GitHub Desktop.
Save MaxHalford/ab6cac51ff72ece5b32a600d61197b8d to your computer and use it in GitHub Desktop.
Keras autoencoder for timeseries
import itertools
from keras import layers as l
from keras import models as m
from keras import preprocessing as p
import numpy as np
import pandas as pd
def generate_series(paths, prepare, chunk_size=1e6):
# Read the data in chunks
chunks = itertools.chain(*[pd.read_csv(p, chunksize=chunk_size) for p in paths])
orphans = pd.DataFrame()
for chunk in chunks:
# Add the previous orphans to the chunk
chunk = pd.concat((orphans, chunk))
# Determine which rows are orphans
last_val = chunk['object_id'].iloc[-1]
is_orphan = chunk['object_id'] == last_val
# Put the new orphans aside
chunk, orphans = chunk[~is_orphan], chunk[is_orphan]
# Yield one series per object
for object_id, g in chunk.groupby('object_id'):
yield object_id, prepare(g)
def generate_batch_series(paths, prepare, batch_size=16, chunk_size=1e6):
#while True:
xs = []
ys = []
for object_id, (x, y) in generate_series(paths, prepare, chunk_size=1e6):
xs.append(x)
ys.append(y)
if len(xs) == batch_size:
yield np.array(xs), np.array(ys)
xs = []
ys = []
timesteps = 180
n_features = 4
encoding_dim = 16
inputs = l.Input(shape=(timesteps, n_features))
encoded = l.normalization.BatchNormalization()(inputs)
encoded = l.LSTM(units=16)(encoded)
decoded = l.RepeatVector(timesteps)(encoded)
decoded = l.LSTM(1, return_sequences=True)(decoded)
autoencoder = m.Model(inputs, decoded)
encoder = m.Model(inputs, encoded)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
def log(x):
y = np.log1p(np.abs(x))
return np.where(x < 0, -y, y)
def prepare_series(g):
features = g[['flux', 'flux_err', 'passband', 'mjd']]
features = p.sequence.pad_sequences(
features.T.values,
maxlen=180,
dtype='float64',
padding='pre',
truncating='pre',
value=0.0
).T
x = log(features)
y = x[:, 0].reshape(-1, 1)
return x, y
batch_size = 16
generator = generate_batch_series(
paths=[
'~/projects/kaggle-plasticc-astro-classification/data/kaggle/training_set.csv',
#'~/projects/kaggle-plasticc-astro-classification/data/kaggle/test_set.csv'
],
prepare=prepare_series,
batch_size=batch_size
)
autoencoder.fit_generator(generator, steps_per_epoch=489, epochs=1);
generator = generate_series(
paths=[
'~/projects/kaggle-plasticc-astro-classification/data/kaggle/training_set.csv',
'~/projects/kaggle-plasticc-astro-classification/data/kaggle/test_set.csv'
],
prepare=prepare_series
)
features = {}
ids = []
xs = []
for object_id, (x, y) in generator:
ids.append(object_id)
xs.append(x)
if len(xs) == batch_size:
encoded = encoder.predict(np.array(xs))
for i, enc in zip(ids, encoded):
features[i] = enc
ids = []
xs = []
pd.DataFrame.from_dict(features, orient='index').add_prefix('encoded_').to_hdf('data/features.h5', 'encoded')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment