-
-
Save hnykda/c362f0ad488e3b289394 to your computer and use it in GitHub Desktop.
import pandas as pd | |
from random import random | |
flow = (list(range(1,10,1)) + list(range(10,1,-1)))*100 | |
pdata = pd.DataFrame({"a":flow, "b":flow}) | |
pdata.b = pdata.b.shift(9) | |
data = pdata.iloc[10:] * random() # some noise | |
import numpy as np | |
def _load_data(data, n_prev = 100): | |
""" | |
data should be pd.DataFrame() | |
""" | |
docX, docY = [], [] | |
for i in range(len(data)-n_prev): | |
docX.append(data.iloc[i:i+n_prev].as_matrix()) | |
docY.append(data.iloc[i+n_prev].as_matrix()) | |
alsX = np.array(docX) | |
alsY = np.array(docY) | |
return alsX, alsY | |
def train_test_split(df, test_size=0.1): | |
""" | |
This just splits data to training and testing parts | |
""" | |
ntrn = round(len(df) * (1 - test_size)) | |
X_train, y_train = _load_data(df.iloc[0:ntrn]) | |
X_test, y_test = _load_data(df.iloc[ntrn:]) | |
return (X_train, y_train), (X_test, y_test) | |
from keras.models import Sequential | |
from keras.layers.core import Dense, Activation | |
from keras.layers.recurrent import LSTM | |
in_out_neurons = 2 | |
hidden_neurons = 50 | |
model = Sequential() | |
model.add(LSTM(in_out_neurons, hidden_neurons, return_sequences=False)) | |
model.add(Dense(hidden_neurons, in_out_neurons)) | |
model.add(Activation("linear")) | |
model.compile(loss="mean_squared_error", optimizer="rmsprop") | |
(X_train, y_train), (X_test, y_test) = train_test_split(data) # retrieve data | |
model.fit(X_train, y_train, batch_size=700, nb_epoch=10, validation_split=0.05) | |
predicted = model.predict(X_test) | |
rmse = np.sqrt(((predicted - y_test) ** 2).mean(axis=0)) | |
# and maybe plot it | |
pd.DataFrame(predicted).to_csv("predicted.csv") | |
pd.DataFrame(y_test).to_csv("test_data.csv") |
def train_test_split(df, test_size=0.1):
ntrn = int(round(len(df) * (1 - test_size)))
ntrn should be an integer , or else , I meet an error: cannot do slice indexing on <class 'pandas.indexes.range.RangeIndex'> with these indexers……
I changed the Keras 0.2 code above to work on Keras 1.0.3 w/h TensorFLow
import pandas as pd
from random import random
flow = (list(range(1,10,1)) + list(range(10,1,-1)))*100
pdata = pd.DataFrame({"a":flow, "b":flow})
pdata.b = pdata.b.shift(9)
data = pdata.iloc[10:] * random() # some noise
import numpy as np
def _load_data(data, n_prev = 100):
"""
data should be pd.DataFrame()
"""
docX, docY = [], []
for i in range(len(data)-n_prev):
docX.append(data.iloc[i:i+n_prev].as_matrix())
docY.append(data.iloc[i+n_prev].as_matrix())
alsX = np.array(docX)
alsY = np.array(docY)
return alsX, alsY
def train_test_split(df, test_size=0.1):
"""
This just splits data to training and testing parts
"""
ntrn = int(round(len(df) * (1 - test_size)))
X_train, y_train = _load_data(df.iloc[0:ntrn])
X_test, y_test = _load_data(df.iloc[ntrn:])
return (X_train, y_train), (X_test, y_test)
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.recurrent import LSTM
in_out_neurons = 2
hidden_neurons = 50
model = Sequential()
# n_prev = 100, 2 values per x axis
model.add(LSTM(hidden_neurons, input_shape=(100, 2)))
model.add(Dense(in_out_neurons))
model.add(Activation("linear"))
model.compile(loss="mean_squared_error",
optimizer="rmsprop",
metrics=['accuracy'])
(X_train, y_train), (X_test, y_test) = train_test_split(data)
model.fit(X_train, y_train, batch_size=700, nb_epoch=50, validation_data=(X_test, y_test), verbose=1)
score = model.evaluate(X_test, y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])
predicted = model.predict(X_test, batch_size=700)
# and maybe plot it
pd.DataFrame(predicted).to_csv("predicted.csv")
pd.DataFrame(y_test).to_csv("test_data.csv")
Produces some pretty rubbish results at 10 epochs, but gets pretty good with 20.
I'm still a beginner, so I'm not sure if this constitutes over fitting or not.
There is a new code for Keras 2 by my friend. On the beginning of the original blogpost: http://danielhnyk.cz/predicting-sequences-vectors-keras-using-rnn-lstm/
Hi, is there anyone who has the idea about applying this to categorical data. I mean I have a data in sequence; first column represents the event code which is categorical column and second column is their sequence number.
Any help?
Your original code (as modified for Keras 0.2, works with Keras 1.0 but only with the Theano back end. So there is still a problem somewhere. The error with the tensorflow backend is: (again, it works with theano backend).
Traceback (most recent call last):
File "new_model.py", line 70, in
model.add(LSTM(hidden_neurons, input_dim=in_out_neurons, return_sequences=False)) # orig
File "/Users/erlebach/anaconda/envs/tensorflow/lib/python2.7/site-packages/keras/models.py", line 107, in add
layer.create_input_layer(batch_input_shape, input_dtype)
File "/Users/erlebach/anaconda/envs/tensorflow/lib/python2.7/site-packages/keras/engine/topology.py", line 341, in create_input_layer
self(x)
File "/Users/erlebach/anaconda/envs/tensorflow/lib/python2.7/site-packages/keras/engine/topology.py", line 485, in call
self.add_inbound_node(inbound_layers, node_indices, tensor_indices)
File "/Users/erlebach/anaconda/envs/tensorflow/lib/python2.7/site-packages/keras/engine/topology.py", line 543, in add_inbound_node
Node.create_node(self, inbound_layers, node_indices, tensor_indices)
File "/Users/erlebach/anaconda/envs/tensorflow/lib/python2.7/site-packages/keras/engine/topology.py", line 148, in create_node
output_tensors = to_list(outbound_layer.call(input_tensors[0], mask=input_masks[0]))
File "/Users/erlebach/anaconda/envs/tensorflow/lib/python2.7/site-packages/keras/layers/recurrent.py", line 213, in call
': ' + str(input_shape))
Exception: When using TensorFlow, you should define explicitly the number of timesteps of your sequences.
If your first layer is an Embedding, make sure to pass it an "input_length" argument. Otherwise, make sure the first layer has an "input_shape" or "batch_input_shape" argument, including the time axis. Found input shape at layer lstm_1: (None, None, 2)