Created
October 27, 2017 16:25
-
-
Save cjratcliff/7669f6d9718e8dda78f63c4170537457 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Adapted from https://github.com/fchollet/keras/blob/master/examples/imdb_lstm.py | |
from __future__ import print_function | |
from __future__ import division | |
from keras.preprocessing import sequence | |
from keras.datasets import imdb | |
from keras.layers.core import Dense | |
from sklearn.cross_validation import train_test_split | |
from sklearn.preprocessing import LabelBinarizer | |
from sklearn import isotonic | |
from scipy.stats import mannwhitneyu | |
import numpy as np | |
import tensorflow as tf | |
from utils import get_minibatches_idx | |
max_features = 20000 | |
maxlen = 80 # cut texts after this number of words (among top max_features most common words) | |
batch_size = 128 | |
num_lstm_layers = 2 | |
rnn_size = 512 | |
max_epochs = 50 | |
eps = 1e-8 | |
def early_stop(losses, sig_level=0.1): | |
""" | |
The losses must be from randomly ordered validation data. | |
""" | |
losses = np.array(losses) | |
# Insufficient sample size to run the test | |
if losses.shape[0] <= 40: | |
return False | |
iterations = np.arange(losses.shape[0] - 1) | |
change_in_losses = losses[1:] - losses[0:-1] | |
n = len(change_in_losses) | |
reg = isotonic.IsotonicRegression() | |
reg.fit(iterations, change_in_losses) | |
pred = reg.predict(iterations) | |
if np.max(pred) <= 0.0: | |
return False | |
# Find the point where pred crosses zero | |
transition_point = np.min(np.where(np.greater(pred,0))) | |
group_1 = change_in_losses[transition_point:] | |
group_2 = group_1 - pred[transition_point:] | |
# Insufficient sample size to run the test | |
if len(group_1) <= 20 or len(group_2) <= 20: | |
return False | |
### Impact of correlation between samples - reduces the effective sample size? | |
_, p_val = mannwhitneyu(group_1, group_2, alternative='greater') ### 'less'? | |
#print(p_val) | |
return p_val < sig_level | |
class Model(): | |
def __init__(self): | |
self.X = tf.placeholder(tf.int32, [None, None], 'x') | |
self.y = tf.placeholder(tf.int32, [None, 2], 'y') | |
self.seq_lengths = tf.placeholder(tf.int32, [None], 'seq_lengths') | |
with tf.variable_scope('rnnlm'): | |
self.embedding = tf.get_variable('embedding', [max_features, rnn_size]) | |
h = tf.nn.embedding_lookup(self.embedding, self.X) | |
cells = [tf.contrib.rnn.LSTMCell(num_units = rnn_size) for i in range(num_lstm_layers)] | |
cells = tf.contrib.rnn.MultiRNNCell(cells) | |
outputs, _ = tf.nn.dynamic_rnn( | |
cells, | |
inputs = h, | |
sequence_length = self.seq_lengths, | |
time_major = False, | |
dtype = tf.float32) | |
# Pool over the time dimension | |
h = tf.reduce_mean(outputs,1) | |
h = Dense(300, activation='relu')(h) | |
h = Dense(2, activation='softmax')(h) | |
self.pred = tf.clip_by_value(h, eps, 1 - eps) | |
# Cross-entropy loss | |
y = tf.to_float(self.y) | |
loss = -y*tf.log(self.pred) | |
self.loss = tf.reduce_mean(loss) | |
correct_prediction = tf.equal(tf.argmax(self.y,1), tf.argmax(self.pred,1)) | |
self.acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) | |
optimizer = tf.train.AdamOptimizer() | |
self.train_step = optimizer.minimize(self.loss) | |
self.init = tf.global_variables_initializer() | |
def fit(self, x, y, sess): | |
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2) | |
for epoch in range(max_epochs): | |
print("\nEpoch %d" % (epoch+1)) | |
train_indices = get_minibatches_idx(len(x_train), batch_size, shuffle=True) | |
val_indices = get_minibatches_idx(len(x_val), batch_size, shuffle=True) | |
val_losses = [] | |
train_accs = [] | |
val_accs = [] | |
for it,iv in zip(train_indices,val_indices): | |
X_batch = [x_train[k] for k in it] | |
y_batch = [y_train[k] for k in it] | |
seq_lengths = [maxlen for i in X_batch] | |
feed_dict = {self.X: X_batch, | |
self.y: y_batch, | |
self.seq_lengths: seq_lengths} | |
_,loss,acc = sess.run([self.train_step,self.loss,self.acc], feed_dict) | |
train_accs.append(acc) | |
X_batch = [x_val[k] for k in iv] | |
y_batch = [y_val[k] for k in iv] | |
seq_lengths = [maxlen for i in X_batch] | |
feed_dict = {self.X: X_batch, | |
self.y: y_batch, | |
self.seq_lengths: seq_lengths} | |
loss,acc = sess.run([self.loss, self.acc], feed_dict) | |
val_losses.append(loss) | |
val_accs.append(acc) | |
early_stop(val_losses) | |
print("Training accuracy: %.3f" % np.mean(train_accs)) | |
print("Validation accuracy: %.3f" % np.mean(val_accs)) | |
def main(): | |
print('Loading data...') | |
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) | |
print(len(x_train), 'train sequences') | |
print(len(x_test), 'test sequences') | |
print('Pad sequences (samples x time)') | |
x_train = sequence.pad_sequences(x_train, maxlen=maxlen) | |
x_test = sequence.pad_sequences(x_test, maxlen=maxlen) | |
y_train_one_hot = np.stack([y_train, 1-y_train],axis=1) | |
y_test_one_hot = np.stack([y_test, 1-y_test],axis=1) | |
m = Model() | |
with tf.Session() as sess: | |
sess.run(m.init) | |
m.fit(x_train, y_train_one_hot, sess) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment