Skip to content

Instantly share code, notes, and snippets.

@cjratcliff
Created October 27, 2017 16:25
Show Gist options
  • Save cjratcliff/7669f6d9718e8dda78f63c4170537457 to your computer and use it in GitHub Desktop.
Save cjratcliff/7669f6d9718e8dda78f63c4170537457 to your computer and use it in GitHub Desktop.
# Adapted from https://github.com/fchollet/keras/blob/master/examples/imdb_lstm.py
from __future__ import print_function
from __future__ import division
from keras.preprocessing import sequence
from keras.datasets import imdb
from keras.layers.core import Dense
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn import isotonic
from scipy.stats import mannwhitneyu
import numpy as np
import tensorflow as tf
from utils import get_minibatches_idx
max_features = 20000
maxlen = 80 # cut texts after this number of words (among top max_features most common words)
batch_size = 128
num_lstm_layers = 2
rnn_size = 512
max_epochs = 50
eps = 1e-8
def early_stop(losses, sig_level=0.1):
"""
The losses must be from randomly ordered validation data.
"""
losses = np.array(losses)
# Insufficient sample size to run the test
if losses.shape[0] <= 40:
return False
iterations = np.arange(losses.shape[0] - 1)
change_in_losses = losses[1:] - losses[0:-1]
n = len(change_in_losses)
reg = isotonic.IsotonicRegression()
reg.fit(iterations, change_in_losses)
pred = reg.predict(iterations)
if np.max(pred) <= 0.0:
return False
# Find the point where pred crosses zero
transition_point = np.min(np.where(np.greater(pred,0)))
group_1 = change_in_losses[transition_point:]
group_2 = group_1 - pred[transition_point:]
# Insufficient sample size to run the test
if len(group_1) <= 20 or len(group_2) <= 20:
return False
### Impact of correlation between samples - reduces the effective sample size?
_, p_val = mannwhitneyu(group_1, group_2, alternative='greater') ### 'less'?
#print(p_val)
return p_val < sig_level
class Model():
def __init__(self):
self.X = tf.placeholder(tf.int32, [None, None], 'x')
self.y = tf.placeholder(tf.int32, [None, 2], 'y')
self.seq_lengths = tf.placeholder(tf.int32, [None], 'seq_lengths')
with tf.variable_scope('rnnlm'):
self.embedding = tf.get_variable('embedding', [max_features, rnn_size])
h = tf.nn.embedding_lookup(self.embedding, self.X)
cells = [tf.contrib.rnn.LSTMCell(num_units = rnn_size) for i in range(num_lstm_layers)]
cells = tf.contrib.rnn.MultiRNNCell(cells)
outputs, _ = tf.nn.dynamic_rnn(
cells,
inputs = h,
sequence_length = self.seq_lengths,
time_major = False,
dtype = tf.float32)
# Pool over the time dimension
h = tf.reduce_mean(outputs,1)
h = Dense(300, activation='relu')(h)
h = Dense(2, activation='softmax')(h)
self.pred = tf.clip_by_value(h, eps, 1 - eps)
# Cross-entropy loss
y = tf.to_float(self.y)
loss = -y*tf.log(self.pred)
self.loss = tf.reduce_mean(loss)
correct_prediction = tf.equal(tf.argmax(self.y,1), tf.argmax(self.pred,1))
self.acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
optimizer = tf.train.AdamOptimizer()
self.train_step = optimizer.minimize(self.loss)
self.init = tf.global_variables_initializer()
def fit(self, x, y, sess):
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
for epoch in range(max_epochs):
print("\nEpoch %d" % (epoch+1))
train_indices = get_minibatches_idx(len(x_train), batch_size, shuffle=True)
val_indices = get_minibatches_idx(len(x_val), batch_size, shuffle=True)
val_losses = []
train_accs = []
val_accs = []
for it,iv in zip(train_indices,val_indices):
X_batch = [x_train[k] for k in it]
y_batch = [y_train[k] for k in it]
seq_lengths = [maxlen for i in X_batch]
feed_dict = {self.X: X_batch,
self.y: y_batch,
self.seq_lengths: seq_lengths}
_,loss,acc = sess.run([self.train_step,self.loss,self.acc], feed_dict)
train_accs.append(acc)
X_batch = [x_val[k] for k in iv]
y_batch = [y_val[k] for k in iv]
seq_lengths = [maxlen for i in X_batch]
feed_dict = {self.X: X_batch,
self.y: y_batch,
self.seq_lengths: seq_lengths}
loss,acc = sess.run([self.loss, self.acc], feed_dict)
val_losses.append(loss)
val_accs.append(acc)
early_stop(val_losses)
print("Training accuracy: %.3f" % np.mean(train_accs))
print("Validation accuracy: %.3f" % np.mean(val_accs))
def main():
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
y_train_one_hot = np.stack([y_train, 1-y_train],axis=1)
y_test_one_hot = np.stack([y_test, 1-y_test],axis=1)
m = Model()
with tf.Session() as sess:
sess.run(m.init)
m.fit(x_train, y_train_one_hot, sess)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment