Created
February 16, 2018 09:20
-
-
Save nishnik/608ed5d224cd376ff7d6c6601cc4a076 to your computer and use it in GitHub Desktop.
Load imdb preprocessed dataset and simple lstm over it - PyTorch, Keras
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from keras.preprocessing import sequence | |
from keras.models import Sequential | |
from keras.datasets import imdb | |
import torch | |
import torch.autograd as autograd | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import torch.optim as optim | |
import os | |
import random | |
from torch.autograd import Variable | |
max_features = 20000 | |
maxlen = 80 # cut texts after this number of words (among top max_features most common words) | |
batch_size = 32 | |
print('Loading data...') | |
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) | |
print(len(x_train), 'train sequences') | |
print(len(x_test), 'test sequences') | |
print('Pad sequences (samples x time)') | |
x_train = sequence.pad_sequences(x_train, maxlen=maxlen) | |
x_test = sequence.pad_sequences(x_test, maxlen=maxlen) | |
print('x_train shape:', x_train.shape) | |
print('x_test shape:', x_test.shape) | |
class LSTMClassifier(nn.Module): | |
def __init__(self): | |
super(LSTMClassifier, self).__init__() | |
self.word_embeddings = nn.Embedding(20000, 128) #(max_features, embedding size) | |
self.lstm = nn.LSTM(128, 128) #(embedding_dim, hidden_dim) | |
self.hidden2label = nn.Linear(128, 2)#(hidden_dim, label_size) | |
self.hidden = self.init_hidden() | |
def init_hidden(self): | |
# the first is the hidden h | |
# the second is the cell c | |
return (autograd.Variable(torch.zeros(1, 1, 128)), #(1, 1, self.hidden_dim) | |
autograd.Variable(torch.zeros(1, 1, 128))) #(1, 1, self.hidden_dim) | |
def forward(self, sentence): | |
embeds = self.word_embeddings(sentence) | |
x = embeds.view(len(sentence), 1, -1) | |
lstm_out, self.hidden = self.lstm(x, self.hidden) | |
y = self.hidden2label(lstm_out[-1]) | |
log_probs = F.log_softmax(y) | |
return log_probs | |
model = LSTMClassifier() | |
model(Variable(torch.from_numpy(x_train[0]).long())) | |
loss_function = nn.NLLLoss() | |
optimizer = optim.Adam(model.parameters(),lr = 1e-3) | |
import numpy as np | |
def evaluate(model, data_x, data_y, loss_function): | |
model.eval() | |
avg_loss = 0.0 | |
truth_res = [] | |
pred_res = [] | |
for rep in range(len(data_x)): | |
y = np.ndarray(1) | |
y[0] = data_y[rep] | |
truth_res.append(y[0]) | |
y = Variable(torch.from_numpy(y).long()) | |
# detaching it from its history on the last instance. | |
# model.hidden = model.init_hidden() | |
pred = model(Variable(torch.from_numpy(data_x[rep]).long())) | |
pred_label = pred.data.max(1)[1].numpy() | |
pred_res.append(pred_label) | |
loss = loss_function(pred, y) | |
avg_loss += loss.data[0] | |
avg_loss /= len(data_x) | |
acc = get_accuracy(truth_res, pred_res) | |
print(name + ' avg_loss:%g train acc:%g' % (avg_loss, acc )) | |
return acc | |
def get_accuracy(truth, pred): | |
assert len(truth)==len(pred) | |
right = 0 | |
for i in range(len(truth)): | |
if truth[i]==pred[i]: | |
right += 1.0 | |
return right/len(truth) | |
def train_epoch(model, train_data_x, train_data_y, loss_function, optimizer, i): | |
model.train() | |
avg_loss = 0.0 | |
count = 0 | |
truth_res = [] | |
pred_res = [] | |
batch_sent = [] | |
prev_loss = 0.0 | |
for rep in range(len(train_data_x)): | |
y = np.ndarray(1) | |
y[0] = train_data_y[rep] | |
truth_res.append(y[0]) | |
y = Variable(torch.from_numpy(y).long()) | |
# detaching it from its history on the last instance. | |
model.hidden = model.init_hidden() | |
pred = model(Variable(torch.from_numpy(train_data_x[rep]).long())) | |
pred_label = pred.data.max(1)[1].numpy() | |
pred_res.append(pred_label) | |
model.zero_grad() | |
loss = loss_function(pred, y) | |
avg_loss += loss.data[0] | |
prev_loss += loss.data[0] | |
count += 1 | |
if count % 200 == 0: | |
print('epoch: %d iterations: %d loss :%g' % (i, count, prev_loss)) | |
prev_loss = 0.0 | |
loss.backward() | |
optimizer.step() | |
avg_loss /= len(train_data_x) | |
print('epoch: %d done! \n train avg_loss:%g , acc:%g'%(i, avg_loss, get_accuracy(truth_res,pred_res))) | |
EPOCH = 3 | |
for i in range(EPOCH): | |
print('epoch: %d start!' % i) | |
train_epoch(model, x_train, y_train, loss_function, optimizer, i) | |
# print('now best dev acc:',best_dev_acc) | |
# dev_acc = evaluate(model,dev_data,loss_function,word_to_ix,label_to_ix,'dev') | |
# test_acc = evaluate(model, test_data, loss_function, word_to_ix, label_to_ix, 'test') | |
# if dev_acc > best_dev_acc: | |
# best_dev_acc = dev_acc | |
# os.system('rm mr_best_model_acc_*.model') | |
# print('New Best Dev!!!') | |
# torch.save(model.state_dict(), 'best_models/mr_best_model_acc_' + str(int(test_acc*10000)) + '.model') | |
# no_up = 0 | |
# else: | |
# no_up += 1 | |
# if no_up >= 10: | |
# exit() | |
evaluate(model, x_test, y_test, loss_function) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment