Created December 18, 2019 13:38
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import os
from import DataLoader, TensorDataset, Dataset
import operator
import data_loader
import pickle
import tqdm
# Important setting
# ------------------------------------------- Constants ----------------------------------------
SEQ_LEN = 52
ONEHOT_AVERAGE = "onehot_average"
W2V_AVERAGE = "w2v_average"
W2V_SEQUENCE = "w2v_sequence"
TRAIN = "train"
VAL = "val"
TEST = "test"
# ------------------------------------------ Helper methods and classes --------------------------
def get_available_device():
Allows training on GPU if available. Can help with running things faster when a GPU with cuda is
available but not a most...
Given a device, one can use
and so that all the computations will be done on the GPU.
return torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def save_pickle(obj, path):
with open(path, "wb") as f:
pickle.dump(obj, f)
def load_pickle(path):
with open(path, "rb") as f:
return pickle.load(f)
def save_model(model, path, epoch, optimizer):
Utility function for saving checkpoint of a model, so training or evaluation can be executed later on.
:param model: torch module representing the model
:param optimizer: torch optimizer used for training the module
:param path: path to save the checkpoint into
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict()}, path)
def load(model, path, optimizer):
Loads the state (weights, paramters...) of a model which was saved with save_model
:param model: should be the same model as the one which was saved in the path
:param path: path to the saved checkpoint
:param optimizer: should be the same optimizer as the one which was saved in the path
checkpoint = torch.load(path)
epoch = checkpoint['epoch']
return model, optimizer, epoch
# ------------------------------------------ Data utilities ----------------------------------------
def load_word2vec():
""" Load Word2Vec Vectors
wv_from_bin: All 3 million embeddings, each lengh 300
import gensim.downloader as api
wv_from_bin = api.load("word2vec-google-news-300")
vocab = list(wv_from_bin.vocab.keys())
print("Loaded vocab size %i" % len(vocab))
return wv_from_bin
def create_or_load_slim_w2v(words_list, cache_w2v=True):
returns word2vec dict only for words which appear in the dataset.
:param words_list: list of words to use for the w2v dict
:param cache_w2v: whether to save locally the small w2v dictionary
:return: dictionary which maps the known words to their vectors
w2v_path = "w2v_dict.pkl"
if not os.path.exists(w2v_path):
full_w2v = load_word2vec()
w2v_emb_dict = {k: full_w2v[k] for k in words_list if k in full_w2v}
if cache_w2v:
save_pickle(w2v_emb_dict, w2v_path)
w2v_emb_dict = load_pickle(w2v_path)
return w2v_emb_dict
def get_w2v_average(sent, word_to_vec, embedding_dim):
This method gets a sentence and returns the average word embedding of the words consisting
the sentence.
:param sent: the sentence object
:param word_to_vec: a dictionary mapping words to their vector embeddings
:param embedding_dim: the dimension of the word embedding vectors
:return The average embedding vector as numpy ndarray.
vec = np.zeros(embedding_dim)
c = 0
for word in sent.text:
if word in word_to_vec:
vec += word_to_vec[word]
c += 1
if c > 0:
vec /= c
return vec
def get_one_hot(size, ind):
this method returns a one-hot vector of the given size, where the 1 is placed in the ind entry.
:param size: the size of the vector
:param ind: the entry index to turn to 1
:return: numpy ndarray which represents the one-hot vector
rv = np.zeros(size)
rv[ind] = 1
return rv
def average_one_hots(sent, word_to_ind):
this method gets a sentence, and a mapping between words to indices, and returns the average
one-hot embedding of the tokens in the sentence.
:param sent: a sentence object.
:param word_to_ind: a mapping between words to indices
size = len(word_to_ind)
rv = np.zeros(size)
for w in sent.text:
rv += get_one_hot(size, word_to_ind[w])
return (rv / len(sent.text))
def get_word_to_ind(words_list):
this function gets a list of words, and returns a mapping between
words to their index.
:param words_list: a list of words
:return: the dictionary mapping words to the index
d = dict()
ind = 0
for word in words_list:
if word not in d:
d[word] = ind
ind += 1
return d
def sentence_to_embedding(sent, word_to_vec, seq_len, embedding_dim=300):
this method gets a sentence and a word to vector mapping, and returns a list containing the
words embeddings of the tokens in the sentence.
:param sent: a sentence object
:param word_to_vec: a word to vector mapping.
:param seq_len: the fixed length for which the sentence will be mapped to.
:param embedding_dim: the dimension of the w2v embedding
:return: numpy ndarray of shape (seq_len, embedding_dim) with the representation of the sentence
rv = []
for i in range(min(seq_len, len(sent.text))):
emb = word_to_vec.get(sent.text[i], np.zeros(embedding_dim))
while len(rv) < seq_len:
return np.array(rv)
class OnlineDataset(Dataset):
A pytorch dataset which generates model inputs on the fly from sentences of SentimentTreeBank
def __init__(self, sent_data, sent_func, sent_func_kwargs):
:param sent_data: list of sentences from SentimentTreeBank
:param sent_func: Function which converts a sentence to an input datapoint
:param sent_func_kwargs: fixed keyword arguments for the state_func
""" = sent_data
self.sent_func = sent_func
self.sent_func_kwargs = sent_func_kwargs
def __len__(self):
return len(
def __getitem__(self, idx):
sent =[idx]
sent_emb = self.sent_func(sent, **self.sent_func_kwargs)
sent_label = sent.sentiment_class
return sent_emb, sent_label
class DataManager():
Utility class for handling all data management task. Can be used to get iterators for training and
def __init__(self, data_type=ONEHOT_AVERAGE, use_sub_phrases=True, dataset_path="stanfordSentimentTreebank", batch_size=50,
builds the data manager used for training and evaluation.
:param data_type: one of ONEHOT_AVERAGE, W2V_AVERAGE and W2V_SEQUENCE
:param use_sub_phrases: if true, training data will include all sub-phrases plus the full sentences
:param dataset_path: path to the dataset directory
:param batch_size: number of examples per batch
:param embedding_dim: relevant only for the W2V data types.
# load the dataset
self.sentiment_dataset = data_loader.SentimentTreeBank(dataset_path, split_words=True)
# map data splits to sentences lists
self.sentences = {}
if use_sub_phrases:
self.sentences[TRAIN] = self.sentiment_dataset.get_train_set_phrases()
self.sentences[TRAIN] = self.sentiment_dataset.get_train_set()
self.sentences[VAL] = self.sentiment_dataset.get_validation_set()
self.sentences[TEST] = self.sentiment_dataset.get_test_set()
# map data splits to sentence input preperation functions
words_list = list(self.sentiment_dataset.get_word_counts().keys())
if data_type == ONEHOT_AVERAGE:
self.sent_func = average_one_hots
self.sent_func_kwargs = {"word_to_ind": get_word_to_ind(words_list)}
elif data_type == W2V_SEQUENCE:
self.sent_func = sentence_to_embedding
self.sent_func_kwargs = {"seq_len": SEQ_LEN,
"word_to_vec": create_or_load_slim_w2v(words_list),
"embedding_dim": embedding_dim
elif data_type == W2V_AVERAGE:
self.sent_func = get_w2v_average
words_list = list(self.sentiment_dataset.get_word_counts().keys())
self.sent_func_kwargs = {"word_to_vec": create_or_load_slim_w2v(words_list),
"embedding_dim": embedding_dim
raise ValueError("invalid data_type: {}".format(data_type))
# map data splits to torch datasets and iterators
self.torch_datasets = {k: OnlineDataset(sentences, self.sent_func, self.sent_func_kwargs) for
k, sentences in self.sentences.items()}
self.torch_iterators = {k: DataLoader(dataset, batch_size=batch_size, shuffle=k == TRAIN)
for k, dataset in self.torch_datasets.items()}
def get_torch_iterator(self, data_subset=TRAIN):
:param data_subset: one of TRAIN VAL and TEST
:return: torch batches iterator for this part of the datset
return self.torch_iterators[data_subset]
def get_labels(self, data_subset=TRAIN):
:param data_subset: one of TRAIN VAL and TEST
:return: numpy array with the labels of the requested part of the datset in the same order of the
return np.array([sent.sentiment_class for sent in self.sentences[data_subset]])
def get_input_shape(self):
:return: the shape of a single example from this dataset (only of x, ignoring y the label).
return self.torch_datasets[TRAIN][0][0].shape
# ------------------------------------ Models ----------------------------------------------------
class LSTM(nn.Module):
An LSTM for sentiment analysis with architecture as described in the exercise description.
def __init__(self, embedding_dim, hidden_dim, n_layers, dropout):
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=True, batch_first=True)
# self.dropout = nn.Dropout()
self.linear = nn.Linear(in_features=hidden_dim * 2, out_features=1)
def forward(self, x):
_, (c, h) = self.lstm(x)
h =[0], h[1]), 1)
return self.linear(h)
def predict(self, x):
return nn.Sigmoid(self.forward(x))
class LogLinear(nn.Module):
general class for the log-linear models for sentiment analysis.
def __init__(self, embedding_dim):
# Initialize Model
self.linear1 = nn.Linear(in_features=embedding_dim, out_features=1)
def forward(self, x):
return self.linear1(x)
def predict(self, x):
return nn.Sigmoid(self.forward(x))
# ------------------------- training functions -------------
def binary_accuracy(preds, y):
This method returns tha accuracy of the predictions, relative to the labels.
You can choose whether to use numpy arrays or tensors here.
:param preds: a vector of predictions
:param y: a vector of true labels
:return: scalar value - (<number of accurate predictions> / <number of examples>)
#print(preds[:,0] >= 0.5)
# TODO is 0.5 bad?
s = torch.sum((preds[:,0] >= 0.5) == (y), dtype=torch.float64)
acc = s / y.shape[0]
return (acc.item())
def train_epoch(model, data_iterator, optimizer, criterion):
This method operates one epoch (pass over the whole train set) of training of the given model,
and returns the accuracy and loss for this epoch
:param model: the model we're currently training
:param data_iterator: an iterator, iterating over the training data for the model.
:param optimizer: the optimizer object for the training process.
:param criterion: the criterion object for the training process.
batches = 0
acc = 0
for x, y in data_iterator:
#x ='cuda')
#y ='cuda')
batches += 1
pred = model(x) # TODO are we iterating over ALL the training set?
loss = criterion(pred[:,0], y)
acc += binary_accuracy(pred, y)
acc = acc / batches
return acc, loss.item()
def evaluate(model, data_iterator, criterion):
evaluate the model performance on the given data
:param model: one of our models..
:param data_iterator: torch data iterator for the relevant subset
:param criterion: the loss criterion used for evaluation
:return: tuple of (average loss over all examples, average accuracy over all examples)
batches = 0
acc = 0
for x, y in data_iterator:
#x ='cuda')
#y ='cuda')
batches += 1
pred = model(x)
loss = criterion(pred[:,0], y)
acc += binary_accuracy(pred, y)
acc = acc / batches
return acc, loss.item()
def get_predictions_for_data(model, data_iter):
This function should iterate over all batches of examples from data_iter and return all of the models
predictions as a numpy ndarray or torch tensor (or list if you prefer). the prediction should be in the
same order of the examples returned by data_iter.
:param model: one of the models you implemented in the exercise
:param data_iter: torch iterator as given by the DataManager
def train_model(model, data_manager, n_epochs, lr, weight_decay=0.):
Runs the full training procedure for the given model. The optimization should be done using the Adam
optimizer with all parameters but learning rate and weight decay set to default.
:param model: module of one of the models implemented in the exercise
:param data_manager: the DataManager object
:param n_epochs: number of times to go over the whole training set
:param lr: learning rate to be used for optimization
:param weight_decay: parameter for l2 regularization
train_acc_lst, train_loss_lst = [], []
val_acc_lst, val_loss_lst = [], []
optimizer = optim.Adam(params=model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = nn.BCEWithLogitsLoss()
for i in range(n_epochs):
train_acc, train_loss = train_epoch(model, data_manager.get_torch_iterator(TRAIN), optimizer, criterion)
val_acc, val_loss = evaluate(model, data_manager.get_torch_iterator(VAL), criterion)
print(f'epoch {i} validatin acc {val_acc}')
return [[train_acc_lst, train_loss_lst], [val_acc_lst, val_loss_lst]]
def train_log_linear_with_one_hot():
Here comes your code for training and evaluation of the log linear model with one hot representation.
n_epochs = 20
batch_size = 64
lr = 0.01
weight_decay = 0.0001
data_manager = DataManager(batch_size=batch_size)
model = LogLinear(data_manager.get_input_shape()[0])
rv = train_model(model, data_manager, n_epochs, lr, weight_decay)
return rv
def train_log_linear_with_w2v():
Here comes your code for training and evaluation of the log linear model with word embeddings
n_epochs = 20
batch_size = 64
lr = 0.01
weight_decay = 0.0001
data_manager = DataManager(batch_size=batch_size, data_type="w2v_average", embedding_dim=300)
model = LogLinear(data_manager.get_input_shape()[0])
#model =
rv = train_model(model, data_manager, n_epochs, lr, weight_decay)
return rv
def train_lstm_with_w2v():
Here comes your code for training and evaluation of the LSTM model.
# Training Params
n_epochs = 20
batch_size = 64
lr = 0.01
weight_decay = 0.0001
# Model Params
hidden_dim = 100
dropout = 0.5
data_manager = DataManager(batch_size=batch_size, data_type="w2v_sequence", embedding_dim=300)
model = LSTM(300, hidden_dim, 1, dropout)
#model =
rv = train_model(model, data_manager, n_epochs, lr, weight_decay)
return rv
if __name__ == '__main__':
# print(train_log_linear_with_one_hot())
# print(train_log_linear_with_w2v())
