Last active
November 24, 2022 14:26
-
-
Save jhumigas/fbd09c24ba1fe16f63dc078df011ebba to your computer and use it in GitHub Desktop.
Pytorch about Continuous Bag Of Words Representation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
My proposal to the exercise in the tutorial about Deep Learning for NLP with Pytorch | |
This is one is about Word Embeddings that encodes Lexical Semantics. | |
Continuous Bag-of-Words model (CBOW) is model that tries to predict a word given the context | |
of a few words before and after the target. | |
.. _Source: | |
http://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#exercise-computing-word-embeddings-continuous-bag-of-words | |
""" | |
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import torch | |
import torch.nn as nn | |
import torch.autograd as autograd | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import torch.optim as optim | |
torch.manual_seed(1) | |
# We will be considering two words before and after the target | |
CONTEXT_SIZE = 2 | |
# This the embedding space dimension, meaning we want to represent each word by a 10 dimensional | |
# vector. Thanks to this representation we will be computing the CBoW | |
EMBEDDING_DIM = 10 | |
# Initial data | |
raw_text = """We are about to study the idea of a computational process. | |
Computational processes are abstract beings that inhabit computers. | |
As they evolve, processes manipulate other abstract things called data. | |
The evolution of a process is directed by a pattern of rules | |
called a program. People create programs to direct processes. In effect, | |
we conjure the spirits of the computer with our spells.""".split() | |
vocab = set(raw_text) | |
vocab_size = len(vocab) | |
# 1. Pre-process the dataset | |
# We associate each word to a unique index | |
word_to_ix = {word: i for i, word in enumerate(vocab)} | |
# Building our training set | |
data = [] | |
for i in range(2, len(raw_text) -2): | |
context = [raw_text[i-2], raw_text[i-1], | |
raw_text[i+1], raw_text[i+2]] | |
target = raw_text[i] | |
data.append((context, target)) | |
print(data[:5]) | |
# 2. Design of our model | |
class CBOW(nn.Module): | |
"""CBOW model | |
Two layer CBOW model | |
Attributes: | |
embeddings: vocab_size*embedding_dim holding the word embedding representations | |
linear1: First layer that maps embedding vector to 128 space | |
linear2: Final layer that maps 128-d vector to vocab size space | |
""" | |
def __init__(self, vocab_size, embedding_dim, context_size): | |
""" | |
Args: | |
vocab_size (int): Size of our vocabulary | |
embedding_dim (int): Dimension of the embedding space | |
context_size (int): Number of words to consider around the target word | |
""" | |
super(CBOW, self).__init__() | |
self.embeddings = nn.Embedding(vocab_size, embedding_dim) | |
self.linear1 = nn.Linear(embedding_dim*context_size*2, 128) | |
self.linear2 = nn.Linear(128, vocab_size) | |
def forward(self, inputs): | |
embeds = self.embeddings(inputs).view((1, -1)) | |
out = F.relu(self.linear1(embeds)) | |
out = self.linear2(out) | |
log_probs = F.log_softmax(out, dim=1) | |
return log_probs | |
def make_context_vector(context, word_to_ix): | |
"""Generate from the context a variable to be used with our model | |
Args: | |
context(list): List of words surrounding the target | |
word_to_ix(dict): Dictionary of words as key and index as value | |
Returns: | |
autograd.Variable: A variable corresponding to the context in terms of indexis | |
""" | |
idxs = [word_to_ix[w] for w in context] | |
tensor = torch.LongTensor(idxs) | |
return autograd.Variable(tensor) | |
# Example of running make_context_vector | |
# make_context_vector(data[0][0], word_to_ix) | |
# Let's train ! | |
losses = [] | |
loss_function = nn.NLLLoss() | |
model = CBOW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE) | |
optimizer = optim.SGD(model.parameters(), lr=0.001) | |
# We will go through the whole dataset 100 times | |
for epoch in range(100): | |
total_loss = torch.Tensor([0]) | |
for context, target in data: | |
# Preparing the input of the model | |
# Turn the word into indices then wrap them into variables | |
context_var = make_context_vector(context, word_to_ix) | |
# Since pytorch accumulates the gradient, we zero it before every iteration | |
model.zero_grad() | |
# Forward pass | |
log_probs = model(context_var) | |
# Get target | |
target_val = torch.tensor([word_to_ix[target]], dtype=torch.long) | |
# Compute the loss function or how we are far from being correct | |
loss = loss_function(log_probs, target_val) | |
# Backward pass and update the gradient | |
loss.backward() | |
optimizer.step() | |
total_loss += loss.data | |
losses.append(total_loss) | |
print(losses) | |
def do_inference(context, word_to_ix, model): | |
""" | |
Predict word given its context. | |
Args: | |
context(list): List of words surrounding the target | |
word_to_ix(dict): Dictionary of words as key and index as value | |
model(CBOW): Trained model | |
""" | |
print(f"Context: {context}") | |
context_var = make_context_vector(context, word_to_ix) | |
# Forward pass | |
log_probs = model(context_var) | |
# Find max prob | |
predicted_target_idx = int(log_probs.argmax().numpy()) | |
# Get word | |
predicted_target = [key for key, val in word_to_ix.items() if val==predicted_target_idx][0] | |
return predicted_target | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment