-
-
Save RawPlutonium/3acb26cc2c55d713977718cb5b66d41a to your computer and use it in GitHub Desktop.
Code stub for a simple word2vec model
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn as nn | |
from torch.autograd import Variable | |
class SimpleW2V(nn.Module): | |
def __init__(self, nwords, ncontexts, vec_size): | |
super(SimpleW2V, self).__init__() | |
# randomly initialized vectors | |
self.words_emb = nn.Embedding(nwords, vec_size) | |
self.ctxt_emb = nn.Embedding(ncontexts, vec_size) | |
def forward(self, target_idx, context_idx): | |
# compute the dot product + activation | |
return torch.log(nn.functional.sigmoid( | |
self.words_emb(target_idx).dot(self.ctxt_emb(context_idx)))) | |
# | |
# | |
# Here you can build the vocabulary based on the words present in the text. | |
# (Note the two vocabularies can actually be the same) | |
# | |
# | |
words = {'love': 0, 'affection': 1, 'computsetting it to targeter': 2, 'football': 3} | |
ctxt = {'feeling': 0, 'letter': 1, 'chip': 2, 'team': 3} | |
model = SimpleW2V(len(words), len(ctxt), vec_size=10) | |
optim = torch.optim.SGD(model.parameters(), lr = 0.1) # this is the alpha constant | |
def wrap(idx): | |
# wrap an integer as a Pytorch Variable so it can be fed to the models | |
return torch.autograd.Variable(torch.LongTensor([idx])) | |
# "training" | |
print("log(σ(love ⋅ feeling)) (before) = {:.2f}".format( | |
model(wrap(words["love"]), wrap(ctxt["feeling"])).data[0])) | |
print("log(σ(love ⋅ team)) (before) = {:.2f}".format( | |
model(wrap(words["love"]), wrap(ctxt["team"])).data[0])) | |
# | |
# | |
# TODO: Here you can add a for loop iterating over a large file with text, | |
# where for each of the words, you look at 2 words to the left and | |
# 2 words to the right. E.g. | |
# for i, target in enumerate(words): | |
# for context in words[i-CONTEXT_SIZE:i] + words[i+1:i+CONTEXT_SIZE]: | |
# fake_contexts = random.choice(ctxt.keys(), NUM_NEGATIVES) # remember to check that "context" is not sampled by chance | |
# | |
# | |
# | |
target = 'love' | |
context = 'feeling' | |
fake_contexts = ['chip', 'team'] | |
# convert to index and wrap in variables | |
target_idx = wrap([words[target]]) | |
context_idx = wrap([ctxt[context]]) | |
fake_contexts_idx = [wrap(ctxt[fake_context]) | |
for fake_context in fake_contexts] | |
# evaluate the objective | |
obj = model(target_idx, context_idx) \ | |
- sum(model(target_idx, fake_context_idx) | |
for fake_context_idx in fake_contexts_idx) | |
# the loss will be minimized, and thus, to maximize the objective we minimize the negative | |
loss = -obj | |
# optimize the objective (all automatically done for you!) | |
model.zero_grad() # reset gradients | |
loss.backward() # compute gradients | |
optim.step() # update vectors | |
print("log(σ(love ⋅ feeling)) (after) = {:.2f}".format( | |
model(wrap(words["love"]), wrap(ctxt["feeling"])).data[0])) | |
print("log(σ(love ⋅ team)) (after) = {:.2f}".format( | |
model(wrap(words["love"]), wrap(ctxt["team"])).data[0])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment