-
-
Save RawPlutonium/a1c4a62217614b164c2d19b9b40f2461 to your computer and use it in GitHub Desktop.
Code stub for a simple text classifier
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding=utf-8 | |
# --- Adapted From --- | |
# Project: learn-pytorch | |
# Author: xingjunjie github: @gavinxing | |
# Create Time: 29/07/2017 11:58 AM on PyCharm | |
# Original code at: https://gist.github.com/GavinXing/9954ea846072e115bb07d9758892382c | |
import torch | |
import torch.nn as nn | |
import torch.autograd as autograd | |
import torch.optim as optim | |
import torch.nn.functional as F | |
class CBOW(nn.Module): | |
def __init__(self, vocab_size, num_classes=2, embedding_size=100): | |
super(CBOW, self).__init__() | |
self.word_embeddings = nn.Embedding(vocab_size, embedding_size) | |
self.class_embeddings = nn.Linear(embedding_size, num_classes) | |
def forward(self, inputs): | |
input_embeddings = self.word_embeddings(inputs) | |
sent_embedding = input_embeddings.sum(dim=0) | |
out = self.class_embeddings(sent_embedding) | |
out = F.log_softmax(out) | |
return out | |
def make_sentence_vector(context, word_to_ix): | |
idxs = [word_to_ix[w] for w in context] | |
tensor = torch.LongTensor(idxs) | |
return autograd.Variable(tensor) | |
if __name__ == '__main__': | |
EMBEDDING_SIZE = 10 | |
# | |
# | |
# TODO: here is some example training data, but we need more. | |
# Download some movie review data from http://ai.stanford.edu/~amaas/data/sentiment/ | |
# and load it so you can train the classifier! | |
# | |
# | |
positive = ["' stanley and iris ' show the triumph of the human spirit.".split(), | |
"what a fun movie !".split()] | |
negative = ["there are times when finishing a film one wishes to have a refund for the time just spent .".split(), | |
"this movie was so unrelentingly bad , I could hardly believe I was watching it .".split()] | |
# for each sentence we label them as "1" for positive and "0" for negative | |
data = [(s, 1) for s in positive] + [(s, 0) for s in negative] | |
# we extract the vocabulary | |
vocab = set(sum(positive, []) + sum(negative, [])) | |
vocab_size = len(vocab) | |
word_to_ix = {word: i for i, word in enumerate(vocab)} | |
loss_func = nn.CrossEntropyLoss() | |
net = CBOW(num_classes=2, embedding_size=EMBEDDING_SIZE, vocab_size=vocab_size) | |
optimizer = optim.SGD(net.parameters(), lr=0.01) | |
for epoch in range(200): | |
total_loss = 0 | |
for sentence, label in data: | |
# creates a vector with the indexes of each word in the sentence | |
sentence_var = make_sentence_vector(sentence, word_to_ix) | |
# compute predictions | |
log_probs = net(sentence_var) | |
# compute error function | |
loss = loss_func(log_probs, autograd.Variable( | |
torch.LongTensor([label]) | |
)) | |
net.zero_grad() # reset gradients | |
loss.backward() # compute updates | |
optimizer.step() # update vectors | |
total_loss += loss.data | |
print("loss =", total_loss[0]) | |
# Sanity check that we fitted the training set, but there is no glory in that! | |
# We will need (much) more training data to generalize to new sentences | |
sentence_var = make_sentence_vector("what a fun movie".split(), word_to_ix) | |
print("Positive prediction: ", net(sentence_var).exp()) | |
sentence_var = make_sentence_vector("this movie was so unrelentingly bad".split(), word_to_ix) | |
print("Negative prediction: ", net(sentence_var).exp()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment