Skip to content

Instantly share code, notes, and snippets.

@robotcator
Last active August 9, 2017 10:16
Show Gist options
  • Save robotcator/b5f90a6ce8ab4ceaf5e6b4f72703cb8e to your computer and use it in GitHub Desktop.
Save robotcator/b5f90a6ce8ab4ceaf5e6b4f72703cb8e to your computer and use it in GitHub Desktop.
doc2vec for sentiment analysis
# coding=utf-8
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
from collections import namedtuple
import numpy as np
import gensim
def read_sentimentDocs():
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')
alldocs = [] # will hold all docs in original order
with gensim.utils.smart_open('alldata-id.txt', encoding='utf-8') as alldata:
for line_no, line in enumerate(alldata):
tokens = gensim.utils.to_unicode(line).split()
words = tokens[1:]
tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost
split = ['train','test','extra','extra'][line_no//25000] # 25k train, 25k test, 25k extra
sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
alldocs.append(SentimentDocument(words, tags, split, sentiment))
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:] # for reshuffling per pass
print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))
return train_docs, test_docs, doc_list
def test_classifier_error(train, train_label, test, test_label):
classifier = LogisticRegression()
classifier.fit(train, train_label)
score = classifier.score(test, test_label)
print "score :", score
return score
model2 = Doc2Vec.load("large_doc_50000_iter50.bin")
train_array = np.zeros((25000, 100))
train_label = np.zeros((25000, 1))
test_array = np.zeros((25000, 100))
test_label = np.zeros((25000, 1))
for i in range(12500):
train_array[i] = model2.docvecs[i]
train_label[i] = 1
train_array[i+12500] = model2.docvecs[i+12500]
train_label[i+12500] = 0
test_array[i] = model2.docvecs[i+25000]
test_label[i] = 1
test_array[i+12500] = model2.docvecs[i+37500]
test_label[i+12500] = 0
print train_array[0], train_label[0]
print test_array[0], test_label[0]
test_classifier_error(train_array, train_label, test_array, test_label)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment