Last active
August 9, 2017 10:16
-
-
Save robotcator/b5f90a6ce8ab4ceaf5e6b4f72703cb8e to your computer and use it in GitHub Desktop.
doc2vec for sentiment analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
from gensim.models import Doc2Vec | |
from sklearn.linear_model import LogisticRegression | |
from collections import namedtuple | |
import numpy as np | |
import gensim | |
def read_sentimentDocs(): | |
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment') | |
alldocs = [] # will hold all docs in original order | |
with gensim.utils.smart_open('alldata-id.txt', encoding='utf-8') as alldata: | |
for line_no, line in enumerate(alldata): | |
tokens = gensim.utils.to_unicode(line).split() | |
words = tokens[1:] | |
tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost | |
split = ['train','test','extra','extra'][line_no//25000] # 25k train, 25k test, 25k extra | |
sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown | |
alldocs.append(SentimentDocument(words, tags, split, sentiment)) | |
train_docs = [doc for doc in alldocs if doc.split == 'train'] | |
test_docs = [doc for doc in alldocs if doc.split == 'test'] | |
doc_list = alldocs[:] # for reshuffling per pass | |
print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs))) | |
return train_docs, test_docs, doc_list | |
def test_classifier_error(train, train_label, test, test_label): | |
classifier = LogisticRegression() | |
classifier.fit(train, train_label) | |
score = classifier.score(test, test_label) | |
print "score :", score | |
return score | |
model2 = Doc2Vec.load("large_doc_50000_iter50.bin") | |
train_array = np.zeros((25000, 100)) | |
train_label = np.zeros((25000, 1)) | |
test_array = np.zeros((25000, 100)) | |
test_label = np.zeros((25000, 1)) | |
for i in range(12500): | |
train_array[i] = model2.docvecs[i] | |
train_label[i] = 1 | |
train_array[i+12500] = model2.docvecs[i+12500] | |
train_label[i+12500] = 0 | |
test_array[i] = model2.docvecs[i+25000] | |
test_label[i] = 1 | |
test_array[i+12500] = model2.docvecs[i+37500] | |
test_label[i+12500] = 0 | |
print train_array[0], train_label[0] | |
print test_array[0], test_label[0] | |
test_classifier_error(train_array, train_label, test_array, test_label) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment