Created
August 9, 2017 10:15
-
-
Save robotcator/5efa77d0c7cf49a386320e6e04061f0e to your computer and use it in GitHub Desktop.
backmapping learned doc vector for sentiment analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
from gensim.models import Doc2Vec | |
from sklearn.linear_model import LogisticRegression | |
from collections import namedtuple | |
import numpy as np | |
import gensim | |
def read_sentimentDocs(): | |
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment') | |
alldocs = [] # will hold all docs in original order | |
with gensim.utils.smart_open('alldata-id.txt', encoding='utf-8') as alldata: | |
for line_no, line in enumerate(alldata): | |
tokens = gensim.utils.to_unicode(line).split() | |
words = tokens[1:] | |
tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost | |
split = ['train','test','extra','extra'][line_no//25000] # 25k train, 25k test, 25k extra | |
sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown | |
alldocs.append(SentimentDocument(words, tags, split, sentiment)) | |
train_docs = [doc for doc in alldocs if doc.split == 'train'] | |
test_docs = [doc for doc in alldocs if doc.split == 'test'] | |
doc_list = alldocs[:] # for reshuffling per pass | |
print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs))) | |
return train_docs, test_docs, doc_list | |
def test_classifier_error(train, train_label, test, test_label): | |
classifier = LogisticRegression() | |
classifier.fit(train, train_label) | |
score = classifier.score(test, test_label) | |
print "score :", score | |
return score | |
model1 = Doc2Vec.load("small_doc_15000_iter50.bin") | |
model2 = Doc2Vec.load("large_doc_50000_iter50.bin") | |
l = model1.docvecs.count | |
l2 = model2.docvecs.count | |
m1 = np.array([model1.docvecs[i] for i in range(l)]) | |
m2 = np.array([model2.docvecs[i] for i in range(l)]) | |
tm = np.linalg.lstsq(m2, m1, -1)[0] | |
# back mapping the doc vector | |
for i in range(l, l2): | |
x = model2.docvecs[i] | |
y = np.dot(x, tm) | |
m1 = np.vstack((m1, y)) | |
train_array = np.zeros((25000, 100)) | |
train_label = np.zeros((25000, 1)) | |
test_array = np.zeros((25000, 100)) | |
test_label = np.zeros((25000, 1)) | |
for i in range(12500): | |
train_array[i] = m1[i] | |
train_label[i] = 1 | |
train_array[i+12500] = m1[i+12500] | |
train_label[i+12500] = 0 | |
test_array[i] = m1[i+25000] | |
test_label[i] = 1 | |
test_array[i+12500] = m1[i+37500] | |
test_label[i+12500] = 0 | |
test_classifier_error(train_array, train_label, test_array, test_label) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment