Skip to content

Instantly share code, notes, and snippets.

@robotcator
Created August 9, 2017 10:15
Show Gist options
  • Save robotcator/5efa77d0c7cf49a386320e6e04061f0e to your computer and use it in GitHub Desktop.
Save robotcator/5efa77d0c7cf49a386320e6e04061f0e to your computer and use it in GitHub Desktop.
backmapping learned doc vector for sentiment analysis
# coding=utf-8
from gensim.models import Doc2Vec
from sklearn.linear_model import LogisticRegression
from collections import namedtuple
import numpy as np
import gensim
def read_sentimentDocs():
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')
alldocs = [] # will hold all docs in original order
with gensim.utils.smart_open('alldata-id.txt', encoding='utf-8') as alldata:
for line_no, line in enumerate(alldata):
tokens = gensim.utils.to_unicode(line).split()
words = tokens[1:]
tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost
split = ['train','test','extra','extra'][line_no//25000] # 25k train, 25k test, 25k extra
sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
alldocs.append(SentimentDocument(words, tags, split, sentiment))
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:] # for reshuffling per pass
print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))
return train_docs, test_docs, doc_list
def test_classifier_error(train, train_label, test, test_label):
classifier = LogisticRegression()
classifier.fit(train, train_label)
score = classifier.score(test, test_label)
print "score :", score
return score
model1 = Doc2Vec.load("small_doc_15000_iter50.bin")
model2 = Doc2Vec.load("large_doc_50000_iter50.bin")
l = model1.docvecs.count
l2 = model2.docvecs.count
m1 = np.array([model1.docvecs[i] for i in range(l)])
m2 = np.array([model2.docvecs[i] for i in range(l)])
tm = np.linalg.lstsq(m2, m1, -1)[0]
# back mapping the doc vector
for i in range(l, l2):
x = model2.docvecs[i]
y = np.dot(x, tm)
m1 = np.vstack((m1, y))
train_array = np.zeros((25000, 100))
train_label = np.zeros((25000, 1))
test_array = np.zeros((25000, 100))
test_label = np.zeros((25000, 1))
for i in range(12500):
train_array[i] = m1[i]
train_label[i] = 1
train_array[i+12500] = m1[i+12500]
train_label[i+12500] = 0
test_array[i] = m1[i+25000]
test_label[i] = 1
test_array[i+12500] = m1[i+37500]
test_label[i+12500] = 0
test_classifier_error(train_array, train_label, test_array, test_label)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment