robotcator · August 9, 2017 10:15
diff --git a/translation_matrix_for_doc2vec.py b/translation_matrix_for_doc2vec.py
 # coding=utf-8
 from gensim.models import Doc2Vec
 from sklearn.linear_model import LogisticRegression
 from collections import namedtuple

 import numpy as np
 import gensim

 def read_sentimentDocs():
    SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

    alldocs = []  # will hold all docs in original order
    with gensim.utils.smart_open('alldata-id.txt', encoding='utf-8') as alldata:
        for line_no, line in enumerate(alldata):
            tokens = gensim.utils.to_unicode(line).split()
            words = tokens[1:]
            tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost
            split = ['train','test','extra','extra'][line_no//25000]  # 25k train, 25k test, 25k extra
            sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
            alldocs.append(SentimentDocument(words, tags, split, sentiment))

    train_docs = [doc for doc in alldocs if doc.split == 'train']
    test_docs = [doc for doc in alldocs if doc.split == 'test']
    doc_list = alldocs[:]  # for reshuffling per pass

    print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

    return train_docs, test_docs, doc_list

 def test_classifier_error(train, train_label, test, test_label):
    classifier = LogisticRegression()
    classifier.fit(train, train_label)
    score = classifier.score(test, test_label)
    print "score :", score
    return score


 model1 = Doc2Vec.load("small_doc_15000_iter50.bin")
 model2 = Doc2Vec.load("large_doc_50000_iter50.bin")

 l = model1.docvecs.count
 l2 = model2.docvecs.count

 m1 = np.array([model1.docvecs[i] for i in range(l)])
 m2 = np.array([model2.docvecs[i] for i in range(l)])

 tm = np.linalg.lstsq(m2, m1, -1)[0]

 # back mapping the doc vector
 for i in range(l, l2):
    x = model2.docvecs[i]
    y = np.dot(x, tm)
    m1 = np.vstack((m1, y))

 train_array = np.zeros((25000, 100))
 train_label = np.zeros((25000, 1))
 test_array = np.zeros((25000, 100))
 test_label = np.zeros((25000, 1))

 for i in range(12500):
    train_array[i] = m1[i]
    train_label[i] = 1

    train_array[i+12500] = m1[i+12500]
    train_label[i+12500] = 0

    test_array[i] = m1[i+25000]
    test_label[i] = 1

    test_array[i+12500] = m1[i+37500]
    test_label[i+12500] = 0

 test_classifier_error(train_array, train_label, test_array, test_label)
	# coding=utf-8
	from gensim.models import Doc2Vec
	from sklearn.linear_model import LogisticRegression
	from collections import namedtuple

	import numpy as np
	import gensim

	def read_sentimentDocs():
	SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')

	alldocs = [] # will hold all docs in original order
	with gensim.utils.smart_open('alldata-id.txt', encoding='utf-8') as alldata:
	for line_no, line in enumerate(alldata):
	tokens = gensim.utils.to_unicode(line).split()
	words = tokens[1:]
	tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost
	split = ['train','test','extra','extra'][line_no//25000] # 25k train, 25k test, 25k extra
	sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
	alldocs.append(SentimentDocument(words, tags, split, sentiment))

	train_docs = [doc for doc in alldocs if doc.split == 'train']
	test_docs = [doc for doc in alldocs if doc.split == 'test']
	doc_list = alldocs[:] # for reshuffling per pass

	print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

	return train_docs, test_docs, doc_list

	def test_classifier_error(train, train_label, test, test_label):
	classifier = LogisticRegression()
	classifier.fit(train, train_label)
	score = classifier.score(test, test_label)
	print "score :", score
	return score


	model1 = Doc2Vec.load("small_doc_15000_iter50.bin")
	model2 = Doc2Vec.load("large_doc_50000_iter50.bin")

	l = model1.docvecs.count
	l2 = model2.docvecs.count

	m1 = np.array([model1.docvecs[i] for i in range(l)])
	m2 = np.array([model2.docvecs[i] for i in range(l)])

	tm = np.linalg.lstsq(m2, m1, -1)[0]

	# back mapping the doc vector
	for i in range(l, l2):
	x = model2.docvecs[i]
	y = np.dot(x, tm)
	m1 = np.vstack((m1, y))

	train_array = np.zeros((25000, 100))
	train_label = np.zeros((25000, 1))
	test_array = np.zeros((25000, 100))
	test_label = np.zeros((25000, 1))

	for i in range(12500):
	train_array[i] = m1[i]
	train_label[i] = 1

	train_array[i+12500] = m1[i+12500]
	train_label[i+12500] = 0

	test_array[i] = m1[i+25000]
	test_label[i] = 1

	test_array[i+12500] = m1[i+37500]
	test_label[i+12500] = 0

	test_classifier_error(train_array, train_label, test_array, test_label)