Skip to content

Instantly share code, notes, and snippets.

@lzbgt
Forked from balajikvijayan/gensim doc2vec tutorial
Created April 17, 2018 09:37
Show Gist options
  • Save lzbgt/2a6b4c3ab17f1ca347723974966ccb26 to your computer and use it in GitHub Desktop.
Save lzbgt/2a6b4c3ab17f1ca347723974966ccb26 to your computer and use it in GitHub Desktop.
from gensim import models
sentence = models.doc2vec.LabeledSentence(
words=[u'so`bme', u'words', u'here'], tags=["SENT_0"])
sentence1 = models.doc2vec.LabeledSentence(
words=[u'here', u'we', u'go'], tags=["SENT_1"])
sentences = [sentence, sentence1]
class LabeledLineSentence(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
for uid, line in enumerate(open(filename)):
yield LabeledSentence(words=line.split(), labels=['SENT_%s' % uid])
model = models.Doc2Vec(alpha=.025, min_alpha=.025, min_count=1)
model.build_vocab(sentences)
for epoch in range(10):
model.train(sentences)
model.alpha -= 0.002 # decrease the learning rate`
model.min_alpha = model.alpha # fix the learning rate, no decay
model.save("my_model.doc2vec")
model_loaded = models.Doc2Vec.load('my_model.doc2vec')
print model.docvecs.most_similar(["SENT_0"])
print model_loaded.docvecs.most_similar(["SENT_1"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment