-
-
Save balajikvijayan/9f7ab00f9bfd0bf56b14 to your computer and use it in GitHub Desktop.
from gensim import models | |
sentence = models.doc2vec.LabeledSentence( | |
words=[u'so`bme', u'words', u'here'], tags=["SENT_0"]) | |
sentence1 = models.doc2vec.LabeledSentence( | |
words=[u'here', u'we', u'go'], tags=["SENT_1"]) | |
sentences = [sentence, sentence1] | |
class LabeledLineSentence(object): | |
def __init__(self, filename): | |
self.filename = filename | |
def __iter__(self): | |
for uid, line in enumerate(open(filename)): | |
yield LabeledSentence(words=line.split(), labels=['SENT_%s' % uid]) | |
model = models.Doc2Vec(alpha=.025, min_alpha=.025, min_count=1) | |
model.build_vocab(sentences) | |
for epoch in range(10): | |
model.train(sentences) | |
model.alpha -= 0.002 # decrease the learning rate` | |
model.min_alpha = model.alpha # fix the learning rate, no decay | |
model.save("my_model.doc2vec") | |
model_loaded = models.Doc2Vec.load('my_model.doc2vec') | |
print model.docvecs.most_similar(["SENT_0"]) | |
print model_loaded.docvecs.most_similar(["SENT_1"]) |
renaud
commented
Jul 19, 2016
•
very stupid question from newbie.
I got an error from "print" syntax
print model.docvecs.most_similar(["SENT_0"]) ^ SyntaxError: invalid syntax
any clue for fixing it ?
You are using probably an older Python version. Add "(" ")" after your print.
e.g. print("some string")
What is the point of:
class LabeledLineSentence(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
for uid, line in enumerate(open(filename)):
yield LabeledSentence(words=line.split(), labels=['SENT_%s' % uid])
If you aren't using it?
@hswick You can use this class if you wish to read the sentences from a file. It is not used in the example since the example hard codes 2 sentences and uses them.
I have an error on line 21: model.train
"You must specify either total_examples or total_words, for proper alpha and progress calculations. "
ValueError: You must specify either total_examples or total_words, for proper alpha and progress calculations. The usual value is total_examples=model.corpus_count."
Please help me
How can I change "print model.docvecs.most_similar(["SENT_0"]) " if I am using the LabeledLineSentence class?