Last active
June 22, 2022 14:36
-
-
Save maxbellec/85d90d3d7f2f96589f1517e5c4567dc3 to your computer and use it in GitHub Desktop.
Create Word2Vec from wikipedia with gensim
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import multiprocessing | |
from gensim.corpora.wikicorpus import WikiCorpus | |
from gensim.models.word2vec import Word2Vec | |
from gensim.models import TfidfModel | |
# logging is important to get the state of the functions | |
import logging | |
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') | |
logging.root.setLevel(level=logging.INFO) | |
wiki = WikiCorpus('data/enwiki-20170101-pages-articles-multistream.xml.bz2', lemmatize=False) | |
tfidf = TfidfModel(wiki) | |
# save for persistence | |
wiki.save('wiki.corpus) | |
tfidf.save('wiki.tfidf.model') | |
# word2vec | |
class MySentences(object): | |
def __iter__(self): | |
for text in wiki.get_texts(): | |
yield [word.decode() for word in text] | |
sentences = MySentences() | |
params = {'size': 300, 'window': 10, 'min_count': 40, | |
'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1e-3,} | |
word2vec = Word2Vec(sentences, **params) | |
word2vec.save('wiki.word2vec.model') |
also based on newer versions of Gensim you should change size
parameter to vector_size
in line 23 and also remove lemmatize
parameter on line 11
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
there is a typo in line 14
missed a quotation