Last active
June 22, 2022 14:36
-
-
Save maxbellec/85d90d3d7f2f96589f1517e5c4567dc3 to your computer and use it in GitHub Desktop.
Create Word2Vec from wikipedia with gensim
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import multiprocessing | |
from gensim.corpora.wikicorpus import WikiCorpus | |
from gensim.models.word2vec import Word2Vec | |
from gensim.models import TfidfModel | |
# logging is important to get the state of the functions | |
import logging | |
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') | |
logging.root.setLevel(level=logging.INFO) | |
wiki = WikiCorpus('data/enwiki-20170101-pages-articles-multistream.xml.bz2', lemmatize=False) | |
tfidf = TfidfModel(wiki) | |
# save for persistence | |
wiki.save('wiki.corpus) | |
tfidf.save('wiki.tfidf.model') | |
# word2vec | |
class MySentences(object): | |
def __iter__(self): | |
for text in wiki.get_texts(): | |
yield [word.decode() for word in text] | |
sentences = MySentences() | |
params = {'size': 300, 'window': 10, 'min_count': 40, | |
'workers': max(1, multiprocessing.cpu_count() - 1), 'sample': 1e-3,} | |
word2vec = Word2Vec(sentences, **params) | |
word2vec.save('wiki.word2vec.model') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
also based on newer versions of Gensim you should change
size
parameter tovector_size
in line 23 and also removelemmatize
parameter on line 11