Last active
October 18, 2020 01:35
-
-
Save piskvorky/85015d52254df77cd23ca43bc2e8322d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# | |
# Copyright (C) 2020 Radim Rehurek <[email protected]> | |
""" | |
Help script (template) for benchmarking. Run with: | |
/usr/bin/time --format "%E elapsed\n%Mk peak RAM" python gensim_benchmark.py ~/gensim-data/text9/text9.txt | |
""" | |
import logging | |
import sys | |
from gensim.models.word2vec import Text8Corpus, LineSentence | |
from gensim.models import FastText, Word2Vec, Doc2Vec, Phrases | |
from gensim import __version__ | |
logger = logging.getLogger(__name__) | |
if __name__ == "__main__": | |
logging.basicConfig( | |
format='%(asctime)s [%(processName)s/%(process)d] [%(levelname)s] %(name)s:%(lineno)d: %(message)s', | |
level=logging.INFO, | |
) | |
if len(sys.argv) < 2: | |
print(globals()['__doc__'] % locals()) | |
sys.exit(1) | |
corpus = Text8Corpus(sys.argv[1]) # text8/text9 format from http://mattmahoney.net/dc/textdata.html | |
cls = FastText | |
cls(corpus, workers=12, epochs=1).save(f'/tmp/{cls.__name__}.gensim{__version__}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment