Created
January 2, 2015 07:53
-
-
Save kgmyshin/6190f10295126774af4f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import logging | |
import os.path | |
import sys | |
from gensim.corpora.wikicorpus import filter_wiki | |
from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus | |
from gensim.models import TfidfModel | |
# Wiki is first scanned for all distinct word types (~7M). The types that | |
# appear in more than 10% of articles are removed and from the rest, the | |
# DEFAULT_DICT_SIZE most frequent types are kept. | |
DEFAULT_DICT_SIZE = 100000 | |
def jatokenize(text): | |
node = tagger.parseToNode(text.encode('utf-8')).next | |
while node: | |
if node.feature.split(',')[0] == '名詞': | |
yield node.surface.lower() | |
node = node.next | |
def tokenize(content): | |
return [token for token in jatokenize(content) if not token.startswith('_')] | |
class JaWikiCorpus(WikiCorpus): | |
def getArticles(self, return_raw=False): | |
articles, articles_all = 0, 0 | |
intext, positions = False, 0 | |
for lineno, line in enumerate(bz2.BZ2File(self.fname)): | |
if line.startswith(' <text'): | |
intext = True | |
line = line[line.find('>') + 1 : ] | |
lines = [line] | |
elif intext: | |
lines.append(line) | |
pos = line.find('</text>') # can be on the same line as <text> | |
if pos >= 0: | |
articles_all += 1 | |
intext = False | |
if not lines: | |
continue | |
lines[-1] = line[:pos] | |
text = filter_wiki(''.join(lines)) | |
if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here | |
articles += 1 | |
if return_raw: | |
result = text | |
else: | |
result = tokenize(text) # text into tokens here | |
positions += len(result) | |
yield result | |
logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions" | |
" (total %i articles before pruning)" % | |
(articles, positions, articles_all)) | |
self.numDocs = articles # cache corpus length | |
if __name__ == '__main__': | |
program = os.path.basename(sys.argv[0]) | |
logger = logging.getLogger(program) | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') | |
logging.root.setLevel(level=logging.INFO) | |
logger.info("running %s" % ' '.join(sys.argv)) | |
# check and process input arguments | |
if len(sys.argv) < 3: | |
print(globals()['__doc__'] % locals()) | |
sys.exit(1) | |
inp, outp = sys.argv[1:3] | |
if len(sys.argv) > 3: | |
keep_words = int(sys.argv[3]) | |
else: | |
keep_words = DEFAULT_DICT_SIZE | |
online = 'online' in program | |
lemmatize = 'lemma' in program | |
debug = 'nodebug' not in program | |
if online: | |
dictionary = HashDictionary(id_range=keep_words, debug=debug) | |
dictionary.allow_update = True # start collecting document frequencies | |
wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary) | |
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012) | |
# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize` | |
dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) | |
dictionary.save_as_text(outp + '_wordids.txt.bz2') | |
wiki.save(outp + '_corpus.pkl.bz2') | |
dictionary.allow_update = False | |
else: | |
wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) | |
# only keep the most frequent words (out of total ~8.2m unique tokens) | |
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) | |
# save dictionary and bag-of-words (term-document frequency matrix) | |
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h | |
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') | |
# load back the id->word mapping directly from file | |
# this seems to save more memory, compared to keeping the wiki.dictionary object from above | |
dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') | |
del wiki | |
# initialize corpus reader and word->id mapping | |
mm = MmCorpus(outp + '_bow.mm') | |
# build tfidf, ~50min | |
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) | |
# save tfidf vectors in matrix market format | |
# ~4h; result file is 15GB! bzip2'ed down to 4.5GB | |
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) | |
logger.info("finished running %s" % program) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment