# coding: utf-8
import logging
import os.path
import sys
import gensim.corpora.wikicorpus as wikicorpus
from gensim.corpora import Dictionary, MmCorpus, WikiCorpus
from gensim.models import TfidfModel
from gensim.utils import to_unicode
import MeCab
# Wiki is first scanned for all distinct word types (~7M). The types that
# appear in more than 10% of articles are removed and from the rest, the
# DEFAULT_DICT_SIZE most frequent types are kept.
tagger = MeCab.Tagger()
def tokenize_ja(text):
node = tagger.parseToNode(to_unicode(text, encoding='utf8', errors='ignore'))
while node:
if node.feature.split(',')[0] == '名詞':
yield node.surface.lower()
node =
def tokenize(content):
return [
to_unicode(token) for token in tokenize_ja(content)
if 2 <= len(token) <= 15 and not token.startswith('_')
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)"running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
print(globals()['__doc__'] % locals())
src, dst = sys.argv[1], sys.argv[2]
wikicorpus.tokenize = tokenize
wiki = WikiCorpus(src)
# only keep the most frequent words
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(dst + '', wiki, progress_cnt=10000, metadata=True)
wiki.dictionary.save_as_text(dst + '_wordids.txt.bz2')
# load back the id->word mapping directly from file
# this seems to save more memory, compared to keeping the wiki.dictionary object from above
dictionary = Dictionary.load_from_text(dst + '_wordids.txt.bz2')
del wiki
# initialize corpus reader and word->id mapping
mm = MmCorpus(dst + '')
# build tfidf
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) + '.tfidf_model')
# save tfidf vectors in matrix market format
MmCorpus.serialize(dst + '', tfidf[mm], progress_cnt=10000)'finished running %s' % program)
Jul 21, 2017

lda = LdaModel.load(dst)

# load TFIDF (BoW) of each of all Wikipedia articles
tfidf = MmCorpus(prefix + '')

# get title to document index mapping
docno2metadata = unpickle(prefix + '')
title2docno = {tup_title[1]: int(docno) for docno, tup_title in docno2metadata.items()}

titles = ['ビール', 'カブトムシ', '海', '夏祭り']
for title in titles:
    topics = lda[tfidf[title2docno[title]]]
    topic = sorted(topics, key=lambda t: t[1], reverse=True)[0][0]
    print('=== %s (topic %d) ===' % (title, topic))
    for word, p_word in lda.show_topic(topic, topn=10):
        print('%.5f\t%s' % (p_word, word))
=== ビール (topic 99) ===
0.04528 植物
0.02466 料理
0.02348 栽培
0.01843 品種
0.01610 ビール
0.01584 醸造
0.01410 ワイン
0.01373 kt
0.01318 生産
0.01272 農業
=== カブトムシ (topic 46) ===
0.00462 顕微鏡
0.00352 地震
0.00339 '''()
0.00303 障害
0.00268 生育
0.00248 哲学
0.00238 発生
0.00236 意味
0.00230 効果
0.00224 患者
=== 海 (topic 32) ===
0.02139 フェリー
0.01960 航路
0.01791 就航
0.01597 運航
0.01130 建造
0.01113 船舶
0.00976 諸島
0.00939 海洋
0.00835 造船
0.00803 ハワイ
=== 夏祭り (topic 62) ===
0.01825 寺院
0.01744 日蓮宗
0.01113 神社
0.00987 文化財
0.00772 大字
0.00706 古墳
0.00676 共編
0.00670 辞典
0.00647 学区
0.00625 角川

