PyYoshi · October 26, 2011 04:46
diff --git a/jawikicorpus.py b/jawikicorpus.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 import logging
 import sys
 import os.path
 import bz2
 from gensim import utils
 from gensim.corpora import WikiCorpus, Dictionary
 from gensim.corpora.mmcorpus import MmCorpus
 from gensim.models import TfidfModel,LdaModel,LsiModel
 import MeCab
 from utils import Utils as ut

 logger = logging.getLogger('jawikicorpus')
 logger.setLevel(logging.INFO)

 tagger = MeCab.Tagger()

 DEFAULT_DICT_SIZE = 100000
 ARTICLE_MIN_CHARS = 500

 def filter_wiki(raw):
    text = utils.decode_htmlentities(utils.to_unicode(raw, 'utf-8', errors='ignore'))
    text = utils.decode_htmlentities(text) # '&amp;nbsp;' --> '\xa0'
    return ut.remove_markup(text)

 def jatokenize(text):
    node = tagger.parseToNode(text.encode('utf-8')).next
    while node:
        if node.feature.split(',')[0] == '名詞':
            yield node.surface.lower()
        node = node.next

 def tokenize(content):
    return [token for token in jatokenize(content) if not token.startswith('_')]

 class JaWikiCorpus(WikiCorpus):
    def getArticles(self, return_raw=False):
        articles, articles_all = 0, 0
        intext, positions = False, 0
        for lineno, line in enumerate(bz2.BZ2File(self.fname)):
            if line.startswith('      <text'):
                intext = True
                line = line[line.find('>') + 1 : ]
                lines = [line]
            elif intext:
                lines.append(line)
            pos = line.find('</text>') # can be on the same line as <text>
            if pos >= 0:
                articles_all += 1
                intext = False
                if not lines:
                    continue
                lines[-1] = line[:pos]
                text = filter_Wiki(''.join(lines))
                if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
                    articles += 1
                    if return_raw:
                        result = text
                    else:
                        result = tokenize(text) # text into tokens here
                        positions += len(result)
                    yield result

        logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
                     " (total %i articles before pruning)" %
                     (articles, positions, articles_all))
        self.numDocs = articles # cache corpus length

 if __name__ == '__main__':
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logging.info("running %s" % ' '.join(sys.argv))
    program = os.path.basename(sys.argv[0])
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    input, output = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    wiki = JaWikiCorpus(input, keep_words=keep_words)
    wiki.dictionary.save_as_text(output + '_wordids.txt')
    wiki.save(output)
    MmCorpus.serialize(output + '_bow.mm', wiki, progress_cnt=10000)
    del wiki # メモリ解放
    ### 以下 処理例 ##
    id2token = Dictionary.load_from_text(output + '_wordids.txt')
    mm = MmCorpus(output + '_bow.mm')
    # TF/IDFモデルの生成
    tfidf = TfidfModel(mm, id2word=id2token, normalize=True)
    MmCorpus.save_corpus(output + '_tfidf.mm', tfidf[mm],progress_cnt=10000)
    # LSIモデルの生成
    lsi = LsiModel(mm,id2word=id2token,num_topics=500)
    lsi.save(output+'_lsi.model')
    # LSIモデルの生成
    lda = LdaModel(mm,id2word=id2token,num_topics=300)
    lda.save(output+'_lda.model')
    logging.info("finished running %s" % program)
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import logging
	import sys
	import os.path
	import bz2
	from gensim import utils
	from gensim.corpora import WikiCorpus, Dictionary
	from gensim.corpora.mmcorpus import MmCorpus
	from gensim.models import TfidfModel,LdaModel,LsiModel
	import MeCab
	from utils import Utils as ut

	logger = logging.getLogger('jawikicorpus')
	logger.setLevel(logging.INFO)

	tagger = MeCab.Tagger()

	DEFAULT_DICT_SIZE = 100000
	ARTICLE_MIN_CHARS = 500

	def filter_wiki(raw):
	text = utils.decode_htmlentities(utils.to_unicode(raw, 'utf-8', errors='ignore'))
	text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0'
	return ut.remove_markup(text)

	def jatokenize(text):
	node = tagger.parseToNode(text.encode('utf-8')).next
	while node:
	if node.feature.split(',')[0] == '名詞':
	yield node.surface.lower()
	node = node.next

	def tokenize(content):
	return [token for token in jatokenize(content) if not token.startswith('_')]

	class JaWikiCorpus(WikiCorpus):
	def getArticles(self, return_raw=False):
	articles, articles_all = 0, 0
	intext, positions = False, 0
	for lineno, line in enumerate(bz2.BZ2File(self.fname)):
	if line.startswith(' <text'):
	intext = True
	line = line[line.find('>') + 1 : ]
	lines = [line]
	elif intext:
	lines.append(line)
	pos = line.find('</text>') # can be on the same line as <text>
	if pos >= 0:
	articles_all += 1
	intext = False
	if not lines:
	continue
	lines[-1] = line[:pos]
	text = filter_Wiki(''.join(lines))
	if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
	articles += 1
	if return_raw:
	result = text
	else:
	result = tokenize(text) # text into tokens here
	positions += len(result)
	yield result

	logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
	" (total %i articles before pruning)" %
	(articles, positions, articles_all))
	self.numDocs = articles # cache corpus length

	if __name__ == '__main__':
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
	logging.root.setLevel(level=logging.INFO)
	logging.info("running %s" % ' '.join(sys.argv))
	program = os.path.basename(sys.argv[0])
	if len(sys.argv) < 3:
	print globals()['__doc__'] % locals()
	sys.exit(1)
	input, output = sys.argv[1:3]
	if len(sys.argv) > 3:
	keep_words = int(sys.argv[3])
	else:
	keep_words = DEFAULT_DICT_SIZE
	wiki = JaWikiCorpus(input, keep_words=keep_words)
	wiki.dictionary.save_as_text(output + '_wordids.txt')
	wiki.save(output)
	MmCorpus.serialize(output + '_bow.mm', wiki, progress_cnt=10000)
	del wiki # メモリ解放
	### 以下処理例 ##
	id2token = Dictionary.load_from_text(output + '_wordids.txt')
	mm = MmCorpus(output + '_bow.mm')
	# TF/IDFモデルの生成
	tfidf = TfidfModel(mm, id2word=id2token, normalize=True)
	MmCorpus.save_corpus(output + '_tfidf.mm', tfidf[mm],progress_cnt=10000)
	# LSIモデルの生成
	lsi = LsiModel(mm,id2word=id2token,num_topics=500)
	lsi.save(output+'_lsi.model')
	# LSIモデルの生成
	lda = LdaModel(mm,id2word=id2token,num_topics=300)
	lda.save(output+'_lda.model')
	logging.info("finished running %s" % program)
No results found