Skip to content

Instantly share code, notes, and snippets.

@PyYoshi
Created October 26, 2011 04:46
Show Gist options
  • Select an option

  • Save PyYoshi/1315461 to your computer and use it in GitHub Desktop.

Select an option

Save PyYoshi/1315461 to your computer and use it in GitHub Desktop.
Gensim用のコーパス,TFIDF,LSIとLDAモデルの生成。Pythonによる実装なのでかなり遅い。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import sys
import os.path
import bz2
from gensim import utils
from gensim.corpora import WikiCorpus, Dictionary
from gensim.corpora.mmcorpus import MmCorpus
from gensim.models import TfidfModel,LdaModel,LsiModel
import MeCab
from utils import Utils as ut
logger = logging.getLogger('jawikicorpus')
logger.setLevel(logging.INFO)
tagger = MeCab.Tagger()
DEFAULT_DICT_SIZE = 100000
ARTICLE_MIN_CHARS = 500
def filter_wiki(raw):
text = utils.decode_htmlentities(utils.to_unicode(raw, 'utf-8', errors='ignore'))
text = utils.decode_htmlentities(text) # ' ' --> '\xa0'
return ut.remove_markup(text)
def jatokenize(text):
node = tagger.parseToNode(text.encode('utf-8')).next
while node:
if node.feature.split(',')[0] == '名詞':
yield node.surface.lower()
node = node.next
def tokenize(content):
return [token for token in jatokenize(content) if not token.startswith('_')]
class JaWikiCorpus(WikiCorpus):
def getArticles(self, return_raw=False):
articles, articles_all = 0, 0
intext, positions = False, 0
for lineno, line in enumerate(bz2.BZ2File(self.fname)):
if line.startswith(' <text'):
intext = True
line = line[line.find('>') + 1 : ]
lines = [line]
elif intext:
lines.append(line)
pos = line.find('</text>') # can be on the same line as <text>
if pos >= 0:
articles_all += 1
intext = False
if not lines:
continue
lines[-1] = line[:pos]
text = filter_Wiki(''.join(lines))
if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
articles += 1
if return_raw:
result = text
else:
result = tokenize(text) # text into tokens here
positions += len(result)
yield result
logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
" (total %i articles before pruning)" %
(articles, positions, articles_all))
self.numDocs = articles # cache corpus length
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info("running %s" % ' '.join(sys.argv))
program = os.path.basename(sys.argv[0])
if len(sys.argv) < 3:
print globals()['__doc__'] % locals()
sys.exit(1)
input, output = sys.argv[1:3]
if len(sys.argv) > 3:
keep_words = int(sys.argv[3])
else:
keep_words = DEFAULT_DICT_SIZE
wiki = JaWikiCorpus(input, keep_words=keep_words)
wiki.dictionary.save_as_text(output + '_wordids.txt')
wiki.save(output)
MmCorpus.serialize(output + '_bow.mm', wiki, progress_cnt=10000)
del wiki # メモリ解放
### 以下 処理例 ##
id2token = Dictionary.load_from_text(output + '_wordids.txt')
mm = MmCorpus(output + '_bow.mm')
# TF/IDFモデルの生成
tfidf = TfidfModel(mm, id2word=id2token, normalize=True)
MmCorpus.save_corpus(output + '_tfidf.mm', tfidf[mm],progress_cnt=10000)
# LSIモデルの生成
lsi = LsiModel(mm,id2word=id2token,num_topics=500)
lsi.save(output+'_lsi.model')
# LSIモデルの生成
lda = LdaModel(mm,id2word=id2token,num_topics=300)
lda.save(output+'_lda.model')
logging.info("finished running %s" % program)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment