Created
October 26, 2011 04:46
-
-
Save PyYoshi/1315461 to your computer and use it in GitHub Desktop.
Gensim用のコーパス,TFIDF,LSIとLDAモデルの生成。Pythonによる実装なのでかなり遅い。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| import logging | |
| import sys | |
| import os.path | |
| import bz2 | |
| from gensim import utils | |
| from gensim.corpora import WikiCorpus, Dictionary | |
| from gensim.corpora.mmcorpus import MmCorpus | |
| from gensim.models import TfidfModel,LdaModel,LsiModel | |
| import MeCab | |
| from utils import Utils as ut | |
| logger = logging.getLogger('jawikicorpus') | |
| logger.setLevel(logging.INFO) | |
| tagger = MeCab.Tagger() | |
| DEFAULT_DICT_SIZE = 100000 | |
| ARTICLE_MIN_CHARS = 500 | |
| def filter_wiki(raw): | |
| text = utils.decode_htmlentities(utils.to_unicode(raw, 'utf-8', errors='ignore')) | |
| text = utils.decode_htmlentities(text) # ' ' --> '\xa0' | |
| return ut.remove_markup(text) | |
| def jatokenize(text): | |
| node = tagger.parseToNode(text.encode('utf-8')).next | |
| while node: | |
| if node.feature.split(',')[0] == '名詞': | |
| yield node.surface.lower() | |
| node = node.next | |
| def tokenize(content): | |
| return [token for token in jatokenize(content) if not token.startswith('_')] | |
| class JaWikiCorpus(WikiCorpus): | |
| def getArticles(self, return_raw=False): | |
| articles, articles_all = 0, 0 | |
| intext, positions = False, 0 | |
| for lineno, line in enumerate(bz2.BZ2File(self.fname)): | |
| if line.startswith(' <text'): | |
| intext = True | |
| line = line[line.find('>') + 1 : ] | |
| lines = [line] | |
| elif intext: | |
| lines.append(line) | |
| pos = line.find('</text>') # can be on the same line as <text> | |
| if pos >= 0: | |
| articles_all += 1 | |
| intext = False | |
| if not lines: | |
| continue | |
| lines[-1] = line[:pos] | |
| text = filter_Wiki(''.join(lines)) | |
| if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here | |
| articles += 1 | |
| if return_raw: | |
| result = text | |
| else: | |
| result = tokenize(text) # text into tokens here | |
| positions += len(result) | |
| yield result | |
| logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions" | |
| " (total %i articles before pruning)" % | |
| (articles, positions, articles_all)) | |
| self.numDocs = articles # cache corpus length | |
| if __name__ == '__main__': | |
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') | |
| logging.root.setLevel(level=logging.INFO) | |
| logging.info("running %s" % ' '.join(sys.argv)) | |
| program = os.path.basename(sys.argv[0]) | |
| if len(sys.argv) < 3: | |
| print globals()['__doc__'] % locals() | |
| sys.exit(1) | |
| input, output = sys.argv[1:3] | |
| if len(sys.argv) > 3: | |
| keep_words = int(sys.argv[3]) | |
| else: | |
| keep_words = DEFAULT_DICT_SIZE | |
| wiki = JaWikiCorpus(input, keep_words=keep_words) | |
| wiki.dictionary.save_as_text(output + '_wordids.txt') | |
| wiki.save(output) | |
| MmCorpus.serialize(output + '_bow.mm', wiki, progress_cnt=10000) | |
| del wiki # メモリ解放 | |
| ### 以下 処理例 ## | |
| id2token = Dictionary.load_from_text(output + '_wordids.txt') | |
| mm = MmCorpus(output + '_bow.mm') | |
| # TF/IDFモデルの生成 | |
| tfidf = TfidfModel(mm, id2word=id2token, normalize=True) | |
| MmCorpus.save_corpus(output + '_tfidf.mm', tfidf[mm],progress_cnt=10000) | |
| # LSIモデルの生成 | |
| lsi = LsiModel(mm,id2word=id2token,num_topics=500) | |
| lsi.save(output+'_lsi.model') | |
| # LSIモデルの生成 | |
| lda = LdaModel(mm,id2word=id2token,num_topics=300) | |
| lda.save(output+'_lda.model') | |
| logging.info("finished running %s" % program) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment