Last active
February 4, 2019 06:35
-
-
Save tsuchm/e818cd0e8b2eeaac371faa396371a2a7 to your computer and use it in GitHub Desktop.
gensim を使って,日本語版 wikipedia ダンプデータをコーパス化するスクリプト(メンテナンスされていません)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# https://github.com/attardi/wikiextractor が使いやすく,かつ,日本語も適切に扱ってくれることが判明しましたので, | |
# このスクリプトは,今後のメンテナンスの予定はありません. | |
import argparse | |
import codecs | |
import logging | |
import os.path | |
import re | |
import sys | |
from gensim.corpora import WikiCorpus | |
import gensim.corpora.wikicorpus | |
import MeCab | |
tagger = None | |
re_beg_space = re.compile(r"^\\s+", re.UNICODE) | |
re_end_space = re.compile(r"\\s+$", re.UNICODE) | |
def ja_tokenize(text, lemmatize): | |
text = text.replace(u"。", u"。\n") | |
data = [] | |
for line in text.split("\n"): | |
line = re.sub(re_beg_space, '', line) | |
line = re.sub(re_end_space, '', line) | |
if len(line) == 0: | |
continue | |
line = line.encode('utf-8') | |
node = tagger.parseToNode(line) | |
tokens = [] | |
while node: | |
if lemmatize: | |
features = node.feature.decode('utf-8').split(',') | |
pos = features[0] | |
lemma = features[6] | |
if lemma == '*': | |
lemma = node.surface.decode('utf-8') | |
tokens.append(lemma + '/' + pos) | |
else: | |
tokens.append(node.surface.decode('utf-8')) | |
node = node.next | |
if len(tokens) > 2: | |
data.append(tokens[1:-1]) | |
return data | |
gensim.corpora.wikicorpus.RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage|ファイル:|画像:)[^]]*(\]\])', re.UNICODE) | |
re_lang = re.compile(r"{{(?:(?:lang|interlang)\|)?[a-zA-Z][a-zA-Z]\|(.*?)}}", re.DOTALL | re.MULTILINE) | |
re_gallery = re.compile(r"<gallery[^>]*>.*?</gallery>", re.DOTALL) | |
re_head = re.compile(r"(=+)\s*(.*?)\s*\1", re.DOTALL | re.UNICODE) | |
re_emph = re.compile(r"'+(.*?)'+", re.DOTALL | re.UNICODE) | |
re_desc = re.compile(r"^[;:]\s*", re.UNICODE | re.MULTILINE) | |
re_item = re.compile(r"^[#\*]+\s*", re.UNICODE | re.MULTILINE) | |
def ja_filter_wiki(text): | |
text = re.sub(re_gallery, '', text) | |
text = re.sub(re_lang, '\\1', text) | |
text = gensim.corpora.wikicorpus.filter_wiki(text) | |
text = re.sub(re_head, '\\2', text) | |
text = re.sub(re_emph, '\\1', text) | |
text = re.sub(re_desc, '', text) | |
text = re.sub(re_item, '', text) | |
return text | |
def ja_process_article(args): | |
text, lemmatize, title, pageid = args | |
text = ja_filter_wiki(text) | |
return ja_tokenize(text, lemmatize), title, pageid | |
gensim.corpora.wikicorpus.process_article = ja_process_article | |
def parse_args(): | |
p = argparse.ArgumentParser(description='Extract corpus from Japanese Wikipedia Dump Data') | |
p.add_argument('input', help='[in] dump data') | |
p.add_argument('output', help='[out] corpus file') | |
p.add_argument('-l', '--lemmatize', action='store_true', help='lemmatize words') | |
p.add_argument('-d', '--dicdir', help='path of Mecab dictionary') | |
return p.parse_args() | |
if __name__ == '__main__': | |
args = parse_args() | |
program = os.path.basename(sys.argv[0]) | |
logger = logging.getLogger(program) | |
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') | |
logging.root.setLevel(level=logging.INFO) | |
logger.info("running %s" % ' '.join(sys.argv)) | |
if args.dicdir: | |
tagger = MeCab.Tagger('-d ' + args.dicdir) | |
else: | |
tagger = MeCab.Tagger() | |
i = 0 | |
space = ' ' | |
wiki = WikiCorpus(args.input, lemmatize=args.lemmatize, dictionary={}) | |
with codecs.open(args.output, 'w', 'utf-8') as fp: | |
for text in wiki.get_texts(): | |
for line in text: | |
fp.write(space.join(line) + "\n") | |
i = i + 1 | |
if (i % 10000 == 0): | |
logger.info("Saved " + str(i) + " articles") | |
logger.info("Finished Saved " + str(i) + " articles") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment