Skip to content

Instantly share code, notes, and snippets.

@tsuchm
Last active February 4, 2019 06:35
Show Gist options
  • Save tsuchm/e818cd0e8b2eeaac371faa396371a2a7 to your computer and use it in GitHub Desktop.
Save tsuchm/e818cd0e8b2eeaac371faa396371a2a7 to your computer and use it in GitHub Desktop.
gensim を使って,日本語版 wikipedia ダンプデータをコーパス化するスクリプト(メンテナンスされていません)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# https://github.com/attardi/wikiextractor が使いやすく,かつ,日本語も適切に扱ってくれることが判明しましたので,
# このスクリプトは,今後のメンテナンスの予定はありません.
import argparse
import codecs
import logging
import os.path
import re
import sys
from gensim.corpora import WikiCorpus
import gensim.corpora.wikicorpus
import MeCab
tagger = None
re_beg_space = re.compile(r"^\\s+", re.UNICODE)
re_end_space = re.compile(r"\\s+$", re.UNICODE)
def ja_tokenize(text, lemmatize):
text = text.replace(u"。", u"。\n")
data = []
for line in text.split("\n"):
line = re.sub(re_beg_space, '', line)
line = re.sub(re_end_space, '', line)
if len(line) == 0:
continue
line = line.encode('utf-8')
node = tagger.parseToNode(line)
tokens = []
while node:
if lemmatize:
features = node.feature.decode('utf-8').split(',')
pos = features[0]
lemma = features[6]
if lemma == '*':
lemma = node.surface.decode('utf-8')
tokens.append(lemma + '/' + pos)
else:
tokens.append(node.surface.decode('utf-8'))
node = node.next
if len(tokens) > 2:
data.append(tokens[1:-1])
return data
gensim.corpora.wikicorpus.RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage|ファイル:|画像:)[^]]*(\]\])', re.UNICODE)
re_lang = re.compile(r"{{(?:(?:lang|interlang)\|)?[a-zA-Z][a-zA-Z]\|(.*?)}}", re.DOTALL | re.MULTILINE)
re_gallery = re.compile(r"<gallery[^>]*>.*?</gallery>", re.DOTALL)
re_head = re.compile(r"(=+)\s*(.*?)\s*\1", re.DOTALL | re.UNICODE)
re_emph = re.compile(r"'+(.*?)'+", re.DOTALL | re.UNICODE)
re_desc = re.compile(r"^[;:]\s*", re.UNICODE | re.MULTILINE)
re_item = re.compile(r"^[#\*]+\s*", re.UNICODE | re.MULTILINE)
def ja_filter_wiki(text):
text = re.sub(re_gallery, '', text)
text = re.sub(re_lang, '\\1', text)
text = gensim.corpora.wikicorpus.filter_wiki(text)
text = re.sub(re_head, '\\2', text)
text = re.sub(re_emph, '\\1', text)
text = re.sub(re_desc, '', text)
text = re.sub(re_item, '', text)
return text
def ja_process_article(args):
text, lemmatize, title, pageid = args
text = ja_filter_wiki(text)
return ja_tokenize(text, lemmatize), title, pageid
gensim.corpora.wikicorpus.process_article = ja_process_article
def parse_args():
p = argparse.ArgumentParser(description='Extract corpus from Japanese Wikipedia Dump Data')
p.add_argument('input', help='[in] dump data')
p.add_argument('output', help='[out] corpus file')
p.add_argument('-l', '--lemmatize', action='store_true', help='lemmatize words')
p.add_argument('-d', '--dicdir', help='path of Mecab dictionary')
return p.parse_args()
if __name__ == '__main__':
args = parse_args()
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
if args.dicdir:
tagger = MeCab.Tagger('-d ' + args.dicdir)
else:
tagger = MeCab.Tagger()
i = 0
space = ' '
wiki = WikiCorpus(args.input, lemmatize=args.lemmatize, dictionary={})
with codecs.open(args.output, 'w', 'utf-8') as fp:
for text in wiki.get_texts():
for line in text:
fp.write(space.join(line) + "\n")
i = i + 1
if (i % 10000 == 0):
logger.info("Saved " + str(i) + " articles")
logger.info("Finished Saved " + str(i) + " articles")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment