Skip to content

Instantly share code, notes, and snippets.

@knok
Created January 29, 2020 07:38
Show Gist options
  • Save knok/9bcd8c743af6d67cdfe090a81b66b6b7 to your computer and use it in GitHub Desktop.
Save knok/9bcd8c743af6d67cdfe090a81b66b6b7 to your computer and use it in GitHub Desktop.
Parse cirrus jawiki content with sudachi via pipe
# refer to https://github.com/yagays/pretrained_doc2vec_ja/blob/master/src/parse_cirrus.py
#
import json
import gzip
import bz2
import subprocess
from tqdm import tqdm
p = subprocess.Popen(["java", "-jar", "sudachi-0.3.2.jar", "-s",
'{"systemDict":"./system_core.dic"}'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=0)
def wakati_parse(text):
p.stdin.write(text.encode('utf-8'))
p.stdin.write(b'\n')
p.stdin.flush()
ret = []
while True:
line = p.stdout.readline().decode('utf-8').strip()
if line == "EOS":
break
r = line.strip().split('\t')
ret.append(r[0])
return " ".join(ret)
in_fname = "../../pretrained_doc2vec_ja/data/jawiki-20200106-cirrussearch-content.json.gz"
out_fname = "../../pretrained_doc2vec_ja/data/j-20200106cirrus_all.tsv.bz2"
with gzip.open(in_fname) as fin:
with bz2.open(out_fname, "wt") as fout:
for line in tqdm(fin, total=2271620):
json_line = json.loads(line)
if "index" not in json_line:
title = json_line["title"]
text = json_line["text"]
if title and text:
print("\t".join([title, wakati_parse(text).strip()]), file=fout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment