knok · January 29, 2020 07:38
diff --git a/parse_cirrus_sudachi.py b/parse_cirrus_sudachi.py
 # refer to https://github.com/yagays/pretrained_doc2vec_ja/blob/master/src/parse_cirrus.py
 #
 import json
 import gzip
 import bz2
 import subprocess

 from tqdm import tqdm

 p = subprocess.Popen(["java", "-jar", "sudachi-0.3.2.jar", "-s",
 '{"systemDict":"./system_core.dic"}'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=0)

 def wakati_parse(text):
    p.stdin.write(text.encode('utf-8'))
    p.stdin.write(b'\n')
    p.stdin.flush()
    ret = []
    while True:
        line = p.stdout.readline().decode('utf-8').strip()
        if line == "EOS":
            break
        r = line.strip().split('\t')
        ret.append(r[0])
    return " ".join(ret)

 in_fname = "../../pretrained_doc2vec_ja/data/jawiki-20200106-cirrussearch-content.json.gz"
 out_fname = "../../pretrained_doc2vec_ja/data/j-20200106cirrus_all.tsv.bz2"

 with gzip.open(in_fname) as fin:
    with bz2.open(out_fname, "wt") as fout:
        for line in tqdm(fin, total=2271620):
            json_line = json.loads(line)
            if "index" not in json_line:
                title = json_line["title"]
                text = json_line["text"]

                if title and text:
                    print("\t".join([title, wakati_parse(text).strip()]), file=fout)
	# refer to https://github.com/yagays/pretrained_doc2vec_ja/blob/master/src/parse_cirrus.py
	#
	import json
	import gzip
	import bz2
	import subprocess

	from tqdm import tqdm

	p = subprocess.Popen(["java", "-jar", "sudachi-0.3.2.jar", "-s",
	'{"systemDict":"./system_core.dic"}'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=0)

	def wakati_parse(text):
	p.stdin.write(text.encode('utf-8'))
	p.stdin.write(b'\n')
	p.stdin.flush()
	ret = []
	while True:
	line = p.stdout.readline().decode('utf-8').strip()
	if line == "EOS":
	break
	r = line.strip().split('\t')
	ret.append(r[0])
	return " ".join(ret)

	in_fname = "../../pretrained_doc2vec_ja/data/jawiki-20200106-cirrussearch-content.json.gz"
	out_fname = "../../pretrained_doc2vec_ja/data/j-20200106cirrus_all.tsv.bz2"

	with gzip.open(in_fname) as fin:
	with bz2.open(out_fname, "wt") as fout:
	for line in tqdm(fin, total=2271620):
	json_line = json.loads(line)
	if "index" not in json_line:
	title = json_line["title"]
	text = json_line["text"]

	if title and text:
	print("\t".join([title, wakati_parse(text).strip()]), file=fout)
No results found