Created
November 19, 2019 17:57
-
-
Save tamuhey/9a0cf1833e43d053a7e3c40064b293d9 to your computer and use it in GitHub Desktop.
convert livedoor news corpus to spacy gold jsonl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import itertools as it | |
import copy | |
import srsly | |
from tqdm.notebook import tqdm | |
labels = [ | |
"movie-enter", | |
"it-life-hack", | |
"kaden-channel", | |
"topic-news", | |
"livedoor-homme", | |
"peachy", | |
"sports-watch", | |
"dokujo-tsushin", | |
".ipynb_checkpoints", | |
"smax", | |
] | |
base_cats = dict(zip(labels, it.repeat(0))) | |
def file2text(fpath): | |
with fpath.open() as f: | |
texts = f.read().split("\n")[2:] | |
texts = filter(lambda x: x != "", texts) | |
return "\n".join(texts) | |
def gen_golds(labels): | |
for d in tqdm(labels): | |
cats = copy.copy(base_cats) | |
cats[d] = 1 | |
for f in tqdm(Path(d).glob("*.txt")): | |
text = file2text(f) | |
yield text, {"cats": cats} | |
srsly.write_jsonl("gold.jsonl", gen_golds(labels)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment