Skip to content

Instantly share code, notes, and snippets.

@tamuhey
Created November 19, 2019 17:57
Show Gist options
  • Save tamuhey/9a0cf1833e43d053a7e3c40064b293d9 to your computer and use it in GitHub Desktop.
Save tamuhey/9a0cf1833e43d053a7e3c40064b293d9 to your computer and use it in GitHub Desktop.
convert livedoor news corpus to spacy gold jsonl
from pathlib import Path
import itertools as it
import copy
import srsly
from tqdm.notebook import tqdm
labels = [
"movie-enter",
"it-life-hack",
"kaden-channel",
"topic-news",
"livedoor-homme",
"peachy",
"sports-watch",
"dokujo-tsushin",
".ipynb_checkpoints",
"smax",
]
base_cats = dict(zip(labels, it.repeat(0)))
def file2text(fpath):
with fpath.open() as f:
texts = f.read().split("\n")[2:]
texts = filter(lambda x: x != "", texts)
return "\n".join(texts)
def gen_golds(labels):
for d in tqdm(labels):
cats = copy.copy(base_cats)
cats[d] = 1
for f in tqdm(Path(d).glob("*.txt")):
text = file2text(f)
yield text, {"cats": cats}
srsly.write_jsonl("gold.jsonl", gen_golds(labels))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment