Created
May 7, 2022 16:24
-
-
Save yptheangel/aaffbeb52bdf808a9b87152eb85c0037 to your computer and use it in GitHub Desktop.
snippet to read .jsonl from Doccano NER annotator and converting into spacy v3 format. filter spans is optional, uncomment if you do not want overlapping span
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
custom_data_path = 'path_to_your_ner_dataset.jsonl' | |
import json | |
with open(custom_data_path, 'r', encoding="utf8") as json_file: | |
json_list = list(json_file) | |
dataset = [json.loads(jline) for jline in json_list] | |
print(f"number of examples : {len(dataset)}") | |
for training_example in tqdm(dataset): | |
text = training_example['data'] | |
labels = training_example['label'] | |
labels = [tuple(lbl) for lbl in labels] | |
doc = nlp.make_doc(text) | |
ents = [] | |
for start, end, label in labels: | |
span = doc.char_span(start, end, label=label, alignment_mode="contract") | |
if span is None: | |
print("Skipping entity") | |
else: | |
ents.append(span) | |
# filtered_ents = filter_spans(ents) | |
# doc.ents = filtered_ents | |
doc.ents = ents | |
doc_bin.add(doc) | |
doc_bin.to_disk("training_data.spacy") # save the docbin object |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment