Forked from brykneval/convert_dataturks_to_spacy.py
Last active
May 13, 2020 05:36
-
-
Save shawonis08/0df66d793acd2b88e360d4d835c0eae0 to your computer and use it in GitHub Desktop.
Creates NER training data in Spacy format from JSON downloaded from Dataturks.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################ NOTE ######################################################## | |
# | |
# Creates NER training data in Spacy format from JSON downloaded from Dataturks. | |
# | |
# Outputs the Spacy training data which can be used for Spacy training. | |
# | |
############################################################################################################ | |
def convert_dataturks_to_spacy(dataturks_JSON_FilePath): | |
try: | |
training_data = [] | |
lines=[] | |
with open(dataturks_JSON_FilePath, 'r', encoding="utf8") as f: | |
lines = f.readlines() | |
for line in lines: | |
data = json.loads(line) | |
text = data['content'] | |
entities = [] | |
data_annotations = data['annotation'] | |
if data_annotations is not None: | |
for annotation in data_annotations: | |
#only a single point in text annotation. | |
point = annotation['points'][0] | |
labels = annotation['label'] | |
# handle both list of labels or a single label. | |
if not isinstance(labels, list): | |
labels = [labels] | |
for label in labels: | |
point_start = point['start'] | |
point_end = point['end'] | |
point_text = point['text'] | |
lstrip_diff = len(point_text) - len(point_text.lstrip()) | |
rstrip_diff = len(point_text) - len(point_text.rstrip()) | |
if lstrip_diff != 0: | |
point_start = point_start + lstrip_diff | |
if rstrip_diff != 0: | |
point_end = point_end - rstrip_diff | |
entities.append((point_start, point_end + 1 , label)) | |
training_data.append((text, {"entities" : entities})) | |
return training_data | |
except Exception as e: | |
logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e)) | |
return None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment