Skip to content

Instantly share code, notes, and snippets.

@DataTurks
Created May 27, 2018 10:23
Show Gist options
  • Save DataTurks/71e66f7ce5ce7c101f0900da7be915da to your computer and use it in GitHub Desktop.
Save DataTurks/71e66f7ce5ce7c101f0900da7be915da to your computer and use it in GitHub Desktop.
Creates NER training data in Spacy format from JSON downloaded from Dataturks.
############################################ NOTE ########################################################
#
# Creates NER training data in Spacy format from JSON downloaded from Dataturks.
#
# Outputs the Spacy training data which can be used for Spacy training.
#
############################################################################################################
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
try:
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
for annotation in data['annotation']:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
# handle both list of labels or a single label.
if not isinstance(labels, list):
labels = [labels]
for label in labels:
#dataturks indices are both inclusive [start, end] but spacy is not [start, end)
entities.append((point['start'], point['end'] + 1 ,label))
training_data.append((text, {"entities" : entities}))
return training_data
except Exception as e:
logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
return None
@brykneval
Copy link

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment