Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save brykneval/58e0c132edf389db89e5a34834927a34 to your computer and use it in GitHub Desktop.
Save brykneval/58e0c132edf389db89e5a34834927a34 to your computer and use it in GitHub Desktop.
Creates NER training data in Spacy format from JSON downloaded from Dataturks.
############################################ NOTE ########################################################
#
# Creates NER training data in Spacy format from JSON downloaded from Dataturks.
#
# Outputs the Spacy training data which can be used for Spacy training.
#
############################################################################################################
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
try:
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, 'r') as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
data_annotations = data['annotation']
if data_annotations is not None:
for annotation in data_annotations:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
# handle both list of labels or a single label.
if not isinstance(labels, list):
labels = [labels]
for label in labels:
point_start = point['start']
point_end = point['end']
point_text = point['text']
lstrip_diff = len(point_text) - len(point_text.lstrip())
rstrip_diff = len(point_text) - len(point_text.rstrip())
if lstrip_diff != 0:
point_start = point_start + lstrip_diff
if rstrip_diff != 0:
point_end = point_end - rstrip_diff
entities.append((point_start, point_end + 1 , label))
training_data.append((text, {"entities" : entities}))
return training_data
except Exception as e:
logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
return None
@DataTurks
Copy link

looks good..hope you had tested it?
I can have the updated made.

@brykneval
Copy link
Author

Yeah I have tested it

@shawonis08
Copy link

UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 5224: character maps to .
please check the fix: https://gist.github.com/shawonis08/0df66d793acd2b88e360d4d835c0eae0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment