Forked from DataTurks/convert_dataturks_to_spacy.py
Last active
May 13, 2020 05:45
-
-
Save brykneval/58e0c132edf389db89e5a34834927a34 to your computer and use it in GitHub Desktop.
Creates NER training data in Spacy format from JSON downloaded from Dataturks.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################ NOTE ######################################################## | |
# | |
# Creates NER training data in Spacy format from JSON downloaded from Dataturks. | |
# | |
# Outputs the Spacy training data which can be used for Spacy training. | |
# | |
############################################################################################################ | |
def convert_dataturks_to_spacy(dataturks_JSON_FilePath): | |
try: | |
training_data = [] | |
lines=[] | |
with open(dataturks_JSON_FilePath, 'r') as f: | |
lines = f.readlines() | |
for line in lines: | |
data = json.loads(line) | |
text = data['content'] | |
entities = [] | |
data_annotations = data['annotation'] | |
if data_annotations is not None: | |
for annotation in data_annotations: | |
#only a single point in text annotation. | |
point = annotation['points'][0] | |
labels = annotation['label'] | |
# handle both list of labels or a single label. | |
if not isinstance(labels, list): | |
labels = [labels] | |
for label in labels: | |
point_start = point['start'] | |
point_end = point['end'] | |
point_text = point['text'] | |
lstrip_diff = len(point_text) - len(point_text.lstrip()) | |
rstrip_diff = len(point_text) - len(point_text.rstrip()) | |
if lstrip_diff != 0: | |
point_start = point_start + lstrip_diff | |
if rstrip_diff != 0: | |
point_end = point_end - rstrip_diff | |
entities.append((point_start, point_end + 1 , label)) | |
training_data.append((text, {"entities" : entities})) | |
return training_data | |
except Exception as e: | |
logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e)) | |
return None |
Yeah I have tested it
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 5224: character maps to .
please check the fix: https://gist.github.com/shawonis08/0df66d793acd2b88e360d4d835c0eae0
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
looks good..hope you had tested it?
I can have the updated made.