Save thatguysimon/6caa622be083f97b8c5c9a10478ba058 to your computer and use it in GitHub Desktop.
# A python script to turn annotated data in standoff format (brat annotation tool) to the formats expected by Stanford NER and Relation Extractor models | |
# - NER format based on: http://nlp.stanford.edu/software/crf-faq.html#a | |
# - RE format based on: http://nlp.stanford.edu/software/relationExtractor.html#training | |
# Usage: | |
# 1) Install the pycorenlp package | |
# 2) Run CoreNLP server (change CORENLP_SERVER_ADDRESS if needed) | |
# 3) Place .ann and .txt files from brat in the location specified in DATA_DIRECTORY | |
# 4) Run this script | |
# Cross-sentence annotation is not supported | |
from pycorenlp import StanfordCoreNLP | |
import os | |
from os import listdir | |
from os.path import isfile, join | |
DATA_DIRECTORY = 'data' | |
OUTPUT_DIRECTORY = 'output' | |
CORENLP_SERVER_ADDRESS = 'http://localhost:9000' | |
NER_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 'ner-crf-training-data.tsv') | |
RE_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 're-training-data.corp') | |
if os.path.exists(OUTPUT_DIRECTORY): | |
if os.path.exists(NER_TRAINING_DATA_OUTPUT_PATH): | |
if os.path.exists(RE_TRAINING_DATA_OUTPUT_PATH): | |
else: | |
os.makedirs(OUTPUT_DIRECTORY) | |
sentence_count = 0 | |
# looping through .ann files in the data directory | |
ann_data_files = [f for f in listdir(DATA_DIRECTORY) if isfile(join(DATA_DIRECTORY, f)) and f.split('.')[1] == 'ann'] | |
for file in ann_data_files: | |
entities = [] | |
relations = [] | |
# process .ann file - place entities and relations into 2 seperate lists of tuples | |
with open(join(DATA_DIRECTORY, file), 'r') as document_anno_file: | |
lines = document_anno_file.readlines() | |
for line in lines: | |
standoff_line = line.split() | |
if standoff_line[0][0] == STANDOFF_ENTITY_PREFIX: | |
entity = {} | |
entity['standoff_id'] = int(standoff_line[0][1:]) | |
entity['entity_type'] = standoff_line[1].capitalize() | |
entity['offset_start'] = int(standoff_line[2]) | |
entity['offset_end'] = int(standoff_line[3]) | |
entity['word'] = standoff_line[4] | |
entities.append(entity) | |
elif standoff_line[0][0] == STANDOFF_RELATION_PREFIX: | |
relation = {} | |
relation['standoff_id'] = int(standoff_line[0][1:]) | |
relation['name'] = standoff_line[1] | |
relation['standoff_entity1_id'] = int(standoff_line[2].split(':')[1][1:]) | |
relation['standoff_entity2_id'] = int(standoff_line[3].split(':')[1][1:]) | |
relations.append(relation) | |
# relations.append((standoff_id, relation_name, standoff_entity1_id, standoff_entity2_id)) | |
# read the .ann's matching .txt file and tokenize its text using stanford corenlp | |
with open(join(DATA_DIRECTORY, file.replace('.ann', '.txt')), 'r') as document_text_file: | |
document_text = document_text_file.read() | |
output = nlp.annotate(document_text, properties={ | |
'annotators': 'tokenize,ssplit,pos', | |
'outputFormat': 'json' | |
}) | |
# write text and annotations into NER and RE output files | |
with open(NER_TRAINING_DATA_OUTPUT_PATH, 'a') as ner_training_data, open(RE_TRAINING_DATA_OUTPUT_PATH, 'a') as re_training_data: | |
for sentence in output['sentences']: | |
entities_in_sentence = {} | |
sentence_re_rows = [] | |
for token in sentence['tokens']: | |
offset_start = int(token['characterOffsetBegin']) | |
offset_end = int(token['characterOffsetEnd']) | |
re_row = {} | |
entity_found = False | |
ner_anno = DEFAULT_OTHER_ANNO | |
# searching for token in annotated entities | |
for entity in entities: | |
if offset_start >= entity['offset_start'] and offset_end <= entity['offset_end']: | |
ner_anno = entity['entity_type'] | |
# multi-token entities for RE need to be handled differently than NER | |
if offset_start == entity['offset_start'] and offset_end <= entity['offset_end']: | |
entities_in_sentence[entity['standoff_id']] = len(sentence_re_rows) | |
re_row['entity_type'] = entity['entity_type'] | |
re_row['pos_tag'] = token['pos'] | |
re_row['word'] = token['word'] | |
sentence_re_rows.append(re_row) | |
entity_found = True | |
break | |
elif offset_start > entity['offset_start'] and offset_end <= entity['offset_end'] and len(sentence_re_rows) > 0: | |
sentence_re_rows[-1]['pos_tag'] += '/{}'.format(token['pos']) | |
sentence_re_rows[-1]['word'] += '/{}'.format(token['word']) | |
entity_found = True | |
break | |
if not entity_found: | |
re_row['entity_type'] = DEFAULT_OTHER_ANNO | |
re_row['pos_tag'] = token['pos'] | |
re_row['word'] = token['word'] | |
sentence_re_rows.append(re_row) | |
# writing tagged tokens to NER training data | |
ner_training_data.write('{}\t{}\n'.format(token['word'], ner_anno)) | |
# writing tagged tokens to RE training data | |
token_count = 0 | |
for sentence_row in sentence_re_rows: | |
re_training_data.write('{}\t{}\t{}\tO\t{}\t{}\tO\tO\tO\n'.format(str(sentence_count), sentence_row['entity_type'], str(token_count), sentence_row['pos_tag'], sentence_row['word'])) | |
token_count += 1 | |
re_training_data.write('\n') | |
# writing relations to RE training data | |
for relation in relations: | |
if relation['standoff_entity1_id'] in entities_in_sentence and relation['standoff_entity2_id'] in entities_in_sentence: | |
entity1 = str(entities_in_sentence[relation['standoff_entity1_id']]) | |
entity2 = str(entities_in_sentence[relation['standoff_entity2_id']]) | |
relation_name = relation['name'] | |
re_training_data.write('{}\t{}\t{}\n'.format(entity1, entity2, relation_name)) | |
re_training_data.write('\n') | |
sentence_count += 1 | |
ner_training_data.write('\n') | |
print('Processed file pair: {} and {}'.format(file, file.replace('.ann', '.txt'))) |
Hi @thatguysimon,
I got “UnicodeDecodeError: 'ascii' codec can't decode byte” for French text and added 3 lines to fix this.
import sys
Thanks a lot.
Traceback (most recent call last):
File "convert_format.py", line 71, in
for sentence in output['sentences']:
TypeError: string indices must be integers
Can you help me fix this error ?
Traceback (most recent call last):
File "convert_format.py", line 71, in
for sentence in output['sentences']:
TypeError: string indices must be integersCan you help me fix this error ?
You need to start the CoreNLP server. The best way is use a docker
The package 'pycorenlp' is not available for windows Os?
Hello, I am trying your script but I am facing issues with the corenlp server. I downloaded the folder from https://stanfordnlp.github.io/CoreNLP/download.html, started the server in my command prompt, imported, and installed the corenlp package in google colab, however it fails with 'Exception: Check whether you have started the CoreNLP server'. So I logged onto the localhost:9000 server and tried getting the output on the server for a random single sentence. The server doesn't respond and my command prompt shows the error of ' java.lang.OutOfMemoryError: Java heap space'. While starting the server I used the command 'java -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,parse \ -port 9000 -timeout 75000'.
I am aware this is not directly related to NLP, however, I would like to try your code for my project and would like any help that you can offer for this. Thank you.
Hey Gurikat, try adding a question on StackOverflow with tags like stanford-nlp
, core-nlp
, etc.
Hi @toliwa,
Thanks for the fix!
The part of the code that throws the error and that you changed deals with converting the annotations into the format required by the Stanford Relation Extractor. The RE doesn't support cross-sentence annotation so this gist doesn't either.
If you are only using this for the NER part and your fix works - great! Your change shouldn't have side affects to the NER part.
However, the result of the RE part will be wrong when providing cross-sentence annotations. Still better to not have an error though, so thanks.