Last active
October 4, 2023 12:04
-
-
Save thatguysimon/6caa622be083f97b8c5c9a10478ba058 to your computer and use it in GitHub Desktop.
A python script to convert annotated data in standoff format (brat annotation tool) to the formats expected by Stanford NER and Relation Extractor models
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A python script to turn annotated data in standoff format (brat annotation tool) to the formats expected by Stanford NER and Relation Extractor models | |
# - NER format based on: http://nlp.stanford.edu/software/crf-faq.html#a | |
# - RE format based on: http://nlp.stanford.edu/software/relationExtractor.html#training | |
# Usage: | |
# 1) Install the pycorenlp package | |
# 2) Run CoreNLP server (change CORENLP_SERVER_ADDRESS if needed) | |
# 3) Place .ann and .txt files from brat in the location specified in DATA_DIRECTORY | |
# 4) Run this script | |
# Cross-sentence annotation is not supported | |
from pycorenlp import StanfordCoreNLP | |
import os | |
from os import listdir | |
from os.path import isfile, join | |
DEFAULT_OTHER_ANNO = 'O' | |
STANDOFF_ENTITY_PREFIX = 'T' | |
STANDOFF_RELATION_PREFIX = 'R' | |
DATA_DIRECTORY = 'data' | |
OUTPUT_DIRECTORY = 'output' | |
CORENLP_SERVER_ADDRESS = 'http://localhost:9000' | |
NER_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 'ner-crf-training-data.tsv') | |
RE_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 're-training-data.corp') | |
if os.path.exists(OUTPUT_DIRECTORY): | |
if os.path.exists(NER_TRAINING_DATA_OUTPUT_PATH): | |
os.remove(NER_TRAINING_DATA_OUTPUT_PATH) | |
if os.path.exists(RE_TRAINING_DATA_OUTPUT_PATH): | |
os.remove(RE_TRAINING_DATA_OUTPUT_PATH) | |
else: | |
os.makedirs(OUTPUT_DIRECTORY) | |
sentence_count = 0 | |
nlp = StanfordCoreNLP(CORENLP_SERVER_ADDRESS) | |
# looping through .ann files in the data directory | |
ann_data_files = [f for f in listdir(DATA_DIRECTORY) if isfile(join(DATA_DIRECTORY, f)) and f.split('.')[1] == 'ann'] | |
for file in ann_data_files: | |
entities = [] | |
relations = [] | |
# process .ann file - place entities and relations into 2 seperate lists of tuples | |
with open(join(DATA_DIRECTORY, file), 'r') as document_anno_file: | |
lines = document_anno_file.readlines() | |
for line in lines: | |
standoff_line = line.split() | |
if standoff_line[0][0] == STANDOFF_ENTITY_PREFIX: | |
entity = {} | |
entity['standoff_id'] = int(standoff_line[0][1:]) | |
entity['entity_type'] = standoff_line[1].capitalize() | |
entity['offset_start'] = int(standoff_line[2]) | |
entity['offset_end'] = int(standoff_line[3]) | |
entity['word'] = standoff_line[4] | |
entities.append(entity) | |
elif standoff_line[0][0] == STANDOFF_RELATION_PREFIX: | |
relation = {} | |
relation['standoff_id'] = int(standoff_line[0][1:]) | |
relation['name'] = standoff_line[1] | |
relation['standoff_entity1_id'] = int(standoff_line[2].split(':')[1][1:]) | |
relation['standoff_entity2_id'] = int(standoff_line[3].split(':')[1][1:]) | |
relations.append(relation) | |
# relations.append((standoff_id, relation_name, standoff_entity1_id, standoff_entity2_id)) | |
# read the .ann's matching .txt file and tokenize its text using stanford corenlp | |
with open(join(DATA_DIRECTORY, file.replace('.ann', '.txt')), 'r') as document_text_file: | |
document_text = document_text_file.read() | |
output = nlp.annotate(document_text, properties={ | |
'annotators': 'tokenize,ssplit,pos', | |
'outputFormat': 'json' | |
}) | |
# write text and annotations into NER and RE output files | |
with open(NER_TRAINING_DATA_OUTPUT_PATH, 'a') as ner_training_data, open(RE_TRAINING_DATA_OUTPUT_PATH, 'a') as re_training_data: | |
for sentence in output['sentences']: | |
entities_in_sentence = {} | |
sentence_re_rows = [] | |
for token in sentence['tokens']: | |
offset_start = int(token['characterOffsetBegin']) | |
offset_end = int(token['characterOffsetEnd']) | |
re_row = {} | |
entity_found = False | |
ner_anno = DEFAULT_OTHER_ANNO | |
# searching for token in annotated entities | |
for entity in entities: | |
if offset_start >= entity['offset_start'] and offset_end <= entity['offset_end']: | |
ner_anno = entity['entity_type'] | |
# multi-token entities for RE need to be handled differently than NER | |
if offset_start == entity['offset_start'] and offset_end <= entity['offset_end']: | |
entities_in_sentence[entity['standoff_id']] = len(sentence_re_rows) | |
re_row['entity_type'] = entity['entity_type'] | |
re_row['pos_tag'] = token['pos'] | |
re_row['word'] = token['word'] | |
sentence_re_rows.append(re_row) | |
entity_found = True | |
break | |
elif offset_start > entity['offset_start'] and offset_end <= entity['offset_end'] and len(sentence_re_rows) > 0: | |
sentence_re_rows[-1]['pos_tag'] += '/{}'.format(token['pos']) | |
sentence_re_rows[-1]['word'] += '/{}'.format(token['word']) | |
entity_found = True | |
break | |
if not entity_found: | |
re_row['entity_type'] = DEFAULT_OTHER_ANNO | |
re_row['pos_tag'] = token['pos'] | |
re_row['word'] = token['word'] | |
sentence_re_rows.append(re_row) | |
# writing tagged tokens to NER training data | |
ner_training_data.write('{}\t{}\n'.format(token['word'], ner_anno)) | |
# writing tagged tokens to RE training data | |
token_count = 0 | |
for sentence_row in sentence_re_rows: | |
re_training_data.write('{}\t{}\t{}\tO\t{}\t{}\tO\tO\tO\n'.format(str(sentence_count), sentence_row['entity_type'], str(token_count), sentence_row['pos_tag'], sentence_row['word'])) | |
token_count += 1 | |
re_training_data.write('\n') | |
# writing relations to RE training data | |
for relation in relations: | |
if relation['standoff_entity1_id'] in entities_in_sentence and relation['standoff_entity2_id'] in entities_in_sentence: | |
entity1 = str(entities_in_sentence[relation['standoff_entity1_id']]) | |
entity2 = str(entities_in_sentence[relation['standoff_entity2_id']]) | |
relation_name = relation['name'] | |
re_training_data.write('{}\t{}\t{}\n'.format(entity1, entity2, relation_name)) | |
re_training_data.write('\n') | |
sentence_count += 1 | |
ner_training_data.write('\n') | |
print('Processed file pair: {} and {}'.format(file, file.replace('.ann', '.txt'))) |
Hey Gurikat, try adding a question on StackOverflow with tags like stanford-nlp
, core-nlp
, etc.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello, I am trying your script but I am facing issues with the corenlp server. I downloaded the folder from https://stanfordnlp.github.io/CoreNLP/download.html, started the server in my command prompt, imported, and installed the corenlp package in google colab, however it fails with 'Exception: Check whether you have started the CoreNLP server'. So I logged onto the localhost:9000 server and tried getting the output on the server for a random single sentence. The server doesn't respond and my command prompt shows the error of ' java.lang.OutOfMemoryError: Java heap space'. While starting the server I used the command 'java -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,parse \ -port 9000 -timeout 75000'.
I am aware this is not directly related to NLP, however, I would like to try your code for my project and would like any help that you can offer for this. Thank you.