-
-
Save thatguysimon/6caa622be083f97b8c5c9a10478ba058 to your computer and use it in GitHub Desktop.
# A python script to turn annotated data in standoff format (brat annotation tool) to the formats expected by Stanford NER and Relation Extractor models | |
# - NER format based on: http://nlp.stanford.edu/software/crf-faq.html#a | |
# - RE format based on: http://nlp.stanford.edu/software/relationExtractor.html#training | |
# Usage: | |
# 1) Install the pycorenlp package | |
# 2) Run CoreNLP server (change CORENLP_SERVER_ADDRESS if needed) | |
# 3) Place .ann and .txt files from brat in the location specified in DATA_DIRECTORY | |
# 4) Run this script | |
# Cross-sentence annotation is not supported | |
from pycorenlp import StanfordCoreNLP | |
import os | |
from os import listdir | |
from os.path import isfile, join | |
DEFAULT_OTHER_ANNO = 'O' | |
STANDOFF_ENTITY_PREFIX = 'T' | |
STANDOFF_RELATION_PREFIX = 'R' | |
DATA_DIRECTORY = 'data' | |
OUTPUT_DIRECTORY = 'output' | |
CORENLP_SERVER_ADDRESS = 'http://localhost:9000' | |
NER_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 'ner-crf-training-data.tsv') | |
RE_TRAINING_DATA_OUTPUT_PATH = join(OUTPUT_DIRECTORY, 're-training-data.corp') | |
if os.path.exists(OUTPUT_DIRECTORY): | |
if os.path.exists(NER_TRAINING_DATA_OUTPUT_PATH): | |
os.remove(NER_TRAINING_DATA_OUTPUT_PATH) | |
if os.path.exists(RE_TRAINING_DATA_OUTPUT_PATH): | |
os.remove(RE_TRAINING_DATA_OUTPUT_PATH) | |
else: | |
os.makedirs(OUTPUT_DIRECTORY) | |
sentence_count = 0 | |
nlp = StanfordCoreNLP(CORENLP_SERVER_ADDRESS) | |
# looping through .ann files in the data directory | |
ann_data_files = [f for f in listdir(DATA_DIRECTORY) if isfile(join(DATA_DIRECTORY, f)) and f.split('.')[1] == 'ann'] | |
for file in ann_data_files: | |
entities = [] | |
relations = [] | |
# process .ann file - place entities and relations into 2 seperate lists of tuples | |
with open(join(DATA_DIRECTORY, file), 'r') as document_anno_file: | |
lines = document_anno_file.readlines() | |
for line in lines: | |
standoff_line = line.split() | |
if standoff_line[0][0] == STANDOFF_ENTITY_PREFIX: | |
entity = {} | |
entity['standoff_id'] = int(standoff_line[0][1:]) | |
entity['entity_type'] = standoff_line[1].capitalize() | |
entity['offset_start'] = int(standoff_line[2]) | |
entity['offset_end'] = int(standoff_line[3]) | |
entity['word'] = standoff_line[4] | |
entities.append(entity) | |
elif standoff_line[0][0] == STANDOFF_RELATION_PREFIX: | |
relation = {} | |
relation['standoff_id'] = int(standoff_line[0][1:]) | |
relation['name'] = standoff_line[1] | |
relation['standoff_entity1_id'] = int(standoff_line[2].split(':')[1][1:]) | |
relation['standoff_entity2_id'] = int(standoff_line[3].split(':')[1][1:]) | |
relations.append(relation) | |
# relations.append((standoff_id, relation_name, standoff_entity1_id, standoff_entity2_id)) | |
# read the .ann's matching .txt file and tokenize its text using stanford corenlp | |
with open(join(DATA_DIRECTORY, file.replace('.ann', '.txt')), 'r') as document_text_file: | |
document_text = document_text_file.read() | |
output = nlp.annotate(document_text, properties={ | |
'annotators': 'tokenize,ssplit,pos', | |
'outputFormat': 'json' | |
}) | |
# write text and annotations into NER and RE output files | |
with open(NER_TRAINING_DATA_OUTPUT_PATH, 'a') as ner_training_data, open(RE_TRAINING_DATA_OUTPUT_PATH, 'a') as re_training_data: | |
for sentence in output['sentences']: | |
entities_in_sentence = {} | |
sentence_re_rows = [] | |
for token in sentence['tokens']: | |
offset_start = int(token['characterOffsetBegin']) | |
offset_end = int(token['characterOffsetEnd']) | |
re_row = {} | |
entity_found = False | |
ner_anno = DEFAULT_OTHER_ANNO | |
# searching for token in annotated entities | |
for entity in entities: | |
if offset_start >= entity['offset_start'] and offset_end <= entity['offset_end']: | |
ner_anno = entity['entity_type'] | |
# multi-token entities for RE need to be handled differently than NER | |
if offset_start == entity['offset_start'] and offset_end <= entity['offset_end']: | |
entities_in_sentence[entity['standoff_id']] = len(sentence_re_rows) | |
re_row['entity_type'] = entity['entity_type'] | |
re_row['pos_tag'] = token['pos'] | |
re_row['word'] = token['word'] | |
sentence_re_rows.append(re_row) | |
entity_found = True | |
break | |
elif offset_start > entity['offset_start'] and offset_end <= entity['offset_end'] and len(sentence_re_rows) > 0: | |
sentence_re_rows[-1]['pos_tag'] += '/{}'.format(token['pos']) | |
sentence_re_rows[-1]['word'] += '/{}'.format(token['word']) | |
entity_found = True | |
break | |
if not entity_found: | |
re_row['entity_type'] = DEFAULT_OTHER_ANNO | |
re_row['pos_tag'] = token['pos'] | |
re_row['word'] = token['word'] | |
sentence_re_rows.append(re_row) | |
# writing tagged tokens to NER training data | |
ner_training_data.write('{}\t{}\n'.format(token['word'], ner_anno)) | |
# writing tagged tokens to RE training data | |
token_count = 0 | |
for sentence_row in sentence_re_rows: | |
re_training_data.write('{}\t{}\t{}\tO\t{}\t{}\tO\tO\tO\n'.format(str(sentence_count), sentence_row['entity_type'], str(token_count), sentence_row['pos_tag'], sentence_row['word'])) | |
token_count += 1 | |
re_training_data.write('\n') | |
# writing relations to RE training data | |
for relation in relations: | |
if relation['standoff_entity1_id'] in entities_in_sentence and relation['standoff_entity2_id'] in entities_in_sentence: | |
entity1 = str(entities_in_sentence[relation['standoff_entity1_id']]) | |
entity2 = str(entities_in_sentence[relation['standoff_entity2_id']]) | |
relation_name = relation['name'] | |
re_training_data.write('{}\t{}\t{}\n'.format(entity1, entity2, relation_name)) | |
re_training_data.write('\n') | |
sentence_count += 1 | |
ner_training_data.write('\n') | |
print('Processed file pair: {} and {}'.format(file, file.replace('.ann', '.txt'))) |
Hello, I am trying your script but I am facing issues with the corenlp server. I downloaded the folder from https://stanfordnlp.github.io/CoreNLP/download.html, started the server in my command prompt, imported, and installed the corenlp package in google colab, however it fails with 'Exception: Check whether you have started the CoreNLP server'. So I logged onto the localhost:9000 server and tried getting the output on the server for a random single sentence. The server doesn't respond and my command prompt shows the error of ' java.lang.OutOfMemoryError: Java heap space'. While starting the server I used the command 'java -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,parse \ -port 9000 -timeout 75000'.
I am aware this is not directly related to NLP, however, I would like to try your code for my project and would like any help that you can offer for this. Thank you.
Hey Gurikat, try adding a question on StackOverflow with tags like stanford-nlp
, core-nlp
, etc.
The package 'pycorenlp' is not available for windows Os?