Last active
August 29, 2015 14:10
-
-
Save larsmans/8a5d1f5e272674e1b7f9 to your computer and use it in GitHub Desktop.
Brat-to-CSV converter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Quick and dirty Brat-to-CSV conversion. | |
from __future__ import print_function | |
import csv | |
import io | |
import re | |
import sys | |
# copy server/src/{gtbtokenize,tokenise}.py from Brat | |
from tokenise import gtb_token_boundary_gen | |
def read_annot(fname): | |
ann = {} | |
level = {} | |
with open(fname) as f: | |
for ln in f: | |
if ln.startswith('T'): | |
ident, label, start, end, text = ln.split(None, 4) | |
start, end = int(start), int(end) | |
for t_start, t_end in gtb_token_boundary_gen(text): | |
# Index annotations by token start only, because it's | |
# too hard to get the tokenizer to behave just like it | |
# does in Brat and the ends tend to go wrong. | |
#ann[(start + int(t_start), start + int(t_end))] = label | |
ann[start + int(t_start)] = (label, int(t_end), ident) | |
else: | |
try: | |
_, lev, ident, value = ln.split() | |
if lev != 'Level': | |
continue | |
level[ident] = value | |
except ValueError: | |
pass | |
return ann, level | |
if len(sys.argv) != 4: | |
print("usage: %s review.txt user1.ann user2.ann" % sys.argv[0], | |
file=sys.stderr) | |
sys.exit(1) | |
txt_name, ann1_name, ann2_name = sys.argv[1:] | |
with io.open(txt_name, encoding='utf-8') as f: | |
text = f.read() | |
tok_bound = list(gtb_token_boundary_gen(text)) | |
ann1, level1 = read_annot(ann1_name) | |
ann2, level2 = read_annot(ann2_name) | |
wr = csv.writer(sys.stdout, dialect='excel') | |
wr.writerow(['Token', | |
#'Start', 'End', | |
'Label1', 'Level1', 'Label2', 'Level2']) | |
for o in tok_bound: | |
start, end = o | |
label1, end1, ident1 = ann1.get(start, ('', 0, '')) | |
label2, end2, ident2 = ann2.get(start, ('', 0, '')) | |
lvl1 = level1.get(ident1, '') | |
lvl2 = level2.get(ident2, '') | |
wr.writerow([text[start:end].encode('utf-8'), | |
#start, end, | |
label1, lvl1, label2, lvl2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment