Created
October 3, 2019 18:50
-
-
Save mrdrozdov/ecacdb33387c0af34d1f17d75e2fda5e to your computer and use it in GitHub Desktop.
match_ptb_propbank.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os | |
import sys | |
import time | |
import collections | |
import json | |
from tqdm import tqdm | |
# PTB | |
def convert_binary_bracketing(parse): | |
transitions = [] | |
tokens = [] | |
for word in parse.split(' '): | |
if word[0] != "(": | |
if word == ")": | |
transitions.append(1) | |
else: | |
tokens.append(word) | |
transitions.append(0) | |
return tokens, transitions | |
def read_ptb(path): | |
with open(path) as f: | |
for line in f: | |
yield json.loads(line) | |
# PROPBANK | |
def parse_lines(lines): | |
keys = [ | |
'fileid', | |
'exampleid', | |
'tokenid', | |
'token', | |
'part-of-speech', | |
'parse', | |
'token.00', | |
'token.01', | |
] | |
key2idx = {k: i for i, k in enumerate(keys)} | |
assert lines is not None and len(lines) > 0 | |
length = None | |
data = {} | |
for x in lines: | |
if length is None: | |
length = len(x) | |
assert len(x) == length, lines | |
for k in keys: | |
data.setdefault(k, []).append(x[key2idx[k]]) | |
return data | |
def read_file(path): | |
result = [] | |
with open(path) as f: | |
lines = [] | |
for line in f: | |
line = line.strip() | |
if not line: | |
result.append(parse_lines(lines)) | |
lines = [] | |
continue | |
lines.append(line.split()) | |
if len(lines) > 0: | |
result.append(parse_lines(lines)) | |
return result | |
def read_files(options): | |
for fn in sorted(os.listdir(options.propbank)): | |
if not fn.endswith('gold_conll'): | |
continue | |
path = os.path.join(options.propbank, fn) | |
for obj in read_file(path): | |
yield obj | |
def main(options): | |
stats = dict(skip_key_ptb=0, skip_key_propbank=0, key_match=0) | |
propbank = {} | |
propbank_skipped = dict(key=0) | |
for obj in tqdm(read_files(options)): | |
key = tuple(obj['token']) | |
if key in propbank: | |
stats['skip_key_propbank'] += 1 | |
continue | |
assert key not in propbank, (key, len(propbank)) | |
propbank[key] = obj | |
print('propbank size = {}'.format(len(propbank))) | |
print(stats) | |
ptb = {} | |
ptb_skipped = dict(key=0) | |
for obj in tqdm(read_ptb(options.ptb)): | |
tokens, _ = convert_binary_bracketing(obj['sentence1_binary_parse'].strip()) | |
key = tuple(tokens) | |
if key in ptb: | |
stats['skip_key_ptb'] += 1 | |
continue | |
assert key not in ptb, (key, len(ptb)) | |
ptb[key] = obj | |
if key in propbank: | |
stats['key_match'] += 1 | |
print('ptb size = {}'.format(len(ptb))) | |
print(stats) | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--propbank', default=os.path.expanduser('~/data/ptb-propbank-v3-1'), type=str) | |
parser.add_argument('--ptb', default=os.path.expanduser('~/data/ptb.jsonl'), type=str) | |
options = parser.parse_args() | |
main(options) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment