Created
May 6, 2020 16:02
-
-
Save hankcs/776e7d95c19e5ff5da8469fe4e9ab050 to your computer and use it in GitHub Desktop.
Script to restore empty nodes for IWPT 2020
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
def load_conll_to_str(path): | |
""" | |
Load a conll file to a list of strings, each string represents a sentence in conll format | |
:rtype: list | |
""" | |
with open(path) as src: | |
text = src.read() | |
sents = text.split('\n\n') | |
sents = [x for x in sents if x.strip()] | |
return sents | |
def restore_collapse_edges(src, dst): | |
""" | |
Restore collapse edges from src file and write everyting to dst file | |
:param src: source file | |
:param dst: destination file | |
""" | |
sents = load_conll_to_str(src) | |
with open(dst, 'w') as out: | |
for each in sents: | |
empty_nodes = {} # head to deps | |
lines = each.split('\n') | |
tokens = [x for x in lines if not x.startswith('#') and x.split()[0].isdigit()] | |
for line in lines: | |
line = line.strip() | |
if not line: | |
continue | |
if line.startswith('#'): | |
out.write(line) | |
else: | |
cells = line.split('\t') | |
deps = cells[8].split('|') | |
for i, d in enumerate(deps): | |
if '>' in d: | |
head, rel = d.split(':', 1) | |
ehead = f'{len(tokens)}.{len(empty_nodes) + 1}' | |
par, cur = rel.split('>', 1) | |
cur = cur.split('>')[0] | |
deps[i] = f'{ehead}:{cur}' | |
empty_nodes[ehead] = f'{head}:{par}' | |
cells[8] = '|'.join(deps) | |
out.write('\t'.join(cells)) | |
out.write('\n') | |
num_tokens = int(line.split('\t')[0]) | |
assert num_tokens == len(tokens) | |
for idx, (ehead, deps) in enumerate(empty_nodes.items()): | |
out.write(f'{num_tokens}.{idx + 1}\t' + '_\t' * 7 + deps + '\t_\n') | |
out.write('\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment