Skip to content

Instantly share code, notes, and snippets.

@hankcs
Created May 6, 2020 16:02
Show Gist options
  • Save hankcs/776e7d95c19e5ff5da8469fe4e9ab050 to your computer and use it in GitHub Desktop.
Save hankcs/776e7d95c19e5ff5da8469fe4e9ab050 to your computer and use it in GitHub Desktop.
Script to restore empty nodes for IWPT 2020
# -*- coding:utf-8 -*-
def load_conll_to_str(path):
"""
Load a conll file to a list of strings, each string represents a sentence in conll format
:rtype: list
"""
with open(path) as src:
text = src.read()
sents = text.split('\n\n')
sents = [x for x in sents if x.strip()]
return sents
def restore_collapse_edges(src, dst):
"""
Restore collapse edges from src file and write everyting to dst file
:param src: source file
:param dst: destination file
"""
sents = load_conll_to_str(src)
with open(dst, 'w') as out:
for each in sents:
empty_nodes = {} # head to deps
lines = each.split('\n')
tokens = [x for x in lines if not x.startswith('#') and x.split()[0].isdigit()]
for line in lines:
line = line.strip()
if not line:
continue
if line.startswith('#'):
out.write(line)
else:
cells = line.split('\t')
deps = cells[8].split('|')
for i, d in enumerate(deps):
if '>' in d:
head, rel = d.split(':', 1)
ehead = f'{len(tokens)}.{len(empty_nodes) + 1}'
par, cur = rel.split('>', 1)
cur = cur.split('>')[0]
deps[i] = f'{ehead}:{cur}'
empty_nodes[ehead] = f'{head}:{par}'
cells[8] = '|'.join(deps)
out.write('\t'.join(cells))
out.write('\n')
num_tokens = int(line.split('\t')[0])
assert num_tokens == len(tokens)
for idx, (ehead, deps) in enumerate(empty_nodes.items()):
out.write(f'{num_tokens}.{idx + 1}\t' + '_\t' * 7 + deps + '\t_\n')
out.write('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment