Last active
May 23, 2020 09:41
-
-
Save simonepri/f4ad91e935a066d39d39e9b0da472988 to your computer and use it in GitHub Desktop.
GED Utils
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from typing import * # pylint: disable=wildcard-import,unused-wildcard-import | |
import argparse | |
import os | |
import re | |
def main(args: argparse.Namespace) -> None: | |
with open(args.m2_file_path, "r") as in_m2: | |
# Load the M2 file and split into blocks | |
m2_blocks = in_m2.read().strip().split("\n\n") | |
with open(args.output, "w+") as out_conll: | |
# Loop though the M2 file blocks | |
for m2_block in m2_blocks: | |
info = m2_block.split("\n") | |
# In the M2 format, space edits are also space separated. | |
# We insert a tab character where appropiate to simply token | |
# splitting in the next line. | |
info[0] = re.sub(r"(?<=[^ ])([ ])|([ ])(?=[^ ])", "\t", info[0]) | |
# Get the orig sent and edits | |
orig = info[0].split("\t")[1:] # 1: ignores "S" | |
edits = info[1:] | |
# Get the indexes of the edited tokens | |
edit_indexes = get_edit_indexes(edits, args.annotator) | |
# Loop through tokens | |
for idx, tok in enumerate(orig): | |
# Spaces | |
if tok.strip() == "": | |
continue | |
# Incorrect | |
if idx in edit_indexes: | |
out_conll.write("\t".join([tok, "B-INC"]) + "\n") | |
# Correct | |
else: | |
out_conll.write("\t".join([tok, "B-COR"]) + "\n") | |
# Newline at end of sentence | |
out_conll.write("\n") | |
def get_edit_indexes(edits: List[str], annotator_id: int) -> Set[int]: | |
""" | |
Get token indexes in the original sentence that are modified by the edits | |
provided. | |
Args: | |
edits: A list of edit lines from an m2 file | |
annotator_id: The annotator id to select | |
Returns: | |
A set of edited token indexes. Missing words affect the next token. | |
""" | |
edit_indexes = [] | |
for edit in edits: | |
parts = edit.split("|||") | |
# Get edit type | |
edit_type = parts[1] | |
# Get edit annotator id | |
edit_annotator_id = int(parts[5]) | |
# Get the edit start and end span | |
edit_start_idx, edit_end_idx = tuple(map(int, parts[0].split(" ")[1:3])) | |
# Ignore noop edits; i.e. no errors | |
if edit_type == "noop": | |
continue | |
# Choose only edits by the specified annotator | |
if edit_annotator_id != annotator_id: | |
continue | |
if edit_start_idx == edit_end_idx: | |
# Missing words defined as affecting the next token | |
edit_indexes.append(edit_start_idx) | |
else: | |
# Other edits may be more than one token | |
edit_indexes.extend(range(edit_start_idx, edit_end_idx)) | |
# Convert output to a set to remove duplicates and speedup lookup | |
return set(edit_indexes) | |
def parse_args() -> argparse.Namespace: | |
parser = argparse.ArgumentParser(description="Convert an M2 file to CoNLL format.") | |
parser.add_argument( | |
"m2_file_path", type=str, help="Path to a M2 file." | |
) | |
parser.add_argument( | |
"--output", "-o", type=str, help="The output filepath.", required=True, | |
) | |
parser.add_argument( | |
"--annotator", | |
"-a", | |
type=int, | |
default=0, | |
help="The annotator id to select.", | |
) | |
parser.add_argument( | |
"--debug", | |
action="store_true", | |
help="If provided it provides additional logging in case of errors.", | |
) | |
args = parser.parse_args() | |
return args | |
def normalize_args(args: argparse.Namespace) -> None: | |
args.m2_file_path = os.path.realpath(args.m2_file_path) | |
def validate_args(args: argparse.Namespace) -> None: | |
if args.m2_file_path != "-": | |
if not os.path.isfile(args.m2_file_path): | |
raise ValueError("The provided M2 file path is invalid.") | |
def run() -> None: | |
try: | |
args = parse_args() | |
normalize_args(args) | |
validate_args(args) | |
main(args) | |
except KeyboardInterrupt: | |
print("\nAborted!") | |
except Exception as err: # pylint: disable=broad-except | |
if args.debug: | |
raise | |
print("Error: %s" % err) | |
if __name__ == "__main__": | |
run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment