simonepri · May 23, 2020 09:41
diff --git a/ged_utils.md b/ged_utils.md
diff --git a/ged_utils_parallel_to_conll.ipynb b/ged_utils_parallel_to_conll.ipynb
diff --git a/m2_to_conll.py b/m2_to_conll.py
 #!/usr/bin/env python3

 from typing import *  # pylint: disable=wildcard-import,unused-wildcard-import

 import argparse
 import os
 import re


 def main(args: argparse.Namespace) -> None:
    with open(args.m2_file_path, "r") as in_m2:
        # Load the M2 file and split into blocks
        m2_blocks = in_m2.read().strip().split("\n\n")

    with open(args.output, "w+") as out_conll:
        # Loop though the M2 file blocks
        for m2_block in m2_blocks:
            info = m2_block.split("\n")
            # In the M2 format, space edits are also space separated.
            # We insert a tab character where appropiate to simply token
            # splitting in the next line.
            info[0] = re.sub(r"(?<=[^ ])([ ])|([ ])(?=[^ ])", "\t", info[0])
            # Get the orig sent and edits
            orig = info[0].split("\t")[1:]  # 1: ignores "S"
            edits = info[1:]
            # Get the indexes of the edited tokens
            edit_indexes = get_edit_indexes(edits, args.annotator)
            # Loop through tokens
            for idx, tok in enumerate(orig):
                # Spaces
                if tok.strip() == "":
                    continue
                # Incorrect
                if idx in edit_indexes:
                    out_conll.write("\t".join([tok, "B-INC"]) + "\n")
                # Correct
                else:
                    out_conll.write("\t".join([tok, "B-COR"]) + "\n")
            # Newline at end of sentence
            out_conll.write("\n")


 def get_edit_indexes(edits: List[str], annotator_id: int) -> Set[int]:
    """
    Get token indexes in the original sentence that are modified by the edits
    provided.
        Args:
            edits: A list of edit lines from an m2 file
            annotator_id: The annotator id to select
        Returns:
            A set of edited token indexes. Missing words affect the next token.
    """
    edit_indexes = []
    for edit in edits:
        parts = edit.split("|||")
        # Get edit type
        edit_type = parts[1]
        # Get edit annotator id
        edit_annotator_id = int(parts[5])
        # Get the edit start and end span
        edit_start_idx, edit_end_idx = tuple(map(int, parts[0].split(" ")[1:3]))

        # Ignore noop edits; i.e. no errors
        if edit_type == "noop":
            continue

        # Choose only edits by the specified annotator
        if edit_annotator_id != annotator_id:
            continue

        if edit_start_idx == edit_end_idx:
            # Missing words defined as affecting the next token
            edit_indexes.append(edit_start_idx)
        else:
            # Other edits may be more than one token
            edit_indexes.extend(range(edit_start_idx, edit_end_idx))
    # Convert output to a set to remove duplicates and speedup lookup
    return set(edit_indexes)


 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert an M2 file to CoNLL format.")
    parser.add_argument(
        "m2_file_path", type=str, help="Path to a M2 file."
    )
    parser.add_argument(
        "--output", "-o", type=str, help="The output filepath.", required=True,
    )
    parser.add_argument(
        "--annotator",
        "-a",
        type=int,
        default=0,
        help="The annotator id to select.",
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="If provided it provides additional logging in case of errors.",
    )

    args = parser.parse_args()
    return args


 def normalize_args(args: argparse.Namespace) -> None:
    args.m2_file_path = os.path.realpath(args.m2_file_path)


 def validate_args(args: argparse.Namespace) -> None:
    if args.m2_file_path != "-":
        if not os.path.isfile(args.m2_file_path):
            raise ValueError("The provided M2 file path is invalid.")


 def run() -> None:
    try:
        args = parse_args()

        normalize_args(args)
        validate_args(args)
        main(args)
    except KeyboardInterrupt:
        print("\nAborted!")
    except Exception as err:  # pylint: disable=broad-except
        if args.debug:
            raise
        print("Error: %s" % err)


 if __name__ == "__main__":
    run()
	#!/usr/bin/env python3

	from typing import * # pylint: disable=wildcard-import,unused-wildcard-import

	import argparse
	import os
	import re


	def main(args: argparse.Namespace) -> None:
	with open(args.m2_file_path, "r") as in_m2:
	# Load the M2 file and split into blocks
	m2_blocks = in_m2.read().strip().split("\n\n")

	with open(args.output, "w+") as out_conll:
	# Loop though the M2 file blocks
	for m2_block in m2_blocks:
	info = m2_block.split("\n")
	# In the M2 format, space edits are also space separated.
	# We insert a tab character where appropiate to simply token
	# splitting in the next line.
	info[0] = re.sub(r"(?<=[^ ])([ ])\|([ ])(?=[^ ])", "\t", info[0])
	# Get the orig sent and edits
	orig = info[0].split("\t")[1:] # 1: ignores "S"
	edits = info[1:]
	# Get the indexes of the edited tokens
	edit_indexes = get_edit_indexes(edits, args.annotator)
	# Loop through tokens
	for idx, tok in enumerate(orig):
	# Spaces
	if tok.strip() == "":
	continue
	# Incorrect
	if idx in edit_indexes:
	out_conll.write("\t".join([tok, "B-INC"]) + "\n")
	# Correct
	else:
	out_conll.write("\t".join([tok, "B-COR"]) + "\n")
	# Newline at end of sentence
	out_conll.write("\n")


	def get_edit_indexes(edits: List[str], annotator_id: int) -> Set[int]:
	"""
	Get token indexes in the original sentence that are modified by the edits
	provided.
	Args:
	edits: A list of edit lines from an m2 file
	annotator_id: The annotator id to select
	Returns:
	A set of edited token indexes. Missing words affect the next token.
	"""
	edit_indexes = []
	for edit in edits:
	parts = edit.split("\|\|\|")
	# Get edit type
	edit_type = parts[1]
	# Get edit annotator id
	edit_annotator_id = int(parts[5])
	# Get the edit start and end span
	edit_start_idx, edit_end_idx = tuple(map(int, parts[0].split(" ")[1:3]))

	# Ignore noop edits; i.e. no errors
	if edit_type == "noop":
	continue

	# Choose only edits by the specified annotator
	if edit_annotator_id != annotator_id:
	continue

	if edit_start_idx == edit_end_idx:
	# Missing words defined as affecting the next token
	edit_indexes.append(edit_start_idx)
	else:
	# Other edits may be more than one token
	edit_indexes.extend(range(edit_start_idx, edit_end_idx))
	# Convert output to a set to remove duplicates and speedup lookup
	return set(edit_indexes)


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="Convert an M2 file to CoNLL format.")
	parser.add_argument(
	"m2_file_path", type=str, help="Path to a M2 file."
	)
	parser.add_argument(
	"--output", "-o", type=str, help="The output filepath.", required=True,
	)
	parser.add_argument(
	"--annotator",
	"-a",
	type=int,
	default=0,
	help="The annotator id to select.",
	)
	parser.add_argument(
	"--debug",
	action="store_true",
	help="If provided it provides additional logging in case of errors.",
	)

	args = parser.parse_args()
	return args


	def normalize_args(args: argparse.Namespace) -> None:
	args.m2_file_path = os.path.realpath(args.m2_file_path)


	def validate_args(args: argparse.Namespace) -> None:
	if args.m2_file_path != "-":
	if not os.path.isfile(args.m2_file_path):
	raise ValueError("The provided M2 file path is invalid.")


	def run() -> None:
	try:
	args = parse_args()

	normalize_args(args)
	validate_args(args)
	main(args)
	except KeyboardInterrupt:
	print("\nAborted!")
	except Exception as err: # pylint: disable=broad-except
	if args.debug:
	raise
	print("Error: %s" % err)


	if __name__ == "__main__":
	run()