simonepri · May 23, 2020 09:41
diff --git a/ged_utils.md b/ged_utils.md
diff --git a/ged_utils_parallel_to_conll.ipynb b/ged_utils_parallel_to_conll.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "ged-utils - parallel to conll",
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gYUXp4NtAPDK",
        "colab_type": "text"
      },
      "source": [
        "# Setup GDrive"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2VjyO8py_lsZ",
        "colab_type": "code",
        "cellView": "form",
        "colab": {}
      },
      "source": [
        "import os\n",
        "from google.colab import drive as gdrive\n",
        "\n",
        "# @markdown Mount gdrive\n",
        "GDRIVE_ROOT = os.path.abspath('gdrive')\n",
        "GDRIVE_BASE = os.path.join(GDRIVE_ROOT, 'My Drive')\n",
        "print('[INFO] Mounting \"My Drive\" Google Drive in {}'.format(GDRIVE_BASE))\n",
        "gdrive.mount(GDRIVE_ROOT, force_remount = True)\n",
        "\n",
        "DATA_BASE = GDRIVE_BASE\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "G1jHAjOejZtK",
        "colab_type": "text"
      },
      "source": [
        "# Setup Code"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Jsh1siLojXim",
        "colab_type": "code",
        "cellView": "form",
        "colab": {}
      },
      "source": [
        "# @markdown errant_parallel\n",
        "!pip uninstall -y -qq spacy nltk en-core-web-sm fastai multiprocess\n",
        "!pip install --upgrade -qq errant==2.1.0 spacy==1.9.0\n",
        "!python -m spacy download en"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wOV9VL7Et61Z",
        "colab_type": "code",
        "cellView": "form",
        "colab": {}
      },
      "source": [
        "# @markdown m2_to_conll.py\n",
        "!wget -q -O m2_to_conll.py https://gist.githubusercontent.com/simonepri/f4ad91e935a066d39d39e9b0da472988/raw/m2_to_conll.py"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "L1zpqsC-_qBk",
        "colab_type": "text"
      },
      "source": [
        "# Parallel -> CoNLL"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "QCJm63YGRtFf",
        "colab_type": "code",
        "cellView": "form",
        "colab": {}
      },
      "source": [
        "# @markdown The paths are relative to the root directory on your GDrive.\n",
        "\n",
        "IN_ORIGINAL_FILE_PATH = 'file.orig' # @param {type:'string'}\n",
        "IN_ORIGINAL_FILE_PATH = os.path.join(DATA_BASE, IN_ORIGINAL_FILE_PATH)\n",
        "\n",
        "IN_CORRECTED_FILE_PATH = 'file.cor' # @param {type:'string'}\n",
        "IN_CORRECTED_FILE_PATH = os.path.join(DATA_BASE, IN_CORRECTED_FILE_PATH)\n",
        "\n",
        "OUT_M2_FILE_PATH = 'file.m2' # @param {type:'string'}\n",
        "OUT_M2_FILE_PATH = os.path.join(DATA_BASE, OUT_M2_FILE_PATH)\n",
        "\n",
        "OUT_CONLL_FILE_PATH = 'file.conll' # @param {type:'string'}\n",
        "OUT_CONLL_FILE_PATH = os.path.join(DATA_BASE, OUT_CONLL_FILE_PATH)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KJW9C2LiEUsO",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!errant_parallel -tok -orig \"$IN_ORIGINAL_FILE_PATH\" -cor \"$IN_CORRECTED_FILE_PATH\" -out \"$OUT_M2_FILE_PATH\""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "2cgtagw_t1lx",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!python m2_to_conll.py \"$OUT_M2_FILE_PATH\" -o \"$OUT_CONLL_FILE_PATH\" --annotator 0"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
 }
diff --git a/m2_to_conll.py b/m2_to_conll.py
 #!/usr/bin/env python3

 from typing import *  # pylint: disable=wildcard-import,unused-wildcard-import

 import argparse
 import os
 import re


 def main(args: argparse.Namespace) -> None:
    with open(args.m2_file_path, "r") as in_m2:
        # Load the M2 file and split into blocks
        m2_blocks = in_m2.read().strip().split("\n\n")

    with open(args.output, "w+") as out_conll:
        # Loop though the M2 file blocks
        for m2_block in m2_blocks:
            info = m2_block.split("\n")
            # In the M2 format, space edits are also space separated.
            # We insert a tab character where appropiate to simply token
            # splitting in the next line.
            info[0] = re.sub(r"(?<=[^ ])([ ])|([ ])(?=[^ ])", "\t", info[0])
            # Get the orig sent and edits
            orig = info[0].split("\t")[1:]  # 1: ignores "S"
            edits = info[1:]
            # Get the indexes of the edited tokens
            edit_indexes = get_edit_indexes(edits, args.annotator)
            # Loop through tokens
            for idx, tok in enumerate(orig):
                # Spaces
                if tok.strip() == "":
                    continue
                # Incorrect
                if idx in edit_indexes:
                    out_conll.write("\t".join([tok, "B-INC"]) + "\n")
                # Correct
                else:
                    out_conll.write("\t".join([tok, "B-COR"]) + "\n")
            # Newline at end of sentence
            out_conll.write("\n")


 def get_edit_indexes(edits: List[str], annotator_id: int) -> Set[int]:
    """
    Get token indexes in the original sentence that are modified by the edits
    provided.
        Args:
            edits: A list of edit lines from an m2 file
            annotator_id: The annotator id to select
        Returns:
            A set of edited token indexes. Missing words affect the next token.
    """
    edit_indexes = []
    for edit in edits:
        parts = edit.split("|||")
        # Get edit type
        edit_type = parts[1]
        # Get edit annotator id
        edit_annotator_id = int(parts[5])
        # Get the edit start and end span
        edit_start_idx, edit_end_idx = tuple(map(int, parts[0].split(" ")[1:3]))

        # Ignore noop edits; i.e. no errors
        if edit_type == "noop":
            continue

        # Choose only edits by the specified annotator
        if edit_annotator_id != annotator_id:
            continue

        if edit_start_idx == edit_end_idx:
            # Missing words defined as affecting the next token
            edit_indexes.append(edit_start_idx)
        else:
            # Other edits may be more than one token
            edit_indexes.extend(range(edit_start_idx, edit_end_idx))
    # Convert output to a set to remove duplicates and speedup lookup
    return set(edit_indexes)


 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert an M2 file to CoNLL format.")
    parser.add_argument(
        "m2_file_path", type=str, help="Path to a M2 file."
    )
    parser.add_argument(
        "--output", "-o", type=str, help="The output filepath.", required=True,
    )
    parser.add_argument(
        "--annotator",
        "-a",
        type=int,
        default=0,
        help="The annotator id to select.",
    )
    parser.add_argument(
        "--debug",
        action="store_true",
        help="If provided it provides additional logging in case of errors.",
    )

    args = parser.parse_args()
    return args


 def normalize_args(args: argparse.Namespace) -> None:
    args.m2_file_path = os.path.realpath(args.m2_file_path)


 def validate_args(args: argparse.Namespace) -> None:
    if args.m2_file_path != "-":
        if not os.path.isfile(args.m2_file_path):
            raise ValueError("The provided M2 file path is invalid.")


 def run() -> None:
    try:
        args = parse_args()

        normalize_args(args)
        validate_args(args)
        main(args)
    except KeyboardInterrupt:
        print("\nAborted!")
    except Exception as err:  # pylint: disable=broad-except
        if args.debug:
            raise
        print("Error: %s" % err)


 if __name__ == "__main__":
    run()
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "ged-utils - parallel to conll",
	"provenance": [],
	"collapsed_sections": [],
	"toc_visible": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "gYUXp4NtAPDK",
	"colab_type": "text"
	},
	"source": [
	"# Setup GDrive"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "2VjyO8py_lsZ",
	"colab_type": "code",
	"cellView": "form",
	"colab": {}
	},
	"source": [
	"import os\n",
	"from google.colab import drive as gdrive\n",
	"\n",
	"# @markdown Mount gdrive\n",
	"GDRIVE_ROOT = os.path.abspath('gdrive')\n",
	"GDRIVE_BASE = os.path.join(GDRIVE_ROOT, 'My Drive')\n",
	"print('[INFO] Mounting \"My Drive\" Google Drive in {}'.format(GDRIVE_BASE))\n",
	"gdrive.mount(GDRIVE_ROOT, force_remount = True)\n",
	"\n",
	"DATA_BASE = GDRIVE_BASE\n"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "G1jHAjOejZtK",
	"colab_type": "text"
	},
	"source": [
	"# Setup Code"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "Jsh1siLojXim",
	"colab_type": "code",
	"cellView": "form",
	"colab": {}
	},
	"source": [
	"# @markdown errant_parallel\n",
	"!pip uninstall -y -qq spacy nltk en-core-web-sm fastai multiprocess\n",
	"!pip install --upgrade -qq errant==2.1.0 spacy==1.9.0\n",
	"!python -m spacy download en"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "wOV9VL7Et61Z",
	"colab_type": "code",
	"cellView": "form",
	"colab": {}
	},
	"source": [
	"# @markdown m2_to_conll.py\n",
	"!wget -q -O m2_to_conll.py https://gist.githubusercontent.com/simonepri/f4ad91e935a066d39d39e9b0da472988/raw/m2_to_conll.py"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "L1zpqsC-_qBk",
	"colab_type": "text"
	},
	"source": [
	"# Parallel -> CoNLL"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "QCJm63YGRtFf",
	"colab_type": "code",
	"cellView": "form",
	"colab": {}
	},
	"source": [
	"# @markdown The paths are relative to the root directory on your GDrive.\n",
	"\n",
	"IN_ORIGINAL_FILE_PATH = 'file.orig' # @param {type:'string'}\n",
	"IN_ORIGINAL_FILE_PATH = os.path.join(DATA_BASE, IN_ORIGINAL_FILE_PATH)\n",
	"\n",
	"IN_CORRECTED_FILE_PATH = 'file.cor' # @param {type:'string'}\n",
	"IN_CORRECTED_FILE_PATH = os.path.join(DATA_BASE, IN_CORRECTED_FILE_PATH)\n",
	"\n",
	"OUT_M2_FILE_PATH = 'file.m2' # @param {type:'string'}\n",
	"OUT_M2_FILE_PATH = os.path.join(DATA_BASE, OUT_M2_FILE_PATH)\n",
	"\n",
	"OUT_CONLL_FILE_PATH = 'file.conll' # @param {type:'string'}\n",
	"OUT_CONLL_FILE_PATH = os.path.join(DATA_BASE, OUT_CONLL_FILE_PATH)"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "KJW9C2LiEUsO",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"!errant_parallel -tok -orig \"$IN_ORIGINAL_FILE_PATH\" -cor \"$IN_CORRECTED_FILE_PATH\" -out \"$OUT_M2_FILE_PATH\""
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "2cgtagw_t1lx",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"!python m2_to_conll.py \"$OUT_M2_FILE_PATH\" -o \"$OUT_CONLL_FILE_PATH\" --annotator 0"
	],
	"execution_count": 0,
	"outputs": []
	}
	]
	}
	#!/usr/bin/env python3

	from typing import * # pylint: disable=wildcard-import,unused-wildcard-import

	import argparse
	import os
	import re


	def main(args: argparse.Namespace) -> None:
	with open(args.m2_file_path, "r") as in_m2:
	# Load the M2 file and split into blocks
	m2_blocks = in_m2.read().strip().split("\n\n")

	with open(args.output, "w+") as out_conll:
	# Loop though the M2 file blocks
	for m2_block in m2_blocks:
	info = m2_block.split("\n")
	# In the M2 format, space edits are also space separated.
	# We insert a tab character where appropiate to simply token
	# splitting in the next line.
	info[0] = re.sub(r"(?<=[^ ])([ ])\|([ ])(?=[^ ])", "\t", info[0])
	# Get the orig sent and edits
	orig = info[0].split("\t")[1:] # 1: ignores "S"
	edits = info[1:]
	# Get the indexes of the edited tokens
	edit_indexes = get_edit_indexes(edits, args.annotator)
	# Loop through tokens
	for idx, tok in enumerate(orig):
	# Spaces
	if tok.strip() == "":
	continue
	# Incorrect
	if idx in edit_indexes:
	out_conll.write("\t".join([tok, "B-INC"]) + "\n")
	# Correct
	else:
	out_conll.write("\t".join([tok, "B-COR"]) + "\n")
	# Newline at end of sentence
	out_conll.write("\n")


	def get_edit_indexes(edits: List[str], annotator_id: int) -> Set[int]:
	"""
	Get token indexes in the original sentence that are modified by the edits
	provided.
	Args:
	edits: A list of edit lines from an m2 file
	annotator_id: The annotator id to select
	Returns:
	A set of edited token indexes. Missing words affect the next token.
	"""
	edit_indexes = []
	for edit in edits:
	parts = edit.split("\|\|\|")
	# Get edit type
	edit_type = parts[1]
	# Get edit annotator id
	edit_annotator_id = int(parts[5])
	# Get the edit start and end span
	edit_start_idx, edit_end_idx = tuple(map(int, parts[0].split(" ")[1:3]))

	# Ignore noop edits; i.e. no errors
	if edit_type == "noop":
	continue

	# Choose only edits by the specified annotator
	if edit_annotator_id != annotator_id:
	continue

	if edit_start_idx == edit_end_idx:
	# Missing words defined as affecting the next token
	edit_indexes.append(edit_start_idx)
	else:
	# Other edits may be more than one token
	edit_indexes.extend(range(edit_start_idx, edit_end_idx))
	# Convert output to a set to remove duplicates and speedup lookup
	return set(edit_indexes)


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="Convert an M2 file to CoNLL format.")
	parser.add_argument(
	"m2_file_path", type=str, help="Path to a M2 file."
	)
	parser.add_argument(
	"--output", "-o", type=str, help="The output filepath.", required=True,
	)
	parser.add_argument(
	"--annotator",
	"-a",
	type=int,
	default=0,
	help="The annotator id to select.",
	)
	parser.add_argument(
	"--debug",
	action="store_true",
	help="If provided it provides additional logging in case of errors.",
	)

	args = parser.parse_args()
	return args


	def normalize_args(args: argparse.Namespace) -> None:
	args.m2_file_path = os.path.realpath(args.m2_file_path)


	def validate_args(args: argparse.Namespace) -> None:
	if args.m2_file_path != "-":
	if not os.path.isfile(args.m2_file_path):
	raise ValueError("The provided M2 file path is invalid.")


	def run() -> None:
	try:
	args = parse_args()

	normalize_args(args)
	validate_args(args)
	main(args)
	except KeyboardInterrupt:
	print("\nAborted!")
	except Exception as err: # pylint: disable=broad-except
	if args.debug:
	raise
	print("Error: %s" % err)


	if __name__ == "__main__":
	run()