Skip to content

Instantly share code, notes, and snippets.

@simonepri
Last active May 23, 2020 09:41
Show Gist options
  • Save simonepri/f4ad91e935a066d39d39e9b0da472988 to your computer and use it in GitHub Desktop.
Save simonepri/f4ad91e935a066d39d39e9b0da472988 to your computer and use it in GitHub Desktop.
GED Utils
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "ged-utils - parallel to conll",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "gYUXp4NtAPDK",
"colab_type": "text"
},
"source": [
"# Setup GDrive"
]
},
{
"cell_type": "code",
"metadata": {
"id": "2VjyO8py_lsZ",
"colab_type": "code",
"cellView": "form",
"colab": {}
},
"source": [
"import os\n",
"from google.colab import drive as gdrive\n",
"\n",
"# @markdown Mount gdrive\n",
"GDRIVE_ROOT = os.path.abspath('gdrive')\n",
"GDRIVE_BASE = os.path.join(GDRIVE_ROOT, 'My Drive')\n",
"print('[INFO] Mounting \"My Drive\" Google Drive in {}'.format(GDRIVE_BASE))\n",
"gdrive.mount(GDRIVE_ROOT, force_remount = True)\n",
"\n",
"DATA_BASE = GDRIVE_BASE\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "G1jHAjOejZtK",
"colab_type": "text"
},
"source": [
"# Setup Code"
]
},
{
"cell_type": "code",
"metadata": {
"id": "Jsh1siLojXim",
"colab_type": "code",
"cellView": "form",
"colab": {}
},
"source": [
"# @markdown errant_parallel\n",
"!pip uninstall -y -qq spacy nltk en-core-web-sm fastai multiprocess\n",
"!pip install --upgrade -qq errant==2.1.0 spacy==1.9.0\n",
"!python -m spacy download en"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "wOV9VL7Et61Z",
"colab_type": "code",
"cellView": "form",
"colab": {}
},
"source": [
"# @markdown m2_to_conll.py\n",
"!wget -q -O m2_to_conll.py https://gist.githubusercontent.com/simonepri/f4ad91e935a066d39d39e9b0da472988/raw/m2_to_conll.py"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "L1zpqsC-_qBk",
"colab_type": "text"
},
"source": [
"# Parallel -> CoNLL"
]
},
{
"cell_type": "code",
"metadata": {
"id": "QCJm63YGRtFf",
"colab_type": "code",
"cellView": "form",
"colab": {}
},
"source": [
"# @markdown The paths are relative to the root directory on your GDrive.\n",
"\n",
"IN_ORIGINAL_FILE_PATH = 'file.orig' # @param {type:'string'}\n",
"IN_ORIGINAL_FILE_PATH = os.path.join(DATA_BASE, IN_ORIGINAL_FILE_PATH)\n",
"\n",
"IN_CORRECTED_FILE_PATH = 'file.cor' # @param {type:'string'}\n",
"IN_CORRECTED_FILE_PATH = os.path.join(DATA_BASE, IN_CORRECTED_FILE_PATH)\n",
"\n",
"OUT_M2_FILE_PATH = 'file.m2' # @param {type:'string'}\n",
"OUT_M2_FILE_PATH = os.path.join(DATA_BASE, OUT_M2_FILE_PATH)\n",
"\n",
"OUT_CONLL_FILE_PATH = 'file.conll' # @param {type:'string'}\n",
"OUT_CONLL_FILE_PATH = os.path.join(DATA_BASE, OUT_CONLL_FILE_PATH)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "KJW9C2LiEUsO",
"colab_type": "code",
"colab": {}
},
"source": [
"!errant_parallel -tok -orig \"$IN_ORIGINAL_FILE_PATH\" -cor \"$IN_CORRECTED_FILE_PATH\" -out \"$OUT_M2_FILE_PATH\""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "2cgtagw_t1lx",
"colab_type": "code",
"colab": {}
},
"source": [
"!python m2_to_conll.py \"$OUT_M2_FILE_PATH\" -o \"$OUT_CONLL_FILE_PATH\" --annotator 0"
],
"execution_count": 0,
"outputs": []
}
]
}
#!/usr/bin/env python3
from typing import * # pylint: disable=wildcard-import,unused-wildcard-import
import argparse
import os
import re
def main(args: argparse.Namespace) -> None:
with open(args.m2_file_path, "r") as in_m2:
# Load the M2 file and split into blocks
m2_blocks = in_m2.read().strip().split("\n\n")
with open(args.output, "w+") as out_conll:
# Loop though the M2 file blocks
for m2_block in m2_blocks:
info = m2_block.split("\n")
# In the M2 format, space edits are also space separated.
# We insert a tab character where appropiate to simply token
# splitting in the next line.
info[0] = re.sub(r"(?<=[^ ])([ ])|([ ])(?=[^ ])", "\t", info[0])
# Get the orig sent and edits
orig = info[0].split("\t")[1:] # 1: ignores "S"
edits = info[1:]
# Get the indexes of the edited tokens
edit_indexes = get_edit_indexes(edits, args.annotator)
# Loop through tokens
for idx, tok in enumerate(orig):
# Spaces
if tok.strip() == "":
continue
# Incorrect
if idx in edit_indexes:
out_conll.write("\t".join([tok, "B-INC"]) + "\n")
# Correct
else:
out_conll.write("\t".join([tok, "B-COR"]) + "\n")
# Newline at end of sentence
out_conll.write("\n")
def get_edit_indexes(edits: List[str], annotator_id: int) -> Set[int]:
"""
Get token indexes in the original sentence that are modified by the edits
provided.
Args:
edits: A list of edit lines from an m2 file
annotator_id: The annotator id to select
Returns:
A set of edited token indexes. Missing words affect the next token.
"""
edit_indexes = []
for edit in edits:
parts = edit.split("|||")
# Get edit type
edit_type = parts[1]
# Get edit annotator id
edit_annotator_id = int(parts[5])
# Get the edit start and end span
edit_start_idx, edit_end_idx = tuple(map(int, parts[0].split(" ")[1:3]))
# Ignore noop edits; i.e. no errors
if edit_type == "noop":
continue
# Choose only edits by the specified annotator
if edit_annotator_id != annotator_id:
continue
if edit_start_idx == edit_end_idx:
# Missing words defined as affecting the next token
edit_indexes.append(edit_start_idx)
else:
# Other edits may be more than one token
edit_indexes.extend(range(edit_start_idx, edit_end_idx))
# Convert output to a set to remove duplicates and speedup lookup
return set(edit_indexes)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert an M2 file to CoNLL format.")
parser.add_argument(
"m2_file_path", type=str, help="Path to a M2 file."
)
parser.add_argument(
"--output", "-o", type=str, help="The output filepath.", required=True,
)
parser.add_argument(
"--annotator",
"-a",
type=int,
default=0,
help="The annotator id to select.",
)
parser.add_argument(
"--debug",
action="store_true",
help="If provided it provides additional logging in case of errors.",
)
args = parser.parse_args()
return args
def normalize_args(args: argparse.Namespace) -> None:
args.m2_file_path = os.path.realpath(args.m2_file_path)
def validate_args(args: argparse.Namespace) -> None:
if args.m2_file_path != "-":
if not os.path.isfile(args.m2_file_path):
raise ValueError("The provided M2 file path is invalid.")
def run() -> None:
try:
args = parse_args()
normalize_args(args)
validate_args(args)
main(args)
except KeyboardInterrupt:
print("\nAborted!")
except Exception as err: # pylint: disable=broad-except
if args.debug:
raise
print("Error: %s" % err)
if __name__ == "__main__":
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment