Last active
September 20, 2024 06:38
-
-
Save PonteIneptique/0330be0c1e4fdbc921487dc168041649 to your computer and use it in GitHub Desktop.
Simple LASLA compatible GT retroconverter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Requirements: | |
regex | |
colorama | |
pycases | |
click | |
""" | |
import os | |
import itertools | |
from typing import IO, List, Tuple, Dict | |
import click | |
import regex | |
import cases | |
def roman_number(inp: str) -> int: | |
""" | |
Source: https://stackoverflow.com/questions/19308177/converting-roman-numerals-to-integers-in-python | |
Author: https://stackoverflow.com/users/1201737/r366y | |
:param num: | |
:return: | |
>>> roman_number("XXIV") | |
24 | |
""" | |
roman_numerals = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} | |
result = 0 | |
for i, c in enumerate(inp.upper()): | |
if (i + 1) == len(inp) or roman_numerals[c] >= roman_numerals[inp[i + 1].upper()]: | |
result += roman_numerals[c] | |
else: | |
result -= roman_numerals[c] | |
click.secho(f"[Debug] Automatic numeral conversion {inp} to {result}", color="blue") | |
return result | |
def uvji(val): | |
return val.replace("V", "U").replace("v", "u").replace("J", "I").replace("j", "i") | |
def get_tasks(morph): | |
if morph == "_": | |
return {} | |
return dict([ | |
m.split("=") | |
for m in morph.split("|") | |
]) | |
# Useful constants | |
WRONG_CLITICS = {"quis1"} | |
DOTS_EXCEPT_APOSTROPHES = r".?!\"“”\"«»…\[\]\(\)„“" | |
TASKS = "form,lemma,Deg,Numb,Person,Mood_Tense_Voice,Case,Gend,Dis,pos".split(",") | |
def read_input(opened_file: IO) -> Tuple[Dict[str, List], List[str]]: | |
""" Reads an input file. Supports [METADATA:___] in order to split a full doc into multiple chunks | |
""" | |
clitics = [] | |
header = [] | |
files = {"full": []} | |
cur_text = "full" | |
previous_anno = None | |
anno = None | |
for lineno, line in enumerate(opened_file): | |
line = line.strip().split("\t") | |
if lineno == 0: | |
header = line | |
continue | |
previous_anno = anno | |
anno = dict(zip(header, line)) | |
if anno["lemma"] == "[METADATA]": | |
cur_text = anno["form"] | |
files[anno["form"]] = [] | |
continue | |
if anno["form"] in DOTS_EXCEPT_APOSTROPHES: | |
if files[cur_text][-1] != {}: | |
files[cur_text].append({}) | |
continue | |
anno["Dis"] = "_" | |
anno.update(get_tasks(anno["morph"])) | |
anno["Mood_Tense_Voice"] = "|".join([ | |
anno.get(part, "_") | |
for part in "Mood_Tense_Voice".split("_") | |
]).replace("_|_|_", "_") | |
if anno["lemma"].isnumeric(): | |
if int(anno["lemma"]) > 3: | |
anno["lemma"] = anno["form"] = "3" | |
if anno["lemma"][-1].isnumeric() and len(anno["lemma"]) > 1: | |
anno["lemma"], anno["Dis"] = anno["lemma"][:-1], anno["lemma"][-1] | |
if anno["lemma"] == "[ROMAN_NUMBER]": | |
anno["lemma"] = anno["form"] = roman_number(anno["form"]) | |
if anno["lemma"] > 3: | |
anno["lemma"] = anno["form"] = "3" | |
else: | |
anno["lemma"] = anno["form"] = str(anno["form"]) | |
if anno["lemma"] == "[Greek]": | |
continue | |
if anno["POS"] == "OUT": | |
click.secho(f"[Debug] Token {anno['form']} is annotated [OUT]", color="blue") | |
if anno["POS"] == "PUNC": | |
if anno["form"] in "?.;!)(][": | |
files[cur_text].append(None) | |
continue | |
if anno["POS"] == "VERaux": | |
anno["POS"] = "VER" | |
anno["lemma"] = uvji(anno["lemma"]) | |
anno["form"] = uvji(anno["form"]) | |
if len(files[cur_text]) and files[cur_text][-1] and files[cur_text][-1] == previous_anno \ | |
and files[cur_text][-1]["form"] == anno["form"]: | |
if anno["lemma"] not in WRONG_CLITICS: | |
clitics.append(anno["lemma"]) | |
files[cur_text][-1]["lemma"] = files[cur_text][-1]["lemma"] + "界" + anno["lemma"] | |
continue | |
elif len(files[cur_text]) and files[cur_text][-1] and files[cur_text][-1]["form"] == anno["form"][1:-1]: | |
clitics.append(anno["lemma"]) | |
files[cur_text][-1]["lemma"] = files[cur_text][-1]["lemma"] + "界" + anno["lemma"] | |
continue | |
if "." in anno["form"]: | |
click.secho(f"[Debug] Token {anno['form']} is abbreviated", fg="blue") | |
if False and anno["POS"].startswith("NOM"): | |
anno["POS"] = "NOM" | |
anno["pos"] = anno["POS"] | |
files[cur_text].append(anno) | |
return {text_id: annots for text_id, annots in files.items()}, clitics | |
@click.command() | |
@click.argument("file", type=click.File(mode="r")) | |
@click.argument("output-dir", type=click.Path(file_okay=False, dir_okay=True)) | |
@click.option("--split", type=bool, default=True) | |
def command(file: IO, output_dir: str, split: bool): | |
content, clitics = read_input(file) | |
click.echo(f"Found {len(clitics)} clitics in the source file: {', '.join(set(clitics))}") | |
n = "\n" | |
click.echo(f"Found {len(content)} different texts in the source file: \n - {(n+' - ').join(content)}") | |
file.close() | |
os.makedirs(output_dir, exist_ok=True) | |
if split is False: | |
# Insert empty values | |
c = list(content.values()) | |
c[::2] = [] * len(c) | |
content = { | |
"all": list(itertools.chain.from_iterable(c)) | |
} | |
total = 0 | |
for file in content: | |
local_total = 0 | |
with open(os.path.join(output_dir, cases.to_kebab(file)+".tsv"), "w") as f: | |
f.write("\t".join(TASKS).replace("form", "token") + "\n") | |
for annot in content[file]: | |
if not annot: | |
f.write("\n") | |
continue | |
local_total += 1 | |
f.write("\t".join([annot.get(h, "_") for h in TASKS]) + "\n") | |
click.secho( | |
f"Found {local_total:,} tokens in "+ os.path.join(output_dir, cases.to_kebab(file)+".tsv"), | |
fg="green" | |
) | |
total += local_total | |
if total != local_total: | |
click.secho( | |
f"Found {local_total:,} tokens in total", | |
fg="green" | |
) | |
command() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment