Last active
October 19, 2022 13:51
-
-
Save xflr6/452ce3b7be31e46e1cad32146af33d15 to your computer and use it in GitHub Desktop.
Make a skeleton ELAN document from (text, translation) pairs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "4e7fe832-c696-48e9-8262-5f147531056e", | |
"metadata": {}, | |
"source": [ | |
"# Make a skeleton ELAN document from (text, translation) pairs\n", | |
"https://nbviewer.org/gist/xflr6/452ce3b7be31e46e1cad32146af33d15/MakeElanSentences.ipynb" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "bb9e386f-f77f-4247-8320-1145d14148e6", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from __future__ import annotations\n", | |
"\n", | |
"import dataclasses\n", | |
"import datetime\n", | |
"import enum\n", | |
"import itertools\n", | |
"import pathlib\n", | |
"import os\n", | |
"from typing import Optional\n", | |
"\n", | |
"import lxml.builder\n", | |
"from lxml import etree\n", | |
"\n", | |
"XSI = 'http://www.w3.org/2001/XMLSchema-instance'\n", | |
"XSI_CLARK = '{%s}' % XSI\n", | |
"\n", | |
"ELAN_SCHEMA = 'http://www.mpi.nl/tools/elan/EAFv3.0.xsd'\n", | |
"ELAN_URN = 'urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506'\n", | |
"ELAN_VERSION = '3.0'\n", | |
"ELAN_SUFFIX = '.eaf'\n", | |
"\n", | |
"ENCODING = 'utf-8'\n", | |
"\n", | |
"E = lxml.builder.ElementMaker(nsmap={'xsi': XSI})" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "ac8ead30-36e2-4f61-8016-c74fe3b1e149", | |
"metadata": {}, | |
"source": [ | |
"## Build document root and header" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "eb92d812-9356-42d3-a9ac-72cc396adf1f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def make_document_skeleton(*, author: str = '', media_file: str = '',\n", | |
" time_units: str = 'milliseconds',\n", | |
" format: str = ELAN_VERSION, version: str = ELAN_VERSION,\n", | |
" date: Optional[datetime.datetime] = None) -> etree._ElementTree:\n", | |
" \"\"\"Create a new annotation document from the given arguments.\"\"\"\n", | |
" kwargs = {f'{XSI_CLARK}noNamespaceSchemaLocation': ELAN_SCHEMA}\n", | |
" if date is None:\n", | |
" date = datetime.datetime.now(datetime.timezone.utc)\n", | |
" root = E.annotation_document(author=author,\n", | |
" date=date.replace(microsecond=0).astimezone().isoformat(),\n", | |
" format=format,\n", | |
" version=format,\n", | |
" **kwargs)\n", | |
" header = E.header(E.property(ELAN_URN, name='URN'), \n", | |
" media_file=media_file, time_units=time_units)\n", | |
" root.append(header)\n", | |
" return etree.ElementTree(root)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "a8c0b933-dfb9-4175-bbc9-8b77f839dd48", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<lxml.etree._ElementTree at 0x23266633c80>" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"make_document_skeleton()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c4e481b5-6758-48ac-b06b-5e6009478532", | |
"metadata": {}, | |
"source": [ | |
"## Pretty-print document" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "fccffba6-ba4b-41f9-82c5-e705a21de423", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def pprint(doc: etree._ElementTree, *, file=None,\n", | |
" canonical: bool = False,\n", | |
" pretty_print: bool = True,\n", | |
" end: str = '\\n',\n", | |
" **kwargs) -> None:\n", | |
" \"\"\"Pretty-print the XML serialization of the given document.\"\"\"\n", | |
" if canonical:\n", | |
" doc = canonicalized(doc)\n", | |
" text = etree.tostring(doc, encoding='unicode', pretty_print=pretty_print, **kwargs)\n", | |
" print(text, file=file, end=end)\n", | |
"\n", | |
" \n", | |
"def canonicalized(doc: etree._ElementTree, *,\n", | |
" indent: str = ' ' * 4) -> etree._ElementTree:\n", | |
" \"\"\"Return a copy of the annotation document in ELAN formatting.\"\"\"\n", | |
" doc = etree.ElementTree(etree.fromstring(etree.tostring(doc)))\n", | |
" etree.indent(doc, space=indent)\n", | |
" for elem in doc.iter('*'):\n", | |
" elem.tag = elem.tag.upper()\n", | |
" for k, v in sorted(elem.attrib.items()):\n", | |
" del elem.attrib[k]\n", | |
" elem.attrib[k.upper() if not k.startswith(XSI_CLARK) else k] = v\n", | |
" return doc" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "229ff54e-5a90-4eb9-913a-c379f9daa7ca", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<annotation_document xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" author=\"\" date=\"2022-09-18T22:40:54+02:00\" format=\"3.0\" version=\"3.0\" xsi:noNamespaceSchemaLocation=\"http://www.mpi.nl/tools/elan/EAFv3.0.xsd\">\n", | |
" <header media_file=\"\" time_units=\"milliseconds\">\n", | |
" <property name=\"URN\">urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506</property>\n", | |
" </header>\n", | |
"</annotation_document>\n", | |
"<ANNOTATION_DOCUMENT xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" AUTHOR=\"\" DATE=\"2022-09-18T22:40:54+02:00\" FORMAT=\"3.0\" VERSION=\"3.0\" xsi:noNamespaceSchemaLocation=\"http://www.mpi.nl/tools/elan/EAFv3.0.xsd\">\n", | |
" <HEADER MEDIA_FILE=\"\" TIME_UNITS=\"milliseconds\">\n", | |
" <PROPERTY NAME=\"URN\">urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506</PROPERTY>\n", | |
" </HEADER>\n", | |
"</ANNOTATION_DOCUMENT>\n" | |
] | |
} | |
], | |
"source": [ | |
"for kwargs in [{}, {'canonical': True}]:\n", | |
" pprint(make_document_skeleton(), **kwargs, end='')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4549b47a-32b7-41a6-80db-32c1bd6d3663", | |
"metadata": {}, | |
"source": [ | |
"## Serialize document" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "bfa790e7-636e-4d28-b2f0-ac0fcc0728d5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def write(path: os.PathLike | str, doc: etree._ElementTree, *,\n", | |
" canonical: bool = True,\n", | |
" pretty_print: bool = True,\n", | |
" xml_declaration: bool = True,\n", | |
" encoding: str = ENCODING) -> pathlib.Path:\n", | |
" \"\"\"(Over)write the given path with the XML serialization of the given document.\"\"\"\n", | |
" path = pathlib.Path(path)\n", | |
" if canonical:\n", | |
" doc = canonicalized(doc)\n", | |
" doc.write(path, xml_declaration=xml_declaration, pretty_print=pretty_print, encoding=encoding)\n", | |
" return path" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "dd8a0f48-21a3-469d-b300-70a49649c64c", | |
"metadata": {}, | |
"source": [ | |
"## Define linguistic type constraints" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "247802c0-4b1f-4714-acdc-f79a204edfc7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class Constraint(enum.Enum):\n", | |
" \"\"\"Possible values for `constraints` in a linguistic type.\"\"\"\n", | |
"\n", | |
" Time_Subdivision = \"Time subdivision of parent annotation's time interval, no time gaps allowed within this interval\"\n", | |
" Symbolic_Subdivision = 'Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered'\n", | |
" Symbolic_Association = '1-1 association with a parent annotation'\n", | |
" Included_In = \"Time alignable annotations within the parent annotation's time interval, gaps are allowed\"\n", | |
"\n", | |
" def as_xml(self) -> etree._Element:\n", | |
" return E.constraint(stereotype=self.name, description=self.value)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "e7f4a141-d1c2-4a3f-ae8f-2be307cf1b05", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<constraint xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" stereotype=\"Time_Subdivision\" description=\"Time subdivision of parent annotation's time interval, no time gaps allowed within this interval\"/>\n", | |
"<constraint xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" stereotype=\"Symbolic_Subdivision\" description=\"Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered\"/>\n", | |
"<constraint xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" stereotype=\"Symbolic_Association\" description=\"1-1 association with a parent annotation\"/>\n", | |
"<constraint xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" stereotype=\"Included_In\" description=\"Time alignable annotations within the parent annotation's time interval, gaps are allowed\"/>\n" | |
] | |
} | |
], | |
"source": [ | |
"for const in Constraint:\n", | |
" pprint(const.as_xml(), end='')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d5465497-6bbb-4782-a90b-0098ba78abb8", | |
"metadata": {}, | |
"source": [ | |
"## Define linguistic types" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "b00294f8-ae38-4a92-8531-b8de0cf3b64e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"@dataclasses.dataclass\n", | |
"class LinguisticType:\n", | |
" \"\"\"Possible targets for `linguistic_type_ref` in a tier.\"\"\"\n", | |
"\n", | |
" id_: str\n", | |
" time_alignable: bool = False\n", | |
" graphic_references: bool = False\n", | |
" constraints: Optional[Constraint] = None\n", | |
"\n", | |
" @classmethod\n", | |
" def make_dict(cls, types) -> dict[str, LinguisticType]:\n", | |
" return {id_: cls(id_=id_, **kwargs) for id_, kwargs in types.items()} \n", | |
"\n", | |
" def as_xml(self) -> etree._Element:\n", | |
" attrs = {'linguistic_type_id': self.id_,\n", | |
" 'time_alignable': 'true' if self.time_alignable else 'false'}\n", | |
" if self.constraints:\n", | |
" attrs['constraints'] = self.constraints.name\n", | |
" attrs['graphic_references'] = 'true' if self.graphic_references else 'false'\n", | |
" return E.linguistic_type(**attrs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "2000a256-f031-44ba-91e4-8fef057a5adc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"LINGUISTIC_TYPES = LinguisticType.make_dict({'default-lt': {'time_alignable': True},\n", | |
" 'translation': {'constraints': Constraint.Symbolic_Association},\n", | |
" 'transcription': {'time_alignable': True},\n", | |
" 'orth': {'time_alignable': True},\n", | |
" 'ref': {'time_alignable': True},\n", | |
" 'tx': {'constraints': Constraint.Included_In, 'time_alignable': True},\n", | |
" 'mb': {'constraints': Constraint.Symbolic_Subdivision},\n", | |
" 'orig': {'constraints': Constraint.Symbolic_Association},\n", | |
" 'ge': {'constraints': Constraint.Symbolic_Association},\n", | |
" 'ps': {'constraints': Constraint.Symbolic_Association},\n", | |
" 'so': {'constraints': Constraint.Symbolic_Association},\n", | |
" 'lxid': {'constraints': Constraint.Symbolic_Association},\n", | |
" 'fte': {'constraints': Constraint.Symbolic_Association},\n", | |
" 'nt': {'constraints': Constraint.Symbolic_Association},\n", | |
" 'imported-sep': {'time_alignable': True}})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "dd557f98-153f-4f19-b23a-17a04f35df60", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"default-lt\" time_alignable=\"true\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"translation\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"transcription\" time_alignable=\"true\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"orth\" time_alignable=\"true\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"ref\" time_alignable=\"true\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"tx\" time_alignable=\"true\" constraints=\"Included_In\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"mb\" time_alignable=\"false\" constraints=\"Symbolic_Subdivision\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"orig\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"ge\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"ps\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"so\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"lxid\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"fte\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"nt\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"imported-sep\" time_alignable=\"true\" graphic_references=\"false\"/>\n" | |
] | |
} | |
], | |
"source": [ | |
"for lt in LINGUISTIC_TYPES.values():\n", | |
" pprint(lt.as_xml(), end='')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "c0118d81-a94c-4ff1-a17e-d468e05f24a6", | |
"metadata": {}, | |
"source": [ | |
"## Define tiers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "a1ba9d63-0c43-4b1b-89c6-d976fd8be75c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"@dataclasses.dataclass\n", | |
"class Tier:\n", | |
" \"\"\"Container for annotations of one linguistic type. Possible target for `parent_ref` of a tier.\"\"\"\n", | |
"\n", | |
" id_: str\n", | |
" linguistic_type_ref: 'str'\n", | |
" parent_ref: Optional[str] = None\n", | |
" default_locale: str = 'en'\n", | |
"\n", | |
" @classmethod\n", | |
" def make_dict(cls, tiers) -> dict[str, Tier]:\n", | |
" return {id_: cls(id_=id_, **kwargs) for id_, kwargs in tiers.items()}\n", | |
"\n", | |
" @property\n", | |
" def linguistic_type(self):\n", | |
" return LINGUISTIC_TYPES[self.linguistic_type_ref]\n", | |
"\n", | |
" def as_xml(self) -> etree._Element:\n", | |
" attrs = {'tier_id': self.id_,\n", | |
" 'linguistic_type_ref': self.linguistic_type.id_}\n", | |
" if self.parent_ref:\n", | |
" attrs['parent_ref'] = self.parent_ref\n", | |
" attrs['default_locale'] = self.default_locale\n", | |
" return E.tier(**attrs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "4b499ba0-80c8-4e7e-b6ca-4cdd7d6811c1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"TIERS = Tier.make_dict({'ref@A': {'linguistic_type_ref': 'ref'},\n", | |
" 'tx@A': {'linguistic_type_ref': 'tx', 'parent_ref': 'ref@A'},\n", | |
" 'fte@A': {'linguistic_type_ref': 'translation', 'parent_ref': 'tx@A'},\n", | |
" 'mb@A': {'linguistic_type_ref': 'mb', 'parent_ref': 'tx@A'},\n", | |
" 'ge@A': {'linguistic_type_ref': 'ge', 'parent_ref': 'mb@A'},\n", | |
" 'ps@A': {'linguistic_type_ref': 'ps', 'parent_ref': 'mb@A'},\n", | |
" 'lxid@A': {'linguistic_type_ref': 'lxid', 'parent_ref': 'mb@A'},\n", | |
" 'so@A': {'linguistic_type_ref': 'so', 'parent_ref': 'mb@A'},\n", | |
" 'nt@A': {'linguistic_type_ref': 'nt', 'parent_ref': 'ref@A'},\n", | |
" 'orig@A': {'linguistic_type_ref': 'orig', 'parent_ref': 'tx@A'},\n", | |
" 'fta@A': {'linguistic_type_ref': 'translation', 'parent_ref': 'tx@A'}})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "881ef3fa-49c5-46c9-b2cb-8eb1ef6f23bb", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"ref@A\" linguistic_type_ref=\"ref\" default_locale=\"en\"/>\n", | |
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"tx@A\" linguistic_type_ref=\"tx\" parent_ref=\"ref@A\" default_locale=\"en\"/>\n", | |
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"fte@A\" linguistic_type_ref=\"translation\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n", | |
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"mb@A\" linguistic_type_ref=\"mb\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n", | |
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"ge@A\" linguistic_type_ref=\"ge\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n", | |
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"ps@A\" linguistic_type_ref=\"ps\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n", | |
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"lxid@A\" linguistic_type_ref=\"lxid\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n", | |
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"so@A\" linguistic_type_ref=\"so\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n", | |
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"nt@A\" linguistic_type_ref=\"nt\" parent_ref=\"ref@A\" default_locale=\"en\"/>\n", | |
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"orig@A\" linguistic_type_ref=\"orig\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n", | |
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"fta@A\" linguistic_type_ref=\"translation\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n" | |
] | |
} | |
], | |
"source": [ | |
"for tier in TIERS.values():\n", | |
" pprint(tier.as_xml(), end='')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "5ebe6048-e986-4f8c-b2d3-0d67927c148d", | |
"metadata": {}, | |
"source": [ | |
"## Build document" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "6d7fb8c1-80ba-42c9-abb5-54c9004ef965", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"SENTENCE_TIERS = (TIERS['ref@A'].id_, TIERS['tx@A'].id_, TIERS['fte@A'].id_)\n", | |
"\n", | |
"TIME_TICK_MILIS = 10_000" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "6d65f3e7-63c4-4833-8b42-ece59e7690e7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def make_document(basename: str, sentences, *,\n", | |
" language_code: str = 'en', country_code: str = 'US',\n", | |
" sentence_tiers: Sequence[str] = SENTENCE_TIERS,\n", | |
" time_tick: int = TIME_TICK_MILIS,\n", | |
" **kwargs) -> etree._ElementTree:\n", | |
" \"\"\"Return a new ELAN document from the given (text, translation) pairs.\"\"\"\n", | |
" doc = make_document_skeleton(**kwargs)\n", | |
" root = doc.getroot()\n", | |
"\n", | |
" tiers = {id_: t.as_xml() for id_, t in TIERS.items()}\n", | |
" assert all(t in tiers for t in sentence_tiers)\n", | |
" annotation_tiers = [elem for id_, elem in tiers.items() if id_ in sentence_tiers]\n", | |
" time_order = E.time_order()\n", | |
" for time_slots, annotations in iterannotations(basename, sentences, time_tick=time_tick):\n", | |
" time_order.extend(time_slots)\n", | |
" for tier, elem in zip(annotation_tiers, annotations):\n", | |
" tier.append(elem)\n", | |
" root.append(time_order)\n", | |
" root.extend(tiers.values())\n", | |
"\n", | |
" last_id, = root.xpath('(tier/annotation/*[self::ref_annotation or self::alignable_annotation]/@annotation_id)[last()]')\n", | |
" last_id = int(last_id.removeprefix('a'))\n", | |
" root.find('header').append(E.property(str(last_id), name='lastUsedAnnotationId'))\n", | |
"\n", | |
" root.extend(linguistic_type.as_xml() for linguistic_type in LINGUISTIC_TYPES.values())\n", | |
" root.append(E.locale(language_code=language_code, country_code=country_code))\n", | |
" root.extend(const.as_xml() for const in Constraint)\n", | |
" return doc\n", | |
"\n", | |
"\n", | |
"def iterannotations(basename: str, sentences, *, time_tick: int):\n", | |
" assert set(map(len, sentences)) == {2}\n", | |
"\n", | |
" annotation_ids = (f'a{i}' for i in itertools.count(1))\n", | |
"\n", | |
" def iterchildren(start_end_value):\n", | |
" for start, end, value in start_end_value:\n", | |
" ann = E.alignable_annotation(E.annotation_value(value),\n", | |
" annotation_id=next(annotation_ids),\n", | |
" time_slot_ref1=start.attrib['time_slot_id'],\n", | |
" time_slot_ref2=end.attrib['time_slot_id'])\n", | |
" yield E.annotation(ann)\n", | |
" yield E.annotation(E.ref_annotation(E.annotation_value(fte_value),\n", | |
" annotation_id=next(annotation_ids),\n", | |
" annotation_ref=ann.attrib['annotation_id']))\n", | |
"\n", | |
" time_slot_ids = (f't{i}' for i in itertools.count(1))\n", | |
"\n", | |
" for sentence_index, (tx_value, fte_value) in enumerate(sentences):\n", | |
" start_time, end_time = (time_tick * i for i in range(sentence_index, sentence_index + 2))\n", | |
"\n", | |
" align_values = [f'{basename}.{sentence_index + 1:03d}', tx_value]\n", | |
"\n", | |
" start = [E.time_slot(time_slot_id=next(time_slot_ids), time_value=str(start_time)) for _ in align_values]\n", | |
" end = [E.time_slot(time_slot_id=next(time_slot_ids), time_value=str(end_time)) for _ in align_values]\n", | |
"\n", | |
" children = iterchildren(zip(start, end, align_values))\n", | |
" yield start + end, list(itertools.starmap(E.annotation,children))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "9905cec6-3e6f-49dc-925e-566510ecfcbb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"TEST_SENTENCES = [('My Hovercraft is full of eels.', 'Can I please buy some matches?'),\n", | |
" ('Please fondle my buttocks.', 'Can you direct me to the station?')]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "4774c88c-05fc-4d35-8a30-5f664e54ca5f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<annotation_document xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" author=\"\" date=\"2022-09-18T22:40:54+02:00\" format=\"3.0\" version=\"3.0\" xsi:noNamespaceSchemaLocation=\"http://www.mpi.nl/tools/elan/EAFv3.0.xsd\">\n", | |
" <header media_file=\"\" time_units=\"milliseconds\">\n", | |
" <property name=\"URN\">urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506</property>\n", | |
" <property name=\"lastUsedAnnotationId\">6</property>\n", | |
" </header>\n", | |
" <time_order>\n", | |
" <time_slot time_slot_id=\"t1\" time_value=\"0\"/>\n", | |
" <time_slot time_slot_id=\"t2\" time_value=\"0\"/>\n", | |
" <time_slot time_slot_id=\"t3\" time_value=\"10000\"/>\n", | |
" <time_slot time_slot_id=\"t4\" time_value=\"10000\"/>\n", | |
" <time_slot time_slot_id=\"t5\" time_value=\"10000\"/>\n", | |
" <time_slot time_slot_id=\"t6\" time_value=\"10000\"/>\n", | |
" <time_slot time_slot_id=\"t7\" time_value=\"20000\"/>\n", | |
" <time_slot time_slot_id=\"t8\" time_value=\"20000\"/>\n", | |
" </time_order>\n", | |
" <tier tier_id=\"ref@A\" linguistic_type_ref=\"ref\" default_locale=\"en\">\n", | |
" <annotation>\n", | |
" <alignable_annotation annotation_id=\"a1\" time_slot_ref1=\"t1\" time_slot_ref2=\"t3\">\n", | |
" <annotation_value>test_sentences.001</annotation_value>\n", | |
" </alignable_annotation>\n", | |
" </annotation>\n", | |
" <annotation>\n", | |
" <alignable_annotation annotation_id=\"a4\" time_slot_ref1=\"t5\" time_slot_ref2=\"t7\">\n", | |
" <annotation_value>test_sentences.002</annotation_value>\n", | |
" </alignable_annotation>\n", | |
" </annotation>\n", | |
" </tier>\n", | |
" <tier tier_id=\"tx@A\" linguistic_type_ref=\"tx\" parent_ref=\"ref@A\" default_locale=\"en\">\n", | |
" <annotation>\n", | |
" <alignable_annotation annotation_id=\"a2\" time_slot_ref1=\"t2\" time_slot_ref2=\"t4\">\n", | |
" <annotation_value>My Hovercraft is full of eels.</annotation_value>\n", | |
" </alignable_annotation>\n", | |
" </annotation>\n", | |
" <annotation>\n", | |
" <alignable_annotation annotation_id=\"a5\" time_slot_ref1=\"t6\" time_slot_ref2=\"t8\">\n", | |
" <annotation_value>Please fondle my buttocks.</annotation_value>\n", | |
" </alignable_annotation>\n", | |
" </annotation>\n", | |
" </tier>\n", | |
" <tier tier_id=\"fte@A\" linguistic_type_ref=\"translation\" parent_ref=\"tx@A\" default_locale=\"en\">\n", | |
" <annotation>\n", | |
" <ref_annotation annotation_id=\"a3\" annotation_ref=\"a2\">\n", | |
" <annotation_value>Can I please buy some matches?</annotation_value>\n", | |
" </ref_annotation>\n", | |
" </annotation>\n", | |
" <annotation>\n", | |
" <ref_annotation annotation_id=\"a6\" annotation_ref=\"a5\">\n", | |
" <annotation_value>Can you direct me to the station?</annotation_value>\n", | |
" </ref_annotation>\n", | |
" </annotation>\n", | |
" </tier>\n", | |
" <tier tier_id=\"mb@A\" linguistic_type_ref=\"mb\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n", | |
" <tier tier_id=\"ge@A\" linguistic_type_ref=\"ge\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n", | |
" <tier tier_id=\"ps@A\" linguistic_type_ref=\"ps\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n", | |
" <tier tier_id=\"lxid@A\" linguistic_type_ref=\"lxid\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n", | |
" <tier tier_id=\"so@A\" linguistic_type_ref=\"so\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n", | |
" <tier tier_id=\"nt@A\" linguistic_type_ref=\"nt\" parent_ref=\"ref@A\" default_locale=\"en\"/>\n", | |
" <tier tier_id=\"orig@A\" linguistic_type_ref=\"orig\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n", | |
" <tier tier_id=\"fta@A\" linguistic_type_ref=\"translation\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n", | |
" <linguistic_type linguistic_type_id=\"default-lt\" time_alignable=\"true\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"translation\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"transcription\" time_alignable=\"true\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"orth\" time_alignable=\"true\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"ref\" time_alignable=\"true\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"tx\" time_alignable=\"true\" constraints=\"Included_In\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"mb\" time_alignable=\"false\" constraints=\"Symbolic_Subdivision\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"orig\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"ge\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"ps\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"so\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"lxid\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"fte\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"nt\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n", | |
" <linguistic_type linguistic_type_id=\"imported-sep\" time_alignable=\"true\" graphic_references=\"false\"/>\n", | |
" <locale language_code=\"en\" country_code=\"US\"/>\n", | |
" <constraint stereotype=\"Time_Subdivision\" description=\"Time subdivision of parent annotation's time interval, no time gaps allowed within this interval\"/>\n", | |
" <constraint stereotype=\"Symbolic_Subdivision\" description=\"Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered\"/>\n", | |
" <constraint stereotype=\"Symbolic_Association\" description=\"1-1 association with a parent annotation\"/>\n", | |
" <constraint stereotype=\"Included_In\" description=\"Time alignable annotations within the parent annotation's time interval, gaps are allowed\"/>\n", | |
"</annotation_document>\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"pprint(make_document('test_sentences', TEST_SENTENCES))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "160ef689-0a40-4a3c-9307-a054a1e4cdae", | |
"metadata": {}, | |
"source": [ | |
"## Write `.eaf` file" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "e3419f0c-57dd-4790-9140-60cb5ab52f16", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"test_sentences.eaf\n" | |
] | |
} | |
], | |
"source": [ | |
"TEST_TARGET = pathlib.Path('test_sentences').with_suffix(ELAN_SUFFIX)\n", | |
"\n", | |
"print(write(TEST_TARGET, make_document(TEST_TARGET.stem, TEST_SENTENCES)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "dee5530f-3ea2-4588-a7d6-33cdbb047f17", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"5_954 bytes\n" | |
] | |
} | |
], | |
"source": [ | |
"print(f'{TEST_TARGET.stat().st_size:_d}', 'bytes')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.7" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment