Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active October 19, 2022 13:51
Show Gist options
  • Save xflr6/452ce3b7be31e46e1cad32146af33d15 to your computer and use it in GitHub Desktop.
Save xflr6/452ce3b7be31e46e1cad32146af33d15 to your computer and use it in GitHub Desktop.
Make a skeleton ELAN document from (text, translation) pairs
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "4e7fe832-c696-48e9-8262-5f147531056e",
"metadata": {},
"source": [
"# Make a skeleton ELAN document from (text, translation) pairs\n",
"https://nbviewer.org/gist/xflr6/452ce3b7be31e46e1cad32146af33d15/MakeElanSentences.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "bb9e386f-f77f-4247-8320-1145d14148e6",
"metadata": {},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"\n",
"import dataclasses\n",
"import datetime\n",
"import enum\n",
"import itertools\n",
"import pathlib\n",
"import os\n",
"from typing import Optional\n",
"\n",
"import lxml.builder\n",
"from lxml import etree\n",
"\n",
"XSI = 'http://www.w3.org/2001/XMLSchema-instance'\n",
"XSI_CLARK = '{%s}' % XSI\n",
"\n",
"ELAN_SCHEMA = 'http://www.mpi.nl/tools/elan/EAFv3.0.xsd'\n",
"ELAN_URN = 'urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506'\n",
"ELAN_VERSION = '3.0'\n",
"ELAN_SUFFIX = '.eaf'\n",
"\n",
"ENCODING = 'utf-8'\n",
"\n",
"E = lxml.builder.ElementMaker(nsmap={'xsi': XSI})"
]
},
{
"cell_type": "markdown",
"id": "ac8ead30-36e2-4f61-8016-c74fe3b1e149",
"metadata": {},
"source": [
"## Build document root and header"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "eb92d812-9356-42d3-a9ac-72cc396adf1f",
"metadata": {},
"outputs": [],
"source": [
"def make_document_skeleton(*, author: str = '', media_file: str = '',\n",
" time_units: str = 'milliseconds',\n",
" format: str = ELAN_VERSION, version: str = ELAN_VERSION,\n",
" date: Optional[datetime.datetime] = None) -> etree._ElementTree:\n",
" \"\"\"Create a new annotation document from the given arguments.\"\"\"\n",
" kwargs = {f'{XSI_CLARK}noNamespaceSchemaLocation': ELAN_SCHEMA}\n",
" if date is None:\n",
" date = datetime.datetime.now(datetime.timezone.utc)\n",
" root = E.annotation_document(author=author,\n",
" date=date.replace(microsecond=0).astimezone().isoformat(),\n",
" format=format,\n",
" version=format,\n",
" **kwargs)\n",
" header = E.header(E.property(ELAN_URN, name='URN'), \n",
" media_file=media_file, time_units=time_units)\n",
" root.append(header)\n",
" return etree.ElementTree(root)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "a8c0b933-dfb9-4175-bbc9-8b77f839dd48",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<lxml.etree._ElementTree at 0x23266633c80>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"make_document_skeleton()"
]
},
{
"cell_type": "markdown",
"id": "c4e481b5-6758-48ac-b06b-5e6009478532",
"metadata": {},
"source": [
"## Pretty-print document"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "fccffba6-ba4b-41f9-82c5-e705a21de423",
"metadata": {},
"outputs": [],
"source": [
"def pprint(doc: etree._ElementTree, *, file=None,\n",
" canonical: bool = False,\n",
" pretty_print: bool = True,\n",
" end: str = '\\n',\n",
" **kwargs) -> None:\n",
" \"\"\"Pretty-print the XML serialization of the given document.\"\"\"\n",
" if canonical:\n",
" doc = canonicalized(doc)\n",
" text = etree.tostring(doc, encoding='unicode', pretty_print=pretty_print, **kwargs)\n",
" print(text, file=file, end=end)\n",
"\n",
" \n",
"def canonicalized(doc: etree._ElementTree, *,\n",
" indent: str = ' ' * 4) -> etree._ElementTree:\n",
" \"\"\"Return a copy of the annotation document in ELAN formatting.\"\"\"\n",
" doc = etree.ElementTree(etree.fromstring(etree.tostring(doc)))\n",
" etree.indent(doc, space=indent)\n",
" for elem in doc.iter('*'):\n",
" elem.tag = elem.tag.upper()\n",
" for k, v in sorted(elem.attrib.items()):\n",
" del elem.attrib[k]\n",
" elem.attrib[k.upper() if not k.startswith(XSI_CLARK) else k] = v\n",
" return doc"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "229ff54e-5a90-4eb9-913a-c379f9daa7ca",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<annotation_document xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" author=\"\" date=\"2022-09-18T22:40:54+02:00\" format=\"3.0\" version=\"3.0\" xsi:noNamespaceSchemaLocation=\"http://www.mpi.nl/tools/elan/EAFv3.0.xsd\">\n",
" <header media_file=\"\" time_units=\"milliseconds\">\n",
" <property name=\"URN\">urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506</property>\n",
" </header>\n",
"</annotation_document>\n",
"<ANNOTATION_DOCUMENT xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" AUTHOR=\"\" DATE=\"2022-09-18T22:40:54+02:00\" FORMAT=\"3.0\" VERSION=\"3.0\" xsi:noNamespaceSchemaLocation=\"http://www.mpi.nl/tools/elan/EAFv3.0.xsd\">\n",
" <HEADER MEDIA_FILE=\"\" TIME_UNITS=\"milliseconds\">\n",
" <PROPERTY NAME=\"URN\">urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506</PROPERTY>\n",
" </HEADER>\n",
"</ANNOTATION_DOCUMENT>\n"
]
}
],
"source": [
"for kwargs in [{}, {'canonical': True}]:\n",
" pprint(make_document_skeleton(), **kwargs, end='')"
]
},
{
"cell_type": "markdown",
"id": "4549b47a-32b7-41a6-80db-32c1bd6d3663",
"metadata": {},
"source": [
"## Serialize document"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "bfa790e7-636e-4d28-b2f0-ac0fcc0728d5",
"metadata": {},
"outputs": [],
"source": [
"def write(path: os.PathLike | str, doc: etree._ElementTree, *,\n",
" canonical: bool = True,\n",
" pretty_print: bool = True,\n",
" xml_declaration: bool = True,\n",
" encoding: str = ENCODING) -> pathlib.Path:\n",
" \"\"\"(Over)write the given path with the XML serialization of the given document.\"\"\"\n",
" path = pathlib.Path(path)\n",
" if canonical:\n",
" doc = canonicalized(doc)\n",
" doc.write(path, xml_declaration=xml_declaration, pretty_print=pretty_print, encoding=encoding)\n",
" return path"
]
},
{
"cell_type": "markdown",
"id": "dd8a0f48-21a3-469d-b300-70a49649c64c",
"metadata": {},
"source": [
"## Define linguistic type constraints"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "247802c0-4b1f-4714-acdc-f79a204edfc7",
"metadata": {},
"outputs": [],
"source": [
"class Constraint(enum.Enum):\n",
" \"\"\"Possible values for `constraints` in a linguistic type.\"\"\"\n",
"\n",
" Time_Subdivision = \"Time subdivision of parent annotation's time interval, no time gaps allowed within this interval\"\n",
" Symbolic_Subdivision = 'Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered'\n",
" Symbolic_Association = '1-1 association with a parent annotation'\n",
" Included_In = \"Time alignable annotations within the parent annotation's time interval, gaps are allowed\"\n",
"\n",
" def as_xml(self) -> etree._Element:\n",
" return E.constraint(stereotype=self.name, description=self.value)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e7f4a141-d1c2-4a3f-ae8f-2be307cf1b05",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<constraint xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" stereotype=\"Time_Subdivision\" description=\"Time subdivision of parent annotation's time interval, no time gaps allowed within this interval\"/>\n",
"<constraint xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" stereotype=\"Symbolic_Subdivision\" description=\"Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered\"/>\n",
"<constraint xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" stereotype=\"Symbolic_Association\" description=\"1-1 association with a parent annotation\"/>\n",
"<constraint xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" stereotype=\"Included_In\" description=\"Time alignable annotations within the parent annotation's time interval, gaps are allowed\"/>\n"
]
}
],
"source": [
"for const in Constraint:\n",
" pprint(const.as_xml(), end='')"
]
},
{
"cell_type": "markdown",
"id": "d5465497-6bbb-4782-a90b-0098ba78abb8",
"metadata": {},
"source": [
"## Define linguistic types"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "b00294f8-ae38-4a92-8531-b8de0cf3b64e",
"metadata": {},
"outputs": [],
"source": [
"@dataclasses.dataclass\n",
"class LinguisticType:\n",
" \"\"\"Possible targets for `linguistic_type_ref` in a tier.\"\"\"\n",
"\n",
" id_: str\n",
" time_alignable: bool = False\n",
" graphic_references: bool = False\n",
" constraints: Optional[Constraint] = None\n",
"\n",
" @classmethod\n",
" def make_dict(cls, types) -> dict[str, LinguisticType]:\n",
" return {id_: cls(id_=id_, **kwargs) for id_, kwargs in types.items()} \n",
"\n",
" def as_xml(self) -> etree._Element:\n",
" attrs = {'linguistic_type_id': self.id_,\n",
" 'time_alignable': 'true' if self.time_alignable else 'false'}\n",
" if self.constraints:\n",
" attrs['constraints'] = self.constraints.name\n",
" attrs['graphic_references'] = 'true' if self.graphic_references else 'false'\n",
" return E.linguistic_type(**attrs)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "2000a256-f031-44ba-91e4-8fef057a5adc",
"metadata": {},
"outputs": [],
"source": [
"LINGUISTIC_TYPES = LinguisticType.make_dict({'default-lt': {'time_alignable': True},\n",
" 'translation': {'constraints': Constraint.Symbolic_Association},\n",
" 'transcription': {'time_alignable': True},\n",
" 'orth': {'time_alignable': True},\n",
" 'ref': {'time_alignable': True},\n",
" 'tx': {'constraints': Constraint.Included_In, 'time_alignable': True},\n",
" 'mb': {'constraints': Constraint.Symbolic_Subdivision},\n",
" 'orig': {'constraints': Constraint.Symbolic_Association},\n",
" 'ge': {'constraints': Constraint.Symbolic_Association},\n",
" 'ps': {'constraints': Constraint.Symbolic_Association},\n",
" 'so': {'constraints': Constraint.Symbolic_Association},\n",
" 'lxid': {'constraints': Constraint.Symbolic_Association},\n",
" 'fte': {'constraints': Constraint.Symbolic_Association},\n",
" 'nt': {'constraints': Constraint.Symbolic_Association},\n",
" 'imported-sep': {'time_alignable': True}})"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "dd557f98-153f-4f19-b23a-17a04f35df60",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"default-lt\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"translation\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"transcription\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"orth\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"ref\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"tx\" time_alignable=\"true\" constraints=\"Included_In\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"mb\" time_alignable=\"false\" constraints=\"Symbolic_Subdivision\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"orig\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"ge\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"ps\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"so\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"lxid\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"fte\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"nt\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
"<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"imported-sep\" time_alignable=\"true\" graphic_references=\"false\"/>\n"
]
}
],
"source": [
"for lt in LINGUISTIC_TYPES.values():\n",
" pprint(lt.as_xml(), end='')"
]
},
{
"cell_type": "markdown",
"id": "c0118d81-a94c-4ff1-a17e-d468e05f24a6",
"metadata": {},
"source": [
"## Define tiers"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a1ba9d63-0c43-4b1b-89c6-d976fd8be75c",
"metadata": {},
"outputs": [],
"source": [
"@dataclasses.dataclass\n",
"class Tier:\n",
" \"\"\"Container for annotations of one linguistic type. Possible target for `parent_ref` of a tier.\"\"\"\n",
"\n",
" id_: str\n",
" linguistic_type_ref: 'str'\n",
" parent_ref: Optional[str] = None\n",
" default_locale: str = 'en'\n",
"\n",
" @classmethod\n",
" def make_dict(cls, tiers) -> dict[str, Tier]:\n",
" return {id_: cls(id_=id_, **kwargs) for id_, kwargs in tiers.items()}\n",
"\n",
" @property\n",
" def linguistic_type(self):\n",
" return LINGUISTIC_TYPES[self.linguistic_type_ref]\n",
"\n",
" def as_xml(self) -> etree._Element:\n",
" attrs = {'tier_id': self.id_,\n",
" 'linguistic_type_ref': self.linguistic_type.id_}\n",
" if self.parent_ref:\n",
" attrs['parent_ref'] = self.parent_ref\n",
" attrs['default_locale'] = self.default_locale\n",
" return E.tier(**attrs)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "4b499ba0-80c8-4e7e-b6ca-4cdd7d6811c1",
"metadata": {},
"outputs": [],
"source": [
"TIERS = Tier.make_dict({'ref@A': {'linguistic_type_ref': 'ref'},\n",
" 'tx@A': {'linguistic_type_ref': 'tx', 'parent_ref': 'ref@A'},\n",
" 'fte@A': {'linguistic_type_ref': 'translation', 'parent_ref': 'tx@A'},\n",
" 'mb@A': {'linguistic_type_ref': 'mb', 'parent_ref': 'tx@A'},\n",
" 'ge@A': {'linguistic_type_ref': 'ge', 'parent_ref': 'mb@A'},\n",
" 'ps@A': {'linguistic_type_ref': 'ps', 'parent_ref': 'mb@A'},\n",
" 'lxid@A': {'linguistic_type_ref': 'lxid', 'parent_ref': 'mb@A'},\n",
" 'so@A': {'linguistic_type_ref': 'so', 'parent_ref': 'mb@A'},\n",
" 'nt@A': {'linguistic_type_ref': 'nt', 'parent_ref': 'ref@A'},\n",
" 'orig@A': {'linguistic_type_ref': 'orig', 'parent_ref': 'tx@A'},\n",
" 'fta@A': {'linguistic_type_ref': 'translation', 'parent_ref': 'tx@A'}})"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "881ef3fa-49c5-46c9-b2cb-8eb1ef6f23bb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"ref@A\" linguistic_type_ref=\"ref\" default_locale=\"en\"/>\n",
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"tx@A\" linguistic_type_ref=\"tx\" parent_ref=\"ref@A\" default_locale=\"en\"/>\n",
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"fte@A\" linguistic_type_ref=\"translation\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n",
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"mb@A\" linguistic_type_ref=\"mb\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n",
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"ge@A\" linguistic_type_ref=\"ge\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"ps@A\" linguistic_type_ref=\"ps\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"lxid@A\" linguistic_type_ref=\"lxid\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"so@A\" linguistic_type_ref=\"so\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"nt@A\" linguistic_type_ref=\"nt\" parent_ref=\"ref@A\" default_locale=\"en\"/>\n",
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"orig@A\" linguistic_type_ref=\"orig\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n",
"<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"fta@A\" linguistic_type_ref=\"translation\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n"
]
}
],
"source": [
"for tier in TIERS.values():\n",
" pprint(tier.as_xml(), end='')"
]
},
{
"cell_type": "markdown",
"id": "5ebe6048-e986-4f8c-b2d3-0d67927c148d",
"metadata": {},
"source": [
"## Build document"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "6d7fb8c1-80ba-42c9-abb5-54c9004ef965",
"metadata": {},
"outputs": [],
"source": [
"SENTENCE_TIERS = (TIERS['ref@A'].id_, TIERS['tx@A'].id_, TIERS['fte@A'].id_)\n",
"\n",
"TIME_TICK_MILIS = 10_000"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "6d65f3e7-63c4-4833-8b42-ece59e7690e7",
"metadata": {},
"outputs": [],
"source": [
"def make_document(basename: str, sentences, *,\n",
" language_code: str = 'en', country_code: str = 'US',\n",
" sentence_tiers: Sequence[str] = SENTENCE_TIERS,\n",
" time_tick: int = TIME_TICK_MILIS,\n",
" **kwargs) -> etree._ElementTree:\n",
" \"\"\"Return a new ELAN document from the given (text, translation) pairs.\"\"\"\n",
" doc = make_document_skeleton(**kwargs)\n",
" root = doc.getroot()\n",
"\n",
" tiers = {id_: t.as_xml() for id_, t in TIERS.items()}\n",
" assert all(t in tiers for t in sentence_tiers)\n",
" annotation_tiers = [elem for id_, elem in tiers.items() if id_ in sentence_tiers]\n",
" time_order = E.time_order()\n",
" for time_slots, annotations in iterannotations(basename, sentences, time_tick=time_tick):\n",
" time_order.extend(time_slots)\n",
" for tier, elem in zip(annotation_tiers, annotations):\n",
" tier.append(elem)\n",
" root.append(time_order)\n",
" root.extend(tiers.values())\n",
"\n",
" last_id, = root.xpath('(tier/annotation/*[self::ref_annotation or self::alignable_annotation]/@annotation_id)[last()]')\n",
" last_id = int(last_id.removeprefix('a'))\n",
" root.find('header').append(E.property(str(last_id), name='lastUsedAnnotationId'))\n",
"\n",
" root.extend(linguistic_type.as_xml() for linguistic_type in LINGUISTIC_TYPES.values())\n",
" root.append(E.locale(language_code=language_code, country_code=country_code))\n",
" root.extend(const.as_xml() for const in Constraint)\n",
" return doc\n",
"\n",
"\n",
"def iterannotations(basename: str, sentences, *, time_tick: int):\n",
" assert set(map(len, sentences)) == {2}\n",
"\n",
" annotation_ids = (f'a{i}' for i in itertools.count(1))\n",
"\n",
" def iterchildren(start_end_value):\n",
" for start, end, value in start_end_value:\n",
" ann = E.alignable_annotation(E.annotation_value(value),\n",
" annotation_id=next(annotation_ids),\n",
" time_slot_ref1=start.attrib['time_slot_id'],\n",
" time_slot_ref2=end.attrib['time_slot_id'])\n",
" yield E.annotation(ann)\n",
" yield E.annotation(E.ref_annotation(E.annotation_value(fte_value),\n",
" annotation_id=next(annotation_ids),\n",
" annotation_ref=ann.attrib['annotation_id']))\n",
"\n",
" time_slot_ids = (f't{i}' for i in itertools.count(1))\n",
"\n",
" for sentence_index, (tx_value, fte_value) in enumerate(sentences):\n",
" start_time, end_time = (time_tick * i for i in range(sentence_index, sentence_index + 2))\n",
"\n",
" align_values = [f'{basename}.{sentence_index + 1:03d}', tx_value]\n",
"\n",
" start = [E.time_slot(time_slot_id=next(time_slot_ids), time_value=str(start_time)) for _ in align_values]\n",
" end = [E.time_slot(time_slot_id=next(time_slot_ids), time_value=str(end_time)) for _ in align_values]\n",
"\n",
" children = iterchildren(zip(start, end, align_values))\n",
" yield start + end, list(itertools.starmap(E.annotation,children))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "9905cec6-3e6f-49dc-925e-566510ecfcbb",
"metadata": {},
"outputs": [],
"source": [
"TEST_SENTENCES = [('My Hovercraft is full of eels.', 'Can I please buy some matches?'),\n",
" ('Please fondle my buttocks.', 'Can you direct me to the station?')]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "4774c88c-05fc-4d35-8a30-5f664e54ca5f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<annotation_document xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" author=\"\" date=\"2022-09-18T22:40:54+02:00\" format=\"3.0\" version=\"3.0\" xsi:noNamespaceSchemaLocation=\"http://www.mpi.nl/tools/elan/EAFv3.0.xsd\">\n",
" <header media_file=\"\" time_units=\"milliseconds\">\n",
" <property name=\"URN\">urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506</property>\n",
" <property name=\"lastUsedAnnotationId\">6</property>\n",
" </header>\n",
" <time_order>\n",
" <time_slot time_slot_id=\"t1\" time_value=\"0\"/>\n",
" <time_slot time_slot_id=\"t2\" time_value=\"0\"/>\n",
" <time_slot time_slot_id=\"t3\" time_value=\"10000\"/>\n",
" <time_slot time_slot_id=\"t4\" time_value=\"10000\"/>\n",
" <time_slot time_slot_id=\"t5\" time_value=\"10000\"/>\n",
" <time_slot time_slot_id=\"t6\" time_value=\"10000\"/>\n",
" <time_slot time_slot_id=\"t7\" time_value=\"20000\"/>\n",
" <time_slot time_slot_id=\"t8\" time_value=\"20000\"/>\n",
" </time_order>\n",
" <tier tier_id=\"ref@A\" linguistic_type_ref=\"ref\" default_locale=\"en\">\n",
" <annotation>\n",
" <alignable_annotation annotation_id=\"a1\" time_slot_ref1=\"t1\" time_slot_ref2=\"t3\">\n",
" <annotation_value>test_sentences.001</annotation_value>\n",
" </alignable_annotation>\n",
" </annotation>\n",
" <annotation>\n",
" <alignable_annotation annotation_id=\"a4\" time_slot_ref1=\"t5\" time_slot_ref2=\"t7\">\n",
" <annotation_value>test_sentences.002</annotation_value>\n",
" </alignable_annotation>\n",
" </annotation>\n",
" </tier>\n",
" <tier tier_id=\"tx@A\" linguistic_type_ref=\"tx\" parent_ref=\"ref@A\" default_locale=\"en\">\n",
" <annotation>\n",
" <alignable_annotation annotation_id=\"a2\" time_slot_ref1=\"t2\" time_slot_ref2=\"t4\">\n",
" <annotation_value>My Hovercraft is full of eels.</annotation_value>\n",
" </alignable_annotation>\n",
" </annotation>\n",
" <annotation>\n",
" <alignable_annotation annotation_id=\"a5\" time_slot_ref1=\"t6\" time_slot_ref2=\"t8\">\n",
" <annotation_value>Please fondle my buttocks.</annotation_value>\n",
" </alignable_annotation>\n",
" </annotation>\n",
" </tier>\n",
" <tier tier_id=\"fte@A\" linguistic_type_ref=\"translation\" parent_ref=\"tx@A\" default_locale=\"en\">\n",
" <annotation>\n",
" <ref_annotation annotation_id=\"a3\" annotation_ref=\"a2\">\n",
" <annotation_value>Can I please buy some matches?</annotation_value>\n",
" </ref_annotation>\n",
" </annotation>\n",
" <annotation>\n",
" <ref_annotation annotation_id=\"a6\" annotation_ref=\"a5\">\n",
" <annotation_value>Can you direct me to the station?</annotation_value>\n",
" </ref_annotation>\n",
" </annotation>\n",
" </tier>\n",
" <tier tier_id=\"mb@A\" linguistic_type_ref=\"mb\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n",
" <tier tier_id=\"ge@A\" linguistic_type_ref=\"ge\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
" <tier tier_id=\"ps@A\" linguistic_type_ref=\"ps\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
" <tier tier_id=\"lxid@A\" linguistic_type_ref=\"lxid\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
" <tier tier_id=\"so@A\" linguistic_type_ref=\"so\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
" <tier tier_id=\"nt@A\" linguistic_type_ref=\"nt\" parent_ref=\"ref@A\" default_locale=\"en\"/>\n",
" <tier tier_id=\"orig@A\" linguistic_type_ref=\"orig\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n",
" <tier tier_id=\"fta@A\" linguistic_type_ref=\"translation\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n",
" <linguistic_type linguistic_type_id=\"default-lt\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"translation\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"transcription\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"orth\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"ref\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"tx\" time_alignable=\"true\" constraints=\"Included_In\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"mb\" time_alignable=\"false\" constraints=\"Symbolic_Subdivision\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"orig\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"ge\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"ps\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"so\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"lxid\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"fte\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"nt\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
" <linguistic_type linguistic_type_id=\"imported-sep\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
" <locale language_code=\"en\" country_code=\"US\"/>\n",
" <constraint stereotype=\"Time_Subdivision\" description=\"Time subdivision of parent annotation's time interval, no time gaps allowed within this interval\"/>\n",
" <constraint stereotype=\"Symbolic_Subdivision\" description=\"Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered\"/>\n",
" <constraint stereotype=\"Symbolic_Association\" description=\"1-1 association with a parent annotation\"/>\n",
" <constraint stereotype=\"Included_In\" description=\"Time alignable annotations within the parent annotation's time interval, gaps are allowed\"/>\n",
"</annotation_document>\n",
"\n"
]
}
],
"source": [
"pprint(make_document('test_sentences', TEST_SENTENCES))"
]
},
{
"cell_type": "markdown",
"id": "160ef689-0a40-4a3c-9307-a054a1e4cdae",
"metadata": {},
"source": [
"## Write `.eaf` file"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "e3419f0c-57dd-4790-9140-60cb5ab52f16",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"test_sentences.eaf\n"
]
}
],
"source": [
"TEST_TARGET = pathlib.Path('test_sentences').with_suffix(ELAN_SUFFIX)\n",
"\n",
"print(write(TEST_TARGET, make_document(TEST_TARGET.stem, TEST_SENTENCES)))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "dee5530f-3ea2-4588-a7d6-33cdbb047f17",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5_954 bytes\n"
]
}
],
"source": [
"print(f'{TEST_TARGET.stat().st_size:_d}', 'bytes')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment