xflr6 · October 19, 2022 13:51
diff --git a/MakeElanSentences.ipynb b/MakeElanSentences.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4e7fe832-c696-48e9-8262-5f147531056e",
   "metadata": {},
   "source": [
    "# Make a skeleton ELAN document from (text, translation) pairs\n",
    "https://nbviewer.org/gist/xflr6/452ce3b7be31e46e1cad32146af33d15/MakeElanSentences.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bb9e386f-f77f-4247-8320-1145d14148e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import annotations\n",
    "\n",
    "import dataclasses\n",
    "import datetime\n",
    "import enum\n",
    "import itertools\n",
    "import pathlib\n",
    "import os\n",
    "from typing import Optional\n",
    "\n",
    "import lxml.builder\n",
    "from lxml import etree\n",
    "\n",
    "XSI = 'http://www.w3.org/2001/XMLSchema-instance'\n",
    "XSI_CLARK = '{%s}' % XSI\n",
    "\n",
    "ELAN_SCHEMA = 'http://www.mpi.nl/tools/elan/EAFv3.0.xsd'\n",
    "ELAN_URN = 'urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506'\n",
    "ELAN_VERSION = '3.0'\n",
    "ELAN_SUFFIX = '.eaf'\n",
    "\n",
    "ENCODING = 'utf-8'\n",
    "\n",
    "E = lxml.builder.ElementMaker(nsmap={'xsi': XSI})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ac8ead30-36e2-4f61-8016-c74fe3b1e149",
   "metadata": {},
   "source": [
    "## Build document root and header"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "eb92d812-9356-42d3-a9ac-72cc396adf1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_document_skeleton(*, author: str = '', media_file: str = '',\n",
    "                           time_units: str = 'milliseconds',\n",
    "                           format: str = ELAN_VERSION, version: str = ELAN_VERSION,\n",
    "                           date: Optional[datetime.datetime] = None) -> etree._ElementTree:\n",
    "    \"\"\"Create a new annotation document from the given arguments.\"\"\"\n",
    "    kwargs = {f'{XSI_CLARK}noNamespaceSchemaLocation': ELAN_SCHEMA}\n",
    "    if date is None:\n",
    "        date = datetime.datetime.now(datetime.timezone.utc)\n",
    "    root = E.annotation_document(author=author,\n",
    "                                 date=date.replace(microsecond=0).astimezone().isoformat(),\n",
    "                                 format=format,\n",
    "                                 version=format,\n",
    "                                 **kwargs)\n",
    "    header = E.header(E.property(ELAN_URN, name='URN'), \n",
    "                      media_file=media_file, time_units=time_units)\n",
    "    root.append(header)\n",
    "    return etree.ElementTree(root)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a8c0b933-dfb9-4175-bbc9-8b77f839dd48",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<lxml.etree._ElementTree at 0x23266633c80>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "make_document_skeleton()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c4e481b5-6758-48ac-b06b-5e6009478532",
   "metadata": {},
   "source": [
    "## Pretty-print document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "fccffba6-ba4b-41f9-82c5-e705a21de423",
   "metadata": {},
   "outputs": [],
   "source": [
    "def pprint(doc: etree._ElementTree, *, file=None,\n",
    "           canonical: bool = False,\n",
    "           pretty_print: bool = True,\n",
    "           end: str = '\\n',\n",
    "           **kwargs) -> None:\n",
    "    \"\"\"Pretty-print the XML serialization of the given document.\"\"\"\n",
    "    if canonical:\n",
    "        doc = canonicalized(doc)\n",
    "    text = etree.tostring(doc, encoding='unicode', pretty_print=pretty_print, **kwargs)\n",
    "    print(text, file=file, end=end)\n",
    "\n",
    "    \n",
    "def canonicalized(doc: etree._ElementTree, *,\n",
    "                  indent: str = ' ' * 4) -> etree._ElementTree:\n",
    "    \"\"\"Return a copy of the annotation document in ELAN formatting.\"\"\"\n",
    "    doc = etree.ElementTree(etree.fromstring(etree.tostring(doc)))\n",
    "    etree.indent(doc, space=indent)\n",
    "    for elem in doc.iter('*'):\n",
    "        elem.tag = elem.tag.upper()\n",
    "        for k, v in sorted(elem.attrib.items()):\n",
    "            del elem.attrib[k]\n",
    "            elem.attrib[k.upper() if not k.startswith(XSI_CLARK) else k] = v\n",
    "    return doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "229ff54e-5a90-4eb9-913a-c379f9daa7ca",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<annotation_document xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" author=\"\" date=\"2022-09-18T22:40:54+02:00\" format=\"3.0\" version=\"3.0\" xsi:noNamespaceSchemaLocation=\"http://www.mpi.nl/tools/elan/EAFv3.0.xsd\">\n",
      "  <header media_file=\"\" time_units=\"milliseconds\">\n",
      "    <property name=\"URN\">urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506</property>\n",
      "  </header>\n",
      "</annotation_document>\n",
      "<ANNOTATION_DOCUMENT xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" AUTHOR=\"\" DATE=\"2022-09-18T22:40:54+02:00\" FORMAT=\"3.0\" VERSION=\"3.0\" xsi:noNamespaceSchemaLocation=\"http://www.mpi.nl/tools/elan/EAFv3.0.xsd\">\n",
      "    <HEADER MEDIA_FILE=\"\" TIME_UNITS=\"milliseconds\">\n",
      "        <PROPERTY NAME=\"URN\">urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506</PROPERTY>\n",
      "    </HEADER>\n",
      "</ANNOTATION_DOCUMENT>\n"
     ]
    }
   ],
   "source": [
    "for kwargs in [{}, {'canonical': True}]:\n",
    "    pprint(make_document_skeleton(), **kwargs, end='')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4549b47a-32b7-41a6-80db-32c1bd6d3663",
   "metadata": {},
   "source": [
    "## Serialize document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "bfa790e7-636e-4d28-b2f0-ac0fcc0728d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "def write(path: os.PathLike | str, doc: etree._ElementTree, *,\n",
    "          canonical: bool = True,\n",
    "          pretty_print: bool = True,\n",
    "          xml_declaration: bool = True,\n",
    "          encoding: str = ENCODING) -> pathlib.Path:\n",
    "    \"\"\"(Over)write the given path with the XML serialization of the given document.\"\"\"\n",
    "    path = pathlib.Path(path)\n",
    "    if canonical:\n",
    "        doc = canonicalized(doc)\n",
    "    doc.write(path, xml_declaration=xml_declaration, pretty_print=pretty_print, encoding=encoding)\n",
    "    return path"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd8a0f48-21a3-469d-b300-70a49649c64c",
   "metadata": {},
   "source": [
    "## Define linguistic type constraints"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "247802c0-4b1f-4714-acdc-f79a204edfc7",
   "metadata": {},
   "outputs": [],
   "source": [
    "class Constraint(enum.Enum):\n",
    "    \"\"\"Possible values for `constraints` in a linguistic type.\"\"\"\n",
    "\n",
    "    Time_Subdivision = \"Time subdivision of parent annotation's time interval, no time gaps allowed within this interval\"\n",
    "    Symbolic_Subdivision = 'Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered'\n",
    "    Symbolic_Association = '1-1 association with a parent annotation'\n",
    "    Included_In = \"Time alignable annotations within the parent annotation's time interval, gaps are allowed\"\n",
    "\n",
    "    def as_xml(self) -> etree._Element:\n",
    "        return E.constraint(stereotype=self.name, description=self.value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e7f4a141-d1c2-4a3f-ae8f-2be307cf1b05",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<constraint xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" stereotype=\"Time_Subdivision\" description=\"Time subdivision of parent annotation's time interval, no time gaps allowed within this interval\"/>\n",
      "<constraint xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" stereotype=\"Symbolic_Subdivision\" description=\"Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered\"/>\n",
      "<constraint xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" stereotype=\"Symbolic_Association\" description=\"1-1 association with a parent annotation\"/>\n",
      "<constraint xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" stereotype=\"Included_In\" description=\"Time alignable annotations within the parent annotation's time interval, gaps are allowed\"/>\n"
     ]
    }
   ],
   "source": [
    "for const in Constraint:\n",
    "    pprint(const.as_xml(), end='')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d5465497-6bbb-4782-a90b-0098ba78abb8",
   "metadata": {},
   "source": [
    "## Define linguistic types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b00294f8-ae38-4a92-8531-b8de0cf3b64e",
   "metadata": {},
   "outputs": [],
   "source": [
    "@dataclasses.dataclass\n",
    "class LinguisticType:\n",
    "    \"\"\"Possible targets for `linguistic_type_ref` in a tier.\"\"\"\n",
    "\n",
    "    id_: str\n",
    "    time_alignable: bool = False\n",
    "    graphic_references: bool = False\n",
    "    constraints: Optional[Constraint] = None\n",
    "\n",
    "    @classmethod\n",
    "    def make_dict(cls, types) -> dict[str, LinguisticType]:\n",
    "        return {id_: cls(id_=id_, **kwargs) for id_, kwargs in types.items()}        \n",
    "\n",
    "    def as_xml(self) -> etree._Element:\n",
    "        attrs = {'linguistic_type_id': self.id_,\n",
    "                 'time_alignable': 'true' if self.time_alignable else 'false'}\n",
    "        if self.constraints:\n",
    "            attrs['constraints'] = self.constraints.name\n",
    "        attrs['graphic_references'] = 'true' if self.graphic_references else 'false'\n",
    "        return E.linguistic_type(**attrs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "2000a256-f031-44ba-91e4-8fef057a5adc",
   "metadata": {},
   "outputs": [],
   "source": [
    "LINGUISTIC_TYPES = LinguisticType.make_dict({'default-lt': {'time_alignable': True},\n",
    "                                             'translation': {'constraints': Constraint.Symbolic_Association},\n",
    "                                             'transcription': {'time_alignable': True},\n",
    "                                             'orth': {'time_alignable': True},\n",
    "                                             'ref': {'time_alignable': True},\n",
    "                                             'tx': {'constraints': Constraint.Included_In, 'time_alignable': True},\n",
    "                                             'mb': {'constraints': Constraint.Symbolic_Subdivision},\n",
    "                                             'orig': {'constraints': Constraint.Symbolic_Association},\n",
    "                                             'ge': {'constraints': Constraint.Symbolic_Association},\n",
    "                                             'ps': {'constraints': Constraint.Symbolic_Association},\n",
    "                                             'so': {'constraints': Constraint.Symbolic_Association},\n",
    "                                             'lxid': {'constraints': Constraint.Symbolic_Association},\n",
    "                                             'fte': {'constraints': Constraint.Symbolic_Association},\n",
    "                                             'nt': {'constraints': Constraint.Symbolic_Association},\n",
    "                                             'imported-sep': {'time_alignable': True}})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "dd557f98-153f-4f19-b23a-17a04f35df60",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"default-lt\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"translation\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"transcription\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"orth\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"ref\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"tx\" time_alignable=\"true\" constraints=\"Included_In\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"mb\" time_alignable=\"false\" constraints=\"Symbolic_Subdivision\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"orig\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"ge\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"ps\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"so\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"lxid\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"fte\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"nt\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "<linguistic_type xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" linguistic_type_id=\"imported-sep\" time_alignable=\"true\" graphic_references=\"false\"/>\n"
     ]
    }
   ],
   "source": [
    "for lt in LINGUISTIC_TYPES.values():\n",
    "    pprint(lt.as_xml(), end='')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c0118d81-a94c-4ff1-a17e-d468e05f24a6",
   "metadata": {},
   "source": [
    "## Define tiers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "a1ba9d63-0c43-4b1b-89c6-d976fd8be75c",
   "metadata": {},
   "outputs": [],
   "source": [
    "@dataclasses.dataclass\n",
    "class Tier:\n",
    "    \"\"\"Container for annotations of one linguistic type. Possible target for `parent_ref` of a tier.\"\"\"\n",
    "\n",
    "    id_: str\n",
    "    linguistic_type_ref: 'str'\n",
    "    parent_ref: Optional[str] = None\n",
    "    default_locale: str = 'en'\n",
    "\n",
    "    @classmethod\n",
    "    def make_dict(cls, tiers) -> dict[str, Tier]:\n",
    "        return {id_: cls(id_=id_, **kwargs) for id_, kwargs in tiers.items()}\n",
    "\n",
    "    @property\n",
    "    def linguistic_type(self):\n",
    "        return LINGUISTIC_TYPES[self.linguistic_type_ref]\n",
    "\n",
    "    def as_xml(self) -> etree._Element:\n",
    "        attrs = {'tier_id': self.id_,\n",
    "                 'linguistic_type_ref': self.linguistic_type.id_}\n",
    "        if self.parent_ref:\n",
    "            attrs['parent_ref'] = self.parent_ref\n",
    "        attrs['default_locale'] = self.default_locale\n",
    "        return E.tier(**attrs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "4b499ba0-80c8-4e7e-b6ca-4cdd7d6811c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "TIERS = Tier.make_dict({'ref@A': {'linguistic_type_ref': 'ref'},\n",
    "                        'tx@A': {'linguistic_type_ref': 'tx', 'parent_ref': 'ref@A'},\n",
    "                        'fte@A': {'linguistic_type_ref': 'translation', 'parent_ref': 'tx@A'},\n",
    "                        'mb@A': {'linguistic_type_ref': 'mb', 'parent_ref': 'tx@A'},\n",
    "                        'ge@A': {'linguistic_type_ref': 'ge', 'parent_ref': 'mb@A'},\n",
    "                        'ps@A': {'linguistic_type_ref': 'ps', 'parent_ref': 'mb@A'},\n",
    "                        'lxid@A': {'linguistic_type_ref': 'lxid', 'parent_ref': 'mb@A'},\n",
    "                        'so@A': {'linguistic_type_ref': 'so', 'parent_ref': 'mb@A'},\n",
    "                        'nt@A': {'linguistic_type_ref': 'nt', 'parent_ref': 'ref@A'},\n",
    "                        'orig@A': {'linguistic_type_ref': 'orig', 'parent_ref': 'tx@A'},\n",
    "                        'fta@A': {'linguistic_type_ref': 'translation', 'parent_ref': 'tx@A'}})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "881ef3fa-49c5-46c9-b2cb-8eb1ef6f23bb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"ref@A\" linguistic_type_ref=\"ref\" default_locale=\"en\"/>\n",
      "<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"tx@A\" linguistic_type_ref=\"tx\" parent_ref=\"ref@A\" default_locale=\"en\"/>\n",
      "<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"fte@A\" linguistic_type_ref=\"translation\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n",
      "<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"mb@A\" linguistic_type_ref=\"mb\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n",
      "<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"ge@A\" linguistic_type_ref=\"ge\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
      "<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"ps@A\" linguistic_type_ref=\"ps\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
      "<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"lxid@A\" linguistic_type_ref=\"lxid\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
      "<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"so@A\" linguistic_type_ref=\"so\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
      "<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"nt@A\" linguistic_type_ref=\"nt\" parent_ref=\"ref@A\" default_locale=\"en\"/>\n",
      "<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"orig@A\" linguistic_type_ref=\"orig\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n",
      "<tier xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" tier_id=\"fta@A\" linguistic_type_ref=\"translation\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n"
     ]
    }
   ],
   "source": [
    "for tier in TIERS.values():\n",
    "    pprint(tier.as_xml(), end='')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ebe6048-e986-4f8c-b2d3-0d67927c148d",
   "metadata": {},
   "source": [
    "## Build document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "6d7fb8c1-80ba-42c9-abb5-54c9004ef965",
   "metadata": {},
   "outputs": [],
   "source": [
    "SENTENCE_TIERS = (TIERS['ref@A'].id_, TIERS['tx@A'].id_, TIERS['fte@A'].id_)\n",
    "\n",
    "TIME_TICK_MILIS = 10_000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "6d65f3e7-63c4-4833-8b42-ece59e7690e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_document(basename: str, sentences, *,\n",
    "                  language_code: str = 'en', country_code: str = 'US',\n",
    "                  sentence_tiers: Sequence[str] = SENTENCE_TIERS,\n",
    "                  time_tick: int = TIME_TICK_MILIS,\n",
    "                  **kwargs) -> etree._ElementTree:\n",
    "    \"\"\"Return a new ELAN document from the given (text, translation) pairs.\"\"\"\n",
    "    doc = make_document_skeleton(**kwargs)\n",
    "    root = doc.getroot()\n",
    "\n",
    "    tiers = {id_: t.as_xml() for id_, t in TIERS.items()}\n",
    "    assert all(t in tiers for t in sentence_tiers)\n",
    "    annotation_tiers = [elem for id_, elem in tiers.items() if id_ in sentence_tiers]\n",
    "    time_order = E.time_order()\n",
    "    for time_slots, annotations in iterannotations(basename, sentences, time_tick=time_tick):\n",
    "        time_order.extend(time_slots)\n",
    "        for tier, elem in zip(annotation_tiers, annotations):\n",
    "            tier.append(elem)\n",
    "    root.append(time_order)\n",
    "    root.extend(tiers.values())\n",
    "\n",
    "    last_id, = root.xpath('(tier/annotation/*[self::ref_annotation or self::alignable_annotation]/@annotation_id)[last()]')\n",
    "    last_id = int(last_id.removeprefix('a'))\n",
    "    root.find('header').append(E.property(str(last_id), name='lastUsedAnnotationId'))\n",
    "\n",
    "    root.extend(linguistic_type.as_xml() for linguistic_type in LINGUISTIC_TYPES.values())\n",
    "    root.append(E.locale(language_code=language_code, country_code=country_code))\n",
    "    root.extend(const.as_xml() for const in Constraint)\n",
    "    return doc\n",
    "\n",
    "\n",
    "def iterannotations(basename: str, sentences, *, time_tick: int):\n",
    "    assert set(map(len, sentences)) == {2}\n",
    "\n",
    "    annotation_ids = (f'a{i}' for i in itertools.count(1))\n",
    "\n",
    "    def iterchildren(start_end_value):\n",
    "        for start, end, value in start_end_value:\n",
    "            ann = E.alignable_annotation(E.annotation_value(value),\n",
    "                                         annotation_id=next(annotation_ids),\n",
    "                                         time_slot_ref1=start.attrib['time_slot_id'],\n",
    "                                         time_slot_ref2=end.attrib['time_slot_id'])\n",
    "            yield E.annotation(ann)\n",
    "        yield E.annotation(E.ref_annotation(E.annotation_value(fte_value),\n",
    "                                            annotation_id=next(annotation_ids),\n",
    "                                            annotation_ref=ann.attrib['annotation_id']))\n",
    "\n",
    "    time_slot_ids =  (f't{i}' for i in itertools.count(1))\n",
    "\n",
    "    for sentence_index, (tx_value, fte_value) in enumerate(sentences):\n",
    "        start_time, end_time = (time_tick * i  for i in range(sentence_index, sentence_index + 2))\n",
    "\n",
    "        align_values = [f'{basename}.{sentence_index + 1:03d}', tx_value]\n",
    "\n",
    "        start = [E.time_slot(time_slot_id=next(time_slot_ids), time_value=str(start_time)) for _ in align_values]\n",
    "        end = [E.time_slot(time_slot_id=next(time_slot_ids), time_value=str(end_time)) for _ in align_values]\n",
    "\n",
    "        children = iterchildren(zip(start, end, align_values))\n",
    "        yield start + end, list(itertools.starmap(E.annotation,children))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "9905cec6-3e6f-49dc-925e-566510ecfcbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "TEST_SENTENCES = [('My Hovercraft is full of eels.', 'Can I please buy some matches?'),\n",
    "                  ('Please fondle my buttocks.', 'Can you direct me to the station?')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "4774c88c-05fc-4d35-8a30-5f664e54ca5f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<annotation_document xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" author=\"\" date=\"2022-09-18T22:40:54+02:00\" format=\"3.0\" version=\"3.0\" xsi:noNamespaceSchemaLocation=\"http://www.mpi.nl/tools/elan/EAFv3.0.xsd\">\n",
      "  <header media_file=\"\" time_units=\"milliseconds\">\n",
      "    <property name=\"URN\">urn:nl-mpi-tools-elan-eaf:315e945c-5e96-4492-9605-fbfe3487b506</property>\n",
      "    <property name=\"lastUsedAnnotationId\">6</property>\n",
      "  </header>\n",
      "  <time_order>\n",
      "    <time_slot time_slot_id=\"t1\" time_value=\"0\"/>\n",
      "    <time_slot time_slot_id=\"t2\" time_value=\"0\"/>\n",
      "    <time_slot time_slot_id=\"t3\" time_value=\"10000\"/>\n",
      "    <time_slot time_slot_id=\"t4\" time_value=\"10000\"/>\n",
      "    <time_slot time_slot_id=\"t5\" time_value=\"10000\"/>\n",
      "    <time_slot time_slot_id=\"t6\" time_value=\"10000\"/>\n",
      "    <time_slot time_slot_id=\"t7\" time_value=\"20000\"/>\n",
      "    <time_slot time_slot_id=\"t8\" time_value=\"20000\"/>\n",
      "  </time_order>\n",
      "  <tier tier_id=\"ref@A\" linguistic_type_ref=\"ref\" default_locale=\"en\">\n",
      "    <annotation>\n",
      "      <alignable_annotation annotation_id=\"a1\" time_slot_ref1=\"t1\" time_slot_ref2=\"t3\">\n",
      "        <annotation_value>test_sentences.001</annotation_value>\n",
      "      </alignable_annotation>\n",
      "    </annotation>\n",
      "    <annotation>\n",
      "      <alignable_annotation annotation_id=\"a4\" time_slot_ref1=\"t5\" time_slot_ref2=\"t7\">\n",
      "        <annotation_value>test_sentences.002</annotation_value>\n",
      "      </alignable_annotation>\n",
      "    </annotation>\n",
      "  </tier>\n",
      "  <tier tier_id=\"tx@A\" linguistic_type_ref=\"tx\" parent_ref=\"ref@A\" default_locale=\"en\">\n",
      "    <annotation>\n",
      "      <alignable_annotation annotation_id=\"a2\" time_slot_ref1=\"t2\" time_slot_ref2=\"t4\">\n",
      "        <annotation_value>My Hovercraft is full of eels.</annotation_value>\n",
      "      </alignable_annotation>\n",
      "    </annotation>\n",
      "    <annotation>\n",
      "      <alignable_annotation annotation_id=\"a5\" time_slot_ref1=\"t6\" time_slot_ref2=\"t8\">\n",
      "        <annotation_value>Please fondle my buttocks.</annotation_value>\n",
      "      </alignable_annotation>\n",
      "    </annotation>\n",
      "  </tier>\n",
      "  <tier tier_id=\"fte@A\" linguistic_type_ref=\"translation\" parent_ref=\"tx@A\" default_locale=\"en\">\n",
      "    <annotation>\n",
      "      <ref_annotation annotation_id=\"a3\" annotation_ref=\"a2\">\n",
      "        <annotation_value>Can I please buy some matches?</annotation_value>\n",
      "      </ref_annotation>\n",
      "    </annotation>\n",
      "    <annotation>\n",
      "      <ref_annotation annotation_id=\"a6\" annotation_ref=\"a5\">\n",
      "        <annotation_value>Can you direct me to the station?</annotation_value>\n",
      "      </ref_annotation>\n",
      "    </annotation>\n",
      "  </tier>\n",
      "  <tier tier_id=\"mb@A\" linguistic_type_ref=\"mb\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n",
      "  <tier tier_id=\"ge@A\" linguistic_type_ref=\"ge\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
      "  <tier tier_id=\"ps@A\" linguistic_type_ref=\"ps\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
      "  <tier tier_id=\"lxid@A\" linguistic_type_ref=\"lxid\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
      "  <tier tier_id=\"so@A\" linguistic_type_ref=\"so\" parent_ref=\"mb@A\" default_locale=\"en\"/>\n",
      "  <tier tier_id=\"nt@A\" linguistic_type_ref=\"nt\" parent_ref=\"ref@A\" default_locale=\"en\"/>\n",
      "  <tier tier_id=\"orig@A\" linguistic_type_ref=\"orig\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n",
      "  <tier tier_id=\"fta@A\" linguistic_type_ref=\"translation\" parent_ref=\"tx@A\" default_locale=\"en\"/>\n",
      "  <linguistic_type linguistic_type_id=\"default-lt\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"translation\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"transcription\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"orth\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"ref\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"tx\" time_alignable=\"true\" constraints=\"Included_In\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"mb\" time_alignable=\"false\" constraints=\"Symbolic_Subdivision\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"orig\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"ge\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"ps\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"so\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"lxid\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"fte\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"nt\" time_alignable=\"false\" constraints=\"Symbolic_Association\" graphic_references=\"false\"/>\n",
      "  <linguistic_type linguistic_type_id=\"imported-sep\" time_alignable=\"true\" graphic_references=\"false\"/>\n",
      "  <locale language_code=\"en\" country_code=\"US\"/>\n",
      "  <constraint stereotype=\"Time_Subdivision\" description=\"Time subdivision of parent annotation's time interval, no time gaps allowed within this interval\"/>\n",
      "  <constraint stereotype=\"Symbolic_Subdivision\" description=\"Symbolic subdivision of a parent annotation. Annotations refering to the same parent are ordered\"/>\n",
      "  <constraint stereotype=\"Symbolic_Association\" description=\"1-1 association with a parent annotation\"/>\n",
      "  <constraint stereotype=\"Included_In\" description=\"Time alignable annotations within the parent annotation's time interval, gaps are allowed\"/>\n",
      "</annotation_document>\n",
      "\n"
     ]
    }
   ],
   "source": [
    "pprint(make_document('test_sentences', TEST_SENTENCES))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "160ef689-0a40-4a3c-9307-a054a1e4cdae",
   "metadata": {},
   "source": [
    "## Write `.eaf` file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "e3419f0c-57dd-4790-9140-60cb5ab52f16",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test_sentences.eaf\n"
     ]
    }
   ],
   "source": [
    "TEST_TARGET = pathlib.Path('test_sentences').with_suffix(ELAN_SUFFIX)\n",
    "\n",
    "print(write(TEST_TARGET, make_document(TEST_TARGET.stem, TEST_SENTENCES)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "dee5530f-3ea2-4588-a7d6-33cdbb047f17",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5_954 bytes\n"
     ]
    }
   ],
   "source": [
    "print(f'{TEST_TARGET.stat().st_size:_d}', 'bytes')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }