Created
January 2, 2015 20:54
-
-
Save frederik-elwert/fef31d94b3ef4589a983 to your computer and use it in GitHub Desktop.
Flat to hierarchical XML
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from lxml import etree\n", | |
"\n", | |
"origtree = etree.XML('''\n", | |
"<sp xml:id=\"sp-0001\" who=\"#Egeon_Err\">\n", | |
"<speaker xml:id=\"spk-0001\">\n", | |
"<w xml:id=\"w0000410\">EGEON</w>\n", | |
"</speaker>\n", | |
"<ab xml:id=\"ab-0001\">\n", | |
"<lb xml:id=\"lb-00009\"/>\n", | |
"<milestone unit=\"ftln\" xml:id=\"ftln-0001\" n=\"1.1.1\" ana=\"#verse\"\n", | |
"corresp=\"#w0000420 #p0000430 #c0000440 #w0000450 #p0000460 #c0000470\n", | |
"#w0000480 #c0000490 #w0000500 #c0000510 #w0000520 #c0000530 #w0000540\n", | |
"#p0000550\"/>\n", | |
"<w xml:id=\"w0000420\" n=\"1.1.1\">Proceed</w>\n", | |
"<pc xml:id=\"p0000430\" n=\"1.1.1\">,</pc>\n", | |
"<c xml:id=\"c0000440\" n=\"1.1.1\"> </c>\n", | |
"<w xml:id=\"w0000450\" n=\"1.1.1\">Solinus</w>\n", | |
"<pc xml:id=\"p0000460\" n=\"1.1.1\">,</pc>\n", | |
"<c xml:id=\"c0000470\" n=\"1.1.1\"> </c>\n", | |
"<w xml:id=\"w0000480\" n=\"1.1.1\">to</w>\n", | |
"<c xml:id=\"c0000490\" n=\"1.1.1\"> </c>\n", | |
"<w xml:id=\"w0000500\" n=\"1.1.1\">procure</w>\n", | |
"<c xml:id=\"c0000510\" n=\"1.1.1\"> </c>\n", | |
"<w xml:id=\"w0000520\" n=\"1.1.1\">my</w>\n", | |
"<c xml:id=\"c0000530\" n=\"1.1.1\"> </c>\n", | |
"<w xml:id=\"w0000540\" n=\"1.1.1\">fall</w>\n", | |
"<pc xml:id=\"p0000550\" n=\"1.1.1\">,</pc>\n", | |
"<lb xml:id=\"lb-00010\"/>\n", | |
"<milestone unit=\"ftln\" xml:id=\"ftln-0002\" n=\"1.1.2\" ana=\"#verse\"\n", | |
"corresp=\"#w0000560 #c0000570 #w0000580 #c0000590 #w0000600 #c0000610\n", | |
"#w0000620 #c0000630 #w0000640 #c0000650 #w0000660 #c0000670 #w0000680\n", | |
"#c0000690 #w0000700 #c0000710 #w0000720 #c0000730 #w0000740 #p0000750\"/>\n", | |
"<w xml:id=\"w0000560\" n=\"1.1.2\">And</w>\n", | |
"<c xml:id=\"c0000570\" n=\"1.1.2\"> </c>\n", | |
"<w xml:id=\"w0000580\" n=\"1.1.2\">by</w>\n", | |
"<c xml:id=\"c0000590\" n=\"1.1.2\"> </c>\n", | |
"<w xml:id=\"w0000600\" n=\"1.1.2\">the</w>\n", | |
"<c xml:id=\"c0000610\" n=\"1.1.2\"> </c>\n", | |
"<w xml:id=\"w0000620\" n=\"1.1.2\">doom</w>\n", | |
"<c xml:id=\"c0000630\" n=\"1.1.2\"> </c>\n", | |
"<w xml:id=\"w0000640\" n=\"1.1.2\">of</w>\n", | |
"<c xml:id=\"c0000650\" n=\"1.1.2\"> </c>\n", | |
"<w xml:id=\"w0000660\" n=\"1.1.2\">death</w>\n", | |
"<c xml:id=\"c0000670\" n=\"1.1.2\"> </c>\n", | |
"<w xml:id=\"w0000680\" n=\"1.1.2\">end</w>\n", | |
"<c xml:id=\"c0000690\" n=\"1.1.2\"> </c>\n", | |
"<w xml:id=\"w0000700\" n=\"1.1.2\">woes</w>\n", | |
"<c xml:id=\"c0000710\" n=\"1.1.2\"> </c>\n", | |
"<w xml:id=\"w0000720\" n=\"1.1.2\">and</w>\n", | |
"<c xml:id=\"c0000730\" n=\"1.1.2\"> </c>\n", | |
"<w xml:id=\"w0000740\" n=\"1.1.2\">all</w>\n", | |
"<pc xml:id=\"p0000750\" n=\"1.1.2\">.</pc>\n", | |
"</ab>\n", | |
"</sp>\n", | |
"''', parser=etree.XMLParser(remove_blank_text=True))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from copy import deepcopy\n", | |
"tree = deepcopy(origtree)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for sp in tree.xpath('//sp'):\n", | |
" ab = sp.xpath('ab')[0] # Assumes only one ab per sp.\n", | |
" for milestone in ab.xpath('milestone'):\n", | |
" line = etree.SubElement(sp, 'l')\n", | |
" corr_ids = [id_.lstrip('#') for id_ in milestone.get('corresp').split()]\n", | |
" for id_ in corr_ids:\n", | |
" elem = sp.xpath('id($cid)', cid=id_)[0]\n", | |
" line.append(elem)\n", | |
" sp.remove(ab)\n", | |
"print(etree.tostring(tree, pretty_print=True, encoding='unicode'))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<sp xml:id=\"sp-0001\" who=\"#Egeon_Err\">\n", | |
" <speaker xml:id=\"spk-0001\">\n", | |
" <w xml:id=\"w0000410\">EGEON</w>\n", | |
" </speaker>\n", | |
" <l>\n", | |
" <w xml:id=\"w0000420\" n=\"1.1.1\">Proceed</w>\n", | |
" <pc xml:id=\"p0000430\" n=\"1.1.1\">,</pc>\n", | |
" <c xml:id=\"c0000440\" n=\"1.1.1\"> </c>\n", | |
" <w xml:id=\"w0000450\" n=\"1.1.1\">Solinus</w>\n", | |
" <pc xml:id=\"p0000460\" n=\"1.1.1\">,</pc>\n", | |
" <c xml:id=\"c0000470\" n=\"1.1.1\"> </c>\n", | |
" <w xml:id=\"w0000480\" n=\"1.1.1\">to</w>\n", | |
" <c xml:id=\"c0000490\" n=\"1.1.1\"> </c>\n", | |
" <w xml:id=\"w0000500\" n=\"1.1.1\">procure</w>\n", | |
" <c xml:id=\"c0000510\" n=\"1.1.1\"> </c>\n", | |
" <w xml:id=\"w0000520\" n=\"1.1.1\">my</w>\n", | |
" <c xml:id=\"c0000530\" n=\"1.1.1\"> </c>\n", | |
" <w xml:id=\"w0000540\" n=\"1.1.1\">fall</w>\n", | |
" <pc xml:id=\"p0000550\" n=\"1.1.1\">,</pc>\n", | |
" </l>\n", | |
" <l>\n", | |
" <w xml:id=\"w0000560\" n=\"1.1.2\">And</w>\n", | |
" <c xml:id=\"c0000570\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000580\" n=\"1.1.2\">by</w>\n", | |
" <c xml:id=\"c0000590\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000600\" n=\"1.1.2\">the</w>\n", | |
" <c xml:id=\"c0000610\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000620\" n=\"1.1.2\">doom</w>\n", | |
" <c xml:id=\"c0000630\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000640\" n=\"1.1.2\">of</w>\n", | |
" <c xml:id=\"c0000650\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000660\" n=\"1.1.2\">death</w>\n", | |
" <c xml:id=\"c0000670\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000680\" n=\"1.1.2\">end</w>\n", | |
" <c xml:id=\"c0000690\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000700\" n=\"1.1.2\">woes</w>\n", | |
" <c xml:id=\"c0000710\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000720\" n=\"1.1.2\">and</w>\n", | |
" <c xml:id=\"c0000730\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000740\" n=\"1.1.2\">all</w>\n", | |
" <pc xml:id=\"p0000750\" n=\"1.1.2\">.</pc>\n", | |
" </l>\n", | |
"</sp>\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from copy import deepcopy\n", | |
"tree = deepcopy(origtree)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for sp in tree.xpath('//sp'):\n", | |
" ab = sp.xpath('ab')[0] # Assumes only one ab per sp.\n", | |
" for milestone in ab.xpath('milestone'):\n", | |
" line = etree.SubElement(sp, 'l')\n", | |
" elem = milestone.getnext()\n", | |
" while True:\n", | |
" if elem is None or elem.tag == 'milestone':\n", | |
" break\n", | |
" line.append(deepcopy(elem)) # Can't append directly, would break getnext().\n", | |
" elem = elem.getnext()\n", | |
" sp.remove(ab)\n", | |
"print(etree.tostring(tree, pretty_print=True, encoding='unicode'))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"<sp xml:id=\"sp-0001\" who=\"#Egeon_Err\">\n", | |
" <speaker xml:id=\"spk-0001\">\n", | |
" <w xml:id=\"w0000410\">EGEON</w>\n", | |
" </speaker>\n", | |
" <l>\n", | |
" <w xml:id=\"w0000420\" n=\"1.1.1\">Proceed</w>\n", | |
" <pc xml:id=\"p0000430\" n=\"1.1.1\">,</pc>\n", | |
" <c xml:id=\"c0000440\" n=\"1.1.1\"> </c>\n", | |
" <w xml:id=\"w0000450\" n=\"1.1.1\">Solinus</w>\n", | |
" <pc xml:id=\"p0000460\" n=\"1.1.1\">,</pc>\n", | |
" <c xml:id=\"c0000470\" n=\"1.1.1\"> </c>\n", | |
" <w xml:id=\"w0000480\" n=\"1.1.1\">to</w>\n", | |
" <c xml:id=\"c0000490\" n=\"1.1.1\"> </c>\n", | |
" <w xml:id=\"w0000500\" n=\"1.1.1\">procure</w>\n", | |
" <c xml:id=\"c0000510\" n=\"1.1.1\"> </c>\n", | |
" <w xml:id=\"w0000520\" n=\"1.1.1\">my</w>\n", | |
" <c xml:id=\"c0000530\" n=\"1.1.1\"> </c>\n", | |
" <w xml:id=\"w0000540\" n=\"1.1.1\">fall</w>\n", | |
" <pc xml:id=\"p0000550\" n=\"1.1.1\">,</pc>\n", | |
" <lb xml:id=\"lb-00010\"/>\n", | |
" </l>\n", | |
" <l>\n", | |
" <w xml:id=\"w0000560\" n=\"1.1.2\">And</w>\n", | |
" <c xml:id=\"c0000570\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000580\" n=\"1.1.2\">by</w>\n", | |
" <c xml:id=\"c0000590\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000600\" n=\"1.1.2\">the</w>\n", | |
" <c xml:id=\"c0000610\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000620\" n=\"1.1.2\">doom</w>\n", | |
" <c xml:id=\"c0000630\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000640\" n=\"1.1.2\">of</w>\n", | |
" <c xml:id=\"c0000650\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000660\" n=\"1.1.2\">death</w>\n", | |
" <c xml:id=\"c0000670\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000680\" n=\"1.1.2\">end</w>\n", | |
" <c xml:id=\"c0000690\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000700\" n=\"1.1.2\">woes</w>\n", | |
" <c xml:id=\"c0000710\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000720\" n=\"1.1.2\">and</w>\n", | |
" <c xml:id=\"c0000730\" n=\"1.1.2\"> </c>\n", | |
" <w xml:id=\"w0000740\" n=\"1.1.2\">all</w>\n", | |
" <pc xml:id=\"p0000750\" n=\"1.1.2\">.</pc>\n", | |
" </l>\n", | |
"</sp>\n", | |
"\n" | |
] | |
} | |
], | |
"prompt_number": 5 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment