Created
February 24, 2016 22:00
-
-
Save robclewley/fa12e0efe7070d67267a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# LXML partial tutorial\n", | |
"Derived from http://lxml.de/tutorial.html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from lxml import etree" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<Element root at 0x3f54b70>\n" | |
] | |
} | |
], | |
"source": [ | |
"root = etree.Element(\"root\")\n", | |
"print(root)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"root.append( etree.Element(\"child1\") )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Help on _Element object:\n", | |
"\n", | |
"class _Element(__builtin__.object)\n", | |
" | Element class.\n", | |
" | \n", | |
" | References a document object and a libxml node.\n", | |
" | \n", | |
" | By pointing to a Document instance, a reference is kept to\n", | |
" | _Document as long as there is some pointer to a node in it.\n", | |
" | \n", | |
" | Methods defined here:\n", | |
" | \n", | |
" | __contains__(...)\n", | |
" | __contains__(self, element)\n", | |
" | \n", | |
" | __copy__(...)\n", | |
" | __copy__(self)\n", | |
" | \n", | |
" | __deepcopy__(...)\n", | |
" | __deepcopy__(self, memo)\n", | |
" | \n", | |
" | __delitem__(...)\n", | |
" | __delitem__(self, x)\n", | |
" | \n", | |
" | Deletes the given subelement or a slice.\n", | |
" | \n", | |
" | __getitem__(...)\n", | |
" | Returns the subelement at the given position or the requested\n", | |
" | slice.\n", | |
" | \n", | |
" | __iter__(...)\n", | |
" | __iter__(self)\n", | |
" | \n", | |
" | __len__(...)\n", | |
" | __len__(self)\n", | |
" | \n", | |
" | Returns the number of subelements.\n", | |
" | \n", | |
" | __nonzero__(...)\n", | |
" | x.__nonzero__() <==> x != 0\n", | |
" | \n", | |
" | __repr__(...)\n", | |
" | __repr__(self)\n", | |
" | \n", | |
" | __reversed__(...)\n", | |
" | __reversed__(self)\n", | |
" | \n", | |
" | __setitem__(...)\n", | |
" | __setitem__(self, x, value)\n", | |
" | \n", | |
" | Replaces the given subelement index or slice.\n", | |
" | \n", | |
" | addnext(...)\n", | |
" | addnext(self, element)\n", | |
" | \n", | |
" | Adds the element as a following sibling directly after this\n", | |
" | element.\n", | |
" | \n", | |
" | This is normally used to set a processing instruction or comment after\n", | |
" | the root node of a document. Note that tail text is automatically\n", | |
" | discarded when adding at the root level.\n", | |
" | \n", | |
" | addprevious(...)\n", | |
" | addprevious(self, element)\n", | |
" | \n", | |
" | Adds the element as a preceding sibling directly before this\n", | |
" | element.\n", | |
" | \n", | |
" | This is normally used to set a processing instruction or comment\n", | |
" | before the root node of a document. Note that tail text is\n", | |
" | automatically discarded when adding at the root level.\n", | |
" | \n", | |
" | append(...)\n", | |
" | append(self, element)\n", | |
" | \n", | |
" | Adds a subelement to the end of this element.\n", | |
" | \n", | |
" | clear(...)\n", | |
" | clear(self)\n", | |
" | \n", | |
" | Resets an element. This function removes all subelements, clears\n", | |
" | all attributes and sets the text and tail properties to None.\n", | |
" | \n", | |
" | extend(...)\n", | |
" | extend(self, elements)\n", | |
" | \n", | |
" | Extends the current children by the elements in the iterable.\n", | |
" | \n", | |
" | find(...)\n", | |
" | find(self, path, namespaces=None)\n", | |
" | \n", | |
" | Finds the first matching subelement, by tag name or path.\n", | |
" | \n", | |
" | The optional ``namespaces`` argument accepts a\n", | |
" | prefix-to-namespace mapping that allows the usage of XPath\n", | |
" | prefixes in the path expression.\n", | |
" | \n", | |
" | findall(...)\n", | |
" | findall(self, path, namespaces=None)\n", | |
" | \n", | |
" | Finds all matching subelements, by tag name or path.\n", | |
" | \n", | |
" | The optional ``namespaces`` argument accepts a\n", | |
" | prefix-to-namespace mapping that allows the usage of XPath\n", | |
" | prefixes in the path expression.\n", | |
" | \n", | |
" | findtext(...)\n", | |
" | findtext(self, path, default=None, namespaces=None)\n", | |
" | \n", | |
" | Finds text for the first matching subelement, by tag name or path.\n", | |
" | \n", | |
" | The optional ``namespaces`` argument accepts a\n", | |
" | prefix-to-namespace mapping that allows the usage of XPath\n", | |
" | prefixes in the path expression.\n", | |
" | \n", | |
" | get(...)\n", | |
" | get(self, key, default=None)\n", | |
" | \n", | |
" | Gets an element attribute.\n", | |
" | \n", | |
" | getchildren(...)\n", | |
" | getchildren(self)\n", | |
" | \n", | |
" | Returns all direct children. The elements are returned in document\n", | |
" | order.\n", | |
" | \n", | |
" | :deprecated: Note that this method has been deprecated as of\n", | |
" | ElementTree 1.3 and lxml 2.0. New code should use\n", | |
" | ``list(element)`` or simply iterate over elements.\n", | |
" | \n", | |
" | getiterator(...)\n", | |
" | getiterator(self, tag=None, *tags)\n", | |
" | \n", | |
" | Returns a sequence or iterator of all elements in the subtree in\n", | |
" | document order (depth first pre-order), starting with this\n", | |
" | element.\n", | |
" | \n", | |
" | Can be restricted to find only elements with a specific tag,\n", | |
" | see `iter`.\n", | |
" | \n", | |
" | :deprecated: Note that this method is deprecated as of\n", | |
" | ElementTree 1.3 and lxml 2.0. It returns an iterator in\n", | |
" | lxml, which diverges from the original ElementTree\n", | |
" | behaviour. If you want an efficient iterator, use the\n", | |
" | ``element.iter()`` method instead. You should only use this\n", | |
" | method in new code if you require backwards compatibility\n", | |
" | with older versions of lxml or ElementTree.\n", | |
" | \n", | |
" | getnext(...)\n", | |
" | getnext(self)\n", | |
" | \n", | |
" | Returns the following sibling of this element or None.\n", | |
" | \n", | |
" | getparent(...)\n", | |
" | getparent(self)\n", | |
" | \n", | |
" | Returns the parent of this element or None for the root element.\n", | |
" | \n", | |
" | getprevious(...)\n", | |
" | getprevious(self)\n", | |
" | \n", | |
" | Returns the preceding sibling of this element or None.\n", | |
" | \n", | |
" | getroottree(...)\n", | |
" | getroottree(self)\n", | |
" | \n", | |
" | Return an ElementTree for the root node of the document that\n", | |
" | contains this element.\n", | |
" | \n", | |
" | This is the same as following element.getparent() up the tree until it\n", | |
" | returns None (for the root element) and then build an ElementTree for\n", | |
" | the last parent that was returned.\n", | |
" | \n", | |
" | index(...)\n", | |
" | index(self, child, start=None, stop=None)\n", | |
" | \n", | |
" | Find the position of the child within the parent.\n", | |
" | \n", | |
" | This method is not part of the original ElementTree API.\n", | |
" | \n", | |
" | insert(...)\n", | |
" | insert(self, index, element)\n", | |
" | \n", | |
" | Inserts a subelement at the given position in this element\n", | |
" | \n", | |
" | items(...)\n", | |
" | items(self)\n", | |
" | \n", | |
" | Gets element attributes, as a sequence. The attributes are returned in\n", | |
" | an arbitrary order.\n", | |
" | \n", | |
" | iter(...)\n", | |
" | iter(self, tag=None, *tags)\n", | |
" | \n", | |
" | Iterate over all elements in the subtree in document order (depth\n", | |
" | first pre-order), starting with this element.\n", | |
" | \n", | |
" | Can be restricted to find only elements with a specific tag:\n", | |
" | pass ``\"{ns}localname\"`` as tag. Either or both of ``ns`` and\n", | |
" | ``localname`` can be ``*`` for a wildcard; ``ns`` can be empty\n", | |
" | for no namespace. ``\"localname\"`` is equivalent to ``\"{}localname\"``\n", | |
" | but ``\"*\"`` is ``\"{*}*\"``, not ``\"{}*\"``.\n", | |
" | \n", | |
" | You can also pass the Element, Comment, ProcessingInstruction and\n", | |
" | Entity factory functions to look only for the specific element type.\n", | |
" | \n", | |
" | Passing a sequence of tags will let the iterator return all\n", | |
" | elements matching any of these tags, in document order.\n", | |
" | \n", | |
" | iterancestors(...)\n", | |
" | iterancestors(self, tag=None, *tags)\n", | |
" | \n", | |
" | Iterate over the ancestors of this element (from parent to parent).\n", | |
" | \n", | |
" | Can be restricted to find only elements with a specific tag,\n", | |
" | see `iter`.\n", | |
" | \n", | |
" | iterchildren(...)\n", | |
" | iterchildren(self, tag=None, *tags, reversed=False)\n", | |
" | \n", | |
" | Iterate over the children of this element.\n", | |
" | \n", | |
" | As opposed to using normal iteration on this element, the returned\n", | |
" | elements can be reversed with the 'reversed' keyword and restricted\n", | |
" | to find only elements with a specific tag, see `iter`.\n", | |
" | \n", | |
" | iterdescendants(...)\n", | |
" | iterdescendants(self, tag=None, *tags)\n", | |
" | \n", | |
" | Iterate over the descendants of this element in document order.\n", | |
" | \n", | |
" | As opposed to ``el.iter()``, this iterator does not yield the element\n", | |
" | itself. The returned elements can be restricted to find only elements\n", | |
" | with a specific tag, see `iter`.\n", | |
" | \n", | |
" | iterfind(...)\n", | |
" | iterfind(self, path, namespaces=None)\n", | |
" | \n", | |
" | Iterates over all matching subelements, by tag name or path.\n", | |
" | \n", | |
" | The optional ``namespaces`` argument accepts a\n", | |
" | prefix-to-namespace mapping that allows the usage of XPath\n", | |
" | prefixes in the path expression.\n", | |
" | \n", | |
" | itersiblings(...)\n", | |
" | itersiblings(self, tag=None, *tags, preceding=False)\n", | |
" | \n", | |
" | Iterate over the following or preceding siblings of this element.\n", | |
" | \n", | |
" | The direction is determined by the 'preceding' keyword which\n", | |
" | defaults to False, i.e. forward iteration over the following\n", | |
" | siblings. When True, the iterator yields the preceding\n", | |
" | siblings in reverse document order, i.e. starting right before\n", | |
" | the current element and going backwards.\n", | |
" | \n", | |
" | Can be restricted to find only elements with a specific tag,\n", | |
" | see `iter`.\n", | |
" | \n", | |
" | itertext(...)\n", | |
" | itertext(self, tag=None, *tags, with_tail=True)\n", | |
" | \n", | |
" | Iterates over the text content of a subtree.\n", | |
" | \n", | |
" | You can pass a tag name to restrict text content to specific elements,\n", | |
" | see `iter`.\n", | |
" | \n", | |
" | You can set the ``with_tail`` keyword argument to ``False`` to skip\n", | |
" | over tail text.\n", | |
" | \n", | |
" | keys(...)\n", | |
" | keys(self)\n", | |
" | \n", | |
" | Gets a list of attribute names. The names are returned in an\n", | |
" | arbitrary order (just like for an ordinary Python dictionary).\n", | |
" | \n", | |
" | makeelement(...)\n", | |
" | makeelement(self, _tag, attrib=None, nsmap=None, **_extra)\n", | |
" | \n", | |
" | Creates a new element associated with the same document.\n", | |
" | \n", | |
" | remove(...)\n", | |
" | remove(self, element)\n", | |
" | \n", | |
" | Removes a matching subelement. Unlike the find methods, this\n", | |
" | method compares elements based on identity, not on tag value\n", | |
" | or contents.\n", | |
" | \n", | |
" | replace(...)\n", | |
" | replace(self, old_element, new_element)\n", | |
" | \n", | |
" | Replaces a subelement with the element passed as second argument.\n", | |
" | \n", | |
" | set(...)\n", | |
" | set(self, key, value)\n", | |
" | \n", | |
" | Sets an element attribute.\n", | |
" | \n", | |
" | values(...)\n", | |
" | values(self)\n", | |
" | \n", | |
" | Gets element attribute values as a sequence of strings. The\n", | |
" | attributes are returned in an arbitrary order.\n", | |
" | \n", | |
" | xpath(...)\n", | |
" | xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)\n", | |
" | \n", | |
" | Evaluate an xpath expression using the element as context node.\n", | |
" | \n", | |
" | ----------------------------------------------------------------------\n", | |
" | Data descriptors defined here:\n", | |
" | \n", | |
" | attrib\n", | |
" | Element attribute dictionary. Where possible, use get(), set(),\n", | |
" | keys(), values() and items() to access element attributes.\n", | |
" | \n", | |
" | base\n", | |
" | The base URI of the Element (xml:base or HTML base URL).\n", | |
" | None if the base URI is unknown.\n", | |
" | \n", | |
" | Note that the value depends on the URL of the document that\n", | |
" | holds the Element if there is no xml:base attribute on the\n", | |
" | Element or its ancestors.\n", | |
" | \n", | |
" | Setting this property will set an xml:base attribute on the\n", | |
" | Element, regardless of the document type (XML or HTML).\n", | |
" | \n", | |
" | nsmap\n", | |
" | Namespace prefix->URI mapping known in the context of this\n", | |
" | Element. This includes all namespace declarations of the\n", | |
" | parents.\n", | |
" | \n", | |
" | Note that changing the returned dict has no effect on the Element.\n", | |
" | \n", | |
" | prefix\n", | |
" | Namespace prefix or None.\n", | |
" | \n", | |
" | sourceline\n", | |
" | Original line number as found by the parser or None if unknown.\n", | |
" | \n", | |
" | tag\n", | |
" | Element tag\n", | |
" | \n", | |
" | tail\n", | |
" | Text after this element's end tag, but before the next sibling\n", | |
" | element's start tag. This is either a string or the value None, if\n", | |
" | there was no text.\n", | |
" | \n", | |
" | text\n", | |
" | Text before the first subelement. This is either a string or \n", | |
" | the value None, if there was no text.\n", | |
" | \n", | |
" | ----------------------------------------------------------------------\n", | |
" | Data and other attributes defined here:\n", | |
" | \n", | |
" | __new__ = <built-in method __new__ of type object>\n", | |
" | T.__new__(S, ...) -> a new object with type S, a subtype of T\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"help(root)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"child2 = etree.SubElement(root, \"child2\")\n", | |
"child3 = etree.SubElement(root, \"child3\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**This is really XML!**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"<root>\n", | |
" <child1/>\n", | |
" <child2/>\n", | |
" <child3/>\n", | |
"</root>\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"print(etree.tostring(root, pretty_print=True))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Elements are lists**\n", | |
"\n", | |
"To make the access to these subelements easy and straight forward, elements mimic the behaviour of normal Python lists as closely as possible:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"child1\n" | |
] | |
} | |
], | |
"source": [ | |
"child = root[0]\n", | |
"print(child.tag)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3\n" | |
] | |
} | |
], | |
"source": [ | |
"print(len(root))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"root.index(root[1]) # lxml.etree only!" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"child1\n", | |
"child2\n", | |
"child3\n" | |
] | |
} | |
], | |
"source": [ | |
"children = list(root)\n", | |
"for child in root:\n", | |
" print(child.tag)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"('child0', 'child3')\n" | |
] | |
} | |
], | |
"source": [ | |
"root.insert(0, etree.Element(\"child0\"))\n", | |
"start = root[:1]\n", | |
"end = root[-1:]\n", | |
"print(start[0].tag, end[0].tag)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment