Skip to content

Instantly share code, notes, and snippets.

@robclewley
Created February 24, 2016 22:00
Show Gist options
  • Save robclewley/fa12e0efe7070d67267a to your computer and use it in GitHub Desktop.
Save robclewley/fa12e0efe7070d67267a to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# LXML partial tutorial\n",
"Derived from http://lxml.de/tutorial.html"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from lxml import etree"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<Element root at 0x3f54b70>\n"
]
}
],
"source": [
"root = etree.Element(\"root\")\n",
"print(root)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"root.append( etree.Element(\"child1\") )"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Help on _Element object:\n",
"\n",
"class _Element(__builtin__.object)\n",
" | Element class.\n",
" | \n",
" | References a document object and a libxml node.\n",
" | \n",
" | By pointing to a Document instance, a reference is kept to\n",
" | _Document as long as there is some pointer to a node in it.\n",
" | \n",
" | Methods defined here:\n",
" | \n",
" | __contains__(...)\n",
" | __contains__(self, element)\n",
" | \n",
" | __copy__(...)\n",
" | __copy__(self)\n",
" | \n",
" | __deepcopy__(...)\n",
" | __deepcopy__(self, memo)\n",
" | \n",
" | __delitem__(...)\n",
" | __delitem__(self, x)\n",
" | \n",
" | Deletes the given subelement or a slice.\n",
" | \n",
" | __getitem__(...)\n",
" | Returns the subelement at the given position or the requested\n",
" | slice.\n",
" | \n",
" | __iter__(...)\n",
" | __iter__(self)\n",
" | \n",
" | __len__(...)\n",
" | __len__(self)\n",
" | \n",
" | Returns the number of subelements.\n",
" | \n",
" | __nonzero__(...)\n",
" | x.__nonzero__() <==> x != 0\n",
" | \n",
" | __repr__(...)\n",
" | __repr__(self)\n",
" | \n",
" | __reversed__(...)\n",
" | __reversed__(self)\n",
" | \n",
" | __setitem__(...)\n",
" | __setitem__(self, x, value)\n",
" | \n",
" | Replaces the given subelement index or slice.\n",
" | \n",
" | addnext(...)\n",
" | addnext(self, element)\n",
" | \n",
" | Adds the element as a following sibling directly after this\n",
" | element.\n",
" | \n",
" | This is normally used to set a processing instruction or comment after\n",
" | the root node of a document. Note that tail text is automatically\n",
" | discarded when adding at the root level.\n",
" | \n",
" | addprevious(...)\n",
" | addprevious(self, element)\n",
" | \n",
" | Adds the element as a preceding sibling directly before this\n",
" | element.\n",
" | \n",
" | This is normally used to set a processing instruction or comment\n",
" | before the root node of a document. Note that tail text is\n",
" | automatically discarded when adding at the root level.\n",
" | \n",
" | append(...)\n",
" | append(self, element)\n",
" | \n",
" | Adds a subelement to the end of this element.\n",
" | \n",
" | clear(...)\n",
" | clear(self)\n",
" | \n",
" | Resets an element. This function removes all subelements, clears\n",
" | all attributes and sets the text and tail properties to None.\n",
" | \n",
" | extend(...)\n",
" | extend(self, elements)\n",
" | \n",
" | Extends the current children by the elements in the iterable.\n",
" | \n",
" | find(...)\n",
" | find(self, path, namespaces=None)\n",
" | \n",
" | Finds the first matching subelement, by tag name or path.\n",
" | \n",
" | The optional ``namespaces`` argument accepts a\n",
" | prefix-to-namespace mapping that allows the usage of XPath\n",
" | prefixes in the path expression.\n",
" | \n",
" | findall(...)\n",
" | findall(self, path, namespaces=None)\n",
" | \n",
" | Finds all matching subelements, by tag name or path.\n",
" | \n",
" | The optional ``namespaces`` argument accepts a\n",
" | prefix-to-namespace mapping that allows the usage of XPath\n",
" | prefixes in the path expression.\n",
" | \n",
" | findtext(...)\n",
" | findtext(self, path, default=None, namespaces=None)\n",
" | \n",
" | Finds text for the first matching subelement, by tag name or path.\n",
" | \n",
" | The optional ``namespaces`` argument accepts a\n",
" | prefix-to-namespace mapping that allows the usage of XPath\n",
" | prefixes in the path expression.\n",
" | \n",
" | get(...)\n",
" | get(self, key, default=None)\n",
" | \n",
" | Gets an element attribute.\n",
" | \n",
" | getchildren(...)\n",
" | getchildren(self)\n",
" | \n",
" | Returns all direct children. The elements are returned in document\n",
" | order.\n",
" | \n",
" | :deprecated: Note that this method has been deprecated as of\n",
" | ElementTree 1.3 and lxml 2.0. New code should use\n",
" | ``list(element)`` or simply iterate over elements.\n",
" | \n",
" | getiterator(...)\n",
" | getiterator(self, tag=None, *tags)\n",
" | \n",
" | Returns a sequence or iterator of all elements in the subtree in\n",
" | document order (depth first pre-order), starting with this\n",
" | element.\n",
" | \n",
" | Can be restricted to find only elements with a specific tag,\n",
" | see `iter`.\n",
" | \n",
" | :deprecated: Note that this method is deprecated as of\n",
" | ElementTree 1.3 and lxml 2.0. It returns an iterator in\n",
" | lxml, which diverges from the original ElementTree\n",
" | behaviour. If you want an efficient iterator, use the\n",
" | ``element.iter()`` method instead. You should only use this\n",
" | method in new code if you require backwards compatibility\n",
" | with older versions of lxml or ElementTree.\n",
" | \n",
" | getnext(...)\n",
" | getnext(self)\n",
" | \n",
" | Returns the following sibling of this element or None.\n",
" | \n",
" | getparent(...)\n",
" | getparent(self)\n",
" | \n",
" | Returns the parent of this element or None for the root element.\n",
" | \n",
" | getprevious(...)\n",
" | getprevious(self)\n",
" | \n",
" | Returns the preceding sibling of this element or None.\n",
" | \n",
" | getroottree(...)\n",
" | getroottree(self)\n",
" | \n",
" | Return an ElementTree for the root node of the document that\n",
" | contains this element.\n",
" | \n",
" | This is the same as following element.getparent() up the tree until it\n",
" | returns None (for the root element) and then build an ElementTree for\n",
" | the last parent that was returned.\n",
" | \n",
" | index(...)\n",
" | index(self, child, start=None, stop=None)\n",
" | \n",
" | Find the position of the child within the parent.\n",
" | \n",
" | This method is not part of the original ElementTree API.\n",
" | \n",
" | insert(...)\n",
" | insert(self, index, element)\n",
" | \n",
" | Inserts a subelement at the given position in this element\n",
" | \n",
" | items(...)\n",
" | items(self)\n",
" | \n",
" | Gets element attributes, as a sequence. The attributes are returned in\n",
" | an arbitrary order.\n",
" | \n",
" | iter(...)\n",
" | iter(self, tag=None, *tags)\n",
" | \n",
" | Iterate over all elements in the subtree in document order (depth\n",
" | first pre-order), starting with this element.\n",
" | \n",
" | Can be restricted to find only elements with a specific tag:\n",
" | pass ``\"{ns}localname\"`` as tag. Either or both of ``ns`` and\n",
" | ``localname`` can be ``*`` for a wildcard; ``ns`` can be empty\n",
" | for no namespace. ``\"localname\"`` is equivalent to ``\"{}localname\"``\n",
" | but ``\"*\"`` is ``\"{*}*\"``, not ``\"{}*\"``.\n",
" | \n",
" | You can also pass the Element, Comment, ProcessingInstruction and\n",
" | Entity factory functions to look only for the specific element type.\n",
" | \n",
" | Passing a sequence of tags will let the iterator return all\n",
" | elements matching any of these tags, in document order.\n",
" | \n",
" | iterancestors(...)\n",
" | iterancestors(self, tag=None, *tags)\n",
" | \n",
" | Iterate over the ancestors of this element (from parent to parent).\n",
" | \n",
" | Can be restricted to find only elements with a specific tag,\n",
" | see `iter`.\n",
" | \n",
" | iterchildren(...)\n",
" | iterchildren(self, tag=None, *tags, reversed=False)\n",
" | \n",
" | Iterate over the children of this element.\n",
" | \n",
" | As opposed to using normal iteration on this element, the returned\n",
" | elements can be reversed with the 'reversed' keyword and restricted\n",
" | to find only elements with a specific tag, see `iter`.\n",
" | \n",
" | iterdescendants(...)\n",
" | iterdescendants(self, tag=None, *tags)\n",
" | \n",
" | Iterate over the descendants of this element in document order.\n",
" | \n",
" | As opposed to ``el.iter()``, this iterator does not yield the element\n",
" | itself. The returned elements can be restricted to find only elements\n",
" | with a specific tag, see `iter`.\n",
" | \n",
" | iterfind(...)\n",
" | iterfind(self, path, namespaces=None)\n",
" | \n",
" | Iterates over all matching subelements, by tag name or path.\n",
" | \n",
" | The optional ``namespaces`` argument accepts a\n",
" | prefix-to-namespace mapping that allows the usage of XPath\n",
" | prefixes in the path expression.\n",
" | \n",
" | itersiblings(...)\n",
" | itersiblings(self, tag=None, *tags, preceding=False)\n",
" | \n",
" | Iterate over the following or preceding siblings of this element.\n",
" | \n",
" | The direction is determined by the 'preceding' keyword which\n",
" | defaults to False, i.e. forward iteration over the following\n",
" | siblings. When True, the iterator yields the preceding\n",
" | siblings in reverse document order, i.e. starting right before\n",
" | the current element and going backwards.\n",
" | \n",
" | Can be restricted to find only elements with a specific tag,\n",
" | see `iter`.\n",
" | \n",
" | itertext(...)\n",
" | itertext(self, tag=None, *tags, with_tail=True)\n",
" | \n",
" | Iterates over the text content of a subtree.\n",
" | \n",
" | You can pass a tag name to restrict text content to specific elements,\n",
" | see `iter`.\n",
" | \n",
" | You can set the ``with_tail`` keyword argument to ``False`` to skip\n",
" | over tail text.\n",
" | \n",
" | keys(...)\n",
" | keys(self)\n",
" | \n",
" | Gets a list of attribute names. The names are returned in an\n",
" | arbitrary order (just like for an ordinary Python dictionary).\n",
" | \n",
" | makeelement(...)\n",
" | makeelement(self, _tag, attrib=None, nsmap=None, **_extra)\n",
" | \n",
" | Creates a new element associated with the same document.\n",
" | \n",
" | remove(...)\n",
" | remove(self, element)\n",
" | \n",
" | Removes a matching subelement. Unlike the find methods, this\n",
" | method compares elements based on identity, not on tag value\n",
" | or contents.\n",
" | \n",
" | replace(...)\n",
" | replace(self, old_element, new_element)\n",
" | \n",
" | Replaces a subelement with the element passed as second argument.\n",
" | \n",
" | set(...)\n",
" | set(self, key, value)\n",
" | \n",
" | Sets an element attribute.\n",
" | \n",
" | values(...)\n",
" | values(self)\n",
" | \n",
" | Gets element attribute values as a sequence of strings. The\n",
" | attributes are returned in an arbitrary order.\n",
" | \n",
" | xpath(...)\n",
" | xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)\n",
" | \n",
" | Evaluate an xpath expression using the element as context node.\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data descriptors defined here:\n",
" | \n",
" | attrib\n",
" | Element attribute dictionary. Where possible, use get(), set(),\n",
" | keys(), values() and items() to access element attributes.\n",
" | \n",
" | base\n",
" | The base URI of the Element (xml:base or HTML base URL).\n",
" | None if the base URI is unknown.\n",
" | \n",
" | Note that the value depends on the URL of the document that\n",
" | holds the Element if there is no xml:base attribute on the\n",
" | Element or its ancestors.\n",
" | \n",
" | Setting this property will set an xml:base attribute on the\n",
" | Element, regardless of the document type (XML or HTML).\n",
" | \n",
" | nsmap\n",
" | Namespace prefix->URI mapping known in the context of this\n",
" | Element. This includes all namespace declarations of the\n",
" | parents.\n",
" | \n",
" | Note that changing the returned dict has no effect on the Element.\n",
" | \n",
" | prefix\n",
" | Namespace prefix or None.\n",
" | \n",
" | sourceline\n",
" | Original line number as found by the parser or None if unknown.\n",
" | \n",
" | tag\n",
" | Element tag\n",
" | \n",
" | tail\n",
" | Text after this element's end tag, but before the next sibling\n",
" | element's start tag. This is either a string or the value None, if\n",
" | there was no text.\n",
" | \n",
" | text\n",
" | Text before the first subelement. This is either a string or \n",
" | the value None, if there was no text.\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Data and other attributes defined here:\n",
" | \n",
" | __new__ = <built-in method __new__ of type object>\n",
" | T.__new__(S, ...) -> a new object with type S, a subtype of T\n",
"\n"
]
}
],
"source": [
"help(root)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"child2 = etree.SubElement(root, \"child2\")\n",
"child3 = etree.SubElement(root, \"child3\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**This is really XML!**"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<root>\n",
" <child1/>\n",
" <child2/>\n",
" <child3/>\n",
"</root>\n",
"\n"
]
}
],
"source": [
"print(etree.tostring(root, pretty_print=True))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Elements are lists**\n",
"\n",
"To make the access to these subelements easy and straight forward, elements mimic the behaviour of normal Python lists as closely as possible:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"child1\n"
]
}
],
"source": [
"child = root[0]\n",
"print(child.tag)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3\n"
]
}
],
"source": [
"print(len(root))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"root.index(root[1]) # lxml.etree only!"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"child1\n",
"child2\n",
"child3\n"
]
}
],
"source": [
"children = list(root)\n",
"for child in root:\n",
" print(child.tag)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('child0', 'child3')\n"
]
}
],
"source": [
"root.insert(0, etree.Element(\"child0\"))\n",
"start = root[:1]\n",
"end = root[-1:]\n",
"print(start[0].tag, end[0].tag)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment