Created
November 24, 2015 19:42
-
-
Save IanHopkinson/ad45831a2fb73f537a79 to your computer and use it in GitHub Desktop.
Examples of xpath queries using lxml in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
import lxml.etree | |
import lxml.html | |
import requests | |
xml_sample = """<?xml version="1.0" encoding="UTF-8"?> | |
<foo:Results xmlns:foo="http://www.foo.com" xmlns="http://www.bah.com"> | |
<foo:Recordset setCount="2"> | |
<foo:Record setEntry="0"> | |
<foo:Title>First title</foo:Title> | |
</foo:Record> | |
<foo:Record setEntry="1"> | |
<foo:Title>Second title</foo:Title> | |
</foo:Record> | |
<Record setEntry="2"> | |
<Title>Third title</Title> | |
</Record> | |
<Record setEntry="3"> | |
<Title>Fourth title</Title> | |
</Record> | |
</foo:Recordset> | |
</foo:Results> | |
""".encode("utf-8") | |
def main(): | |
print("Demonstrating xpath on HTML") | |
print("===========================") | |
r = requests.get("http://www.ianhopkinson.org.uk") | |
root = lxml.html.fromstring(r.content) | |
title = root.xpath('/html/body/div/div/div[2]/h1') | |
print("My blog title is: '{}'".format(title[0].text.strip())) | |
title = root.xpath('//div[2]/h1') | |
print("We can use the // shortcut to get the same thing more easily: '{}'".format(title[0].text_content().strip())) | |
ids = root.xpath('//li/@id') | |
print("We can get the id attributes of all the <li> elements. There are {} of them, the first one is {}".format(len(ids), ids[0])) | |
tagcloud = root.xpath('//*[@class="tagcloud"]') | |
print("We can get the parent element of the tagcloud using an attribute selector: {}".format(tagcloud)) | |
title = root.xpath("//h1[contains(., 'SomeBeans')]") | |
print("Another way to get the title is to select by element text content: '{}'".format(title[0].text.strip())) | |
subtitle = root.xpath('//h1[contains(@class,"header_title")]/../h2') | |
print("We can use the .. operator is select the subtitle: '{}'".format(subtitle[0].text.strip())) | |
subtitle = root.xpath('//h1[contains(@class,"header_title")]/following-sibling::h2') | |
print("Or we can use following-sibling to same effect: '{}'".format(subtitle[0].text.strip())) | |
print("\nDemonstrating xpath on XML") | |
print("============================") | |
print("Processing XML is pretty similar except for namespaces") | |
namespace = "http://www.foo.com" | |
namespace_c = "{" + namespace + "}" | |
NSMAP = {"foo": namespace} | |
root = lxml.etree.fromstring(xml_sample) | |
record_count = root.xpath('//@setCount')[0] | |
print("Attributes are easy, this is the @setCount: {}".format(record_count)) | |
print("These are the elements defined by the XML string at the top of this program:") | |
for i, element in enumerate(root.getiterator()): | |
print(element.tag) | |
print("We can select elements by defining a namespace in our queries") | |
records = root.xpath('//foo:Title', namespaces = {"foo": "http://www.foo.com"}) | |
for record in records: | |
print(record.text) | |
print("Without defining the default namespace, we get nothing") | |
records = root.xpath('//Title') | |
for record in records: | |
print(record.text) | |
print("With the default namespace, we get something") | |
records = root.xpath('//bah:Title', namespaces = {"bah": "http://www.bah.com"}) | |
for record in records: | |
print("Element name: {}, element text '{}'".format(record.tag, record.text)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Gotcha. I thought that might be the case.
I forked your gist and made some changes. Then I added on some other examples of processing XML that contains QTI (Question & Test Interoperability) data. Experimenting with
lxml.etree
, I found that the default, unnamed namespace in the XML is available in the tree's data innsmap[None]
. See my lxml-test-etree.py, line 11…I found that naming it
_
makes it convenient to refer to it in the XPath statement, as on line 23…I wanted to get that namespace used by default when
xpath()
is called. I tried setting the key toNone
or usingroot.nsmap
itself, but those caused an error ("TypeError: empty namespace prefix is not supported in XPath
").I'd like to not need to use the
_:
prefix for the element name, but at least it's minimally obtrusive. Trying to set a truly default namespace is a lost cause, apparently. As written in the lxml FAQ, "How can I specify a default namespace for XPath expressions?". The short answer: "You can't." 🤷As it turns out, I may prefer using
lxml.objectify
rather thanlxml.etree
, but I need to investigate a little more before I know for sure. See my lxml-test-objectify.py, for example.