-
-
Save IanHopkinson/ad45831a2fb73f537a79 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
# encoding: utf-8 | |
import lxml.etree | |
import lxml.html | |
import requests | |
xml_sample = """<?xml version="1.0" encoding="UTF-8"?> | |
<foo:Results xmlns:foo="http://www.foo.com" xmlns="http://www.bah.com"> | |
<foo:Recordset setCount="2"> | |
<foo:Record setEntry="0"> | |
<foo:Title>First title</foo:Title> | |
</foo:Record> | |
<foo:Record setEntry="1"> | |
<foo:Title>Second title</foo:Title> | |
</foo:Record> | |
<Record setEntry="2"> | |
<Title>Third title</Title> | |
</Record> | |
<Record setEntry="3"> | |
<Title>Fourth title</Title> | |
</Record> | |
</foo:Recordset> | |
</foo:Results> | |
""".encode("utf-8") | |
def main(): | |
print("Demonstrating xpath on HTML") | |
print("===========================") | |
r = requests.get("http://www.ianhopkinson.org.uk") | |
root = lxml.html.fromstring(r.content) | |
title = root.xpath('/html/body/div/div/div[2]/h1') | |
print("My blog title is: '{}'".format(title[0].text.strip())) | |
title = root.xpath('//div[2]/h1') | |
print("We can use the // shortcut to get the same thing more easily: '{}'".format(title[0].text_content().strip())) | |
ids = root.xpath('//li/@id') | |
print("We can get the id attributes of all the <li> elements. There are {} of them, the first one is {}".format(len(ids), ids[0])) | |
tagcloud = root.xpath('//*[@class="tagcloud"]') | |
print("We can get the parent element of the tagcloud using an attribute selector: {}".format(tagcloud)) | |
title = root.xpath("//h1[contains(., 'SomeBeans')]") | |
print("Another way to get the title is to select by element text content: '{}'".format(title[0].text.strip())) | |
subtitle = root.xpath('//h1[contains(@class,"header_title")]/../h2') | |
print("We can use the .. operator is select the subtitle: '{}'".format(subtitle[0].text.strip())) | |
subtitle = root.xpath('//h1[contains(@class,"header_title")]/following-sibling::h2') | |
print("Or we can use following-sibling to same effect: '{}'".format(subtitle[0].text.strip())) | |
print("\nDemonstrating xpath on XML") | |
print("============================") | |
print("Processing XML is pretty similar except for namespaces") | |
namespace = "http://www.foo.com" | |
namespace_c = "{" + namespace + "}" | |
NSMAP = {"foo": namespace} | |
root = lxml.etree.fromstring(xml_sample) | |
record_count = root.xpath('//@setCount')[0] | |
print("Attributes are easy, this is the @setCount: {}".format(record_count)) | |
print("These are the elements defined by the XML string at the top of this program:") | |
for i, element in enumerate(root.getiterator()): | |
print(element.tag) | |
print("We can select elements by defining a namespace in our queries") | |
records = root.xpath('//foo:Title', namespaces = {"foo": "http://www.foo.com"}) | |
for record in records: | |
print(record.text) | |
print("Without defining the default namespace, we get nothing") | |
records = root.xpath('//Title') | |
for record in records: | |
print(record.text) | |
print("With the default namespace, we get something") | |
records = root.xpath('//bah:Title', namespaces = {"bah": "http://www.bah.com"}) | |
for record in records: | |
print("Element name: {}, element text '{}'".format(record.tag, record.text)) | |
if __name__ == "__main__": | |
main() |
@lsloan I think that is probably a hangover from an earlier version of the code, in this version it serves no purpose. The lxml documentation uses that style of namespace definition, I probably intended to use it down at line 74 and then forgot!
Gotcha. I thought that might be the case.
I forked your gist and made some changes. Then I added on some other examples of processing XML that contains QTI (Question & Test Interoperability) data. Experimenting with lxml.etree
, I found that the default, unnamed namespace in the XML is available in the tree's data in nsmap[None]
. See my lxml-test-etree.py, line 11…
defaultNamespace = {'_': root.nsmap[None]}
I found that naming it _
makes it convenient to refer to it in the XPath statement, as on line 23…
items = root.xpath('//_:item', namespaces=defaultNamespace)
I wanted to get that namespace used by default when xpath()
is called. I tried setting the key to None
or using root.nsmap
itself, but those caused an error ("TypeError: empty namespace prefix is not supported in XPath
").
I'd like to not need to use the _:
prefix for the element name, but at least it's minimally obtrusive. Trying to set a truly default namespace is a lost cause, apparently. As written in the lxml FAQ, "How can I specify a default namespace for XPath expressions?". The short answer: "You can't." 🤷
As it turns out, I may prefer using lxml.objectify
rather than lxml.etree
, but I need to investigate a little more before I know for sure. See my lxml-test-objectify.py, for example.
What is the purpose of the namespace variables on lines 59–61? They're not used anywhere else that I can see.