Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active October 12, 2025 09:40
Show Gist options
  • Save xflr6/eafc69268a60d6fda8ff9d8d3e12cacc to your computer and use it in GitHub Desktop.
Save xflr6/eafc69268a60d6fda8ff9d8d3e12cacc to your computer and use it in GitHub Desktop.
Use some advanced XPath features of lxml for scraping html/xml
"""Use advanced XPath features of lxml (see also scrapy parsel)."""
from collections.abc import Callable
import functools
from typing import Any, Self, overload
import urllib.request
import lxml.etree
import lxml.html
HEADERS = {'User-Agent': ('Mozilla/5.0 (X11; U; Linux i686)'
' Gecko/20071127 Firefox/2.0.0.11')}
@overload
def register_xpath[F: Callable[..., Any]](func: F, /) -> F:
"""Return the decorated function when used as bare decorator."""
@overload
def register_xpath[F: Callable[..., Any]](*, ns_uri: str | None = ...) -> Callable[[F], F]:
"""Return a function-returning decorator when used as decorator factory."""
def register_xpath(func: Callable[..., Any] | None = None, /, *,
ns_uri: str | None = None):
if func is None:
return functools.partial(register_xpath, ns_uri=ns_uri)
ns = lxml.etree.FunctionNamespace(ns_uri)
name = func.__name__.replace('_', '-')
ns[name] = func
return func
@register_xpath
def has_class(context, *args) -> bool:
cls_contains = 'contains(concat(" ", normalize-space(@class), " "), " {} ")'
has_cls = ' and '.join(cls_contains.format(cls) for cls in args)
xpath = f'self::*[@class and {has_cls}]'
return bool(context.context_node.xpath(xpath))
class XpathEval(lxml.etree.XPathDocumentEvaluator):
_defaults = {'namespaces': {'set': 'http://exslt.org/sets',
're': 'http://exslt.org/regular-expressions'},
'smart_strings': False}
@classmethod
def from_url(cls, url: str, *, xml: bool = False, **kwargs) -> Self:
parse_func = lxml.xml.parse if xml else lxml.html.parse
request = urllib.request.Request(url, headers=HEADERS)
with urllib.request.urlopen(request) as f:
tree = parse_func(f)
return cls(tree, **kwargs)
def __init__(self, tree: lxml.etree._ElementTree, *args, **kwargs) -> None:
for kw, value in self._defaults.items():
kwargs.setdefault(kw, value)
super().__init__(tree, *args, **kwargs)
self.tree = tree
if __name__ == '__main__':
url = 'https://en.wikipedia.org/wiki/Category:Monty_Python_songs'
xpath_eval = XpathEval.from_url(url)
xpath = ('//div[has-class("mw-category")]'
'//h3[re:test(., "^[EI]$")]'
'/following-sibling::ul/li/a/text()')
print(xpath_eval(xpath))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment