Skip to content

Instantly share code, notes, and snippets.

@alexdelorenzo
Last active August 29, 2015 14:17
Show Gist options
  • Save alexdelorenzo/44665bedf97468612b3a to your computer and use it in GitHub Desktop.
Save alexdelorenzo/44665bedf97468612b3a to your computer and use it in GitHub Desktop.
BeauToLxml
from functools import lru_cache
from lxml.html import HtmlElement, Element, fromstring
from lxml.etree import XPath
from bs4 import BeautifulSoup, Tag
class BeauToLxml(object):
"""
An adapter for a small subset of the BeautifulSoup4 API
that I use. Translates calls to a much faster
lxml backend not based on bs4's slow lxml-based engine.
I'm sure that get_xpath() can be refactored nicely,
with better xpath selectors as well, but I'm gonna throw
a @lru_cache() on it and call it a day.
"""
def __init__(self, html):
super().__init__()
html_type = type(html)
if html_type in (str, bytes):
self.html = fromstring(html)
elif html_type == BeauToLxml:
self.html = html.html
elif html_type in (Element, HtmlElement):
self.html = html
elif html_type in (Tag, BeautifulSoup):
self.html = fromstring(str(html))
def __repr__(self):
return 'BeauToLxml: ' + repr(self.html)
def __str__(self):
return self.html.text
def __getitem__(self, item):
items = self.html.attrib[item]
#
# if ' ' in items:
# print(item, "items: ", items)
if item == 'class':
items = items.split(' ')
return items
def __getattr__(self, item):
val = self.find(item)
if val is None:
if hasattr(self.html, item):
return getattr(self.html, item)
else:
return None
else:
return val
@property
def text(self) -> str:
text = self.html.text_content()
return text if text else ''
def find(self, tag: str, _class: str=None, **kwargs):
return find(self.html, tag, _class, **kwargs)
def find_all(self, tag: str, _class: str=None, **kwargs) -> tuple:
return find_all(self.html, tag, _class, **kwargs)
def find(html: Element, tag: str, _class: str=None, **kwargs) -> BeauToLxml or None:
results = find_all(html, tag, _class, gen=True, **kwargs)
return next(iter(results)) if results else None
def find_all(html: Element, tag: str, _class: str=None, gen: bool=False, **kwargs) -> iter or tuple:
xpath = get_xpath(tag, _class, **kwargs)
elems = xpath(html)
if not elems:
return tuple()
wrapper_map = map(BeauToLxml, elems) # returns an iterator
return wrapper_map if gen else tuple(wrapper_map)
@lru_cache(maxsize=None)
def get_xpath(tag: str, _class: str=None, **kwargs) -> XPath:
tag_xp = './/' + tag
if _class:
kwargs['class'] = _class
for attr, val in kwargs.items():
tag_xp += '['
attr_xp = '@' + attr
val_type = type(val)
if val_type == bool:
if val:
tag_xp += attr_xp
else:
tag_xp += 'not(%s)' % attr_xp
elif val_type in (set, list, tuple):
for item in val:
val_xp = '"%s", ' % item
val_xp = val_xp[:-2] if val else ''
tag_xp += 'contains(%s, %s)' % (attr_xp, val_xp)
elif val_type == str:
tag_xp += 'contains(%s, "%s")' % (attr_xp, val_xp)
else:
tag_xp += "%s=%s'" % (attr_xp, val)
tag_xp += ']'
return XPath(tag_xp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment