Skip to content

Instantly share code, notes, and snippets.

@alexdelorenzo
Last active September 28, 2016 18:55
Show Gist options
  • Save alexdelorenzo/be0eb422f705b7540189 to your computer and use it in GitHub Desktop.
Save alexdelorenzo/be0eb422f705b7540189 to your computer and use it in GitHub Desktop.
HtmlWrapper -> Fast drop in replacement for BeautifulSoup
from functools import lru_cache
from lxml.html import HtmlElement, fromstring
from lxml.etree import XPath
BS4_TYPES = "bs4.element.Tag", "bs4.BeautifulSoup"
class HtmlWrapper(object):
"""
An lxml adapter over a subset of the BeautifulSoup API
"""
def __init__(self, html):
if isinstance(html, (str, bytes)):
self.html = fromstring(html)
elif isinstance(html, HtmlWrapper):
self.html = html.html
elif isinstance(html, HtmlElement):
self.html = html
elif str(type(html)) in BS4_TYPES:
self.html = fromstring(str(html))
else:
msg = "Object of type %s not compatible with BTL" % str(type(html))
raise TypeError(msg)
def __repr__(self):
return 'HtmlWrapper: ' + repr(self.html)
def __str__(self):
return self.html.text
def __getitem__(self, item):
items = self.html.attrib[item]
#
# if ' ' in items:
# print(item, "items: ", items)
if item == 'class':
items = items.split(' ')
return items
def __getattr__(self, item):
val = self.find(item)
if val is None:
if hasattr(self.html, item):
return getattr(self.html, item)
else:
return None
else:
return val
def name(self):
return self.html.tag
@property
def text(self) -> str:
text = self.html.text_content()
return text if text else ''
def find(self, tag: str, _class: str=None, **kwargs):
return find(self.html, tag, _class, **kwargs)
def find_all(self, tag: str, _class: str=None, *, gen=False, **kwargs) -> iter or tuple:
return find_all(self.html, tag, _class, gen=gen, **kwargs)
def find(html: HtmlElement, tag: str,
_class: str=None, **kwargs) -> HtmlWrapper or None:
results = find_all(html, tag, _class, gen=True, **kwargs)
return next(results) if results else None
def find_all(html: HtmlElement, tag: str,
_class: str=None, gen: bool=False, **kwargs) -> iter or tuple:
xpath = get_xpath(tag, _class, **kwargs)
elems = xpath(html)
if not elems:
return tuple()
wrapper_map = map(HtmlWrapper, elems) # returns an iterator
return wrapper_map if gen else tuple(wrapper_map)
def get_xpath_str(tag: str, _class: str=None, **kwargs) -> str:
tag_xp = './/' + tag
if _class:
kwargs['class'] = _class
for attr, val in kwargs.items():
tag_xp += '['
attr_xp = '@' + attr
if isinstance(val, bool):
if val:
tag_xp += attr_xp
else:
tag_xp += 'not(%s)' % attr_xp
elif isinstance(val, (set, list, tuple)):
for item in val:
val_xp = '"%s", ' % item
val_xp = val_xp[:-2] if val else ''
tag_xp += 'contains(%s, %s)' % (attr_xp, val_xp)
elif isinstance(val, str):
tag_xp += 'contains(%s, "%s")' % (attr_xp, val)
else:
tag_xp += "%s=%s'" % (attr_xp, val)
tag_xp += ']'
return tag_xp
@lru_cache(maxsize=None)
def get_xpath(tag: str, _class: str=None, **kwargs) -> XPath:
xpath_str = get_xpath_str(tag, _class, **kwargs)
return XPath(xpath_str)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment