Last active
August 29, 2015 14:17
-
-
Save alexdelorenzo/44665bedf97468612b3a to your computer and use it in GitHub Desktop.
BeauToLxml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from functools import lru_cache | |
from lxml.html import HtmlElement, Element, fromstring | |
from lxml.etree import XPath | |
from bs4 import BeautifulSoup, Tag | |
class BeauToLxml(object): | |
""" | |
An adapter for a small subset of the BeautifulSoup4 API | |
that I use. Translates calls to a much faster | |
lxml backend not based on bs4's slow lxml-based engine. | |
I'm sure that get_xpath() can be refactored nicely, | |
with better xpath selectors as well, but I'm gonna throw | |
a @lru_cache() on it and call it a day. | |
""" | |
def __init__(self, html): | |
super().__init__() | |
html_type = type(html) | |
if html_type in (str, bytes): | |
self.html = fromstring(html) | |
elif html_type == BeauToLxml: | |
self.html = html.html | |
elif html_type in (Element, HtmlElement): | |
self.html = html | |
elif html_type in (Tag, BeautifulSoup): | |
self.html = fromstring(str(html)) | |
def __repr__(self): | |
return 'BeauToLxml: ' + repr(self.html) | |
def __str__(self): | |
return self.html.text | |
def __getitem__(self, item): | |
items = self.html.attrib[item] | |
# | |
# if ' ' in items: | |
# print(item, "items: ", items) | |
if item == 'class': | |
items = items.split(' ') | |
return items | |
def __getattr__(self, item): | |
val = self.find(item) | |
if val is None: | |
if hasattr(self.html, item): | |
return getattr(self.html, item) | |
else: | |
return None | |
else: | |
return val | |
@property | |
def text(self) -> str: | |
text = self.html.text_content() | |
return text if text else '' | |
def find(self, tag: str, _class: str=None, **kwargs): | |
return find(self.html, tag, _class, **kwargs) | |
def find_all(self, tag: str, _class: str=None, **kwargs) -> tuple: | |
return find_all(self.html, tag, _class, **kwargs) | |
def find(html: Element, tag: str, _class: str=None, **kwargs) -> BeauToLxml or None: | |
results = find_all(html, tag, _class, gen=True, **kwargs) | |
return next(iter(results)) if results else None | |
def find_all(html: Element, tag: str, _class: str=None, gen: bool=False, **kwargs) -> iter or tuple: | |
xpath = get_xpath(tag, _class, **kwargs) | |
elems = xpath(html) | |
if not elems: | |
return tuple() | |
wrapper_map = map(BeauToLxml, elems) # returns an iterator | |
return wrapper_map if gen else tuple(wrapper_map) | |
@lru_cache(maxsize=None) | |
def get_xpath(tag: str, _class: str=None, **kwargs) -> XPath: | |
tag_xp = './/' + tag | |
if _class: | |
kwargs['class'] = _class | |
for attr, val in kwargs.items(): | |
tag_xp += '[' | |
attr_xp = '@' + attr | |
val_type = type(val) | |
if val_type == bool: | |
if val: | |
tag_xp += attr_xp | |
else: | |
tag_xp += 'not(%s)' % attr_xp | |
elif val_type in (set, list, tuple): | |
for item in val: | |
val_xp = '"%s", ' % item | |
val_xp = val_xp[:-2] if val else '' | |
tag_xp += 'contains(%s, %s)' % (attr_xp, val_xp) | |
elif val_type == str: | |
tag_xp += 'contains(%s, "%s")' % (attr_xp, val_xp) | |
else: | |
tag_xp += "%s=%s'" % (attr_xp, val) | |
tag_xp += ']' | |
return XPath(tag_xp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment