nrrb · September 19, 2014 19:53
diff --git a/lxml_utils.py b/lxml_utils.py
 from urlparse import urljoin
 import lxml.html


 def clean_xpath(path):
 	"""
 	When copying XPath from the developer console of Firefox or Chrome,
 	the browser inserts 'tbody' tags in table declarations which are
 	generally not there in the original source. These need to be removed
 	in order for lxml to work with said XPath expression.
 	"""
 	return path.replace('tbody/', '')


 def links_to_dicts(links, base_url=None):
 	"""
 	Takes a list of lxml.html.HtmlElement elements representing <a> links,
 	and returns a list of dictionaries representing those links with each
 	dict containing the name and url of the corresponding <a> link.
 	"""
 	return [{'name': link.text, 'url': urljoin(base_url, link.attrib['href'])} for link in links]
	from urlparse import urljoin
	import lxml.html


	def clean_xpath(path):
	"""
	When copying XPath from the developer console of Firefox or Chrome,
	the browser inserts 'tbody' tags in table declarations which are
	generally not there in the original source. These need to be removed
	in order for lxml to work with said XPath expression.
	"""
	return path.replace('tbody/', '')


	def links_to_dicts(links, base_url=None):
	"""
	Takes a list of lxml.html.HtmlElement elements representing <a> links,
	and returns a list of dictionaries representing those links with each
	dict containing the name and url of the corresponding <a> link.
	"""
	return [{'name': link.text, 'url': urljoin(base_url, link.attrib['href'])} for link in links]
No results found