magical · March 19, 2012 04:55
diff --git a/html2data.py b/html2data.py
 #!/usr/bin/env python3

 """Download an HTML page, translating <img> tags to data: URIs"""

 import sys

 import lxml.html as etree

 from urllib.request import urlopen
 from base64 import b64encode
 from functools import lru_cache, partial

 memoize = partial(lru_cache, maxsize=None)

 @memoize()
 def data_from_url(url):
    with urlopen(url) as f:
        content_type = f.headers['content-type']
        data = f.read()
    mime_type, _, _ = content_type.partition(';')
    mime_type = mime_type.strip()
    uri = 'data:{0};base64,{1}'.format(mime_type, b64encode(data).decode('ascii'))
    return uri

 page_url = sys.argv[1]

 tree = etree.parse(page_url)
 tree.getroot().make_links_absolute(page_url)

 for img in tree.xpath('//img'):
    img.attrib['src'] = data_from_url(img.attrib['src'])

 sys.stdout.buffer.write(etree.tostring(tree, encoding='utf-8'))
	#!/usr/bin/env python3

	"""Download an HTML page, translating <img> tags to data: URIs"""

	import sys

	import lxml.html as etree

	from urllib.request import urlopen
	from base64 import b64encode
	from functools import lru_cache, partial

	memoize = partial(lru_cache, maxsize=None)

	@memoize()
	def data_from_url(url):
	with urlopen(url) as f:
	content_type = f.headers['content-type']
	data = f.read()
	mime_type, _, _ = content_type.partition(';')
	mime_type = mime_type.strip()
	uri = 'data:{0};base64,{1}'.format(mime_type, b64encode(data).decode('ascii'))
	return uri

	page_url = sys.argv[1]

	tree = etree.parse(page_url)
	tree.getroot().make_links_absolute(page_url)

	for img in tree.xpath('//img'):
	img.attrib['src'] = data_from_url(img.attrib['src'])

	sys.stdout.buffer.write(etree.tostring(tree, encoding='utf-8'))
No results found