Created
March 19, 2012 04:55
-
-
Save magical/2095875 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Download an HTML page, translating <img> tags to data: URIs""" | |
import sys | |
import lxml.html as etree | |
from urllib.request import urlopen | |
from base64 import b64encode | |
from functools import lru_cache, partial | |
memoize = partial(lru_cache, maxsize=None) | |
@memoize() | |
def data_from_url(url): | |
with urlopen(url) as f: | |
content_type = f.headers['content-type'] | |
data = f.read() | |
mime_type, _, _ = content_type.partition(';') | |
mime_type = mime_type.strip() | |
uri = 'data:{0};base64,{1}'.format(mime_type, b64encode(data).decode('ascii')) | |
return uri | |
page_url = sys.argv[1] | |
tree = etree.parse(page_url) | |
tree.getroot().make_links_absolute(page_url) | |
for img in tree.xpath('//img'): | |
img.attrib['src'] = data_from_url(img.attrib['src']) | |
sys.stdout.buffer.write(etree.tostring(tree, encoding='utf-8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment