Last active
December 18, 2015 22:48
-
-
Save adriaant/5856786 to your computer and use it in GitHub Desktop.
Streamed downloading of gzip compressed files. Based on https://bitbucket.org/blueluna/transmissionrpc/ and http://rationalpie.wordpress.com/2010/06/02/python-streaming-gzip-decompression/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import zlib | |
from urllib2 import (build_opener, HTTPPasswordMgrWithDefaultRealm, | |
HTTPBasicAuthHandler, HTTPDigestAuthHandler, HTTPError, URLError) | |
from httplib import BadStatusLine | |
class HTTPHandlerError(Exception): | |
""" | |
Exceptions related to the use of HTTPHandler. | |
""" | |
def __init__(self, httpurl=None, httpcode=None, httpmsg=None, httpheaders=None, httpdata=None): | |
Exception.__init__(self) | |
self.url = httpurl or '' | |
self.code = httpcode or 600 | |
self.message = httpmsg or '' | |
self.headers = httpheaders or {} | |
self.data = httpdata or '' | |
def __repr__(self): | |
return '<HTTPHandlerError %d, %s>' % (self.code, self.message) | |
def __str__(self): | |
return 'HTTPHandlerError %d: %s' % (self.code, self.message) | |
def __unicode__(self): | |
return 'HTTPHandlerError %d: %s' % (self.code, self.message) | |
class HTTPHandler(object): | |
""" | |
An HTTP download handler with support for basic auth | |
""" | |
READ_BLOCK_SIZE = 1024*8 | |
def __init__(self): | |
# http://bugs.python.org/issue9639 | |
if sys.version_info[:2] == (2, 6) and sys.version_info[2] >= 6: | |
def fixed_http_error_401(self, req, fp, code, msg, headers): | |
url = req.get_full_url() | |
response = self.http_error_auth_reqed('www-authenticate', | |
url, req, headers) | |
self.retried = 0 | |
return response | |
HTTPBasicAuthHandler.http_error_401 = fixed_http_error_401 | |
self.http_opener = build_opener() | |
def set_authentication(self, uri, login, password): | |
password_manager = HTTPPasswordMgrWithDefaultRealm() | |
password_manager.add_password(realm=None, uri=uri, user=login, passwd=password) | |
self.http_opener = build_opener(HTTPBasicAuthHandler(password_manager), HTTPDigestAuthHandler(password_manager)) | |
self.http_opener.addheaders = [('Accept-encoding', 'gzip,deflate')] | |
def download(self, url, path, timeout=30): | |
try: | |
if (sys.version_info[0] == 2 and sys.version_info[1] > 5) or sys.version_info[0] > 2: | |
response = self.http_opener.open(url, timeout=timeout) | |
else: | |
response = self.http_opener.open(url) | |
except HTTPError as error: | |
if error.fp is None: | |
raise HTTPHandlerError(error.filename, error.code, error.msg, dict(error.hdrs)) | |
else: | |
raise HTTPHandlerError(error.filename, error.code, error.msg, dict(error.hdrs), error.read()) | |
except URLError as error: | |
# Try to get the tuple arguments of URLError | |
if hasattr(error.reason, 'args') and isinstance(error.reason.args, tuple) and len(error.reason.args) == 2: | |
raise HTTPHandlerError(httpcode=error.reason.args[0], httpmsg=error.reason.args[1]) | |
else: | |
raise HTTPHandlerError(httpmsg='urllib2.URLError: %s' % (error.reason)) | |
except BadStatusLine as error: | |
raise HTTPHandlerError(httpmsg='httplib.BadStatusLine: %s' % (error.line)) | |
# save to file | |
file_name = os.path.join(path, url.split('/')[-1]) | |
f = open(file_name, 'wb') | |
isGZipped = response.headers.get('content-encoding', '').find('gzip') >= 0 | |
if isGZipped: | |
d = zlib.decompressobj(16+zlib.MAX_WBITS) # this magic number can be inferred from the structure of a gzip file | |
sys.stdout.write("Downloading: {0}".format(url)) | |
while True: | |
data = response.read(HTTPHandler.READ_BLOCK_SIZE) | |
sys.stdout.write('.') | |
if not data: | |
break | |
if isGZipped: | |
data = d.decompress(data) | |
f.write(data) | |
sys.stdout.write("done\n") | |
f.close() | |
del data | |
del response | |
return file_name |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment