Skip to content

Instantly share code, notes, and snippets.

@shufflebits
Forked from mjohnsullivan/download.py
Last active January 14, 2019 12:22
Show Gist options
  • Save shufflebits/8d4a0f7ed518a3e88e96767e17a0fb0d to your computer and use it in GitHub Desktop.
Save shufflebits/8d4a0f7ed518a3e88e96767e17a0fb0d to your computer and use it in GitHub Desktop.
Python HTTP download with resume and optional MD5 hash checking
#!/usr/bin/env python
import logging
logger = logging.getLogger(__name__)
import argparse
from pprint import pprint,pformat
import os.path
import urllib2
import shutil
import hashlib
def validate_file(file_path, hash):
"""
Validates a file against an MD5 hash value
:param file_path: path to the file for hash validation
:type file_path: string
:param hash: expected hash value of the file
:type hash: string -- MD5 hash value
"""
m = hashlib.md5()
with open(file_path, 'rb') as f:
while True:
chunk = f.read(1000 * 1000) # 1MB
if not chunk:
break
m.update(chunk)
return m.hexdigest() == hash
def download_with_resume(url, file_path, hash=None, timeout=10,block_size=1000 * 1000):
"""
Performs a HTTP(S) download that can be restarted if prematurely terminated.
The HTTP server must support byte ranges.
:param file_path: the path to the file to write to disk
:type file_path: string
:param hash: hash value for file validation
:type hash: string (MD5 hash value)
"""
# don't download if the file exists
if os.path.exists(file_path):
if hash:
# but still validate it
if not validate_file(file_path, hash):
raise Exception('Error validating the file against its MD5 hash')
else:
logger.info('File passes MD5 validation')
return
tmp_file_path = file_path + '.part'
first_byte = os.path.getsize(tmp_file_path) if os.path.exists(tmp_file_path) else 0
logging.debug('Starting download at %.1fMB' % (first_byte / 1e6))
file_size = -1
try:
file_size = int(urllib2.urlopen(url).info().get('Content-Length', -1))
logging.debug('File size is %s' % file_size)
while first_byte < file_size:
last_byte = first_byte + block_size \
if first_byte + block_size < file_size \
else file_size
# Limits are inclusive
last_byte -= 1
logging.debug('Downloading byte range %d - %d' % (first_byte, last_byte))
# create the request and set the byte range in the header
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (first_byte, last_byte)
data_chunk = urllib2.urlopen(req, timeout=timeout).read()
# Read the data from the URL and write it to the file
with open(tmp_file_path, 'ab') as f:
f.write(data_chunk)
first_byte = last_byte + 1
except IOError as e:
logging.debug('IO Error - %s' % e)
finally:
# rename the temp download file to the correct name if fully downloaded
if file_size == os.path.getsize(tmp_file_path):
# if there's a hash value, validate the file
if hash:
if not validate_file(tmp_file_path, hash):
raise Exception('Error validating the file against its MD5 hash')
else:
# nice to see if it worked.
logger.info('File passes MD5 validation')
shutil.move(tmp_file_path, file_path)
logger.info("URL '%s' downloaded to '%s' ok",url,file_path)
elif file_size == -1:
raise Exception('Error getting Content-Length from server: %s' % url)
def console_logging(level=logging.DEBUG,format="%(message)s"):
l=logging.getLogger()
for h in list(l.handlers):
l.removeHandler(h)
stream = logging.StreamHandler(sys.stdout)
stream.setFormatter(logging.Formatter(format))
l.addHandler(stream)
l.setLevel(level)
def main(arg_list=[]):
parser = argparse.ArgumentParser(description='Restarting Downloader')
parser.add_argument('URL',
metavar='<url>',
help='URL to load')
parser.add_argument('OUTFILE',
metavar='<filename>',
help='Filename to write')
parser.add_argument('-t','--timeout',required=False, help='timeout',metavar='<seconds>',type=int,default='10')
parser.add_argument('-c','--chunksize',required=False, help='Chunk size',metavar='<bytes>',type=int,default=1024*1024)
parser.add_argument('-m','--md5',required=False, help='MD5 hash to check',metavar='<md5>')
parser.add_argument('-v',required=False, help='verbode output',action='store_true')
args = parser.parse_args(arg_list)
if args.v:
logging.getLogger().setLevel(logging.DEBUG)
download_with_resume(args.URL,args.OUTFILE,hash=args.md5,timeout=args.timeout,block_size=args.chunksize)
if __name__ == "__main__":
import sys
console_logging(level=logging.INFO)
ret = main(sys.argv[1:])
if ret:
if len(ret)>0:
print >> sys.stderr,ret[1]
sys.exit(ret[0])
else:
sys.exit(ret)
@shufflebits
Copy link
Author

This version incorporates a command line handler

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment