Created
May 17, 2023 11:16
-
-
Save jelmervdl/c89b21c3d8d8ed2a00a42bd122f5d668 to your computer and use it in GitHub Desktop.
Resumeable downloads with plain Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import os | |
from argparse import ArgumentParser | |
from contextlib import ExitStack | |
from email.utils import parsedate_to_datetime | |
from http.client import HTTPResponse | |
from shutil import copyfileobj | |
from tempfile import TemporaryFile | |
from time import sleep | |
from typing import cast, BinaryIO, Optional | |
from urllib.request import urlopen, Request, URLError | |
from urllib.error import HTTPError | |
from urllib.parse import urlparse | |
from warnings import warn | |
BUFSIZE=2**16 | |
def get_content_length(response:HTTPResponse) -> int: | |
"""Get whole content length from either a normal or a Range request.""" | |
content_range = response.getheader('Content-Range', '').split('/') | |
if len(content_range) == 2 and content_range[1] != '*': | |
return int(content_range[1]) | |
size = response.getheader('Content-Length') | |
if size is not None: | |
return int(size) | |
raise ValueError('No content size') | |
def parse_retry_after(retry_after: Optional[str]) -> int: | |
if retry_after is None: | |
raise ValueError('No Retry-After header') | |
if retry_after.isdigit(): | |
return int(retry_after) | |
else : | |
diff = parsedate_to_datetime(retry_after) - datetime.now() | |
return diff.total_seconds() | |
def download(url:str, file:BinaryIO, *, retries:int=10, wait:float=30.0) -> None: | |
attempt = 0 | |
timeout = 0 | |
size = None | |
while size is None or file.tell() < size: | |
attempt += 1 | |
if attempt > retries: | |
raise Exception('Ran out of retries') | |
if timeout > 0: | |
sleep(timeout) | |
request = Request(url, headers={ | |
'Range': f'bytes={file.tell()}-' | |
}) | |
if file.tell() > 0: | |
warn(f'Resuming download from {file.tell()}') | |
try: | |
with urlopen(request) as fin: | |
response = cast(HTTPResponse, fin) | |
if response.status not in {200, 206}: | |
raise RuntimeError(f'Server responded with {response.status}') | |
# make sure we get a partial response. If not (i.e. 200 instead of 206) | |
# then start writing our output from the start as well. | |
if response.status == 200: | |
warn('Server does not support Range requests') | |
file.seek(0) | |
# Get the expected full content length (throws if not available) | |
size = get_content_length(response) | |
# Read downloaded bytes, writing them to the file. | |
while True: | |
chunk = fin.read(BUFSIZE) | |
if len(chunk) == 0: | |
break | |
file.write(chunk) | |
# If we're somehow past our expected size, something went wrong | |
# and we can't recover from that by retrying | |
if file.tell() > size: | |
raise Exception(f'Downloaded too much: {file.tell()} > {size}') | |
# Incomplete? Retry without timeout because there wasn't an error, | |
# the connection just got closed early? | |
if file.tell() < size: | |
warn(f'Server gave incomplete response: {file.tell()} < {size}') | |
timeout = 0 | |
except HTTPError as e: | |
if e.code >= 500 and e.code < 600: | |
# Back-off at least | |
timeout = wait if timeout == 0 else timeout * 2 | |
# Oh no someone is rate-limiting us, lets try to listen to them | |
if e.code == 503: | |
try: | |
timeout = parse_retry_after(e.headers.get('Retry-After')) | |
except ValueError: | |
pass | |
warn(f'Server responded with {e.code}, retry {attempt} after {timeout} pause') | |
continue | |
else: | |
raise | |
# At the end of the loop, we assume we've got all our data | |
assert size is not None and file.tell() == size | |
if __name__ == '__main__': | |
parser = ArgumentParser() | |
parser.add_argument('--retries', '-r', type=int, default=10) | |
parser.add_argument('--wait', '-w', type=float, default=30.0) | |
parser.add_argument('--output', '-o', type=str, default='./') | |
parser.add_argument('url', type=str, nargs='+') | |
args = parser.parse_args() | |
for url in args.url: | |
with ExitStack() as ctx: | |
use_stdout = args.output in {'-', '/dev/stdout'} | |
if use_stdout: | |
dest = ctx.enter_context(TemporaryFile('a+b')) | |
else: | |
if args.output.endswith('/') and not os.path.exists(args.output): | |
os.makedirs(args.output) | |
if os.path.isdir(args.output): | |
filename = os.path.basename(urlparse(url).path.rstrip('/')) | |
output = os.path.join(args.output, filename) | |
else: | |
if len(args.url) > 1: | |
raise RuntimeError('Downloading multiple urls to the same output file does not make much sense') | |
output = args.output | |
dest = ctx.enter_context(open(output, 'a+b')) | |
download(url, dest, retries=args.retries, wait=args.wait) | |
if use_stdout: | |
dest.seek(0) | |
copyfileobj(dest, sys.stdout.buffer) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment