jelmervdl · May 17, 2023 11:16
diff --git a/download.py b/download.py
 #!/usr/bin/env python3
 import sys
 import os

 from argparse import ArgumentParser
 from contextlib import ExitStack
 from email.utils import parsedate_to_datetime
 from http.client import HTTPResponse
 from shutil import copyfileobj
 from tempfile import TemporaryFile
 from time import sleep
 from typing import cast, BinaryIO, Optional
 from urllib.request import urlopen, Request, URLError
 from urllib.error import HTTPError
 from urllib.parse import urlparse
 from warnings import warn


 BUFSIZE=2**16


 def get_content_length(response:HTTPResponse) -> int:
 	"""Get whole content length from either a normal or a Range request."""
 	content_range = response.getheader('Content-Range', '').split('/')
 	if len(content_range) == 2 and content_range[1] != '*':
 		return int(content_range[1])

 	size = response.getheader('Content-Length')
 	if size is not None:
 		return int(size)

 	raise ValueError('No content size')


 def parse_retry_after(retry_after: Optional[str]) -> int:
 	if retry_after is None:
 		raise ValueError('No Retry-After header')

 	if retry_after.isdigit():
 		return int(retry_after)
 	else :
 		diff = parsedate_to_datetime(retry_after) - datetime.now()
 		return diff.total_seconds()


 def download(url:str, file:BinaryIO, *, retries:int=10, wait:float=30.0) -> None:
 	attempt = 0
 	timeout = 0
 	size = None

 	while size is None or file.tell() < size:
 		attempt += 1

 		if attempt > retries:
 			raise Exception('Ran out of retries')

 		if timeout > 0:
 			sleep(timeout)

 		request = Request(url, headers={
 			'Range': f'bytes={file.tell()}-'
 		})

 		if file.tell() > 0:
 			warn(f'Resuming download from {file.tell()}')

 		try:
 			with urlopen(request) as fin:
 				response = cast(HTTPResponse, fin)

 				if response.status not in {200, 206}:
 					raise RuntimeError(f'Server responded with {response.status}')

 				# make sure we get a partial response. If not (i.e. 200 instead of 206)
 				# then start writing our output from the start as well.
 				if response.status == 200:
 					warn('Server does not support Range requests')
 					file.seek(0)

 				# Get the expected full content length (throws if not available)
 				size = get_content_length(response)
 				
 				# Read downloaded bytes, writing them to the file.
 				while True:
 					chunk = fin.read(BUFSIZE)
 					if len(chunk) == 0:
 						break
 					file.write(chunk)

 				# If we're somehow past our expected size, something went wrong
 				# and we can't recover from that by retrying
 				if file.tell() > size:
 					raise Exception(f'Downloaded too much: {file.tell()} > {size}')

 				# Incomplete? Retry without timeout because there wasn't an error,
 				# the connection just got closed early?
 				if file.tell() < size:
 					warn(f'Server gave incomplete response: {file.tell()} < {size}')
 					timeout = 0
 		except HTTPError as e:
 			if e.code >= 500 and e.code < 600:
 				# Back-off at least
 				timeout = wait if timeout == 0 else timeout * 2

 				# Oh no someone is rate-limiting us, lets try to listen to them
 				if e.code == 503:	
 					try:
 						timeout = parse_retry_after(e.headers.get('Retry-After'))
 					except ValueError:
 						pass

 				warn(f'Server responded with {e.code}, retry {attempt} after {timeout} pause')
 				continue
 			else:
 				raise

 	# At the end of the loop, we assume we've got all our data
 	assert size is not None and file.tell() == size


 if __name__ == '__main__':
 	parser = ArgumentParser()
 	parser.add_argument('--retries', '-r', type=int, default=10)
 	parser.add_argument('--wait', '-w', type=float, default=30.0)
 	parser.add_argument('--output', '-o', type=str, default='./')
 	parser.add_argument('url', type=str, nargs='+')

 	args = parser.parse_args()

 	for url in args.url:
 		with ExitStack() as ctx:
 			use_stdout = args.output in {'-', '/dev/stdout'}

 			if use_stdout:
 				dest = ctx.enter_context(TemporaryFile('a+b'))
 			else:
 				if args.output.endswith('/') and not os.path.exists(args.output):
 					os.makedirs(args.output)

 				if os.path.isdir(args.output):
 					filename = os.path.basename(urlparse(url).path.rstrip('/'))
 					output = os.path.join(args.output, filename)
 				else:
 					if len(args.url) > 1:
 						raise RuntimeError('Downloading multiple urls to the same output file does not make much sense')
 					output = args.output

 				dest = ctx.enter_context(open(output, 'a+b'))

 			download(url, dest, retries=args.retries, wait=args.wait)

 			if use_stdout:
 				dest.seek(0)
 				copyfileobj(dest, sys.stdout.buffer)
	#!/usr/bin/env python3
	import sys
	import os

	from argparse import ArgumentParser
	from contextlib import ExitStack
	from email.utils import parsedate_to_datetime
	from http.client import HTTPResponse
	from shutil import copyfileobj
	from tempfile import TemporaryFile
	from time import sleep
	from typing import cast, BinaryIO, Optional
	from urllib.request import urlopen, Request, URLError
	from urllib.error import HTTPError
	from urllib.parse import urlparse
	from warnings import warn


	BUFSIZE=2**16


	def get_content_length(response:HTTPResponse) -> int:
	"""Get whole content length from either a normal or a Range request."""
	content_range = response.getheader('Content-Range', '').split('/')
	if len(content_range) == 2 and content_range[1] != '*':
	return int(content_range[1])

	size = response.getheader('Content-Length')
	if size is not None:
	return int(size)

	raise ValueError('No content size')


	def parse_retry_after(retry_after: Optional[str]) -> int:
	if retry_after is None:
	raise ValueError('No Retry-After header')

	if retry_after.isdigit():
	return int(retry_after)
	else :
	diff = parsedate_to_datetime(retry_after) - datetime.now()
	return diff.total_seconds()


	def download(url:str, file:BinaryIO, *, retries:int=10, wait:float=30.0) -> None:
	attempt = 0
	timeout = 0
	size = None

	while size is None or file.tell() < size:
	attempt += 1

	if attempt > retries:
	raise Exception('Ran out of retries')

	if timeout > 0:
	sleep(timeout)

	request = Request(url, headers={
	'Range': f'bytes={file.tell()}-'
	})

	if file.tell() > 0:
	warn(f'Resuming download from {file.tell()}')

	try:
	with urlopen(request) as fin:
	response = cast(HTTPResponse, fin)

	if response.status not in {200, 206}:
	raise RuntimeError(f'Server responded with {response.status}')

	# make sure we get a partial response. If not (i.e. 200 instead of 206)
	# then start writing our output from the start as well.
	if response.status == 200:
	warn('Server does not support Range requests')
	file.seek(0)

	# Get the expected full content length (throws if not available)
	size = get_content_length(response)

	# Read downloaded bytes, writing them to the file.
	while True:
	chunk = fin.read(BUFSIZE)
	if len(chunk) == 0:
	break
	file.write(chunk)

	# If we're somehow past our expected size, something went wrong
	# and we can't recover from that by retrying
	if file.tell() > size:
	raise Exception(f'Downloaded too much: {file.tell()} > {size}')

	# Incomplete? Retry without timeout because there wasn't an error,
	# the connection just got closed early?
	if file.tell() < size:
	warn(f'Server gave incomplete response: {file.tell()} < {size}')
	timeout = 0
	except HTTPError as e:
	if e.code >= 500 and e.code < 600:
	# Back-off at least
	timeout = wait if timeout == 0 else timeout * 2

	# Oh no someone is rate-limiting us, lets try to listen to them
	if e.code == 503:
	try:
	timeout = parse_retry_after(e.headers.get('Retry-After'))
	except ValueError:
	pass

	warn(f'Server responded with {e.code}, retry {attempt} after {timeout} pause')
	continue
	else:
	raise

	# At the end of the loop, we assume we've got all our data
	assert size is not None and file.tell() == size


	if __name__ == '__main__':
	parser = ArgumentParser()
	parser.add_argument('--retries', '-r', type=int, default=10)
	parser.add_argument('--wait', '-w', type=float, default=30.0)
	parser.add_argument('--output', '-o', type=str, default='./')
	parser.add_argument('url', type=str, nargs='+')

	args = parser.parse_args()

	for url in args.url:
	with ExitStack() as ctx:
	use_stdout = args.output in {'-', '/dev/stdout'}

	if use_stdout:
	dest = ctx.enter_context(TemporaryFile('a+b'))
	else:
	if args.output.endswith('/') and not os.path.exists(args.output):
	os.makedirs(args.output)

	if os.path.isdir(args.output):
	filename = os.path.basename(urlparse(url).path.rstrip('/'))
	output = os.path.join(args.output, filename)
	else:
	if len(args.url) > 1:
	raise RuntimeError('Downloading multiple urls to the same output file does not make much sense')
	output = args.output

	dest = ctx.enter_context(open(output, 'a+b'))

	download(url, dest, retries=args.retries, wait=args.wait)

	if use_stdout:
	dest.seek(0)
	copyfileobj(dest, sys.stdout.buffer)