Created
March 26, 2019 02:03
-
-
Save jesserobertson/061e4dda314fcc83f97cfc6d26997aba to your computer and use it in GitHub Desktop.
Nice downloads in Python for largish files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" file: download.py | |
author: Jess Robertson (@jesserobertson) | |
date: Tuesday, 26 March 2019 | |
description: Download large files with requests and a tqdm progress bar. | |
Modified lightly from https://stackoverflow.com/questions/37573483 | |
""" | |
import pathlib | |
from urllib.parse import urlparse | |
import requests | |
from tqdm import tqdm | |
def download(url, download_folder=None, overwrite=False): | |
""" | |
Stream some large download with a progress bar from tqdm to a download folder | |
Parameters: | |
url - the thing to download | |
download_folder - the folder to download to. Will be created (with parents) | |
if it doesn't already exist. | |
overwrite - whether to overwrite a local file if already downloaded | |
(default False) | |
Returns: | |
a pathlib.Path pointing to the downloaded file | |
""" | |
# Make sure we've got somewhere to download to | |
filename = pathlib.Path(urlparse(url).path).name | |
if download_folder is not None: | |
download_folder = pathlib.Path(download_folder) | |
else: | |
download_folder = pathlib.Path('.').absolute() | |
if not download_folder.exists(): | |
download_folder.mkdir(parents=True) | |
output_location = download_folder / filename | |
if not overwrite and output_location.exists(): | |
print(f'{output_location} already exists and overwrite=False, skipping') | |
return output_location | |
# Open a streaming connection to the endpoint | |
response = requests.get(url, stream=True) | |
# Set up progress bar with total size | |
block_size, wrote_size = 2048, 0 | |
total_size = int(response.headers.get('content-length', 0)) | |
tqdm_kwargs = { | |
'total': total_size, | |
'unit': 'MB', | |
'unit_scale': True, | |
'desc': f'Downloading {filename}' | |
} | |
# Actually stream to file - we can clobber if we're here | |
with open(output_location, 'wb') as sink, \ | |
tqdm(**tqdm_kwargs) as pbar: | |
for data in response.iter_content(block_size): | |
sink.write(data) | |
wrote_size += len(data) | |
pbar.update(block_size) | |
# Check we got everything | |
if total_size != 0 and wrote_size != total_size: | |
raise IOError(f"Something went wrong downloading {output_location} - we're missing data!") | |
else: | |
return output_location |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment