Last active
August 16, 2021 12:11
-
-
Save teshanshanuka/01f79074e89d68678705f017c8c2e2f9 to your computer and use it in GitHub Desktop.
A better download function for Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Teshan Liyanage <[email protected]> | |
import os | |
from urllib.parse import urlparse | |
import requests | |
import re | |
def is_valid_url(x): | |
try: | |
result = urlparse(x) | |
return all([result.scheme, result.netloc]) | |
except: | |
return False | |
def download(url: str, | |
download_dir: str = '.', | |
default_extension: str = None, | |
overwrite: bool = False, | |
exists_ok: bool = False, | |
chunk_size: int = 1024*1024) -> str: | |
"""Download file from url and return downloaded file path | |
Args: | |
url: Download url | |
download_dir: Download directory | |
default_extension: File extension if url does not provide an extension | |
overwrite: Whether to overwrite if the file exists | |
exists_ok: Do nothing if overwrite is false and file exists | |
chunk_size: Chunk size to download at a time | |
Returns: | |
Download file path and md5 hash of the file | |
Raises: | |
RuntimeError: If download request fails | |
FileExistsError: If file exists and overwrite is false | |
""" | |
if not is_valid_url(url): | |
raise RuntimeError(f"'{url}' is not a valid url") | |
if not os.path.exists(download_dir): | |
os.makedirs(download_dir) | |
r = requests.get(url, stream=True) | |
if not r.ok: | |
raise RuntimeError(f"Download request returned status code {r.status_code}") | |
file_name = '' | |
if 'content-disposition' in r.headers: | |
try: | |
file_name = re.findall("filename=(.+)", r.headers['content-disposition'])[0] | |
except IndexError: | |
pass | |
if not file_name: | |
o = urlparse(url) | |
file_name = o.path.split('/')[-1] | |
if '.' not in file_name: # seems like not a file name (no extension) | |
if default_extension is not None: | |
file_name += default_extension | |
outfile = os.path.join(download_dir, file_name) | |
if os.path.isfile(outfile) and not overwrite: | |
if exists_ok: | |
return outfile | |
else: | |
raise FileExistsError(f"Download file '{outfile}' exists") | |
with open(outfile, "wb") as fp: | |
for chunk in r.iter_content(chunk_size=chunk_size): | |
# writing one chunk at a time to file | |
if chunk: | |
fp.write(chunk) | |
return outfile |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment