-
-
Save yngfoxx/3f9c2f5d786034685b2151443f0450f5 to your computer and use it in GitHub Desktop.
Script to download files, resume downloads and validate downloads. Everything wrapped with a beautiful progressbar.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import click | |
import hashlib | |
import requests | |
from pathlib import Path | |
from tqdm import tqdm | |
CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) | |
URL_BASE = 'http://www.patentsview.org/data' | |
"""str: Base url for PatentsView. Just to reduce url length.""" | |
URLS = [ | |
f'{URL_BASE}/20171226/botanic.tsv.zip', | |
f'{URL_BASE}/20171226/cpc_subsection.tsv.zip' | |
] | |
"""List: Contains urls of files which need to be downloaded. | |
Make sure that you add a hash in the same position in ``HASHES`` so that the | |
integrity of the file can be verified. The hash has to be a lowercase sha265. | |
The has can be computed in Powershell with ``Get-FileHash <file>``. Notice | |
that Powershell returns uppercase letters and Python lowercase.""" | |
HASHES = [ | |
'94c642405619b20ecaf657b30e84bab787320649e751ed6ac629c0be613ded44', | |
'8437a94cf0c777486a10f77df89749a4278c6dbc69ff6bc62c2dd01b515a84f4', | |
] | |
"""List: Contains sha265 hashes calculated for the files in ``URLS``.""" | |
DOWNLOAD_FOLDER = Path('.') | |
"""pathlib.Path: Points to the target directory of downloads.""" | |
def downloader(position: int, resume_byte_pos: int = None): | |
"""Download url in ``URLS[position]`` to disk with possible resumption. | |
Parameters | |
---------- | |
position: int | |
Position of url. | |
resume_byte_pos: int | |
Position of byte from where to resume the download | |
""" | |
# Get size of file | |
url = URLS[position] | |
r = requests.head(url) | |
file_size = int(r.headers.get('content-length', 0)) | |
# Append information to resume download at specific byte position | |
# to header | |
resume_header = ({'Range': f'bytes={resume_byte_pos}-'} | |
if resume_byte_pos else None) | |
# Establish connection | |
r = requests.get(url, stream=True, headers=resume_header) | |
# Set configuration | |
block_size = 1024 | |
initial_pos = resume_byte_pos if resume_byte_pos else 0 | |
mode = 'ab' if resume_byte_pos else 'wb' | |
file = DOWNLOAD_FOLDER / url.split('/')[-1] | |
with open(file, mode) as f: | |
with tqdm(total=file_size, unit='B', | |
unit_scale=True, unit_divisor=1024, | |
desc=file.name, initial=initial_pos, | |
ascii=True, miniters=1) as pbar: | |
for chunk in r.iter_content(32 * block_size): | |
f.write(chunk) | |
pbar.update(len(chunk)) | |
def download_file(position: int) -> None: | |
"""Execute the correct download operation. | |
Depending on the size of the file online and offline, resume the | |
download if the file offline is smaller than online. | |
Parameters | |
---------- | |
position: int | |
Position of url. | |
""" | |
# Establish connection to header of file | |
url = URLS[position] | |
r = requests.head(url) | |
# Get filesize of online and offline file | |
file_size_online = int(r.headers.get('content-length', 0)) | |
file = DOWNLOAD_FOLDER / url.split('/')[-1] | |
if file.exists(): | |
file_size_offline = file.stat().st_size | |
if file_size_online != file_size_offline: | |
click.echo(f'File {file} is incomplete. Resume download.') | |
downloader(position, file_size_offline) | |
else: | |
click.echo(f'File {file} is complete. Skip download.') | |
pass | |
else: | |
click.echo(f'File {file} does not exist. Start download.') | |
downloader(position) | |
def validate_file(position: int) -> None: | |
"""Validate a given file with its hash. | |
The downloaded file is hashed and compared to a pre-registered | |
has value to validate the download procedure. | |
Parameters | |
---------- | |
position: int | |
Position of url and hash. | |
""" | |
file = DOWNLOAD_FOLDER / URLS[position].split('/')[-1] | |
try: | |
hash = HASHES[position] | |
except IndexError: | |
click.echo(f'File {file.name} has no hash.') | |
return 0 | |
sha = hashlib.sha256() | |
with open(file, 'rb') as f: | |
while True: | |
chunk = f.read(1000 * 1000) # 1MB so that memory is not exhausted | |
if not chunk: | |
break | |
sha.update(chunk) | |
try: | |
assert sha.hexdigest() == hash | |
except AssertionError: | |
file = URLS[position].split("/")[-1] | |
click.echo(f'File {file} is corrupt. ' | |
'Delete it manually and restart the program.') | |
else: | |
click.echo(f'File {file} is validated.') | |
@click.group(context_settings=CONTEXT_SETTINGS, chain=True) | |
def cli(): | |
"""Program for downloading and validating files. | |
It is possible to run both operations consecutively with | |
.. code-block:: shell | |
$ python python-downloader.py download validate | |
To download a file, add the link to ``URLS`` and its hash to ``HASHES`` if | |
you want to validate downloaded files. | |
""" | |
pass | |
@cli.command() | |
def download(): | |
"""Download files specified in ``URLS``.""" | |
click.echo('\n### Start downloading required files.\n') | |
for position in range(len(URLS)): | |
download_file(position) | |
click.echo('\n### End\n') | |
@cli.command() | |
def validate(): | |
"""Validate downloads with hashes in ``HASHES``.""" | |
click.echo('### Start validating required files.\n') | |
for position in range(len(URLS)): | |
validate_file(position) | |
click.echo('\n### End\n') | |
if __name__ == '__main__': | |
cli() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment