Give a txt which contains urls to download, this script will download them for you.
- tqdm
import csv | |
import time | |
from argparse import ArgumentParser | |
from concurrent import futures | |
from pathlib import Path | |
from urllib import request | |
from tqdm import tqdm | |
def download(url: str, save_dir: Path, sleep_time: int = 1): | |
save_path = save_dir / Path(url).name | |
request.urlretrieve(url, save_path) | |
time.sleep(sleep_time) | |
return 'OK' | |
def main(): | |
parser = ArgumentParser() | |
parser.add_argument("--data", type=str, default="data.txt") | |
parser.add_argument("--data_index", type=int, default=0) | |
parser.add_argument("--save", type=str, default="Download") | |
parser.add_argument("--fail", type=str, default="fail.txt") | |
parser.add_argument("--threads", type=int, default=20) | |
parser.add_argument("--sleep", type=int, default=1) | |
args = parser.parse_args() | |
data_path = Path(args.data) | |
data = csv.reader(data_path.open(), delimiter=',') | |
save_path = Path(args.save) | |
save_path.mkdir(exist_ok=True, parents=True) | |
fail_record = Path(args.fail) | |
fails = [] | |
with futures.ThreadPoolExecutor(max_workers=args.threads) as executor: | |
jobs = { | |
executor.submit(download, line[args.data_index], save_path, args.sleep): ith for ith, line in enumerate(data) | |
} | |
for future in tqdm(futures.as_completed(jobs), total=len(jobs)): | |
try: | |
status = future.result() | |
except Exception as e: | |
tqdm.write("{} : {}".format(jobs[future], e)) | |
fails.append(jobs[future]) | |
else: | |
tqdm.write("{} : {}".format(jobs[future], status)) | |
fails = ["{}\n".format(x) for x in fails] | |
fail_record.open("a").writelines(fails) | |
if "__main__" == __name__: | |
main() |