Created
October 17, 2020 14:38
-
-
Save pybites/a571758b0561fbebd0ed35e47d16275b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import concurrent.futures | |
import os | |
import re | |
from timeit import timeit | |
import requests | |
from tqdm import tqdm | |
URLS = 'urls' | |
def _get_blog_urls(url='https://pybit.es/archives', outfile=URLS): | |
resp = requests.get(url) | |
urls = re.findall(r'(https://pybit.es/[\w]+\.html)', resp.text) | |
with open(outfile, 'w') as f: | |
f.write('\n'.join(urls) + '\n') | |
def _download_page(url): | |
fname = os.path.basename(url) | |
r = requests.get(url) | |
with open(f'downloads/{fname}', 'wb') as outfile: | |
outfile.write(r.content) | |
def _parse_args(): | |
parser = argparse.ArgumentParser( | |
description='Download all PyBites articles.') | |
parser.add_argument("-s", "--seq", action='store_true', | |
help="download sequentially") | |
parser.add_argument("-c", "--conc", action='store_true', | |
help='download concurrently (32 workers)') | |
return parser.parse_args() | |
def download_urls_sequentially(urls): | |
for url in urls: | |
yield _download_page(url) | |
def download_urls_concurrently(urls): | |
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: | |
future_to_url = {executor.submit(_download_page, url): url | |
for url in urls} | |
for future in concurrent.futures.as_completed(future_to_url): | |
yield future_to_url[future] | |
if __name__ == '__main__': | |
import sys | |
import argparse | |
if not os.path.isfile(URLS): | |
print('getting urls') | |
_get_blog_urls() | |
with open(URLS) as f: | |
urls = [u.rstrip() for u in f.readlines()] | |
args = _parse_args() | |
if args.seq: | |
for _ in tqdm(download_urls_sequentially(urls), total=len(urls)): | |
pass | |
elif args.conc: | |
for _ in tqdm(download_urls_concurrently(urls), total=len(urls)): | |
pass | |
else: | |
print("select -s or -c") | |
sys.exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment