Created
October 17, 2020 14:24
-
-
Save pybites/6a15bfe006057b6d82e85b4fd1240beb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import concurrent.futures | |
import os | |
import re | |
from timeit import timeit | |
import requests | |
URLS = 'urls' | |
def _get_blog_urls(url='https://pybit.es/archives', outfile=URLS): | |
resp = requests.get(url) | |
urls = re.findall(r'(https://pybit.es/[\w]+\.html)', resp.text) | |
with open(outfile, 'w') as f: | |
f.write('\n'.join(urls) + '\n') | |
def _download_page(url): | |
fname = os.path.basename(url) | |
r = requests.get(url) | |
with open(f'downloads/{fname}', 'wb') as outfile: | |
outfile.write(r.content) | |
def _parse_args(): | |
parser = argparse.ArgumentParser( | |
description='Download all PyBites articles.') | |
parser.add_argument("-s", "--seq", action='store_true', | |
help="download sequentially") | |
parser.add_argument("-c", "--conc", action='store_true', | |
help='download concurrently (32 workers)') | |
return parser.parse_args() | |
def download_urls_sequentially(urls): | |
for url in urls: | |
_download_page(url) | |
def download_urls_concurrently(urls): | |
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor: | |
future_to_url = {executor.submit(_download_page, url): url | |
for url in urls} | |
for future in concurrent.futures.as_completed(future_to_url): | |
future_to_url[future] | |
if __name__ == '__main__': | |
import sys | |
import argparse | |
if not os.path.isfile(URLS): | |
print('getting urls') | |
_get_blog_urls() | |
with open(URLS) as f: | |
urls = [u.rstrip() for u in f.readlines()] | |
""" | |
args = _parse_args() | |
if args.seq: | |
download_urls_sequentially(urls) | |
elif args.conc: | |
download_urls_concurrently(urls) | |
else: | |
print("select -s or -c") | |
sys.exit(1) | |
""" | |
funcs = 'download_urls_sequentially, download_urls_concurrently' | |
for func in funcs.split(', '): | |
print(func) | |
print(timeit(f"{func}(urls)", | |
f"from __main__ import {funcs}, urls", | |
number=1)) | |
print() |
Author
pybites
commented
Oct 17, 2020
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment