Skip to content

Instantly share code, notes, and snippets.

@pybites
Created October 17, 2020 14:38
Show Gist options
  • Save pybites/a571758b0561fbebd0ed35e47d16275b to your computer and use it in GitHub Desktop.
Save pybites/a571758b0561fbebd0ed35e47d16275b to your computer and use it in GitHub Desktop.
import concurrent.futures
import os
import re
from timeit import timeit
import requests
from tqdm import tqdm
URLS = 'urls'
def _get_blog_urls(url='https://pybit.es/archives', outfile=URLS):
resp = requests.get(url)
urls = re.findall(r'(https://pybit.es/[\w]+\.html)', resp.text)
with open(outfile, 'w') as f:
f.write('\n'.join(urls) + '\n')
def _download_page(url):
fname = os.path.basename(url)
r = requests.get(url)
with open(f'downloads/{fname}', 'wb') as outfile:
outfile.write(r.content)
def _parse_args():
parser = argparse.ArgumentParser(
description='Download all PyBites articles.')
parser.add_argument("-s", "--seq", action='store_true',
help="download sequentially")
parser.add_argument("-c", "--conc", action='store_true',
help='download concurrently (32 workers)')
return parser.parse_args()
def download_urls_sequentially(urls):
for url in urls:
yield _download_page(url)
def download_urls_concurrently(urls):
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
future_to_url = {executor.submit(_download_page, url): url
for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
yield future_to_url[future]
if __name__ == '__main__':
import sys
import argparse
if not os.path.isfile(URLS):
print('getting urls')
_get_blog_urls()
with open(URLS) as f:
urls = [u.rstrip() for u in f.readlines()]
args = _parse_args()
if args.seq:
for _ in tqdm(download_urls_sequentially(urls), total=len(urls)):
pass
elif args.conc:
for _ in tqdm(download_urls_concurrently(urls), total=len(urls)):
pass
else:
print("select -s or -c")
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment