Skip to content

Instantly share code, notes, and snippets.

@pybites
Created October 17, 2020 14:24
Show Gist options
  • Save pybites/6a15bfe006057b6d82e85b4fd1240beb to your computer and use it in GitHub Desktop.
Save pybites/6a15bfe006057b6d82e85b4fd1240beb to your computer and use it in GitHub Desktop.
import concurrent.futures
import os
import re
from timeit import timeit
import requests
URLS = 'urls'
def _get_blog_urls(url='https://pybit.es/archives', outfile=URLS):
resp = requests.get(url)
urls = re.findall(r'(https://pybit.es/[\w]+\.html)', resp.text)
with open(outfile, 'w') as f:
f.write('\n'.join(urls) + '\n')
def _download_page(url):
fname = os.path.basename(url)
r = requests.get(url)
with open(f'downloads/{fname}', 'wb') as outfile:
outfile.write(r.content)
def _parse_args():
parser = argparse.ArgumentParser(
description='Download all PyBites articles.')
parser.add_argument("-s", "--seq", action='store_true',
help="download sequentially")
parser.add_argument("-c", "--conc", action='store_true',
help='download concurrently (32 workers)')
return parser.parse_args()
def download_urls_sequentially(urls):
for url in urls:
_download_page(url)
def download_urls_concurrently(urls):
with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
future_to_url = {executor.submit(_download_page, url): url
for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
future_to_url[future]
if __name__ == '__main__':
import sys
import argparse
if not os.path.isfile(URLS):
print('getting urls')
_get_blog_urls()
with open(URLS) as f:
urls = [u.rstrip() for u in f.readlines()]
"""
args = _parse_args()
if args.seq:
download_urls_sequentially(urls)
elif args.conc:
download_urls_concurrently(urls)
else:
print("select -s or -c")
sys.exit(1)
"""
funcs = 'download_urls_sequentially, download_urls_concurrently'
for func in funcs.split(', '):
print(func)
print(timeit(f"{func}(urls)",
f"from __main__ import {funcs}, urls",
number=1))
print()
@pybites
Copy link
Author

pybites commented Oct 17, 2020

$ python dl.py
download_urls_sequentially
53.683453743

download_urls_concurrently
2.6830952339999996

$ ls downloads/|wc -l
     228

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment