pybites · October 17, 2020 14:24 · pybites · Oct 17, 2020
diff --git a/dl.py b/dl.py
 import concurrent.futures
 import os
 import re
 from timeit import timeit

 import requests

 URLS = 'urls'


 def _get_blog_urls(url='https://pybit.es/archives', outfile=URLS):
    resp = requests.get(url)
    urls = re.findall(r'(https://pybit.es/[\w]+\.html)', resp.text)
    with open(outfile, 'w') as f:
        f.write('\n'.join(urls) + '\n')


 def _download_page(url):
    fname = os.path.basename(url)
    r = requests.get(url)
    with open(f'downloads/{fname}', 'wb') as outfile:
        outfile.write(r.content)


 def _parse_args():
    parser = argparse.ArgumentParser(
        description='Download all PyBites articles.')
    parser.add_argument("-s", "--seq", action='store_true',
                        help="download sequentially")
    parser.add_argument("-c", "--conc", action='store_true',
                        help='download concurrently (32 workers)')
    return parser.parse_args()


 def download_urls_sequentially(urls):
    for url in urls:
        _download_page(url)


 def download_urls_concurrently(urls):
    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
        future_to_url = {executor.submit(_download_page, url): url
                         for url in urls}
        for future in concurrent.futures.as_completed(future_to_url):
            future_to_url[future]


 if __name__ == '__main__':
    import sys
    import argparse

    if not os.path.isfile(URLS):
        print('getting urls')
        _get_blog_urls()

    with open(URLS) as f:
        urls = [u.rstrip() for u in f.readlines()]

        """
        args = _parse_args()

        if args.seq:
            download_urls_sequentially(urls)
        elif args.conc:
            download_urls_concurrently(urls)
        else:
            print("select -s or -c")
            sys.exit(1)
        """

        funcs = 'download_urls_sequentially, download_urls_concurrently'
        for func in funcs.split(', '):
            print(func)
            print(timeit(f"{func}(urls)",
                         f"from __main__ import {funcs}, urls",
                         number=1))
            print()
	import concurrent.futures
	import os
	import re
	from timeit import timeit

	import requests

	URLS = 'urls'


	def _get_blog_urls(url='https://pybit.es/archives', outfile=URLS):
	resp = requests.get(url)
	urls = re.findall(r'(https://pybit.es/[\w]+\.html)', resp.text)
	with open(outfile, 'w') as f:
	f.write('\n'.join(urls) + '\n')


	def _download_page(url):
	fname = os.path.basename(url)
	r = requests.get(url)
	with open(f'downloads/{fname}', 'wb') as outfile:
	outfile.write(r.content)


	def _parse_args():
	parser = argparse.ArgumentParser(
	description='Download all PyBites articles.')
	parser.add_argument("-s", "--seq", action='store_true',
	help="download sequentially")
	parser.add_argument("-c", "--conc", action='store_true',
	help='download concurrently (32 workers)')
	return parser.parse_args()


	def download_urls_sequentially(urls):
	for url in urls:
	_download_page(url)


	def download_urls_concurrently(urls):
	with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
	future_to_url = {executor.submit(_download_page, url): url
	for url in urls}
	for future in concurrent.futures.as_completed(future_to_url):
	future_to_url[future]


	if __name__ == '__main__':
	import sys
	import argparse

	if not os.path.isfile(URLS):
	print('getting urls')
	_get_blog_urls()

	with open(URLS) as f:
	urls = [u.rstrip() for u in f.readlines()]

	"""
	args = _parse_args()

	if args.seq:
	download_urls_sequentially(urls)
	elif args.conc:
	download_urls_concurrently(urls)
	else:
	print("select -s or -c")
	sys.exit(1)
	"""

	funcs = 'download_urls_sequentially, download_urls_concurrently'
	for func in funcs.split(', '):
	print(func)
	print(timeit(f"{func}(urls)",
	f"from __main__ import {funcs}, urls",
	number=1))
	print()