bbelderbos · April 6, 2023 06:12
diff --git a/article-downloader.py b/article-downloader.py
 from pathlib import Path
 import concurrent.futures

 from fake_useragent import UserAgent
 import requests

 ARTICLE_ENDPOINT = "https://codechalleng.es/api/articles/"
 ARTICLE_LINKS = Path("links")
 DOWNLOADS_FOLDER = Path("downloads")
 HEADERS = {"User-Agent": str(UserAgent().chrome)}


 def get_links():
    response = requests.get(ARTICLE_ENDPOINT)
    urls = [row["link"] for row in response.json()]
    return urls


 def _download_url(url):
    response = requests.get(url, headers=HEADERS)
    filename = url.rstrip("/").split("/")[-1].removesuffix(".html")
    path = DOWNLOADS_FOLDER / filename
    path.write_text(response.text)


 def download_articles(urls):
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        # Start the load operations and mark each future with its URL
        future_to_url = {executor.submit(_download_url, url): url for url in urls}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            data = future.result()


 if __name__ == "__main__":

    if not ARTICLE_LINKS.exists():
        links = get_links()
        ARTICLE_LINKS.write_text("\n".join(links) + "\n")

    if not DOWNLOADS_FOLDER.exists():
        DOWNLOADS_FOLDER.mkdir()

    urls = ARTICLE_LINKS.read_text().splitlines()
    download_articles(urls)
	from pathlib import Path
	import concurrent.futures

	from fake_useragent import UserAgent
	import requests

	ARTICLE_ENDPOINT = "https://codechalleng.es/api/articles/"
	ARTICLE_LINKS = Path("links")
	DOWNLOADS_FOLDER = Path("downloads")
	HEADERS = {"User-Agent": str(UserAgent().chrome)}


	def get_links():
	response = requests.get(ARTICLE_ENDPOINT)
	urls = [row["link"] for row in response.json()]
	return urls


	def _download_url(url):
	response = requests.get(url, headers=HEADERS)
	filename = url.rstrip("/").split("/")[-1].removesuffix(".html")
	path = DOWNLOADS_FOLDER / filename
	path.write_text(response.text)


	def download_articles(urls):
	with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
	# Start the load operations and mark each future with its URL
	future_to_url = {executor.submit(_download_url, url): url for url in urls}
	for future in concurrent.futures.as_completed(future_to_url):
	url = future_to_url[future]
	data = future.result()


	if __name__ == "__main__":

	if not ARTICLE_LINKS.exists():
	links = get_links()
	ARTICLE_LINKS.write_text("\n".join(links) + "\n")

	if not DOWNLOADS_FOLDER.exists():
	DOWNLOADS_FOLDER.mkdir()

	urls = ARTICLE_LINKS.read_text().splitlines()
	download_articles(urls)