Created
April 6, 2023 06:12
-
-
Save bbelderbos/1b13763e1ba91cb5595cef8c1c9821c0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import concurrent.futures | |
from fake_useragent import UserAgent | |
import requests | |
ARTICLE_ENDPOINT = "https://codechalleng.es/api/articles/" | |
ARTICLE_LINKS = Path("links") | |
DOWNLOADS_FOLDER = Path("downloads") | |
HEADERS = {"User-Agent": str(UserAgent().chrome)} | |
def get_links(): | |
response = requests.get(ARTICLE_ENDPOINT) | |
urls = [row["link"] for row in response.json()] | |
return urls | |
def _download_url(url): | |
response = requests.get(url, headers=HEADERS) | |
filename = url.rstrip("/").split("/")[-1].removesuffix(".html") | |
path = DOWNLOADS_FOLDER / filename | |
path.write_text(response.text) | |
def download_articles(urls): | |
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: | |
# Start the load operations and mark each future with its URL | |
future_to_url = {executor.submit(_download_url, url): url for url in urls} | |
for future in concurrent.futures.as_completed(future_to_url): | |
url = future_to_url[future] | |
data = future.result() | |
if __name__ == "__main__": | |
if not ARTICLE_LINKS.exists(): | |
links = get_links() | |
ARTICLE_LINKS.write_text("\n".join(links) + "\n") | |
if not DOWNLOADS_FOLDER.exists(): | |
DOWNLOADS_FOLDER.mkdir() | |
urls = ARTICLE_LINKS.read_text().splitlines() | |
download_articles(urls) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment