Last active
August 8, 2023 16:38
-
-
Save bbelderbos/f37ef9336e8564489352db98bb5f34a8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.parse import urlparse | |
from bs4 import BeautifulSoup | |
import httpx | |
API_URL = "https://codechalleng.es/api/articles/" | |
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
HEADERS = {"User-Agent": USER_AGENT} | |
PYBITES_DOMAINS = ("pybit.es",) | |
class ArticleChecker: | |
def __init__(self, max_articles=None): | |
self.max_articles = max_articles | |
self._seen = set() | |
self.articles = self._get_articles() | |
self.contents = self._get_articles_html_content() | |
def _get_articles(self) -> list[str]: | |
try: | |
response = httpx.get(API_URL) | |
response.raise_for_status() | |
links = [row["link"] for row in response.json()] | |
if self.max_articles is not None: | |
links = links[: self.max_articles] | |
return links | |
except httpx.HTTPError: | |
print("Error fetching articles.") | |
return [] | |
def _get_articles_html_content(self) -> dict[str, str]: | |
contents = {} | |
with httpx.Client() as client: | |
for article in self.articles: | |
try: | |
response = client.get(article, headers=HEADERS) | |
response.raise_for_status() | |
contents[article] = response.text | |
except httpx.HTTPError: | |
print(f"Error fetching content for {article}") | |
return contents | |
@staticmethod | |
def get_links_from_article_html(content: str) -> list[str]: | |
soup = BeautifulSoup(content, "html.parser") | |
links = [ | |
link.get("href") | |
for link in soup.find_all("a") | |
if link.get("href") and urlparse(link.get("href")).netloc in PYBITES_DOMAINS | |
] | |
return links | |
def check_status_links(self, links: list[str]) -> dict[str, bool]: | |
results = {} | |
with httpx.Client() as client: | |
for link in links: | |
if link in self._seen: | |
continue | |
self._seen.add(link) | |
try: | |
response = client.head(link, headers=HEADERS) | |
status_ok = response.status_code == 200 | |
results[link] = status_ok | |
except httpx.HTTPError: | |
print(f"Error checking status for {link}") | |
return results | |
def main(): | |
checker = ArticleChecker() | |
for article, content in checker.contents.items(): | |
links = checker.get_links_from_article_html(content) | |
results = checker.check_status_links(links) | |
print(f"\n** {article}") | |
for link, status_ok in results.items(): | |
if not status_ok: | |
print(f"- {link}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment