bbelderbos · August 8, 2023 16:38
diff --git a/pybites-link-checker.py b/pybites-link-checker.py
 from urllib.parse import urlparse

 from bs4 import BeautifulSoup
 import httpx

 API_URL = "https://codechalleng.es/api/articles/"
 USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
 HEADERS = {"User-Agent": USER_AGENT}
 PYBITES_DOMAINS = ("pybit.es",)


 class ArticleChecker:
    def __init__(self, max_articles=None):
        self.max_articles = max_articles
        self._seen = set()
        self.articles = self._get_articles()
        self.contents = self._get_articles_html_content()

    def _get_articles(self) -> list[str]:
        try:
            response = httpx.get(API_URL)
            response.raise_for_status()
            links = [row["link"] for row in response.json()]
            if self.max_articles is not None:
                links = links[: self.max_articles]
            return links
        except httpx.HTTPError:
            print("Error fetching articles.")
            return []

    def _get_articles_html_content(self) -> dict[str, str]:
        contents = {}
        with httpx.Client() as client:
            for article in self.articles:
                try:
                    response = client.get(article, headers=HEADERS)
                    response.raise_for_status()
                    contents[article] = response.text
                except httpx.HTTPError:
                    print(f"Error fetching content for {article}")
        return contents

    @staticmethod
    def get_links_from_article_html(content: str) -> list[str]:
        soup = BeautifulSoup(content, "html.parser")
        links = [
            link.get("href")
            for link in soup.find_all("a")
            if link.get("href") and urlparse(link.get("href")).netloc in PYBITES_DOMAINS
        ]
        return links

    def check_status_links(self, links: list[str]) -> dict[str, bool]:
        results = {}
        with httpx.Client() as client:
            for link in links:
                if link in self._seen:
                    continue
                self._seen.add(link)
                try:
                    response = client.head(link, headers=HEADERS)
                    status_ok = response.status_code == 200
                    results[link] = status_ok
                except httpx.HTTPError:
                    print(f"Error checking status for {link}")
        return results


 def main():
    checker = ArticleChecker()
    for article, content in checker.contents.items():
        links = checker.get_links_from_article_html(content)
        results = checker.check_status_links(links)
        print(f"\n** {article}")
        for link, status_ok in results.items():
            if not status_ok:
                print(f"- {link}")


 if __name__ == "__main__":
    main()
	from urllib.parse import urlparse

	from bs4 import BeautifulSoup
	import httpx

	API_URL = "https://codechalleng.es/api/articles/"
	USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
	HEADERS = {"User-Agent": USER_AGENT}
	PYBITES_DOMAINS = ("pybit.es",)


	class ArticleChecker:
	def __init__(self, max_articles=None):
	self.max_articles = max_articles
	self._seen = set()
	self.articles = self._get_articles()
	self.contents = self._get_articles_html_content()

	def _get_articles(self) -> list[str]:
	try:
	response = httpx.get(API_URL)
	response.raise_for_status()
	links = [row["link"] for row in response.json()]
	if self.max_articles is not None:
	links = links[: self.max_articles]
	return links
	except httpx.HTTPError:
	print("Error fetching articles.")
	return []

	def _get_articles_html_content(self) -> dict[str, str]:
	contents = {}
	with httpx.Client() as client:
	for article in self.articles:
	try:
	response = client.get(article, headers=HEADERS)
	response.raise_for_status()
	contents[article] = response.text
	except httpx.HTTPError:
	print(f"Error fetching content for {article}")
	return contents

	@staticmethod
	def get_links_from_article_html(content: str) -> list[str]:
	soup = BeautifulSoup(content, "html.parser")
	links = [
	link.get("href")
	for link in soup.find_all("a")
	if link.get("href") and urlparse(link.get("href")).netloc in PYBITES_DOMAINS
	]
	return links

	def check_status_links(self, links: list[str]) -> dict[str, bool]:
	results = {}
	with httpx.Client() as client:
	for link in links:
	if link in self._seen:
	continue
	self._seen.add(link)
	try:
	response = client.head(link, headers=HEADERS)
	status_ok = response.status_code == 200
	results[link] = status_ok
	except httpx.HTTPError:
	print(f"Error checking status for {link}")
	return results


	def main():
	checker = ArticleChecker()
	for article, content in checker.contents.items():
	links = checker.get_links_from_article_html(content)
	results = checker.check_status_links(links)
	print(f"\n** {article}")
	for link, status_ok in results.items():
	if not status_ok:
	print(f"- {link}")


	if __name__ == "__main__":
	main()