RGBCube · January 6, 2023 20:01 · RGBCube · Jan 6, 2023
diff --git a/link_scraper.py b/link_scraper.py
 import chalky
 import bs4
 import requests

 ROOT = "https://wavetro.net"  # Change this to your desire
 WANTED_SITE = "wavetro.net"  # This needs to be in the url for it to get scraped. Warning: This doesn't check the root domain. It only checks if it is in the URL.
 PRINT_INFO = True  # Wheter if to print extra info.

 gotten = set()
 __print = print


 def print(*args, **kwargs) -> None:
    if PRINT_INFO:
        __print(*args, **kwargs)


 def is_html_page(url: str) -> bool:
    if url.rsplit(".", 1)[1].lower() in {"jpg", "png"}:
        return False

    try:
        response = requests.head(url)
    except Exception as error:
        print(f"ERROR: Error while getting {url}: {error}")
        return False

    return "html" in response.headers["Content-Type"]


 def add_url(url: str) -> None:
    print(f"INFO : Got", chalky.fg.green | f"{url}")
    gotten.add(url)


 def get_urls_from_url(url: str) -> list[str]:
    print(f"INFO : Scraping {url}")
    raw_html = requests.get(url).text
    print(f"INFO : Request for {url} done")

    html = bs4.BeautifulSoup(raw_html, features="html5lib")

    return map(
        lambda u: (ROOT + u) if u and u.startswith("/") else u,
        map(
            lambda a: a.get("href"),
            html.find_all("a")
            )
        )


 def scrape_links(url: str | None) -> None:
    if url is None:
        return

    url = url.removesuffix("/")

    if "#" in url:
        url = url.split("#")[0]

    if WANTED_SITE not in url:
        return

    if url in gotten:
        return

    if not is_html_page(url):
        return

    add_url(url)

    for url in get_urls_from_url(url):
        scrape_links(url)

 try:
    scrape_links(ROOT)
 except KeyboardInterrupt:
    pass
 finally:
    gotten = list(gotten)
    gotten.sort()
    __print("\n")
    __print("\n".join(gotten))
	import chalky
	import bs4
	import requests

	ROOT = "https://wavetro.net" # Change this to your desire
	WANTED_SITE = "wavetro.net" # This needs to be in the url for it to get scraped. Warning: This doesn't check the root domain. It only checks if it is in the URL.
	PRINT_INFO = True # Wheter if to print extra info.

	gotten = set()
	__print = print


	def print(args, *kwargs) -> None:
	if PRINT_INFO:
	__print(args, *kwargs)


	def is_html_page(url: str) -> bool:
	if url.rsplit(".", 1)[1].lower() in {"jpg", "png"}:
	return False

	try:
	response = requests.head(url)
	except Exception as error:
	print(f"ERROR: Error while getting {url}: {error}")
	return False

	return "html" in response.headers["Content-Type"]


	def add_url(url: str) -> None:
	print(f"INFO : Got", chalky.fg.green \| f"{url}")
	gotten.add(url)


	def get_urls_from_url(url: str) -> list[str]:
	print(f"INFO : Scraping {url}")
	raw_html = requests.get(url).text
	print(f"INFO : Request for {url} done")

	html = bs4.BeautifulSoup(raw_html, features="html5lib")

	return map(
	lambda u: (ROOT + u) if u and u.startswith("/") else u,
	map(
	lambda a: a.get("href"),
	html.find_all("a")
	)
	)


	def scrape_links(url: str \| None) -> None:
	if url is None:
	return

	url = url.removesuffix("/")

	if "#" in url:
	url = url.split("#")[0]

	if WANTED_SITE not in url:
	return

	if url in gotten:
	return

	if not is_html_page(url):
	return

	add_url(url)

	for url in get_urls_from_url(url):
	scrape_links(url)

	try:
	scrape_links(ROOT)
	except KeyboardInterrupt:
	pass
	finally:
	gotten = list(gotten)
	gotten.sort()
	__print("\n")
	__print("\n".join(gotten))