ddjerqq · December 13, 2022 22:23 · ddjerqq · Dec 13, 2022
diff --git a/recursive url and title extractor.py b/recursive url and title extractor.py
 """recursively extract links and their titles from a web address
 """
 from __future__ import annotations

 import re
 import asyncio as aio

 import aiohttp
 from aiohttp import ClientTimeout


 async def extract_links(address: str, depth: int = 1) -> list[dict[str, str]]:
    """Extract links from a web address. Recursively.

    Notes:
        the depth of 0 gets only the title on the first page you provide.

    Args:
        address (str): The web address to extract links from.
        depth (int, optional): The depth to extract links from. Defaults to 1.

    Returns:
        list[dict[str, str]]: A list of links and their titles.
                              list[{"title": ..., "link": ...}, {...}]
    """
    async with aiohttp.ClientSession(timeout=ClientTimeout(total=60)) as session:
        async with session.get(address) as response:
            try:
                html = await response.text()
            except UnicodeDecodeError:
                return []

            try:
                title = re.findall(r"<title>(.*?)</title>", html)[0]
            except IndexError:
                title = None

            if depth == 0:
                return [{"title": title, "link": address}]

            links_on_page = [
                link
                for link in re.findall(r'href=[\'"]?([^\'" >]+)', html)
                if link.startswith("http")
            ]

            futures = [
                extract_links(link, depth - 1)
                for link in links_on_page
            ]
            result = await aio.gather(*futures, return_exceptions=True)  # type: ignore
            # filter the exceptions returned by aio.gather
            result = filter(lambda i: not isinstance(i, Exception), result)

            def flatten(ndarray):
                for i in ndarray:
                    if isinstance(i, list):
                        yield from flatten(i)
                    else:
                        yield i

            result = list(flatten(result))

            return result


 async def main():
    links = await extract_links("https://gist.github.com/", 1)
    for item in links:
        if item["title"]:
            print(f"{item['title']}: {item['link']}")


 if __name__ == "__main__":
    aio.run(main())
	"""recursively extract links and their titles from a web address
	"""
	from __future__ import annotations

	import re
	import asyncio as aio

	import aiohttp
	from aiohttp import ClientTimeout


	async def extract_links(address: str, depth: int = 1) -> list[dict[str, str]]:
	"""Extract links from a web address. Recursively.

	Notes:
	the depth of 0 gets only the title on the first page you provide.

	Args:
	address (str): The web address to extract links from.
	depth (int, optional): The depth to extract links from. Defaults to 1.

	Returns:
	list[dict[str, str]]: A list of links and their titles.
	list[{"title": ..., "link": ...}, {...}]
	"""
	async with aiohttp.ClientSession(timeout=ClientTimeout(total=60)) as session:
	async with session.get(address) as response:
	try:
	html = await response.text()
	except UnicodeDecodeError:
	return []

	try:
	title = re.findall(r"<title>(.*?)</title>", html)[0]
	except IndexError:
	title = None

	if depth == 0:
	return [{"title": title, "link": address}]

	links_on_page = [
	link
	for link in re.findall(r'href=[\'"]?([^\'" >]+)', html)
	if link.startswith("http")
	]

	futures = [
	extract_links(link, depth - 1)
	for link in links_on_page
	]
	result = await aio.gather(*futures, return_exceptions=True) # type: ignore
	# filter the exceptions returned by aio.gather
	result = filter(lambda i: not isinstance(i, Exception), result)

	def flatten(ndarray):
	for i in ndarray:
	if isinstance(i, list):
	yield from flatten(i)
	else:
	yield i

	result = list(flatten(result))

	return result


	async def main():
	links = await extract_links("https://gist.github.com/", 1)
	for item in links:
	if item["title"]:
	print(f"{item['title']}: {item['link']}")


	if __name__ == "__main__":
	aio.run(main())