Created
December 13, 2022 22:23
-
-
Save ddjerqq/dd1f2a1e4efeef5b0f4484cad632b82e to your computer and use it in GitHub Desktop.
a better version to url_extract_and_save.py.
recursively extract urls from a url and go 1 level deeper.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """recursively extract links and their titles from a web address | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import asyncio as aio | |
| import aiohttp | |
| from aiohttp import ClientTimeout | |
| async def extract_links(address: str, depth: int = 1) -> list[dict[str, str]]: | |
| """Extract links from a web address. Recursively. | |
| Notes: | |
| the depth of 0 gets only the title on the first page you provide. | |
| Args: | |
| address (str): The web address to extract links from. | |
| depth (int, optional): The depth to extract links from. Defaults to 1. | |
| Returns: | |
| list[dict[str, str]]: A list of links and their titles. | |
| list[{"title": ..., "link": ...}, {...}] | |
| """ | |
| async with aiohttp.ClientSession(timeout=ClientTimeout(total=60)) as session: | |
| async with session.get(address) as response: | |
| try: | |
| html = await response.text() | |
| except UnicodeDecodeError: | |
| return [] | |
| try: | |
| title = re.findall(r"<title>(.*?)</title>", html)[0] | |
| except IndexError: | |
| title = None | |
| if depth == 0: | |
| return [{"title": title, "link": address}] | |
| links_on_page = [ | |
| link | |
| for link in re.findall(r'href=[\'"]?([^\'" >]+)', html) | |
| if link.startswith("http") | |
| ] | |
| futures = [ | |
| extract_links(link, depth - 1) | |
| for link in links_on_page | |
| ] | |
| result = await aio.gather(*futures, return_exceptions=True) # type: ignore | |
| # filter the exceptions returned by aio.gather | |
| result = filter(lambda i: not isinstance(i, Exception), result) | |
| def flatten(ndarray): | |
| for i in ndarray: | |
| if isinstance(i, list): | |
| yield from flatten(i) | |
| else: | |
| yield i | |
| result = list(flatten(result)) | |
| return result | |
| async def main(): | |
| links = await extract_links("https://gist.github.com/", 1) | |
| for item in links: | |
| if item["title"]: | |
| print(f"{item['title']}: {item['link']}") | |
| if __name__ == "__main__": | |
| aio.run(main()) |
Author
ddjerqq
commented
Dec 13, 2022

Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment