Skip to content

Instantly share code, notes, and snippets.

@ddjerqq
Created December 13, 2022 22:23
Show Gist options
  • Save ddjerqq/dd1f2a1e4efeef5b0f4484cad632b82e to your computer and use it in GitHub Desktop.
Save ddjerqq/dd1f2a1e4efeef5b0f4484cad632b82e to your computer and use it in GitHub Desktop.
a better version to url_extract_and_save.py. recursively extract urls from a url and go 1 level deeper.
"""recursively extract links and their titles from a web address
"""
from __future__ import annotations
import re
import asyncio as aio
import aiohttp
from aiohttp import ClientTimeout
async def extract_links(address: str, depth: int = 1) -> list[dict[str, str]]:
"""Extract links from a web address. Recursively.
Notes:
the depth of 0 gets only the title on the first page you provide.
Args:
address (str): The web address to extract links from.
depth (int, optional): The depth to extract links from. Defaults to 1.
Returns:
list[dict[str, str]]: A list of links and their titles.
list[{"title": ..., "link": ...}, {...}]
"""
async with aiohttp.ClientSession(timeout=ClientTimeout(total=60)) as session:
async with session.get(address) as response:
try:
html = await response.text()
except UnicodeDecodeError:
return []
try:
title = re.findall(r"<title>(.*?)</title>", html)[0]
except IndexError:
title = None
if depth == 0:
return [{"title": title, "link": address}]
links_on_page = [
link
for link in re.findall(r'href=[\'"]?([^\'" >]+)', html)
if link.startswith("http")
]
futures = [
extract_links(link, depth - 1)
for link in links_on_page
]
result = await aio.gather(*futures, return_exceptions=True) # type: ignore
# filter the exceptions returned by aio.gather
result = filter(lambda i: not isinstance(i, Exception), result)
def flatten(ndarray):
for i in ndarray:
if isinstance(i, list):
yield from flatten(i)
else:
yield i
result = list(flatten(result))
return result
async def main():
links = await extract_links("https://gist.github.com/", 1)
for item in links:
if item["title"]:
print(f"{item['title']}: {item['link']}")
if __name__ == "__main__":
aio.run(main())
@ddjerqq
Copy link
Author

ddjerqq commented Dec 13, 2022

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment