Skip to content

Instantly share code, notes, and snippets.

@e-dreyer
Created August 2, 2023 12:01
Show Gist options
  • Save e-dreyer/058b9e39c0aec1aeab3a57456de2bb15 to your computer and use it in GitHub Desktop.
Save e-dreyer/058b9e39c0aec1aeab3a57456de2bb15 to your computer and use it in GitHub Desktop.
Beautifulsoup Async find and find_all
import asyncio
import time
import aiohttp
import html5lib
from bs4 import BeautifulSoup, ResultSet, Tag, PageElement
from typing import List, Dict, AnyStr, Any
SELECTED_URL = "https://discuss.python.org/latest"
async def getPageContent(url: str) -> BeautifulSoup | None:
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
if resp.status == 200:
return BeautifulSoup((await resp.read()).decode('utf-8'), 'html5lib')
else:
return None
async def findAllTags(soup: BeautifulSoup, name: str, class_: str) -> ResultSet[Any]:
result = soup.find_all(name, class_=class_)
return result
async def findTag(soup: BeautifulSoup, name: str, class_: str) -> PageElement:
result = soup.find(name, class_=class_)
return result
async def getLatestPostsPage() -> None:
LATEST_PAGE_URL = "https://discuss.python.org/latest"
latest_page_soup = await getPageContent(LATEST_PAGE_URL)
if latest_page_soup:
topics_soup = await findAllTags(latest_page_soup, "tr", class_="topic-list-item")
page_tasks = set()
for topic_soup in topics_soup:
# Get URL
topic_url_element = await findTag(topic_soup, "a", class_="title raw-link raw-topic-link")
topic_url = topic_url_element.get("href")
# Get page data
task = asyncio.create_task(getPageContent(topic_url))
page_tasks.add(task)
for task in page_tasks:
await task
print(page_tasks)
if __name__ == "__main__":
start_time = time.time()
asyncio.run(getLatestPostsPage())
end_time = time.time()
print(end_time-start_time)
import asyncio
import time
import aiohttp
import html5lib
from bs4 import BeautifulSoup, ResultSet, Tag, PageElement
from typing import List, Dict, AnyStr, Any
SELECTED_URL = "https://discuss.python.org/latest"
async def getPageContent(url: str) -> BeautifulSoup | None:
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
if resp.status == 200:
return BeautifulSoup((await resp.read()).decode('utf-8'), 'html5lib')
else:
return None
def findAllTags(soup: BeautifulSoup, name: str, class_: str) -> ResultSet[Any]:
result = soup.find_all(name, class_=class_)
return result
def findTag(soup: BeautifulSoup, name: str, class_: str) -> PageElement:
result = soup.find(name, class_=class_)
return result
async def getLatestPostsPage() -> None:
LATEST_PAGE_URL = "https://discuss.python.org/latest"
latest_page_soup = await getPageContent(LATEST_PAGE_URL)
if latest_page_soup:
topics_soup = findAllTags(latest_page_soup, "tr", class_="topic-list-item")
page_tasks = set()
for topic_soup in topics_soup:
# Get URL
topic_url_element = findTag(topic_soup, "a", class_="title raw-link raw-topic-link")
topic_url = topic_url_element.get("href")
# Get page data
task = await getPageContent(topic_url)
page_tasks.add(task)
print(page_tasks)
if __name__ == "__main__":
start_time = time.time()
asyncio.run(getLatestPostsPage())
end_time = time.time()
print(end_time-start_time)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment