Created
August 2, 2023 12:01
-
-
Save e-dreyer/058b9e39c0aec1aeab3a57456de2bb15 to your computer and use it in GitHub Desktop.
Beautifulsoup Async find and find_all
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import time | |
import aiohttp | |
import html5lib | |
from bs4 import BeautifulSoup, ResultSet, Tag, PageElement | |
from typing import List, Dict, AnyStr, Any | |
SELECTED_URL = "https://discuss.python.org/latest" | |
async def getPageContent(url: str) -> BeautifulSoup | None: | |
async with aiohttp.ClientSession() as session: | |
async with session.get(url) as resp: | |
if resp.status == 200: | |
return BeautifulSoup((await resp.read()).decode('utf-8'), 'html5lib') | |
else: | |
return None | |
async def findAllTags(soup: BeautifulSoup, name: str, class_: str) -> ResultSet[Any]: | |
result = soup.find_all(name, class_=class_) | |
return result | |
async def findTag(soup: BeautifulSoup, name: str, class_: str) -> PageElement: | |
result = soup.find(name, class_=class_) | |
return result | |
async def getLatestPostsPage() -> None: | |
LATEST_PAGE_URL = "https://discuss.python.org/latest" | |
latest_page_soup = await getPageContent(LATEST_PAGE_URL) | |
if latest_page_soup: | |
topics_soup = await findAllTags(latest_page_soup, "tr", class_="topic-list-item") | |
page_tasks = set() | |
for topic_soup in topics_soup: | |
# Get URL | |
topic_url_element = await findTag(topic_soup, "a", class_="title raw-link raw-topic-link") | |
topic_url = topic_url_element.get("href") | |
# Get page data | |
task = asyncio.create_task(getPageContent(topic_url)) | |
page_tasks.add(task) | |
for task in page_tasks: | |
await task | |
print(page_tasks) | |
if __name__ == "__main__": | |
start_time = time.time() | |
asyncio.run(getLatestPostsPage()) | |
end_time = time.time() | |
print(end_time-start_time) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import time | |
import aiohttp | |
import html5lib | |
from bs4 import BeautifulSoup, ResultSet, Tag, PageElement | |
from typing import List, Dict, AnyStr, Any | |
SELECTED_URL = "https://discuss.python.org/latest" | |
async def getPageContent(url: str) -> BeautifulSoup | None: | |
async with aiohttp.ClientSession() as session: | |
async with session.get(url) as resp: | |
if resp.status == 200: | |
return BeautifulSoup((await resp.read()).decode('utf-8'), 'html5lib') | |
else: | |
return None | |
def findAllTags(soup: BeautifulSoup, name: str, class_: str) -> ResultSet[Any]: | |
result = soup.find_all(name, class_=class_) | |
return result | |
def findTag(soup: BeautifulSoup, name: str, class_: str) -> PageElement: | |
result = soup.find(name, class_=class_) | |
return result | |
async def getLatestPostsPage() -> None: | |
LATEST_PAGE_URL = "https://discuss.python.org/latest" | |
latest_page_soup = await getPageContent(LATEST_PAGE_URL) | |
if latest_page_soup: | |
topics_soup = findAllTags(latest_page_soup, "tr", class_="topic-list-item") | |
page_tasks = set() | |
for topic_soup in topics_soup: | |
# Get URL | |
topic_url_element = findTag(topic_soup, "a", class_="title raw-link raw-topic-link") | |
topic_url = topic_url_element.get("href") | |
# Get page data | |
task = await getPageContent(topic_url) | |
page_tasks.add(task) | |
print(page_tasks) | |
if __name__ == "__main__": | |
start_time = time.time() | |
asyncio.run(getLatestPostsPage()) | |
end_time = time.time() | |
print(end_time-start_time) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment