e-dreyer · August 2, 2023 12:01
diff --git a/bs4_async_find_test.py b/bs4_async_find_test.py
 import asyncio
 import time
 import aiohttp
 import html5lib
 from bs4 import BeautifulSoup, ResultSet, Tag, PageElement

 from typing import List, Dict, AnyStr, Any

 SELECTED_URL = "https://discuss.python.org/latest"

 async def getPageContent(url: str) -> BeautifulSoup | None:
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            if resp.status == 200:
                return BeautifulSoup((await resp.read()).decode('utf-8'), 'html5lib')
            else:
                return None

 async def findAllTags(soup: BeautifulSoup, name: str, class_: str) -> ResultSet[Any]:
    result = soup.find_all(name, class_=class_)
    return result

 async def findTag(soup: BeautifulSoup, name: str, class_: str) -> PageElement:
    result = soup.find(name, class_=class_)
    return result

 async def getLatestPostsPage() -> None:
    LATEST_PAGE_URL = "https://discuss.python.org/latest"
    latest_page_soup = await getPageContent(LATEST_PAGE_URL)
    
    if latest_page_soup:
        topics_soup = await findAllTags(latest_page_soup, "tr", class_="topic-list-item")
        
    page_tasks = set()
    for topic_soup in topics_soup:
        # Get URL
        topic_url_element = await findTag(topic_soup, "a", class_="title raw-link raw-topic-link")
        topic_url = topic_url_element.get("href")

        # Get page data
        task = asyncio.create_task(getPageContent(topic_url))
        page_tasks.add(task)

    for task in page_tasks:
        await task
        
    print(page_tasks)
    
 if __name__ == "__main__":
    start_time = time.time()
    asyncio.run(getLatestPostsPage())
    end_time = time.time()
    
    print(end_time-start_time)
diff --git a/bs4_normal_find_test.py b/bs4_normal_find_test.py
 import asyncio
 import time
 import aiohttp
 import html5lib
 from bs4 import BeautifulSoup, ResultSet, Tag, PageElement

 from typing import List, Dict, AnyStr, Any

 SELECTED_URL = "https://discuss.python.org/latest"

 async def getPageContent(url: str) -> BeautifulSoup | None:
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            if resp.status == 200:
                return BeautifulSoup((await resp.read()).decode('utf-8'), 'html5lib')
            else:
                return None

 def findAllTags(soup: BeautifulSoup, name: str, class_: str) -> ResultSet[Any]:
    result = soup.find_all(name, class_=class_)
    return result

 def findTag(soup: BeautifulSoup, name: str, class_: str) -> PageElement:
    result = soup.find(name, class_=class_)
    return result

 async def getLatestPostsPage() -> None:
    LATEST_PAGE_URL = "https://discuss.python.org/latest"
    latest_page_soup = await getPageContent(LATEST_PAGE_URL)
    
    if latest_page_soup:
        topics_soup = findAllTags(latest_page_soup, "tr", class_="topic-list-item")
        
    page_tasks = set()
    for topic_soup in topics_soup:
        # Get URL
        topic_url_element = findTag(topic_soup, "a", class_="title raw-link raw-topic-link")
        topic_url = topic_url_element.get("href")

        # Get page data
        task = await getPageContent(topic_url)
        page_tasks.add(task)
        
    print(page_tasks)
    
 if __name__ == "__main__":
    start_time = time.time()
    asyncio.run(getLatestPostsPage())
    end_time = time.time()
    
    print(end_time-start_time)
	import asyncio
	import time
	import aiohttp
	import html5lib
	from bs4 import BeautifulSoup, ResultSet, Tag, PageElement

	from typing import List, Dict, AnyStr, Any

	SELECTED_URL = "https://discuss.python.org/latest"

	async def getPageContent(url: str) -> BeautifulSoup \| None:
	async with aiohttp.ClientSession() as session:
	async with session.get(url) as resp:
	if resp.status == 200:
	return BeautifulSoup((await resp.read()).decode('utf-8'), 'html5lib')
	else:
	return None

	async def findAllTags(soup: BeautifulSoup, name: str, class_: str) -> ResultSet[Any]:
	result = soup.find_all(name, class_=class_)
	return result

	async def findTag(soup: BeautifulSoup, name: str, class_: str) -> PageElement:
	result = soup.find(name, class_=class_)
	return result

	async def getLatestPostsPage() -> None:
	LATEST_PAGE_URL = "https://discuss.python.org/latest"
	latest_page_soup = await getPageContent(LATEST_PAGE_URL)

	if latest_page_soup:
	topics_soup = await findAllTags(latest_page_soup, "tr", class_="topic-list-item")

	page_tasks = set()
	for topic_soup in topics_soup:
	# Get URL
	topic_url_element = await findTag(topic_soup, "a", class_="title raw-link raw-topic-link")
	topic_url = topic_url_element.get("href")

	# Get page data
	task = asyncio.create_task(getPageContent(topic_url))
	page_tasks.add(task)

	for task in page_tasks:
	await task

	print(page_tasks)

	if __name__ == "__main__":
	start_time = time.time()
	asyncio.run(getLatestPostsPage())
	end_time = time.time()

	print(end_time-start_time)