Created
August 18, 2023 18:45
-
-
Save clbarnes/33b844bf516a28f9f0125fef5f4c8af8 to your computer and use it in GitHub Desktop.
Async client for fetching HTML content from pages requiring javascript execution, using a pool of tabs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from contextlib import asynccontextmanager | |
import asyncio as aio | |
from playwright.async_api import async_playwright | |
class BrowserPool: | |
def __init__(self, n_tabs=10, executable=None) -> None: | |
self.executable = executable | |
self.n_tabs = n_tabs | |
self.tabs_remaining = n_tabs | |
self.context = None | |
self.browser = None | |
self.tab_queue = aio.Queue() | |
def _check_active(self): | |
if self.browser is None: | |
raise RuntimeError("Pool is not active") | |
async def open(self): | |
if self.context is None: | |
self.context = async_playwright() | |
playwright = await self.context.__aenter__() | |
self.browser = await playwright.chromium.launch(executable_path=self.executable) | |
return self | |
async def close(self, *args, **kwargs): | |
if self.context is None: | |
return None | |
for _ in range(self.n_tabs - self.tabs_remaining): | |
await self.tab_queue.get() | |
await self.browser.close() | |
self.browser = None | |
ret = await self.context.__aexit__(*args, **kwargs) | |
self.context = None | |
return ret | |
async def __aenter__(self): | |
return await self.open() | |
async def __aexit__(self, *args, **kwargs): | |
return await self.close(*args, **kwargs) | |
@asynccontextmanager | |
async def tab(self): | |
self._check_active() | |
if self.tabs_remaining > 0: | |
try: | |
tab = self.tab_queue.get_nowait() | |
except aio.QueueEmpty: | |
self.tabs_remaining -= 1 | |
tab = await self.browser.new_page() | |
else: | |
tab = await self.tab_queue.get() | |
try: | |
yield tab | |
finally: | |
await self.tab_queue.put(tab) | |
async def get(self, url: str, selector: str | None = None): | |
async with self.tab() as t: | |
await t.goto(url) | |
if selector is not None: | |
await t.wait_for_selector(selector, state="attached") | |
return await t.content() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment