Last active
August 28, 2025 08:22
-
-
Save cb109/a324a6bfb49f875ae5721929590909a8 to your computer and use it in GitHub Desktop.
Reusable Chrome Instance for Puppeteer using Django Model and Management Command
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Unit] | |
Description=Chrome Browser Worker | |
After=network.target | |
[Service] | |
Environment=PYTHONUNBUFFERED=1 | |
Environment=DJANGO_DEPLOYMENT_TYPE=prod | |
User=www-data | |
Group=www-data | |
WorkingDirectory=/opt/myproject | |
ExecStart=/opt/myproject/manage.py chromeworker | |
Restart=on-failure | |
KillSignal=SIGINT | |
RuntimeMaxSec=7d |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Launch a Chrome and make it available through the ORM. | |
Puppeteer scripts can make use of that running browser instance by | |
calling .connect() against the ws_endpoint_url. This allows to skip the | |
startup time involved in launching a new browser from scratch. | |
Note: This command blocks until cancelled and tries to clean up the | |
database afterwards, so 'dead' ChromeWorkers won't pile up. To make | |
this work, setup this script as a systemd service using | |
KillSignal=SIGINT (which translates to a KeyboardInterrupt in here) | |
instead of SIGTERM, see: | |
https://alexandra-zaharia.github.io/posts/stopping-python-systemd-service-cleanly) | |
""" | |
import json | |
import logging | |
import socket | |
import subprocess | |
import tempfile | |
import threading | |
import time | |
import uuid | |
from typing import List, Optional | |
from django.core.management.base import BaseCommand | |
from django.db import close_old_connections | |
from django.template.loader import render_to_string | |
from myproject.myapp.models import ChromeWorker | |
logger = logging.getLogger(__name__) | |
class LaunchChromeThread(threading.Thread): | |
def __init__( | |
self, remote_info_filepath: str, *args, headless: bool = True, **kwargs | |
): | |
super().__init__(*args, **kwargs) | |
self.remote_info_filepath = remote_info_filepath | |
self.headless = headless | |
def run(self): | |
javascript_code: str = render_to_string( | |
"chromeworker/puppeteer_launch_chrome.js", | |
{ | |
"headless": self.headless, | |
"launch_arguments": ( | |
# ATTENTION: Order matters here, it seems --user-data-dir MUST come | |
# before --disble-web-security, otherwise that flag is just | |
# silently ignored, so do not reorder! | |
"--user-data-dir=/tmp/", | |
"--disable-web-security", | |
"--no-sandbox", | |
"--disable-setuid-sandbox", | |
# Additional render options below | |
"--hide-scrollbars", | |
"--browser-test", | |
"--disable-background-networking", | |
"--run-all-compositor-stages-before-draw", | |
"--disable-new-content-rendering-timeout", | |
"--disable-threaded-animation", | |
"--disable-threaded-scrolling", | |
"--disable-checker-imaging", | |
"--disable-image-animation-resync", | |
# Prevents Chrome from using /dev/shm (shared memory). Docker | |
# containers often have limited shared memory (64MB default), | |
# causing Chrome to crash when it runs out. | |
"--disable-dev-shm-usage", | |
# Disables GPU hardware acceleration. Prevents GPU-related crashes | |
# in headless environments. | |
"--disable-gpu", | |
# Disables CPU-based rendering fallback. Can prevent hangs. | |
"--disable-software-rasterizer", | |
# Disables site isolation. Reduces process count and memory usage. | |
"--disable-features=IsolateOrigins,site-per-process", | |
# Hides automation indicators from JavaScript detection. | |
"--disable-blink-features=AutomationControlled", | |
), | |
"executable_path": "google-chrome", | |
"remote_info_filepath": self.remote_info_filepath, | |
}, | |
) | |
options: List[str] = ["node", "--eval", javascript_code] | |
# This blocks, but since it's within a thread the main | |
# thread can analyse its output and continue. | |
subprocess.run(options, capture_output=True, check=True) | |
def launch_chrome_browser(worker_uuid: str, headless: bool = True) -> Optional[dict]: | |
# Start thread that launches a Chrome instance via our puppeteer script. | |
print(f"Launching Chrome Worker '{worker_uuid}'...") | |
remote_info_file: object = tempfile.NamedTemporaryFile( | |
prefix=f"remote_info.{worker_uuid}.", suffix=".json" | |
) | |
thread = LaunchChromeThread(remote_info_file.name, headless=headless) | |
thread.start() | |
# Read remote_info data from the output file created by the thread. | |
remote_info = None | |
seconds_step: float = 0.5 | |
seconds_left: float = 10.0 | |
while True: | |
if not seconds_left: | |
raise RuntimeError( | |
f"Timed out trying to read Chrome Worker '{worker_uuid}' remote_info" | |
) | |
try: | |
with open(remote_info_filepath) as f: | |
remote_info = json.loads(f.read()) | |
except (FileNotFoundError, TypeError, json.decoder.JSONDecodeError): | |
pass | |
if remote_info is not None: | |
break | |
seconds_left -= seconds_step | |
time.sleep(seconds_step) | |
# Create a ChromeWorker instance. | |
worker, _ = ChromeWorker.objects.get_or_create( | |
uuid=worker_uuid, | |
pid=remote_info["processId"], | |
hostname=socket.gethostname(), | |
ws_endpoint_url=remote_info["wsEndpoint"], | |
) | |
logger.info("Created ChromeWorker: %s", worker) | |
# Block infinitely until exit is triggered from the outside. | |
while True: | |
pass | |
class Command(BaseCommand): | |
help = ( | |
"Launch an instance of Google Chrome via puppeteer and keep it alive, " | |
"allowing other services to reuse it via its browser.wsEndpoint(), so they " | |
"don't have the browser startup time on every call." | |
) | |
def add_arguments(self, parser): | |
parser.add_argument( | |
"--headful", | |
action="store_true", | |
dest="headful", | |
default=False, | |
help=( | |
"Start launcher in non-headless mode, aka with visible browser GUI, " | |
"helpful with debugging" | |
), | |
) | |
def handle(self, *args, **options): | |
headful: bool = options["headful"] | |
worker_uuid: str = str(uuid.uuid4()) | |
def _on_browser_exit_disable_worker_instance(): | |
# Avoid 'MySQL server has gone away' error. | |
close_old_connections() | |
try: | |
worker = ChromeWorker.objects.get(uuid=worker_uuid) | |
logger.warning( | |
( | |
"Stopping ChromeWorker as it looks like its associated " | |
"browser has been closed: %s" | |
), | |
worker, | |
) | |
worker.stop() | |
except ChromeWorker.DoesNotExist: | |
return | |
try: | |
launch_chrome_browser(worker_uuid, headless=not headful) | |
except KeyboardInterrupt: | |
logger.info("The chromeworker command has been stopped on purpose.") | |
finally: | |
_on_browser_exit_disable_worker_instance() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime | |
from django.db import models | |
class ChromeWorker(models.Model): | |
"""Represent a running Chrome instance that can be connected to. | |
Note: The Chrome instance needs to run on our main host, so that | |
it is reachable on the same machine handling associated | |
requests. | |
""" | |
uuid = models.CharField(max_length=64) | |
"""Result of uuid.uuid4() as a string, e.g. to associate file names.""" | |
pid = models.PositiveIntegerField() | |
"""The process ID of the associated active Chrome process.""" | |
hostname = models.CharField(max_length=128) | |
"""The host where the process runs on.""" | |
ws_endpoint_url = models.CharField(max_length=512) | |
"""The Chrome debugging protocol URL used to connect to this Chrome instance.""" | |
num_used = models.PositiveIntegerField(default=0) | |
"""Track how often calls to this worker were made.""" | |
stopped_at = models.DateTimeField(blank=True, null=True, default=None) | |
def __str__(self) -> str: | |
return f"ChromeWorker uuid={self.uuid} pid={self.pid}" | |
def stop(self): | |
self.stopped_at = datetime.now() | |
self.save(update_fields=["stopped_at"]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Launch a long-lived Chrome meant to be reused by other clients. | |
* | |
* It will write its remote URL and process ID to the given filepath for | |
* the caller to inspect. | |
*/ | |
const fs = require('fs'); | |
const puppeteer = require('puppeteer'); | |
(async () => { | |
const browser = await puppeteer.launch({ | |
headless: {% if headless %}'new'{% else %}false{% endif %}, | |
executablePath: '{{ executable_path }}', | |
args: [ | |
{% for arg in launch_arguments %} | |
'{{ arg }}',{% endfor %} | |
], | |
}); | |
const remoteInfo = { | |
processId: browser.process().pid, | |
wsEndpoint: browser.wsEndpoint(), | |
}; | |
fs.writeFileSync( | |
'{{ remote_info_filepath }}', JSON.stringify(remoteInfo, null, 2) | |
); | |
// Leave browser alive/open. | |
await browser.disconnect(); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment