Last active
January 30, 2025 10:06
-
-
Save wooddar/df4c89f381fa20ce819e94782dc5bc04 to your computer and use it in GitHub Desktop.
Easy Python script to run selenium web workers/browsers in parallel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This is an adaptable example script for using selenium across multiple webbrowsers simultaneously. This makes use of | |
two queues - one to store idle webworkers and another to store data to pass to any idle webworkers in a selenium function | |
""" | |
from multiprocessing import Queue, cpu_count | |
from threading import Thread | |
from selenium import webdriver | |
from time import sleep | |
from numpy.random import randint | |
import logging | |
logger = logging.getLogger(__name__) | |
# Some example data to pass the the selenium processes, this will just cause a sleep of time i | |
# This data can be a list of any datatype that can be pickled | |
selenium_data = [4, 2, 3, 3, 4, 3, 4, 3, 1, 2, 3, 2, 'STOP'] | |
# Create the two queues to hold the data and the IDs for the selenium workers | |
selenium_data_queue = Queue() | |
worker_queue = Queue() | |
# Create Selenium processes and assign them a worker ID | |
# This ID is what needs to be put on the queue as Selenium workers cannot be pickled | |
# By default, make one selenium process per cpu core with cpu_count | |
# TODO: Change the worker creation code to be your webworker of choice e.g. PhantomJS | |
worker_ids = list(range(cpu_count())) | |
selenium_workers = {i: webdriver.Chrome() for i in worker_ids} | |
for worker_id in worker_ids: | |
worker_queue.put(worker_id) | |
def selenium_task(worker, data): | |
""" | |
This is a demonstration selenium function that takes a worker and data and then does something with the worker and | |
data. | |
TODO: change the below code to be whatever it is you want your worker to do e.g. scrape webpages or run browser tests | |
:param worker: A selenium web worker NOT a worker ID | |
:type worker: webdriver.XXX | |
:param data: Any data for your selenium function (must be pickleable) | |
:rtype: None | |
""" | |
worker.set_window_size(randint(100, 200), randint(200, 400)) | |
logger.info("Getting Google") | |
worker.get(f'https://ytroulette.com') | |
logger.info("Sleeping") | |
sleep(data) | |
def selenium_queue_listener(data_queue, worker_queue): | |
""" | |
Monitor a data queue and assign new pieces of data to any available web workers to action | |
:param data_queue: The python FIFO queue containing the data to run on the web worker | |
:type data_queue: Queue | |
:param worker_queue: The queue that holds the IDs of any idle workers | |
:type worker_queue: Queue | |
:rtype: None | |
""" | |
logger.info("Selenium func worker started") | |
while True: | |
current_data = data_queue.get() | |
if current_data == 'STOP': | |
# If a stop is encountered then kill the current worker and put the stop back onto the queue | |
# to poison other workers listening on the queue | |
logger.warning("STOP encountered, killing worker thread") | |
data_queue.put(current_data) | |
break | |
else: | |
logger.info(f"Got the item {current_data} on the data queue") | |
# Get the ID of any currently free workers from the worker queue | |
worker_id = worker_queue.get() | |
worker = selenium_workers[worker_id] | |
# Assign current worker and current data to your selenium function | |
selenium_task(worker, current_data) | |
# Put the worker back into the worker queue as it has completed it's task | |
worker_queue.put(worker_id) | |
return | |
# Create one new queue listener thread per selenium worker and start them | |
logger.info("Starting selenium background processes") | |
selenium_processes = [Thread(target=selenium_queue_listener, | |
args=(selenium_data_queue, worker_queue)) for _ in worker_ids] | |
for p in selenium_processes: | |
p.daemon = True | |
p.start() | |
# Add each item of data to the data queue, this could be done over time so long as the selenium queue listening | |
# processes are still running | |
logger.info("Adding data to data queue") | |
for d in selenium_data: | |
selenium_data_queue.put(d) | |
# Wait for all selenium queue listening processes to complete, this happens when the queue listener returns | |
logger.info("Waiting for Queue listener threads to complete") | |
for p in selenium_processes: | |
p.join() | |
# Quit all the web workers elegantly in the background | |
logger.info("Tearing down web workers") | |
for b in selenium_workers.values(): | |
b.quit() |
Great job! Thanks!
Thanks, this is such a helpful tool!
Wow wow woW
Thanks for this script.
How should I write data from each thread and write it to a csv file ?
I can't make threading.Lock() to work ... :(
This is great, thanks.
I'm also returning the results with
class ThreadWithReturnValue(Thread):
def __init__(self, group=None, target=None, name=None,
args=(), kwargs={}, Verbose=None):
Thread.__init__(self, group, target, name, args, kwargs)
self._return = None
def run(self):
if self._target is not None:
self._return = self._target(*self._args,
**self._kwargs)
def join(self, *args):
Thread.join(self, *args)
return self._return
Got it from this post: StackOverflow
For this to work you need line 81 to return a value, I'm returning a list I'm extending with the return of line 78.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is awesome man :)
Thanks for this useful piece