Last active
February 8, 2025 04:09
-
-
Save rengler33/f8b9d3f26a518c08a414f6f86109863c to your computer and use it in GitHub Desktop.
How to Capture Network Traffic When Scraping with Selenium & Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# see rkengler.com for related blog post | |
# https://www.rkengler.com/how-to-capture-network-traffic-when-scraping-with-selenium-and-python/ | |
import json | |
import pprint | |
from selenium import webdriver | |
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
capabilities = DesiredCapabilities.CHROME | |
# capabilities["loggingPrefs"] = {"performance": "ALL"} # chromedriver < ~75 | |
capabilities["goog:loggingPrefs"] = {"performance": "ALL"} # chromedriver 75+ | |
driver = webdriver.Chrome( | |
r"chromedriver.exe", | |
desired_capabilities=capabilities, | |
) | |
def process_browser_logs_for_network_events(logs): | |
""" | |
Return only logs which have a method that start with "Network.response", "Network.request", or "Network.webSocket" | |
since we're interested in the network events specifically. | |
""" | |
for entry in logs: | |
log = json.loads(entry["message"])["message"] | |
if ( | |
"Network.response" in log["method"] | |
or "Network.request" in log["method"] | |
or "Network.webSocket" in log["method"] | |
): | |
yield log | |
driver.get("https://www.rkengler.com") | |
logs = driver.get_log("performance") | |
events = process_browser_logs_for_network_events(logs) | |
with open("log_entries.txt", "wt") as out: | |
for event in events: | |
pprint.pprint(event, stream=out) |
This still works great with newest version of Selenium and Chrome Driver. Bit of tweaking but worked out in the end. Here is the code functions I wrote and utilize when performing web-scraping:
-Function to initialize the driver_instance and return it:
def initialize_chrome_driver_instance(profile_id):
logger.info(f"{Colors.CYAN}Initializing{Colors.END}{Colors.YELLOW} chrome-driver{Colors.END} {Colors.CYAN} 'webdriver_instance' for web-automation via chrome...{Colors.END}")
# Create a WebDriver instance
# Here we specify full path to chromedriver
chrome_driver_path = "/home/software/chromedriver-linux64/chromedriver"
service = Service(executable_path=chrome_driver_path)
chrome_options = webdriver.ChromeOptions()
chrome_options.headless = False # Set to False if you want to see the browser while running
chrome_options.add_experimental_option("debuggerAddress", debugger_address)
# needs to be added for network logging
chrome_options.add_argument("--auto-open-devtools-for-tabs")
# Add this for CDP network logging :D
chrome_options.set_capability("goog:loggingPrefs", {"performance": "ALL"})
driver_instance = webdriver.Chrome(service=service, options=chrome_options)
# Enable network logging
driver_instance.execute_cdp_cmd('Network.enable', {})
logger.info(f"{Colors.GREEN}Successfully initialized webdriver_instance:{Colors.END}")
logger.info(f"{Colors.MAGENTA} {driver_instance} {Colors.END}")
logger.info(f"{Colors.BLUE}Returning;{Colors.END} {Colors.YELLOW}driver_instance{Colors.END}")
logger.info(f"{Colors.CYAN}-{Colors.END}{Colors.CYAN} driver_instance:{Colors.END} {Colors.YELLOW}{driver_instance}{Colors.END}")
# Print a blank line to the terminal
print("")
return driver_instance
if __name__ == "__main__":
initialize_chrome_driver_instance()
# Use these imports for the above code of initializing webdriver
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
-Function to capture traffic from specified url:
def intercept_traffic_check_log_auth(driver_instance):
driver_instance.get("https://example.com")
# Allow some time for traffic to load
time.sleep(5)
# Function to capture and print network events
intercepted_traffic_object = driver_instance.get_log("performance")
#print(intercepted_traffic_object)
# Here we return all the traffic we captured as a object.
return intercepted_traffic_object
if __name__ == "__main__":
intercept_traffic_check_log_auth()
Thanks so much to the author of this post I found on Google: "https://www.rkengler.com/how-to-capture-network-traffic-when-scraping-with-selenium-and-python/" Amazing!
Have a blessed day ^_^
does this bring back the actual network response, because I'm seeing headers, status codes and other metadata but no actual response data 🥲
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi @masummuhammad I found a way with selenium >4 . Slightly different but the spirit is the same.