Created
January 18, 2023 21:36
-
-
Save weiglemc/81c7befb1ee35c405105036c5632ff82 to your computer and use it in GitHub Desktop.
Python script using selenium-wire to render a webpage and capture specific requests that it generates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# run the script on a set of URI-Ms: | |
# python3 capture-requests.py < to-request.txt >> requests-log.txt | |
# process the results and generate a new list of URI-Ms that were requested: | |
# awk '{if ($1 ~ /cnn\.com(:80)?[\/]+$/ && $2 == "200") print $0}' requests-log.txt | sort -t '/' -k 5 >! requests.txt | |
# https://pypi.org/project/selenium-wire/#installation | |
import sys | |
import time | |
from seleniumwire import webdriver # Import from seleniumwire | |
def interceptor(request): | |
# Block PNG, JPEG and GIF images, fonts | |
if request.path.endswith(('.png', '.jpg', '.gif', '.woff', 'woff2', '.tff')): | |
request.abort() | |
chrome_options = webdriver.ChromeOptions() | |
chrome_options.add_argument('headless') | |
# Create a new instance of the Chrome driver | |
driver = webdriver.Chrome(options=chrome_options) | |
# set URLs to capture (scopes) and extensions to ignore (interceptor) | |
driver.scopes = ['.*www.cnn.com/$', '.*www.cnn.com:80/$', '.*www.cnn.com//$', '.*header.*', '.*zone-manager.*'] | |
driver.request_interceptor = interceptor | |
# read in URI-Ms we've already requested | |
requested = [] | |
with open ('requests.txt') as fp: | |
line = fp.readline() | |
while line: | |
urim = line.split(" ")[0] | |
requested.append(urim) | |
line = fp.readline() | |
for url in sys.stdin: | |
url = url.rstrip() # remove trailing newlines | |
# check to see if we already have data on this URI-M | |
if (url in requested): | |
print (url + " already DONE", file=sys.stderr) | |
sys.stderr.flush() | |
continue | |
print ("REQ " + url, end=' ', file=sys.stderr) | |
sys.stderr.flush() | |
try: | |
driver.get(url) | |
except Exception as err: | |
sys.exit (" - EXCEPTION: " + str(err)) | |
# Access requests via the `requests` attribute | |
for request in driver.requests: | |
if request.response: | |
print(request.url, request.response.status_code, " ", end='') | |
if (request.response.status_code >= 300 | request.response.status_code<400): | |
print (request.response.headers['location']) | |
elif (request.response.headers['content-length']): | |
print (request.response.headers['content-length']) | |
elif (request.response.headers['x-archive-orig-content-length']): | |
print (request.response.headers['x-archive-orig-content-length']) | |
else: | |
print ("NoContentLength") | |
sys.stdout.flush() | |
print (" - DONE, waiting 10 seconds... ", file=sys.stderr) | |
sys.stderr.flush() | |
time.sleep(10) # wait 10 seconds between requests | |
# reset driver.requests | |
del driver.requests | |
driver.close() | |
del driver | |
# avoid warnings about selenium.Service not shutting down in time | |
time.sleep(3) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment