weiglemc · January 18, 2023 21:36
diff --git a/capture-requests.py b/capture-requests.py
 # run the script on a set of URI-Ms:
 # python3 capture-requests.py < to-request.txt >> requests-log.txt

 # process the results and generate a new list of URI-Ms that were requested:
 # awk '{if ($1 ~ /cnn\.com(:80)?[\/]+$/ && $2 == "200") print $0}' requests-log.txt | sort -t '/' -k 5 >! requests.txt

 # https://pypi.org/project/selenium-wire/#installation
 import sys
 import time
 from seleniumwire import webdriver  # Import from seleniumwire

 def interceptor(request):
    # Block PNG, JPEG and GIF images, fonts
    if request.path.endswith(('.png', '.jpg', '.gif', '.woff', 'woff2', '.tff')):
        request.abort()

 chrome_options = webdriver.ChromeOptions()
 chrome_options.add_argument('headless')

 # Create a new instance of the Chrome driver
 driver = webdriver.Chrome(options=chrome_options)

 # set URLs to capture (scopes) and extensions to ignore (interceptor)
 driver.scopes = ['.*www.cnn.com/$', '.*www.cnn.com:80/$', '.*www.cnn.com//$', '.*header.*', '.*zone-manager.*']
 driver.request_interceptor = interceptor

 # read in URI-Ms we've already requested
 requested = []
 with open ('requests.txt') as fp:    
    line = fp.readline()
    while line:
        urim = line.split(" ")[0]
        requested.append(urim)
        line = fp.readline()

 for url in sys.stdin:
    url = url.rstrip()  # remove trailing newlines

    # check to see if we already have data on this URI-M
    if (url in requested):
        print (url + " already DONE", file=sys.stderr)
        sys.stderr.flush()
        continue

    print ("REQ " + url, end=' ', file=sys.stderr)
    sys.stderr.flush()
    try:
        driver.get(url)
    except Exception as err:
        sys.exit (" - EXCEPTION: " + str(err))

    # Access requests via the `requests` attribute
    for request in driver.requests:
        if request.response:
            print(request.url, request.response.status_code, " ", end='')
            if (request.response.status_code >= 300 | request.response.status_code<400):
                print (request.response.headers['location'])
            elif (request.response.headers['content-length']):
                print (request.response.headers['content-length'])
            elif (request.response.headers['x-archive-orig-content-length']):
                print (request.response.headers['x-archive-orig-content-length'])
            else:
                print ("NoContentLength")
            sys.stdout.flush()

    print (" - DONE, waiting 10 seconds... ", file=sys.stderr)
    sys.stderr.flush()
    time.sleep(10)  # wait 10 seconds between requests

    # reset driver.requests
    del driver.requests 

 driver.close()
 del driver

 # avoid warnings about selenium.Service not shutting down in time
 time.sleep(3)
	# run the script on a set of URI-Ms:
	# python3 capture-requests.py < to-request.txt >> requests-log.txt

	# process the results and generate a new list of URI-Ms that were requested:
	# awk '{if ($1 ~ /cnn\.com(:80)?[\/]+$/ && $2 == "200") print $0}' requests-log.txt \| sort -t '/' -k 5 >! requests.txt

	# https://pypi.org/project/selenium-wire/#installation
	import sys
	import time
	from seleniumwire import webdriver # Import from seleniumwire

	def interceptor(request):
	# Block PNG, JPEG and GIF images, fonts
	if request.path.endswith(('.png', '.jpg', '.gif', '.woff', 'woff2', '.tff')):
	request.abort()

	chrome_options = webdriver.ChromeOptions()
	chrome_options.add_argument('headless')

	# Create a new instance of the Chrome driver
	driver = webdriver.Chrome(options=chrome_options)

	# set URLs to capture (scopes) and extensions to ignore (interceptor)
	driver.scopes = ['.www.cnn.com/$', '.www.cnn.com:80/$', '.www.cnn.com//$', '.header.', '.zone-manager.*']
	driver.request_interceptor = interceptor

	# read in URI-Ms we've already requested
	requested = []
	with open ('requests.txt') as fp:
	line = fp.readline()
	while line:
	urim = line.split(" ")[0]
	requested.append(urim)
	line = fp.readline()

	for url in sys.stdin:
	url = url.rstrip() # remove trailing newlines

	# check to see if we already have data on this URI-M
	if (url in requested):
	print (url + " already DONE", file=sys.stderr)
	sys.stderr.flush()
	continue

	print ("REQ " + url, end=' ', file=sys.stderr)
	sys.stderr.flush()
	try:
	driver.get(url)
	except Exception as err:
	sys.exit (" - EXCEPTION: " + str(err))

	# Access requests via the `requests` attribute
	for request in driver.requests:
	if request.response:
	print(request.url, request.response.status_code, " ", end='')
	if (request.response.status_code >= 300 \| request.response.status_code<400):
	print (request.response.headers['location'])
	elif (request.response.headers['content-length']):
	print (request.response.headers['content-length'])
	elif (request.response.headers['x-archive-orig-content-length']):
	print (request.response.headers['x-archive-orig-content-length'])
	else:
	print ("NoContentLength")
	sys.stdout.flush()

	print (" - DONE, waiting 10 seconds... ", file=sys.stderr)
	sys.stderr.flush()
	time.sleep(10) # wait 10 seconds between requests

	# reset driver.requests
	del driver.requests

	driver.close()
	del driver

	# avoid warnings about selenium.Service not shutting down in time
	time.sleep(3)