weiglemc · January 24, 2023 19:13
diff --git a/capture-html.py b/capture-html.py
 # run from untracked/html as 
 # % python3 ../../capture-html.py < ../../cnn-to-request.txt >> ../../cnn-html-list.txt 
 import sys
 import time
 import requests

 WAIT = 10  # seconds to wait between requests
 TIMEOUT = 60  # seconds to wait for timeout
 DONE_URI_LIST = "../../cnn-html-list.txt"

 # setup requests headers
 headers = {
    'User-Agent': 'ODU WS-DL research script',
    'From': 'mweigle@odu.edu'
 }

 # read in URI-Ms we've already requested
 requested = []
 with open (DONE_URI_LIST) as fp:    
    line = fp.readline()
    while line:
        urim = line.split(" ")[0]
        requested.append(urim)
        line = fp.readline()

 for url in sys.stdin:
    if ("id_" not in url):
        # add id_ after datetime
        url = url.replace("/http", "id_/http")

    # check to see if we already have data on this URI-M
    if (url in requested):
        url = url.rstrip()  # remove trailing newlines
        print (url + " - already DONE", file=sys.stderr)
        sys.stderr.flush()
        continue

    url = url.rstrip()  # remove trailing newlines
    print ("REQ " + url, end=' ', file=sys.stderr)
    sys.stderr.flush()

    try:
        response = requests.get(url, headers=headers, timeout=TIMEOUT) # TIMEOUT second timeout
    except requests.exceptions.RequestException as err:
        print(" - EXCEPTION: ", str(err), file=sys.stderr)
        sys.stderr.flush()
        continue

    # write url to stdout (want to redirect this to append to DONE_URI_LIST)
    print (url)
    sys.stdout.flush()

    # construct output filename (cnn-DT.html) - DT is 14 digits
    idx = url.find("id_")
    if (idx == -1): continue
    dt = url[idx-14:idx]
    outfile = "cnn-" + dt + ".html"

    # write content to file
    with open (outfile, mode="w") as f:
        f.write(response.text)

    print (" - DONE, waiting " + str(WAIT) + " seconds... ", file=sys.stderr)
    sys.stderr.flush()
    time.sleep(WAIT)  # wait WAIT seconds between requests
	# run from untracked/html as
	# % python3 ../../capture-html.py < ../../cnn-to-request.txt >> ../../cnn-html-list.txt
	import sys
	import time
	import requests

	WAIT = 10 # seconds to wait between requests
	TIMEOUT = 60 # seconds to wait for timeout
	DONE_URI_LIST = "../../cnn-html-list.txt"

	# setup requests headers
	headers = {
	'User-Agent': 'ODU WS-DL research script',
	'From': 'mweigle@odu.edu'
	}

	# read in URI-Ms we've already requested
	requested = []
	with open (DONE_URI_LIST) as fp:
	line = fp.readline()
	while line:
	urim = line.split(" ")[0]
	requested.append(urim)
	line = fp.readline()

	for url in sys.stdin:
	if ("id_" not in url):
	# add id_ after datetime
	url = url.replace("/http", "id_/http")

	# check to see if we already have data on this URI-M
	if (url in requested):
	url = url.rstrip() # remove trailing newlines
	print (url + " - already DONE", file=sys.stderr)
	sys.stderr.flush()
	continue

	url = url.rstrip() # remove trailing newlines
	print ("REQ " + url, end=' ', file=sys.stderr)
	sys.stderr.flush()

	try:
	response = requests.get(url, headers=headers, timeout=TIMEOUT) # TIMEOUT second timeout
	except requests.exceptions.RequestException as err:
	print(" - EXCEPTION: ", str(err), file=sys.stderr)
	sys.stderr.flush()
	continue

	# write url to stdout (want to redirect this to append to DONE_URI_LIST)
	print (url)
	sys.stdout.flush()

	# construct output filename (cnn-DT.html) - DT is 14 digits
	idx = url.find("id_")
	if (idx == -1): continue
	dt = url[idx-14:idx]
	outfile = "cnn-" + dt + ".html"

	# write content to file
	with open (outfile, mode="w") as f:
	f.write(response.text)

	print (" - DONE, waiting " + str(WAIT) + " seconds... ", file=sys.stderr)
	sys.stderr.flush()
	time.sleep(WAIT) # wait WAIT seconds between requests
No results found