thisismattmiller · April 28, 2020 17:30
diff --git a/make_screen_shots.py b/make_screen_shots.py
 import time
 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 import glob
 from bs4 import BeautifulSoup
 import os.path
 import random
 import shutil

 # get list of all sites
 all_html_files = list(glob.glob('index_html/*.html'))
 random.shuffle(all_html_files)


 # location of the chromedriver
 chrome_driver_binary = "/Users/thisismattmiller/Downloads/chromedriver"

 # setup these options
 chrome_options = Options()
 chrome_options.add_argument('--headless')
 chrome_options.add_argument('--start-maximized')
 chrome_options.add_argument('--disable-notifications')
 chrome_options.add_argument("--disable-popup-blocking")


 # start up the driver
 driver = webdriver.Chrome(chrome_driver_binary, chrome_options=chrome_options)

 for f in all_html_files:

 	# the P1 id		
 	id = f.split('/')[1].split('.')[0]
 	print(id)

 	# pull out the http location
 	with open(f) as infile:
 		index_html_soruce = infile.read()
 		soup = BeautifulSoup(index_html_soruce, 'html.parser')

 	# find the redirect link if it is there
 	ptag = soup.find('p',attrs={'class':'wbThis'})
 	if ptag != None:

 		url = ptag.find('a')['href']
 		print(url)

 		path = f'screenshots/{id}.png'
 		
 		# request the page
 		try:
 			driver.get(url)
 		except:
 			print("this one didnt load 1:",url)
 			shutil.move(f, f.replace('index_html','index_html_errors'))

 			continue

 		# some pages have a lot of alert() boxes, click through all of them if there
 		try:
 			alert = driver.switch_to.alert
 			alert.accept()
 			alert = driver.switch_to.alert
 			alert.accept()
 			alert = driver.switch_to.alert
 			alert.accept()
 			alert = driver.switch_to.alert
 			alert.accept()
 			alert = driver.switch_to.alert
 			alert.accept()									

 		except:
 			pass

 		time.sleep(0.5)

 		# try to remove the lc header
 		try:
 			driver.execute_script("document.getElementById('wm-maximized').remove()")
 			driver.execute_script("document.getElementById('wm-minimized').remove()")
 			print("okay removed lc header")
 		except:

 			# couldnt do it, because its not there, might be a edge case, go down the rabbit hole
 			if 'FILE ARCHIVED ON' not in driver.page_source:
 					
 				# see if it is a <frameset> page:
 				if '</frameset>' not in driver.page_source:

 					print(url,'looks like a bad capture', 'trying to do that redirect')

 					try:
 						new_url = driver.execute_script("return document.querySelector('.impatient a').href")
 					except:

 						try:
 							new_url = driver.execute_script("return document.querySelector('.wm-nav-link-div a').href")

 						except:

 							try:
 								new_url = driver.execute_script("return document.querySelector('.wm-nav-link-div a').href")

 							except:
 								print('could not find redirect url', url)
 								shutil.move(f, f.replace('index_html','index_html_errors'))

 								continue
 							


 					if '/webarchive.loc.gov/' in new_url:

 						try:
 							driver.get(new_url)
 						except:
 							print("this one didnt load 2:",url)
 							shutil.move(f, f.replace('index_html','index_html_errors'))
 							continue

 						try:
 							alert = driver.switch_to.alert
 							alert.accept()
 							alert = driver.switch_to.alert
 							alert.accept()
 							alert = driver.switch_to.alert
 							alert.accept()
 							alert = driver.switch_to.alert
 							alert.accept()
 							alert = driver.switch_to.alert
 							alert.accept()
 							alert = driver.switch_to.alert
 							alert.accept()


 						except:
 							pass


 						if '</frameset>' not in driver.page_source:
 								
 							try:
 								driver.execute_script("document.getElementById('wm-maximized').remove()")
 								driver.execute_script("document.getElementById('wm-minimized').remove()")
 							except:

 								if 'FILE ARCHIVED ON' not in driver.page_source:

 									print("this one didnt load 3:",url)
 									shutil.move(f, f.replace('index_html','index_html_errors'))
 									continue



 					else:
 						shutil.move(f, f.replace('index_html','index_html_errors'))
 						continue

 		try:


 			# save the source HTML
 			with open(f"html_source/{id}.html", "w") as f:
 				f.write(driver.page_source)

 			# turn off the scroll bard
 			driver.execute_script("document.querySelector('html').style.overflow = 'hidden';")
 			
 			# try to get the max height of the web page, DOESNT always work! 
 			# should spend more time making this work better, getting that total_height correct is key
 			total_height = driver.execute_script("return document.body.scrollHeight")
 			if total_height == 0:
 				total_height = driver.execute_script("return document.documentElement.scrollHeight")

 			if total_height == 0:
 				shutil.move(f, f.replace('index_html','index_html_errors'))
 				continue

 			# set the width and height
 			driver.set_window_size(1440, total_height)
 			
 			# save the png
 			driver.save_screenshot(path)
 			
 		except:
 			pass

 	else:
 		if 'Archived content not available outside of Library of Congress premises' in index_html_soruce:
 			shutil.move(f, f.replace('index_html','index_html_restricted'))
 		elif 'The Resource you requested is not in this archive' in index_html_soruce:
 			shutil.move(f, f.replace('index_html','index_html_not_in_archive'))		
 		else:

 			print("No tag?")
 driver.quit()
	import time
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	import glob
	from bs4 import BeautifulSoup
	import os.path
	import random
	import shutil

	# get list of all sites
	all_html_files = list(glob.glob('index_html/*.html'))
	random.shuffle(all_html_files)


	# location of the chromedriver
	chrome_driver_binary = "/Users/thisismattmiller/Downloads/chromedriver"

	# setup these options
	chrome_options = Options()
	chrome_options.add_argument('--headless')
	chrome_options.add_argument('--start-maximized')
	chrome_options.add_argument('--disable-notifications')
	chrome_options.add_argument("--disable-popup-blocking")


	# start up the driver
	driver = webdriver.Chrome(chrome_driver_binary, chrome_options=chrome_options)

	for f in all_html_files:

	# the P1 id
	id = f.split('/')[1].split('.')[0]
	print(id)

	# pull out the http location
	with open(f) as infile:
	index_html_soruce = infile.read()
	soup = BeautifulSoup(index_html_soruce, 'html.parser')

	# find the redirect link if it is there
	ptag = soup.find('p',attrs={'class':'wbThis'})
	if ptag != None:

	url = ptag.find('a')['href']
	print(url)

	path = f'screenshots/{id}.png'

	# request the page
	try:
	driver.get(url)
	except:
	print("this one didnt load 1:",url)
	shutil.move(f, f.replace('index_html','index_html_errors'))

	continue

	# some pages have a lot of alert() boxes, click through all of them if there
	try:
	alert = driver.switch_to.alert
	alert.accept()
	alert = driver.switch_to.alert
	alert.accept()
	alert = driver.switch_to.alert
	alert.accept()
	alert = driver.switch_to.alert
	alert.accept()
	alert = driver.switch_to.alert
	alert.accept()

	except:
	pass

	time.sleep(0.5)

	# try to remove the lc header
	try:
	driver.execute_script("document.getElementById('wm-maximized').remove()")
	driver.execute_script("document.getElementById('wm-minimized').remove()")
	print("okay removed lc header")
	except:

	# couldnt do it, because its not there, might be a edge case, go down the rabbit hole
	if 'FILE ARCHIVED ON' not in driver.page_source:

	# see if it is a <frameset> page:
	if '</frameset>' not in driver.page_source:

	print(url,'looks like a bad capture', 'trying to do that redirect')

	try:
	new_url = driver.execute_script("return document.querySelector('.impatient a').href")
	except:

	try:
	new_url = driver.execute_script("return document.querySelector('.wm-nav-link-div a').href")

	except:

	try:
	new_url = driver.execute_script("return document.querySelector('.wm-nav-link-div a').href")

	except:
	print('could not find redirect url', url)
	shutil.move(f, f.replace('index_html','index_html_errors'))

	continue



	if '/webarchive.loc.gov/' in new_url:

	try:
	driver.get(new_url)
	except:
	print("this one didnt load 2:",url)
	shutil.move(f, f.replace('index_html','index_html_errors'))
	continue

	try:
	alert = driver.switch_to.alert
	alert.accept()
	alert = driver.switch_to.alert
	alert.accept()
	alert = driver.switch_to.alert
	alert.accept()
	alert = driver.switch_to.alert
	alert.accept()
	alert = driver.switch_to.alert
	alert.accept()
	alert = driver.switch_to.alert
	alert.accept()


	except:
	pass


	if '</frameset>' not in driver.page_source:

	try:
	driver.execute_script("document.getElementById('wm-maximized').remove()")
	driver.execute_script("document.getElementById('wm-minimized').remove()")
	except:

	if 'FILE ARCHIVED ON' not in driver.page_source:

	print("this one didnt load 3:",url)
	shutil.move(f, f.replace('index_html','index_html_errors'))
	continue



	else:
	shutil.move(f, f.replace('index_html','index_html_errors'))
	continue

	try:


	# save the source HTML
	with open(f"html_source/{id}.html", "w") as f:
	f.write(driver.page_source)

	# turn off the scroll bard
	driver.execute_script("document.querySelector('html').style.overflow = 'hidden';")

	# try to get the max height of the web page, DOESNT always work!
	# should spend more time making this work better, getting that total_height correct is key
	total_height = driver.execute_script("return document.body.scrollHeight")
	if total_height == 0:
	total_height = driver.execute_script("return document.documentElement.scrollHeight")

	if total_height == 0:
	shutil.move(f, f.replace('index_html','index_html_errors'))
	continue

	# set the width and height
	driver.set_window_size(1440, total_height)

	# save the png
	driver.save_screenshot(path)

	except:
	pass

	else:
	if 'Archived content not available outside of Library of Congress premises' in index_html_soruce:
	shutil.move(f, f.replace('index_html','index_html_restricted'))
	elif 'The Resource you requested is not in this archive' in index_html_soruce:
	shutil.move(f, f.replace('index_html','index_html_not_in_archive'))
	else:

	print("No tag?")
	driver.quit()
No results found