Skip to content

Instantly share code, notes, and snippets.

@thisismattmiller
Created April 28, 2020 17:30
Show Gist options
  • Select an option

  • Save thisismattmiller/250b18bc5d3c24beac6e79bae1b54900 to your computer and use it in GitHub Desktop.

Select an option

Save thisismattmiller/250b18bc5d3c24beac6e79bae1b54900 to your computer and use it in GitHub Desktop.
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import glob
from bs4 import BeautifulSoup
import os.path
import random
import shutil
# get list of all sites
all_html_files = list(glob.glob('index_html/*.html'))
random.shuffle(all_html_files)
# location of the chromedriver
chrome_driver_binary = "/Users/thisismattmiller/Downloads/chromedriver"
# setup these options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('--disable-notifications')
chrome_options.add_argument("--disable-popup-blocking")
# start up the driver
driver = webdriver.Chrome(chrome_driver_binary, chrome_options=chrome_options)
for f in all_html_files:
# the P1 id
id = f.split('/')[1].split('.')[0]
print(id)
# pull out the http location
with open(f) as infile:
index_html_soruce = infile.read()
soup = BeautifulSoup(index_html_soruce, 'html.parser')
# find the redirect link if it is there
ptag = soup.find('p',attrs={'class':'wbThis'})
if ptag != None:
url = ptag.find('a')['href']
print(url)
path = f'screenshots/{id}.png'
# request the page
try:
driver.get(url)
except:
print("this one didnt load 1:",url)
shutil.move(f, f.replace('index_html','index_html_errors'))
continue
# some pages have a lot of alert() boxes, click through all of them if there
try:
alert = driver.switch_to.alert
alert.accept()
alert = driver.switch_to.alert
alert.accept()
alert = driver.switch_to.alert
alert.accept()
alert = driver.switch_to.alert
alert.accept()
alert = driver.switch_to.alert
alert.accept()
except:
pass
time.sleep(0.5)
# try to remove the lc header
try:
driver.execute_script("document.getElementById('wm-maximized').remove()")
driver.execute_script("document.getElementById('wm-minimized').remove()")
print("okay removed lc header")
except:
# couldnt do it, because its not there, might be a edge case, go down the rabbit hole
if 'FILE ARCHIVED ON' not in driver.page_source:
# see if it is a <frameset> page:
if '</frameset>' not in driver.page_source:
print(url,'looks like a bad capture', 'trying to do that redirect')
try:
new_url = driver.execute_script("return document.querySelector('.impatient a').href")
except:
try:
new_url = driver.execute_script("return document.querySelector('.wm-nav-link-div a').href")
except:
try:
new_url = driver.execute_script("return document.querySelector('.wm-nav-link-div a').href")
except:
print('could not find redirect url', url)
shutil.move(f, f.replace('index_html','index_html_errors'))
continue
if '/webarchive.loc.gov/' in new_url:
try:
driver.get(new_url)
except:
print("this one didnt load 2:",url)
shutil.move(f, f.replace('index_html','index_html_errors'))
continue
try:
alert = driver.switch_to.alert
alert.accept()
alert = driver.switch_to.alert
alert.accept()
alert = driver.switch_to.alert
alert.accept()
alert = driver.switch_to.alert
alert.accept()
alert = driver.switch_to.alert
alert.accept()
alert = driver.switch_to.alert
alert.accept()
except:
pass
if '</frameset>' not in driver.page_source:
try:
driver.execute_script("document.getElementById('wm-maximized').remove()")
driver.execute_script("document.getElementById('wm-minimized').remove()")
except:
if 'FILE ARCHIVED ON' not in driver.page_source:
print("this one didnt load 3:",url)
shutil.move(f, f.replace('index_html','index_html_errors'))
continue
else:
shutil.move(f, f.replace('index_html','index_html_errors'))
continue
try:
# save the source HTML
with open(f"html_source/{id}.html", "w") as f:
f.write(driver.page_source)
# turn off the scroll bard
driver.execute_script("document.querySelector('html').style.overflow = 'hidden';")
# try to get the max height of the web page, DOESNT always work!
# should spend more time making this work better, getting that total_height correct is key
total_height = driver.execute_script("return document.body.scrollHeight")
if total_height == 0:
total_height = driver.execute_script("return document.documentElement.scrollHeight")
if total_height == 0:
shutil.move(f, f.replace('index_html','index_html_errors'))
continue
# set the width and height
driver.set_window_size(1440, total_height)
# save the png
driver.save_screenshot(path)
except:
pass
else:
if 'Archived content not available outside of Library of Congress premises' in index_html_soruce:
shutil.move(f, f.replace('index_html','index_html_restricted'))
elif 'The Resource you requested is not in this archive' in index_html_soruce:
shutil.move(f, f.replace('index_html','index_html_not_in_archive'))
else:
print("No tag?")
driver.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment