Created
April 28, 2020 17:30
-
-
Save thisismattmiller/250b18bc5d3c24beac6e79bae1b54900 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import time | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| import glob | |
| from bs4 import BeautifulSoup | |
| import os.path | |
| import random | |
| import shutil | |
| # get list of all sites | |
| all_html_files = list(glob.glob('index_html/*.html')) | |
| random.shuffle(all_html_files) | |
| # location of the chromedriver | |
| chrome_driver_binary = "/Users/thisismattmiller/Downloads/chromedriver" | |
| # setup these options | |
| chrome_options = Options() | |
| chrome_options.add_argument('--headless') | |
| chrome_options.add_argument('--start-maximized') | |
| chrome_options.add_argument('--disable-notifications') | |
| chrome_options.add_argument("--disable-popup-blocking") | |
| # start up the driver | |
| driver = webdriver.Chrome(chrome_driver_binary, chrome_options=chrome_options) | |
| for f in all_html_files: | |
| # the P1 id | |
| id = f.split('/')[1].split('.')[0] | |
| print(id) | |
| # pull out the http location | |
| with open(f) as infile: | |
| index_html_soruce = infile.read() | |
| soup = BeautifulSoup(index_html_soruce, 'html.parser') | |
| # find the redirect link if it is there | |
| ptag = soup.find('p',attrs={'class':'wbThis'}) | |
| if ptag != None: | |
| url = ptag.find('a')['href'] | |
| print(url) | |
| path = f'screenshots/{id}.png' | |
| # request the page | |
| try: | |
| driver.get(url) | |
| except: | |
| print("this one didnt load 1:",url) | |
| shutil.move(f, f.replace('index_html','index_html_errors')) | |
| continue | |
| # some pages have a lot of alert() boxes, click through all of them if there | |
| try: | |
| alert = driver.switch_to.alert | |
| alert.accept() | |
| alert = driver.switch_to.alert | |
| alert.accept() | |
| alert = driver.switch_to.alert | |
| alert.accept() | |
| alert = driver.switch_to.alert | |
| alert.accept() | |
| alert = driver.switch_to.alert | |
| alert.accept() | |
| except: | |
| pass | |
| time.sleep(0.5) | |
| # try to remove the lc header | |
| try: | |
| driver.execute_script("document.getElementById('wm-maximized').remove()") | |
| driver.execute_script("document.getElementById('wm-minimized').remove()") | |
| print("okay removed lc header") | |
| except: | |
| # couldnt do it, because its not there, might be a edge case, go down the rabbit hole | |
| if 'FILE ARCHIVED ON' not in driver.page_source: | |
| # see if it is a <frameset> page: | |
| if '</frameset>' not in driver.page_source: | |
| print(url,'looks like a bad capture', 'trying to do that redirect') | |
| try: | |
| new_url = driver.execute_script("return document.querySelector('.impatient a').href") | |
| except: | |
| try: | |
| new_url = driver.execute_script("return document.querySelector('.wm-nav-link-div a').href") | |
| except: | |
| try: | |
| new_url = driver.execute_script("return document.querySelector('.wm-nav-link-div a').href") | |
| except: | |
| print('could not find redirect url', url) | |
| shutil.move(f, f.replace('index_html','index_html_errors')) | |
| continue | |
| if '/webarchive.loc.gov/' in new_url: | |
| try: | |
| driver.get(new_url) | |
| except: | |
| print("this one didnt load 2:",url) | |
| shutil.move(f, f.replace('index_html','index_html_errors')) | |
| continue | |
| try: | |
| alert = driver.switch_to.alert | |
| alert.accept() | |
| alert = driver.switch_to.alert | |
| alert.accept() | |
| alert = driver.switch_to.alert | |
| alert.accept() | |
| alert = driver.switch_to.alert | |
| alert.accept() | |
| alert = driver.switch_to.alert | |
| alert.accept() | |
| alert = driver.switch_to.alert | |
| alert.accept() | |
| except: | |
| pass | |
| if '</frameset>' not in driver.page_source: | |
| try: | |
| driver.execute_script("document.getElementById('wm-maximized').remove()") | |
| driver.execute_script("document.getElementById('wm-minimized').remove()") | |
| except: | |
| if 'FILE ARCHIVED ON' not in driver.page_source: | |
| print("this one didnt load 3:",url) | |
| shutil.move(f, f.replace('index_html','index_html_errors')) | |
| continue | |
| else: | |
| shutil.move(f, f.replace('index_html','index_html_errors')) | |
| continue | |
| try: | |
| # save the source HTML | |
| with open(f"html_source/{id}.html", "w") as f: | |
| f.write(driver.page_source) | |
| # turn off the scroll bard | |
| driver.execute_script("document.querySelector('html').style.overflow = 'hidden';") | |
| # try to get the max height of the web page, DOESNT always work! | |
| # should spend more time making this work better, getting that total_height correct is key | |
| total_height = driver.execute_script("return document.body.scrollHeight") | |
| if total_height == 0: | |
| total_height = driver.execute_script("return document.documentElement.scrollHeight") | |
| if total_height == 0: | |
| shutil.move(f, f.replace('index_html','index_html_errors')) | |
| continue | |
| # set the width and height | |
| driver.set_window_size(1440, total_height) | |
| # save the png | |
| driver.save_screenshot(path) | |
| except: | |
| pass | |
| else: | |
| if 'Archived content not available outside of Library of Congress premises' in index_html_soruce: | |
| shutil.move(f, f.replace('index_html','index_html_restricted')) | |
| elif 'The Resource you requested is not in this archive' in index_html_soruce: | |
| shutil.move(f, f.replace('index_html','index_html_not_in_archive')) | |
| else: | |
| print("No tag?") | |
| driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment