Created
March 6, 2019 17:16
-
-
Save akanik/f166bb58dfce5786c20b28d3b4d669d3 to your computer and use it in GitHub Desktop.
Scrape a website that has a delayed page load
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Big thank you to Mike Stucka for all his help on this one <3 | |
import pandas as pd | |
import requests, time, csv | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.common.exceptions import UnexpectedAlertPresentException | |
#fac_data is created from segment.py | |
fac_data = pd.read_csv('../data/RSEI/jeff_co_ky_rsei_fac_list.CSV') | |
#the facility_id field is what the url uses to create facility-specific | |
#pages | |
fac_list = fac_data['_TRI_FACILITY_EZ.TRI_FACILITY_ID'].unique() | |
#if there isn't a RSEI page for a given facility, record it here so we can | |
#handcheck afterward | |
fac_not_found = [] | |
#this is the url that we append the facility_ids to to get facility-specific | |
#pages | |
BASE_URL = 'https://www3.epa.gov/myem/rseimap/rseihome_sidebyside.html?facid=' | |
#we're going to download each RSEI page as html so we don't have to fetch pages | |
#from the server everytime we do this analysis | |
HTML_DIR = '../data/RSEI/html/' | |
CSV_DIR = '../data/RSEI/csv/' | |
#we basically have to run this script as though we were Chrome because | |
#everything is the worst | |
options = webdriver.ChromeOptions() | |
options.add_argument('headless') | |
driver = webdriver.Chrome(options=options) | |
for fac in fac_list: | |
#create the urls and filepaths we'll need for each facility | |
fac_url = BASE_URL + fac | |
fac_html = HTML_DIR + 'rsei-fac-' + fac + '.html' | |
fac_csv = CSV_DIR + 'rsei-fac-' + fac + '.csv' | |
try: | |
#try to retreive data from the url but once you make contact | |
#wait 10 seconds because, again, everything is the worst | |
driver.get(fac_url) | |
time.sleep(10) | |
html_src = driver.page_source | |
#Write the html data we find at the url to our html file | |
with open(fac_html, 'w') as o: | |
o.write(html_src) | |
#Parse the html data we downloaded to a csv | |
with open(fac_html, 'r') as i: | |
html = i.read() | |
soup = BeautifulSoup(html, 'html.parser') | |
#Here's where the data live | |
table = soup.find('table',{'class': 'google-visualization-table-table'}) | |
rows = table.find_all('tr') | |
#Open the spreadsheet we created for this facility and write in the column | |
#headers you know we're gonna want | |
with open(fac_csv, 'w', newline='') as o: | |
writer = csv.DictWriter(o, fieldnames=[ | |
'fac_id','year','rsei_score','industry_rsei_mean', | |
'county_rsei_mean','state_rsei_mean','us_rsei_mean']) | |
writer.writeheader() | |
#Start with the first nonheader row in our html table | |
for row in rows[1:]: | |
cols = row.find_all('td') | |
#The .sting format is Beautiful Soup syntax | |
year = cols[0].string.strip() | |
fac_score = cols[1].string.strip() | |
industry_mean = cols[2].string.strip() | |
county_mean = cols[3].string.strip() | |
state_mean = cols[4].string.strip() | |
us_mean = cols[5].string.strip() | |
writer.writerow({ | |
'fac_id': fac, | |
'year': year, | |
'rsei_score':fac_score, | |
'industry_rsei_mean':industry_mean, | |
'county_rsei_mean':county_mean, | |
'state_rsei_mean':state_mean, | |
'us_rsei_mean':us_mean | |
}) | |
#Probably don't need this since we pause for 10 after our initial query but | |
#hell, why not sleep so we don't bombard the server | |
time.sleep(1) | |
#If you try to retreive data from the url but there is none, put the facility_id | |
#in our fac_not_found list so we can check that out later. | |
except UnexpectedAlertPresentException: | |
fac_not_found.append(fac) | |
alert = driver.switch_to.alert | |
alert.accept() | |
#output the facilities that were not found to a csv for later analysis | |
fac_not_found_df = pd.DataFrame(fac_not_found, columns=['fac_id']) | |
fac_not_found_df.to_csv('../data/RSEI/fac_not_found.csv', index=False) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment