Skip to content

Instantly share code, notes, and snippets.

@akanik
Created March 6, 2019 17:16
Show Gist options
  • Save akanik/f166bb58dfce5786c20b28d3b4d669d3 to your computer and use it in GitHub Desktop.
Save akanik/f166bb58dfce5786c20b28d3b4d669d3 to your computer and use it in GitHub Desktop.
Scrape a website that has a delayed page load
#Big thank you to Mike Stucka for all his help on this one <3
import pandas as pd
import requests, time, csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import UnexpectedAlertPresentException
#fac_data is created from segment.py
fac_data = pd.read_csv('../data/RSEI/jeff_co_ky_rsei_fac_list.CSV')
#the facility_id field is what the url uses to create facility-specific
#pages
fac_list = fac_data['_TRI_FACILITY_EZ.TRI_FACILITY_ID'].unique()
#if there isn't a RSEI page for a given facility, record it here so we can
#handcheck afterward
fac_not_found = []
#this is the url that we append the facility_ids to to get facility-specific
#pages
BASE_URL = 'https://www3.epa.gov/myem/rseimap/rseihome_sidebyside.html?facid='
#we're going to download each RSEI page as html so we don't have to fetch pages
#from the server everytime we do this analysis
HTML_DIR = '../data/RSEI/html/'
CSV_DIR = '../data/RSEI/csv/'
#we basically have to run this script as though we were Chrome because
#everything is the worst
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(options=options)
for fac in fac_list:
#create the urls and filepaths we'll need for each facility
fac_url = BASE_URL + fac
fac_html = HTML_DIR + 'rsei-fac-' + fac + '.html'
fac_csv = CSV_DIR + 'rsei-fac-' + fac + '.csv'
try:
#try to retreive data from the url but once you make contact
#wait 10 seconds because, again, everything is the worst
driver.get(fac_url)
time.sleep(10)
html_src = driver.page_source
#Write the html data we find at the url to our html file
with open(fac_html, 'w') as o:
o.write(html_src)
#Parse the html data we downloaded to a csv
with open(fac_html, 'r') as i:
html = i.read()
soup = BeautifulSoup(html, 'html.parser')
#Here's where the data live
table = soup.find('table',{'class': 'google-visualization-table-table'})
rows = table.find_all('tr')
#Open the spreadsheet we created for this facility and write in the column
#headers you know we're gonna want
with open(fac_csv, 'w', newline='') as o:
writer = csv.DictWriter(o, fieldnames=[
'fac_id','year','rsei_score','industry_rsei_mean',
'county_rsei_mean','state_rsei_mean','us_rsei_mean'])
writer.writeheader()
#Start with the first nonheader row in our html table
for row in rows[1:]:
cols = row.find_all('td')
#The .sting format is Beautiful Soup syntax
year = cols[0].string.strip()
fac_score = cols[1].string.strip()
industry_mean = cols[2].string.strip()
county_mean = cols[3].string.strip()
state_mean = cols[4].string.strip()
us_mean = cols[5].string.strip()
writer.writerow({
'fac_id': fac,
'year': year,
'rsei_score':fac_score,
'industry_rsei_mean':industry_mean,
'county_rsei_mean':county_mean,
'state_rsei_mean':state_mean,
'us_rsei_mean':us_mean
})
#Probably don't need this since we pause for 10 after our initial query but
#hell, why not sleep so we don't bombard the server
time.sleep(1)
#If you try to retreive data from the url but there is none, put the facility_id
#in our fac_not_found list so we can check that out later.
except UnexpectedAlertPresentException:
fac_not_found.append(fac)
alert = driver.switch_to.alert
alert.accept()
#output the facilities that were not found to a csv for later analysis
fac_not_found_df = pd.DataFrame(fac_not_found, columns=['fac_id'])
fac_not_found_df.to_csv('../data/RSEI/fac_not_found.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment