akanik · March 6, 2019 17:16
diff --git a/python-scrape-delayed.py b/python-scrape-delayed.py
 #Big thank you to Mike Stucka for all his help on this one <3
 import pandas as pd
 import requests, time, csv
 from bs4 import BeautifulSoup
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.common.exceptions import UnexpectedAlertPresentException

 #fac_data is created from segment.py
 fac_data = pd.read_csv('../data/RSEI/jeff_co_ky_rsei_fac_list.CSV')

 #the facility_id field is what the url uses to create facility-specific
 #pages
 fac_list = fac_data['_TRI_FACILITY_EZ.TRI_FACILITY_ID'].unique()

 #if there isn't a RSEI page for a given facility, record it here so we can 
 #handcheck afterward
 fac_not_found = []

 #this is the url that we append the facility_ids to to get facility-specific
 #pages
 BASE_URL = 'https://www3.epa.gov/myem/rseimap/rseihome_sidebyside.html?facid='
 #we're going to download each RSEI page as html so we don't have to fetch pages
 #from the server everytime we do this analysis
 HTML_DIR = '../data/RSEI/html/'
 CSV_DIR = '../data/RSEI/csv/'

 #we basically have to run this script as though we were Chrome because
 #everything is the worst
 options = webdriver.ChromeOptions()
 options.add_argument('headless')
 driver = webdriver.Chrome(options=options)


 for fac in fac_list:
    #create the urls and filepaths we'll need for each facility
    fac_url = BASE_URL + fac
    fac_html = HTML_DIR + 'rsei-fac-' + fac + '.html'
    fac_csv = CSV_DIR + 'rsei-fac-' + fac + '.csv'
    
    try:
        #try to retreive data from the url but once you make contact
        #wait 10 seconds because, again, everything is the worst
        driver.get(fac_url)
        time.sleep(10)
        html_src = driver.page_source
        
        #Write the html data we find at the url to our html file
        with open(fac_html, 'w') as o:
            o.write(html_src)
        
        #Parse the html data we downloaded to a csv   
        with open(fac_html, 'r') as i:
            html = i.read()
            soup = BeautifulSoup(html, 'html.parser')
            
            #Here's where the data live
            table = soup.find('table',{'class': 'google-visualization-table-table'})
            rows = table.find_all('tr')
            
            #Open the spreadsheet we created for this facility and write in the column
            #headers you know we're gonna want   
            with open(fac_csv, 'w', newline='') as o:
                writer = csv.DictWriter(o, fieldnames=[
                                            'fac_id','year','rsei_score','industry_rsei_mean',
                                            'county_rsei_mean','state_rsei_mean','us_rsei_mean'])
                
                writer.writeheader()
                
                #Start with the first nonheader row in our html table
                for row in rows[1:]:
                    cols = row.find_all('td')
                    
                    #The .sting format is Beautiful Soup syntax
                    year = cols[0].string.strip()
                    fac_score = cols[1].string.strip()
                    industry_mean = cols[2].string.strip()
                    county_mean = cols[3].string.strip()
                    state_mean = cols[4].string.strip()
                    us_mean = cols[5].string.strip()
                    
                    writer.writerow({
                        'fac_id': fac,
                        'year': year,
                        'rsei_score':fac_score,
                        'industry_rsei_mean':industry_mean,
                        'county_rsei_mean':county_mean,
                        'state_rsei_mean':state_mean,
                        'us_rsei_mean':us_mean
                    })
        #Probably don't need this since we pause for 10 after our initial query but
        #hell, why not sleep so we don't bombard the server
        time.sleep(1)
        
    #If you try to retreive data from the url but there is none, put the facility_id
    #in our fac_not_found list so we can check that out later.
    except UnexpectedAlertPresentException:
        fac_not_found.append(fac)
        alert = driver.switch_to.alert
        alert.accept()

 #output the facilities that were not found to a csv for later analysis
 fac_not_found_df = pd.DataFrame(fac_not_found, columns=['fac_id'])
 fac_not_found_df.to_csv('../data/RSEI/fac_not_found.csv', index=False)
	#Big thank you to Mike Stucka for all his help on this one <3
	import pandas as pd
	import requests, time, csv
	from bs4 import BeautifulSoup
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.common.exceptions import UnexpectedAlertPresentException

	#fac_data is created from segment.py
	fac_data = pd.read_csv('../data/RSEI/jeff_co_ky_rsei_fac_list.CSV')

	#the facility_id field is what the url uses to create facility-specific
	#pages
	fac_list = fac_data['_TRI_FACILITY_EZ.TRI_FACILITY_ID'].unique()

	#if there isn't a RSEI page for a given facility, record it here so we can
	#handcheck afterward
	fac_not_found = []

	#this is the url that we append the facility_ids to to get facility-specific
	#pages
	BASE_URL = 'https://www3.epa.gov/myem/rseimap/rseihome_sidebyside.html?facid='
	#we're going to download each RSEI page as html so we don't have to fetch pages
	#from the server everytime we do this analysis
	HTML_DIR = '../data/RSEI/html/'
	CSV_DIR = '../data/RSEI/csv/'

	#we basically have to run this script as though we were Chrome because
	#everything is the worst
	options = webdriver.ChromeOptions()
	options.add_argument('headless')
	driver = webdriver.Chrome(options=options)


	for fac in fac_list:
	#create the urls and filepaths we'll need for each facility
	fac_url = BASE_URL + fac
	fac_html = HTML_DIR + 'rsei-fac-' + fac + '.html'
	fac_csv = CSV_DIR + 'rsei-fac-' + fac + '.csv'

	try:
	#try to retreive data from the url but once you make contact
	#wait 10 seconds because, again, everything is the worst
	driver.get(fac_url)
	time.sleep(10)
	html_src = driver.page_source

	#Write the html data we find at the url to our html file
	with open(fac_html, 'w') as o:
	o.write(html_src)

	#Parse the html data we downloaded to a csv
	with open(fac_html, 'r') as i:
	html = i.read()
	soup = BeautifulSoup(html, 'html.parser')

	#Here's where the data live
	table = soup.find('table',{'class': 'google-visualization-table-table'})
	rows = table.find_all('tr')

	#Open the spreadsheet we created for this facility and write in the column
	#headers you know we're gonna want
	with open(fac_csv, 'w', newline='') as o:
	writer = csv.DictWriter(o, fieldnames=[
	'fac_id','year','rsei_score','industry_rsei_mean',
	'county_rsei_mean','state_rsei_mean','us_rsei_mean'])

	writer.writeheader()

	#Start with the first nonheader row in our html table
	for row in rows[1:]:
	cols = row.find_all('td')

	#The .sting format is Beautiful Soup syntax
	year = cols[0].string.strip()
	fac_score = cols[1].string.strip()
	industry_mean = cols[2].string.strip()
	county_mean = cols[3].string.strip()
	state_mean = cols[4].string.strip()
	us_mean = cols[5].string.strip()

	writer.writerow({
	'fac_id': fac,
	'year': year,
	'rsei_score':fac_score,
	'industry_rsei_mean':industry_mean,
	'county_rsei_mean':county_mean,
	'state_rsei_mean':state_mean,
	'us_rsei_mean':us_mean
	})
	#Probably don't need this since we pause for 10 after our initial query but
	#hell, why not sleep so we don't bombard the server
	time.sleep(1)

	#If you try to retreive data from the url but there is none, put the facility_id
	#in our fac_not_found list so we can check that out later.
	except UnexpectedAlertPresentException:
	fac_not_found.append(fac)
	alert = driver.switch_to.alert
	alert.accept()

	#output the facilities that were not found to a csv for later analysis
	fac_not_found_df = pd.DataFrame(fac_not_found, columns=['fac_id'])
	fac_not_found_df.to_csv('../data/RSEI/fac_not_found.csv', index=False)