Skip to content

Instantly share code, notes, and snippets.

@jfialkoff
Last active August 29, 2015 14:06
Show Gist options
  • Save jfialkoff/f6c9f9f6166c4dd85647 to your computer and use it in GitHub Desktop.
Save jfialkoff/f6c9f9f6166c4dd85647 to your computer and use it in GitHub Desktop.
Script to do a search Penn DOS contributor search by name and year.
# -*- coding: utf-8 -*-
import csv
import logging
import re
import sys
import time
import unittest
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
TARGET_NODE_REGEX = re.compile(r".*hlnkFilerName2$")
# ctl00_ContentPlaceHolder1_gvResultsGrid_ctl02_hlnkFilerName2
CONTRIBUTOR_NODE_REGEX = re.compile(r".*lblRecipientName2$")
# ctl00_ContentPlaceHolder1_gvResultsGrid_ctl02_lblRecipientName2
DATE_NODE_REGEX = re.compile(r".*lblCycAmountDate$")
# ctl00_ContentPlaceHolder1_gvResultsGrid_ctl02_DLCycles_ctl00_DLAmounts_ctl00_lblCycAmountDate
AMOUNT_NODE_REGEX = re.compile(r".*lblCycAmount$")
# ctl00_ContentPlaceHolder1_gvResultsGrid_ctl02_DLCycles_ctl00_DLAmounts_ctl00_lblCycAmount
logger = logging.getLogger('simple_example')
logger.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
ch = logging.FileHandler("log")
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)
def _get_page_data(html, name, year):
soup = BeautifulSoup(html, 'html5lib')
trs = soup.select("table#ctl00_ContentPlaceHolder1_gvResultsGrid > tbody > tr")
data = []
for tr in trs[1:]:
row = [
tr.find_all('a', id=TARGET_NODE_REGEX),
tr.find_all('span', id=CONTRIBUTOR_NODE_REGEX),
tr.find_all('span', id=DATE_NODE_REGEX),
tr.find_all('span', id=AMOUNT_NODE_REGEX)]
if not all(row):
break
name_parts = [part.lower() for part in re.split(r'\s+', name)]
row = [cell[0].text.strip() for cell in row]
lcontributor = row[1].lower()
if all(part in lcontributor for part in name_parts):
data.append([name, year] + row)
return data
def search_penn_dos(driver, name, year):
base_url = "https://www.campaignfinanceonline.state.pa.us/"
data = []
driver.get(base_url + "/pages/CFReportSearch.aspx")
driver.find_element_by_id("ctl00_ContentPlaceHolder1_rbContrMade").click()
driver.find_element_by_xpath("//div[@id='divSearchRdBtns']/label[3]/b").click()
driver.find_element_by_id("ctl00_ContentPlaceHolder1_txtContributorNameCM").clear()
driver.find_element_by_id("ctl00_ContentPlaceHolder1_txtContributorNameCM").send_keys(name)
Select(driver.find_element_by_id("ctl00_ContentPlaceHolder1_ddlElectionYear")).select_by_visible_text(str(year))
driver.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearch").click()
try:
driver.find_element_by_id("ctl00_ContentPlaceHolder1_lblErrorMsg")
except NoSuchElementException:
pass
else:
return data
page = 1
finished = False
page_links = driver.find_elements_by_css_selector(
"table#ctl00_ContentPlaceHolder1_gvResultsGrid > tbody > tr")[-1]
try:
page_links.find_element_by_link_text('2')
has_multiple_pages = True
except NoSuchElementException:
has_multiple_pages = False
while True:
logger.info("Trying to extract data from page.")
page_data = _get_page_data(driver.page_source, name, year)
logger.info("Finished extracting data.")
if page_data:
data.extend(page_data)
else:
break
if not has_multiple_pages:
return data
page += 1
page_link = page_links.find_element_by_link_text(str(page))
if page_link:
logger.info("Moving to next page.")
page_link.click()
num_retries = 20
while True:
try:
page_links = driver.find_elements_by_css_selector(
"table#ctl00_ContentPlaceHolder1_gvResultsGrid > tbody > tr")[-1]
page_links.find_element_by_link_text(str(page-1))
except:
pass
else:
break
time.sleep(5)
num_retries -= 1
if num_retries == 0:
logger.error("Couldn't read page numbers.")
return data
logger.info("Page %d" % page)
else:
break
return data
if __name__ == "__main__":
print("Please enter the list of names you'd like to search for. When "
"you're done entering names, hit <Enter> from a blank line.")
names = []
years = [int(year) for year in sys.argv[1:]]
if not years:
years = [2010, 2011, 2012, 2013, 2014]
while True:
name = sys.stdin.readline().strip()
if name:
names.append(name)
else:
break
for name in names:
for year in years:
logger.info("Searching for %s, year %d." % (name, year))
num_retries = 5
while True:
driver = webdriver.Firefox()
driver.set_page_load_timeout(120)
try:
data = search_penn_dos(driver, name, year)
except:
driver.quit()
if num_retries == 0:
logger.exception("Encountered error when trying to "
"search for '%s' in %d." % (name, year))
break
num_retries -= 1
logger.warning("Encountered error, retrying")
continue
else:
driver.quit()
with open('output.csv', 'a+') as f:
writer = csv.writer(f, delimiter=',')
f.seek(0)
if not f.read(1):
writer.writerow([
'Search-Name', 'Search-Year',
'Target', 'Contributor', 'Date', 'Amount'])
f.seek(0, 2)
[writer.writerow(row) for row in data]
break
beautifulsoup4==4.3.2
html5lib==0.999
selenium==2.43.0
six==1.7.3
wsgiref==0.1.2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment