Last active
August 29, 2015 14:06
-
-
Save jfialkoff/f6c9f9f6166c4dd85647 to your computer and use it in GitHub Desktop.
Script to do a search Penn DOS contributor search by name and year.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import csv | |
import logging | |
import re | |
import sys | |
import time | |
import unittest | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver.support.ui import Select | |
from selenium.common.exceptions import NoSuchElementException | |
from selenium.common.exceptions import NoAlertPresentException | |
TARGET_NODE_REGEX = re.compile(r".*hlnkFilerName2$") | |
# ctl00_ContentPlaceHolder1_gvResultsGrid_ctl02_hlnkFilerName2 | |
CONTRIBUTOR_NODE_REGEX = re.compile(r".*lblRecipientName2$") | |
# ctl00_ContentPlaceHolder1_gvResultsGrid_ctl02_lblRecipientName2 | |
DATE_NODE_REGEX = re.compile(r".*lblCycAmountDate$") | |
# ctl00_ContentPlaceHolder1_gvResultsGrid_ctl02_DLCycles_ctl00_DLAmounts_ctl00_lblCycAmountDate | |
AMOUNT_NODE_REGEX = re.compile(r".*lblCycAmount$") | |
# ctl00_ContentPlaceHolder1_gvResultsGrid_ctl02_DLCycles_ctl00_DLAmounts_ctl00_lblCycAmount | |
logger = logging.getLogger('simple_example') | |
logger.setLevel(logging.INFO) | |
ch = logging.StreamHandler() | |
ch.setLevel(logging.INFO) | |
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') | |
ch.setFormatter(formatter) | |
logger.addHandler(ch) | |
ch = logging.FileHandler("log") | |
ch.setLevel(logging.INFO) | |
ch.setFormatter(formatter) | |
logger.addHandler(ch) | |
def _get_page_data(html, name, year): | |
soup = BeautifulSoup(html, 'html5lib') | |
trs = soup.select("table#ctl00_ContentPlaceHolder1_gvResultsGrid > tbody > tr") | |
data = [] | |
for tr in trs[1:]: | |
row = [ | |
tr.find_all('a', id=TARGET_NODE_REGEX), | |
tr.find_all('span', id=CONTRIBUTOR_NODE_REGEX), | |
tr.find_all('span', id=DATE_NODE_REGEX), | |
tr.find_all('span', id=AMOUNT_NODE_REGEX)] | |
if not all(row): | |
break | |
name_parts = [part.lower() for part in re.split(r'\s+', name)] | |
row = [cell[0].text.strip() for cell in row] | |
lcontributor = row[1].lower() | |
if all(part in lcontributor for part in name_parts): | |
data.append([name, year] + row) | |
return data | |
def search_penn_dos(driver, name, year): | |
base_url = "https://www.campaignfinanceonline.state.pa.us/" | |
data = [] | |
driver.get(base_url + "/pages/CFReportSearch.aspx") | |
driver.find_element_by_id("ctl00_ContentPlaceHolder1_rbContrMade").click() | |
driver.find_element_by_xpath("//div[@id='divSearchRdBtns']/label[3]/b").click() | |
driver.find_element_by_id("ctl00_ContentPlaceHolder1_txtContributorNameCM").clear() | |
driver.find_element_by_id("ctl00_ContentPlaceHolder1_txtContributorNameCM").send_keys(name) | |
Select(driver.find_element_by_id("ctl00_ContentPlaceHolder1_ddlElectionYear")).select_by_visible_text(str(year)) | |
driver.find_element_by_id("ctl00_ContentPlaceHolder1_btnSearch").click() | |
try: | |
driver.find_element_by_id("ctl00_ContentPlaceHolder1_lblErrorMsg") | |
except NoSuchElementException: | |
pass | |
else: | |
return data | |
page = 1 | |
finished = False | |
page_links = driver.find_elements_by_css_selector( | |
"table#ctl00_ContentPlaceHolder1_gvResultsGrid > tbody > tr")[-1] | |
try: | |
page_links.find_element_by_link_text('2') | |
has_multiple_pages = True | |
except NoSuchElementException: | |
has_multiple_pages = False | |
while True: | |
logger.info("Trying to extract data from page.") | |
page_data = _get_page_data(driver.page_source, name, year) | |
logger.info("Finished extracting data.") | |
if page_data: | |
data.extend(page_data) | |
else: | |
break | |
if not has_multiple_pages: | |
return data | |
page += 1 | |
page_link = page_links.find_element_by_link_text(str(page)) | |
if page_link: | |
logger.info("Moving to next page.") | |
page_link.click() | |
num_retries = 20 | |
while True: | |
try: | |
page_links = driver.find_elements_by_css_selector( | |
"table#ctl00_ContentPlaceHolder1_gvResultsGrid > tbody > tr")[-1] | |
page_links.find_element_by_link_text(str(page-1)) | |
except: | |
pass | |
else: | |
break | |
time.sleep(5) | |
num_retries -= 1 | |
if num_retries == 0: | |
logger.error("Couldn't read page numbers.") | |
return data | |
logger.info("Page %d" % page) | |
else: | |
break | |
return data | |
if __name__ == "__main__": | |
print("Please enter the list of names you'd like to search for. When " | |
"you're done entering names, hit <Enter> from a blank line.") | |
names = [] | |
years = [int(year) for year in sys.argv[1:]] | |
if not years: | |
years = [2010, 2011, 2012, 2013, 2014] | |
while True: | |
name = sys.stdin.readline().strip() | |
if name: | |
names.append(name) | |
else: | |
break | |
for name in names: | |
for year in years: | |
logger.info("Searching for %s, year %d." % (name, year)) | |
num_retries = 5 | |
while True: | |
driver = webdriver.Firefox() | |
driver.set_page_load_timeout(120) | |
try: | |
data = search_penn_dos(driver, name, year) | |
except: | |
driver.quit() | |
if num_retries == 0: | |
logger.exception("Encountered error when trying to " | |
"search for '%s' in %d." % (name, year)) | |
break | |
num_retries -= 1 | |
logger.warning("Encountered error, retrying") | |
continue | |
else: | |
driver.quit() | |
with open('output.csv', 'a+') as f: | |
writer = csv.writer(f, delimiter=',') | |
f.seek(0) | |
if not f.read(1): | |
writer.writerow([ | |
'Search-Name', 'Search-Year', | |
'Target', 'Contributor', 'Date', 'Amount']) | |
f.seek(0, 2) | |
[writer.writerow(row) for row in data] | |
break | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
beautifulsoup4==4.3.2 | |
html5lib==0.999 | |
selenium==2.43.0 | |
six==1.7.3 | |
wsgiref==0.1.2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment