Skip to content

Instantly share code, notes, and snippets.

@banterability
Created October 7, 2010 22:22
Show Gist options
  • Select an option

  • Save banterability/616028 to your computer and use it in GitHub Desktop.

Select an option

Save banterability/616028 to your computer and use it in GitHub Desktop.
Rough sketch of a parser for California election data feeds
import os
import shutil
import urllib2
import zipfile
from lxml import etree # requires lxml & friends
# TODO:
# - Total vote counts
# - County-level results
DOWNLOAD_URL = "http://www.sos.ca.gov/media/10gg/november2010-sample-xml.zip"
DATA_FILE = "X10GG_510.xml"
TMP_DIR = "election_tmp"
CONTEST_TYPES = {
'0200': {'description': 'Governor', 'type': 'candidate'},
'0300': {'description': 'Lieutenant Governor', 'type': 'candidate'},
'0400': {'description': 'Secretary of State', 'type': 'candidate'},
'0500': {'description': 'State Controller', 'type': 'candidate'},
'0600': {'description': 'State Treasurer', 'type': 'candidate'},
'0700': {'description': 'Attorney General', 'type': 'candidate'},
'0800': {'description': 'Insurance Commissioner', 'type': 'candidate'},
'0900': {'description': 'Board of Equalization', 'type': 'candidate'},
'1000': {'description': 'U.S. Senate', 'type': 'candidate'},
'1100': {'description': 'U.S. Representative in Congress', 'type': 'candidate'},
'1200': {'description': 'State Senate', 'type': 'candidate'},
'1300': {'description': 'State Assembly', 'type': 'candidate'},
'1400': {'description': 'Supreme Court Justices', 'type': 'measure'},
'1500': {'description': 'Courts of Appeal Justices', 'type': 'measure'},
'1600': {'description': 'Superintendent of Public Instruction', 'type': 'candidate'},
'1900': {'description': 'Ballot Measures', 'type': 'measure'},
}
#### Helpers ####
def download_file(url):
os.mkdir(TMP_DIR)
file_name = url.split('/')[-1]
request = urllib2.Request(url)
opener = urllib2.build_opener()
request.add_header('User-Agent', 'kpcc-hancock/1.0 +http://www.scpr.org')
remote_file = opener.open(request)
local_file = open(os.path.join(TMP_DIR, file_name), 'w')
local_file.write(remote_file.read())
local_file.close()
remote_file.close()
return os.path.join(TMP_DIR, file_name)
def unzip_and_extract(data_archive):
f = open(data_archive, 'r')
zfobj = zipfile.ZipFile(f)
for name in zfobj.namelist():
if name == DATA_FILE:
outfile = open(os.path.join(TMP_DIR, name), 'wb')
outfile.write(zfobj.read(name))
outfile.close()
f.close()
def process_candidate(contest_package):
"""
Handle a contest with candidates.
TODO: Fix return type.
"""
contest = contest_package["contest"]
print "%s:" % contest_package["name"]
# Get all available candidates
selections = contest.findall('TotalVotes/Selection')
for s in selections:
cname = s.findtext("Candidate/CandidateIdentifier/CandidateName")
# Party affiliation
cid = s.findtext("Candidate/Affiliation/Type")
# TODO: Get incumbancy status
print " %s (%s)" % (cname, cid)
# Get vote count for this candidate
votes = s.findtext("ValidVotes")
# Get percentage of vote in this contest for this candidate
pv = s.findtext("CountMetric[@Id='PVR']")
print " - %s%% (%s)" % (pv, votes)
def process_measure(contest_package):
"""
Handle a contest with yes/no options.
TODO: Fix return type.
"""
contest = contest_package["contest"]
# If contest is a prop, extract the proposition number from the contest ID
if contest_package["info_dict"]["description"] == "Ballot Measures":
print "Prop %s: %s" % (
int(contest_package["id"][7:12]),
contest_package["name"])
else:
print "%s:" % contest_package["name"]
# Get percentage of yes & no votes in this contest
pyv = contest.findtext('TotalVotes/CountMetric[@Id="PYV"]')
pnv = contest.findtext('TotalVotes/CountMetric[@Id="PNV"]')
# Get all available responses (Should just be yes and no)
selections = contest.findall('TotalVotes/Selection')
for s in selections:
cname = s.find("Candidate/ProposalItem").get("ReferendumOptionIdentifier")
print " %s" % cname
votes = s.findtext("ValidVotes")
# Check option identifier and associate with vote percentages
if cname == "Yes":
use_var = pyv
if cname == "No":
use_var = pnv
print " - %s%% (%s)" % (use_var, votes)
#### Main #####
# get the file
unzip_and_extract(download_file(DOWNLOAD_URL)) # download data, unzip, and extract xml
response = open(os.path.join(TMP_DIR, DATA_FILE), 'r')
results = etree.fromstring(response.read())
response.close()
shutil.rmtree(TMP_DIR) # erase working directory
contests = results.findall('.//Contests/Contest')
for contest in contests:
# Extract the contest ID to decide how to handle things
contest_id = contest.find('ContestIdentifier').get("Id", "Not defined")
# Compare the first four digits against mapping dictionary
contest_type_dict = CONTEST_TYPES[contest_id[0:4]]
contest_type = contest_type_dict["type"]
# Package contest plus previously accessed data for helper functions
contest_package = {
'contest': contest,
'id': contest_id,
'info_dict': contest_type_dict,
'name': contest.findtext('ContestIdentifier/ContestName')}
# Hand off...
if contest_type == "measure":
process_measure(contest_package)
elif contest_type == "candidate":
process_candidate(contest_package)
print "\n"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment