Created
October 7, 2010 22:22
-
-
Save banterability/616028 to your computer and use it in GitHub Desktop.
Rough sketch of a parser for California election data feeds
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import shutil | |
| import urllib2 | |
| import zipfile | |
| from lxml import etree # requires lxml & friends | |
| # TODO: | |
| # - Total vote counts | |
| # - County-level results | |
| DOWNLOAD_URL = "http://www.sos.ca.gov/media/10gg/november2010-sample-xml.zip" | |
| DATA_FILE = "X10GG_510.xml" | |
| TMP_DIR = "election_tmp" | |
| CONTEST_TYPES = { | |
| '0200': {'description': 'Governor', 'type': 'candidate'}, | |
| '0300': {'description': 'Lieutenant Governor', 'type': 'candidate'}, | |
| '0400': {'description': 'Secretary of State', 'type': 'candidate'}, | |
| '0500': {'description': 'State Controller', 'type': 'candidate'}, | |
| '0600': {'description': 'State Treasurer', 'type': 'candidate'}, | |
| '0700': {'description': 'Attorney General', 'type': 'candidate'}, | |
| '0800': {'description': 'Insurance Commissioner', 'type': 'candidate'}, | |
| '0900': {'description': 'Board of Equalization', 'type': 'candidate'}, | |
| '1000': {'description': 'U.S. Senate', 'type': 'candidate'}, | |
| '1100': {'description': 'U.S. Representative in Congress', 'type': 'candidate'}, | |
| '1200': {'description': 'State Senate', 'type': 'candidate'}, | |
| '1300': {'description': 'State Assembly', 'type': 'candidate'}, | |
| '1400': {'description': 'Supreme Court Justices', 'type': 'measure'}, | |
| '1500': {'description': 'Courts of Appeal Justices', 'type': 'measure'}, | |
| '1600': {'description': 'Superintendent of Public Instruction', 'type': 'candidate'}, | |
| '1900': {'description': 'Ballot Measures', 'type': 'measure'}, | |
| } | |
| #### Helpers #### | |
| def download_file(url): | |
| os.mkdir(TMP_DIR) | |
| file_name = url.split('/')[-1] | |
| request = urllib2.Request(url) | |
| opener = urllib2.build_opener() | |
| request.add_header('User-Agent', 'kpcc-hancock/1.0 +http://www.scpr.org') | |
| remote_file = opener.open(request) | |
| local_file = open(os.path.join(TMP_DIR, file_name), 'w') | |
| local_file.write(remote_file.read()) | |
| local_file.close() | |
| remote_file.close() | |
| return os.path.join(TMP_DIR, file_name) | |
| def unzip_and_extract(data_archive): | |
| f = open(data_archive, 'r') | |
| zfobj = zipfile.ZipFile(f) | |
| for name in zfobj.namelist(): | |
| if name == DATA_FILE: | |
| outfile = open(os.path.join(TMP_DIR, name), 'wb') | |
| outfile.write(zfobj.read(name)) | |
| outfile.close() | |
| f.close() | |
| def process_candidate(contest_package): | |
| """ | |
| Handle a contest with candidates. | |
| TODO: Fix return type. | |
| """ | |
| contest = contest_package["contest"] | |
| print "%s:" % contest_package["name"] | |
| # Get all available candidates | |
| selections = contest.findall('TotalVotes/Selection') | |
| for s in selections: | |
| cname = s.findtext("Candidate/CandidateIdentifier/CandidateName") | |
| # Party affiliation | |
| cid = s.findtext("Candidate/Affiliation/Type") | |
| # TODO: Get incumbancy status | |
| print " %s (%s)" % (cname, cid) | |
| # Get vote count for this candidate | |
| votes = s.findtext("ValidVotes") | |
| # Get percentage of vote in this contest for this candidate | |
| pv = s.findtext("CountMetric[@Id='PVR']") | |
| print " - %s%% (%s)" % (pv, votes) | |
| def process_measure(contest_package): | |
| """ | |
| Handle a contest with yes/no options. | |
| TODO: Fix return type. | |
| """ | |
| contest = contest_package["contest"] | |
| # If contest is a prop, extract the proposition number from the contest ID | |
| if contest_package["info_dict"]["description"] == "Ballot Measures": | |
| print "Prop %s: %s" % ( | |
| int(contest_package["id"][7:12]), | |
| contest_package["name"]) | |
| else: | |
| print "%s:" % contest_package["name"] | |
| # Get percentage of yes & no votes in this contest | |
| pyv = contest.findtext('TotalVotes/CountMetric[@Id="PYV"]') | |
| pnv = contest.findtext('TotalVotes/CountMetric[@Id="PNV"]') | |
| # Get all available responses (Should just be yes and no) | |
| selections = contest.findall('TotalVotes/Selection') | |
| for s in selections: | |
| cname = s.find("Candidate/ProposalItem").get("ReferendumOptionIdentifier") | |
| print " %s" % cname | |
| votes = s.findtext("ValidVotes") | |
| # Check option identifier and associate with vote percentages | |
| if cname == "Yes": | |
| use_var = pyv | |
| if cname == "No": | |
| use_var = pnv | |
| print " - %s%% (%s)" % (use_var, votes) | |
| #### Main ##### | |
| # get the file | |
| unzip_and_extract(download_file(DOWNLOAD_URL)) # download data, unzip, and extract xml | |
| response = open(os.path.join(TMP_DIR, DATA_FILE), 'r') | |
| results = etree.fromstring(response.read()) | |
| response.close() | |
| shutil.rmtree(TMP_DIR) # erase working directory | |
| contests = results.findall('.//Contests/Contest') | |
| for contest in contests: | |
| # Extract the contest ID to decide how to handle things | |
| contest_id = contest.find('ContestIdentifier').get("Id", "Not defined") | |
| # Compare the first four digits against mapping dictionary | |
| contest_type_dict = CONTEST_TYPES[contest_id[0:4]] | |
| contest_type = contest_type_dict["type"] | |
| # Package contest plus previously accessed data for helper functions | |
| contest_package = { | |
| 'contest': contest, | |
| 'id': contest_id, | |
| 'info_dict': contest_type_dict, | |
| 'name': contest.findtext('ContestIdentifier/ContestName')} | |
| # Hand off... | |
| if contest_type == "measure": | |
| process_measure(contest_package) | |
| elif contest_type == "candidate": | |
| process_candidate(contest_package) | |
| print "\n" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment