Last active
June 26, 2018 12:15
-
-
Save quandyfactory/e12b697615d2356d77759881cc3131b0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf8 -*- | |
from HTMLParser import HTMLParser | |
import os | |
import requests | |
class MLStripper(HTMLParser): | |
def __init__(self): | |
self.reset() | |
self.fed = [] | |
def handle_data(self, d): | |
self.fed.append(d) | |
def get_data(self): | |
return ''.join(self.fed) | |
def strip_tags(html): | |
s = MLStripper() | |
s.feed(html) | |
return s.get_data() | |
path = r'~\Desktop\election_results' | |
base_url = 'https://www.elections.on.ca' | |
# tuple of riding URLs and riding names | |
# I extracted them from the select options in the main results webpage | |
ridings = ( | |
('/content/ngw/en/election-results/ajax.html', 'Ajax / Ajax'), | |
('/content/ngw/en/election-results/algoma-manitoulin.html', 'Algoma—Manitoulin / Algoma—Manitoulin'), | |
('/content/ngw/en/election-results/aurora-oak-ridges-richmond-hill.html', 'Aurora—Oak Ridges—Richmond Hill / Aurora—Oak Ridges—Richmond Hill'), | |
('/content/ngw/en/election-results/barrie-innisfil.html', 'Barrie—Innisfil / Barrie—Innisfil'), | |
('/content/ngw/en/election-results/barrie-springwater-oro-medonte.html', 'Barrie—Springwater—Oro-Medonte / Barrie—Springwater—Oro-Medonte'), | |
('/content/ngw/en/election-results/bay-of-quinte.html', 'Bay of Quinte / Baie de Quinte'), | |
('/content/ngw/en/election-results/beaches-east-york.html', 'Beaches—East York / Beaches—East York'), | |
('/content/ngw/en/election-results/brampton-centre.html', 'Brampton Centre / Brampton-Centre'), | |
('/content/ngw/en/election-results/brampton-east.html', 'Brampton East / Brampton-Est'), | |
('/content/ngw/en/election-results/brampton-north.html', 'Brampton North / Brampton-Nord'), | |
('/content/ngw/en/election-results/brampton-south.html', 'Brampton South / Brampton-Sud'), | |
('/content/ngw/en/election-results/brampton-west.html', 'Brampton West / Brampton-Ouest'), | |
('/content/ngw/en/election-results/brantford-brant.html', 'Brantford—Brant / Brantford—Brant'), | |
('/content/ngw/en/election-results/bruce-grey-owen-sound.html', 'Bruce—Grey—Owen Sound / Bruce—Grey—Owen Sound'), | |
('/content/ngw/en/election-results/burlington.html', 'Burlington / Burlington'), | |
('/content/ngw/en/election-results/cambridge.html', 'Cambridge / Cambridge'), | |
('/content/ngw/en/election-results/carleton.html', 'Carleton / Carleton'), | |
('/content/ngw/en/election-results/chatham-kent-leamington.html', 'Chatham-Kent—Leamington / Chatham-Kent—Leamington'), | |
('/content/ngw/en/election-results/davenport.html', 'Davenport / Davenport'), | |
('/content/ngw/en/election-results/don-valley-east.html', 'Don Valley East / Don Valley-Est'), | |
('/content/ngw/en/election-results/don-valley-north.html', 'Don Valley North / Don Valley-Nord'), | |
('/content/ngw/en/election-results/don-valley-west.html', 'Don Valley West / Don Valley-Ouest'), | |
('/content/ngw/en/election-results/dufferin-caledon.html', 'Dufferin—Caledon / Dufferin—Caledon'), | |
('/content/ngw/en/election-results/durham.html', 'Durham / Durham'), | |
('/content/ngw/en/election-results/eglinton-lawrence.html', 'Eglinton—Lawrence / Eglinton—Lawrence'), | |
('/content/ngw/en/election-results/elgin-middlesex-london.html', 'Elgin—Middlesex—London / Elgin—Middlesex—London'), | |
('/content/ngw/en/election-results/essex.html', 'Essex / Essex'), | |
('/content/ngw/en/election-results/etobicoke-centre.html', 'Etobicoke Centre / Etobicoke-Centre'), | |
('/content/ngw/en/election-results/etobicoke-lakeshore.html', 'Etobicoke—Lakeshore / Etobicoke—Lakeshore'), | |
('/content/ngw/en/election-results/etobicoke-north.html', 'Etobicoke North / Etobicoke-Nord'), | |
('/content/ngw/en/election-results/flamborough-glanbrook.html', 'Flamborough—Glanbrook / Flamborough—Glanbrook'), | |
('/content/ngw/en/election-results/glengarry-prescott-russell.html', 'Glengarry—Prescott—Russell / Glengarry—Prescott—Russell'), | |
('/content/ngw/en/election-results/guelph.html', 'Guelph / Guelph'), | |
('/content/ngw/en/election-results/haldimand-norfolk.html', 'Haldimand—Norfolk / Haldimand—Norfolk'), | |
('/content/ngw/en/election-results/haliburton-kawartha-lakes-brock.html', 'Haliburton—Kawartha Lakes—Brock / Haliburton—Kawartha Lakes—Brock'), | |
('/content/ngw/en/election-results/hamilton-centre.html', 'Hamilton Centre / Hamilton-Centre'), | |
('/content/ngw/en/election-results/hamilton-east-stoney-creek.html', 'Hamilton East—Stoney Creek / Hamilton-Est—Stoney Creek'), | |
('/content/ngw/en/election-results/hamilton-mountain.html', 'Hamilton Mountain / Hamilton Mountain'), | |
('/content/ngw/en/election-results/hamilton-west-ancaster-dundas.html', 'Hamilton West—Ancaster—Dundas / Hamilton-Ouest—Ancaster—Dundas'), | |
('/content/ngw/en/election-results/hastings-lennox-and-addington.html', 'Hastings—Lennox and Addington / Hastings—Lennox and Addington'), | |
('/content/ngw/en/election-results/humber-river-black-creek.html', 'Humber River—Black Creek / Humber River—Black Creek'), | |
('/content/ngw/en/election-results/huron-bruce.html', 'Huron—Bruce / Huron—Bruce'), | |
('/content/ngw/en/election-results/kanata-carleton.html', 'Kanata—Carleton / Kanata—Carleton'), | |
('/content/ngw/en/election-results/kenora-rainy-river.html', 'Kenora—Rainy River / Kenora—Rainy River'), | |
('/content/ngw/en/election-results/king-vaughan.html', 'King—Vaughan / King—Vaughan'), | |
('/content/ngw/en/election-results/kingston-and-the-islands.html', 'Kingston and the Islands / Kingston et les Îles'), | |
('/content/ngw/en/election-results/kitchener-centre.html', 'Kitchener Centre / Kitchener-Centre'), | |
('/content/ngw/en/election-results/kitchener-conestoga.html', 'Kitchener—Conestoga / Kitchener—Conestoga'), | |
('/content/ngw/en/election-results/kitchener-south-hespeler.html', 'Kitchener South—Hespeler / Kitchener-Sud—Hespeler'), | |
('/content/ngw/en/election-results/lambton-kent-middlesex.html', 'Lambton—Kent—Middlesex / Lambton—Kent—Middlesex'), | |
('/content/ngw/en/election-results/lanark-frontenac-kingston.html', 'Lanark—Frontenac—Kingston / Lanark—Frontenac—Kingston'), | |
('/content/ngw/en/election-results/leeds-grenville-thousand-islands-and-rideau-lakes.html', 'Leeds—Grenville—Thousand Islands and Rideau Lakes / Leeds—Grenville—Thousand Islands et Rideau Lakes'), | |
('/content/ngw/en/election-results/london-fanshawe.html', 'London—Fanshawe / London—Fanshawe'), | |
('/content/ngw/en/election-results/london-north-centre.html', 'London North Centre / London-Centre-Nord'), | |
('/content/ngw/en/election-results/london-west.html', 'London West / London-Ouest'), | |
('/content/ngw/en/election-results/markham-stouffville.html', 'Markham—Stouffville / Markham—Stouffville'), | |
('/content/ngw/en/election-results/markham-thornhill.html', 'Markham—Thornhill / Markham—Thornhill'), | |
('/content/ngw/en/election-results/markham-unionville.html', 'Markham—Unionville / Markham—Unionville'), | |
('/content/ngw/en/election-results/milton.html', 'Milton / Milton'), | |
('/content/ngw/en/election-results/mississauga-centre.html', 'Mississauga Centre / Mississauga-Centre'), | |
('/content/ngw/en/election-results/mississauga-east-cooksville.html', 'Mississauga East—Cooksville / Mississauga-Est—Cooksville'), | |
('/content/ngw/en/election-results/mississauga-erin-mills.html', 'Mississauga—Erin Mills / Mississauga—Erin Mills'), | |
('/content/ngw/en/election-results/mississauga-lakeshore.html', 'Mississauga—Lakeshore / Mississauga—Lakeshore'), | |
('/content/ngw/en/election-results/mississauga-malton.html', 'Mississauga—Malton / Mississauga—Malton'), | |
('/content/ngw/en/election-results/mississauga-streetsville.html', 'Mississauga—Streetsville / Mississauga—Streetsville'), | |
('/content/ngw/en/election-results/nepean.html', 'Nepean / Nepean'), | |
('/content/ngw/en/election-results/newmarket-aurora.html', 'Newmarket—Aurora / Newmarket—Aurora'), | |
('/content/ngw/en/election-results/niagara-centre.html', 'Niagara Centre / Niagara-Centre'), | |
('/content/ngw/en/election-results/niagara-falls.html', 'Niagara Falls / Niagara Falls'), | |
('/content/ngw/en/election-results/niagara-west.html', 'Niagara West / Niagara-Ouest'), | |
('/content/ngw/en/election-results/nickel-belt.html', 'Nickel Belt / Nickel Belt'), | |
('/content/ngw/en/election-results/nipissing.html', 'Nipissing / Nipissing'), | |
('/content/ngw/en/election-results/northumberland-peterborough-south.html', 'Northumberland—Peterborough South / Northumberland—Peterborough-Sud'), | |
('/content/ngw/en/election-results/oakville.html', 'Oakville / Oakville'), | |
('/content/ngw/en/election-results/oakville-north-burlington.html', 'Oakville North—Burlington / Oakville-Nord—Burlington'), | |
('/content/ngw/en/election-results/orl-ans.html', 'Orléans / Orléans'), | |
('/content/ngw/en/election-results/oshawa.html', 'Oshawa / Oshawa'), | |
('/content/ngw/en/election-results/ottawa-centre.html', 'Ottawa Centre / Ottawa-Centre'), | |
('/content/ngw/en/election-results/ottawa-south.html', 'Ottawa South / Ottawa-Sud'), | |
('/content/ngw/en/election-results/ottawa-vanier.html', 'Ottawa—Vanier / Ottawa—Vanier'), | |
('/content/ngw/en/election-results/ottawa-west-nepean.html', 'Ottawa West—Nepean / Ottawa-Ouest—Nepean'), | |
('/content/ngw/en/election-results/oxford.html', 'Oxford / Oxford'), | |
('/content/ngw/en/election-results/parkdale-high-park.html', 'Parkdale—High Park / Parkdale—High Park'), | |
('/content/ngw/en/election-results/parry-sound-muskoka.html', 'Parry Sound—Muskoka / Parry Sound—Muskoka'), | |
('/content/ngw/en/election-results/perth-wellington.html', 'Perth—Wellington / Perth—Wellington'), | |
('/content/ngw/en/election-results/peterborough-kawartha.html', 'Peterborough—Kawartha / Peterborough—Kawartha'), | |
('/content/ngw/en/election-results/pickering-uxbridge.html', 'Pickering—Uxbridge / Pickering—Uxbridge'), | |
('/content/ngw/en/election-results/renfrew-nipissing-pembroke.html', 'Renfrew—Nipissing—Pembroke / Renfrew—Nipissing—Pembroke'), | |
('/content/ngw/en/election-results/richmond-hill.html', 'Richmond Hill / Richmond Hill'), | |
('/content/ngw/en/election-results/st--catharines.html', 'St. Catharines / St. Catharines'), | |
('/content/ngw/en/election-results/sarnia-lambton.html', 'Sarnia—Lambton / Sarnia—Lambton'), | |
('/content/ngw/en/election-results/sault-ste--marie.html', 'Sault Ste. Marie / Sault Ste. Marie'), | |
('/content/ngw/en/election-results/scarborough-agincourt.html', 'Scarborough—Agincourt / Scarborough—Agincourt'), | |
('/content/ngw/en/election-results/scarborough-centre.html', 'Scarborough Centre / Scarborough-Centre'), | |
('/content/ngw/en/election-results/scarborough-guildwood.html', 'Scarborough—Guildwood / Scarborough—Guildwood'), | |
('/content/ngw/en/election-results/scarborough-north.html', 'Scarborough North / Scarborough-Nord'), | |
('/content/ngw/en/election-results/scarborough-rouge-park.html', 'Scarborough—Rouge Park / Scarborough—Rouge Park'), | |
('/content/ngw/en/election-results/scarborough-southwest.html', 'Scarborough Southwest / Scarborough-Sud-Ouest'), | |
('/content/ngw/en/election-results/simcoe-grey.html', 'Simcoe—Grey / Simcoe—Grey'), | |
('/content/ngw/en/election-results/simcoe-north.html', 'Simcoe North / Simcoe-Nord'), | |
('/content/ngw/en/election-results/spadina-fort-york.html', 'Spadina—Fort York / Spadina—Fort York'), | |
('/content/ngw/en/election-results/stormont-dundas-south-glengarry.html', 'Stormont—Dundas—South Glengarry / Stormont—Dundas—Sud-Glengarry'), | |
('/content/ngw/en/election-results/sudbury.html', 'Sudbury / Sudbury'), | |
('/content/ngw/en/election-results/thornhill.html', 'Thornhill / Thornhill'), | |
('/content/ngw/en/election-results/thunder-bay-atikokan.html', 'Thunder Bay—Atikokan / Thunder Bay—Atikokan'), | |
('/content/ngw/en/election-results/thunder-bay-superior-north.html', 'Thunder Bay—Superior North / Thunder Bay—Supérieur-Nord'), | |
('/content/ngw/en/election-results/timiskaming-cochrane.html', 'Timiskaming—Cochrane / Timiskaming—Cochrane'), | |
('/content/ngw/en/election-results/timmins.html', 'Timmins / Timmins'), | |
('/content/ngw/en/election-results/toronto-centre.html', 'Toronto Centre / Toronto-Centre'), | |
('/content/ngw/en/election-results/toronto-danforth.html', 'Toronto—Danforth / Toronto—Danforth'), | |
('/content/ngw/en/election-results/toronto-st--paul-s.html', 'Toronto—St. Paul\'s / Toronto—St. Paul\'s'), | |
('/content/ngw/en/election-results/university-rosedale.html', 'University—Rosedale / University—Rosedale'), | |
('/content/ngw/en/election-results/vaughan-woodbridge.html', 'Vaughan—Woodbridge / Vaughan—Woodbridge'), | |
('/content/ngw/en/election-results/waterloo.html', 'Waterloo / Waterloo'), | |
('/content/ngw/en/election-results/wellington-halton-hills.html', 'Wellington—Halton Hills / Wellington—Halton Hills'), | |
('/content/ngw/en/election-results/whitby.html', 'Whitby / Whitby'), | |
('/content/ngw/en/election-results/willowdale.html', 'Willowdale / Willowdale'), | |
('/content/ngw/en/election-results/windsor-tecumseh.html', 'Windsor—Tecumseh / Windsor—Tecumseh'), | |
('/content/ngw/en/election-results/windsor-west.html', 'Windsor West / Windsor-Ouest'), | |
('/content/ngw/en/election-results/york-centre.html', 'York Centre / York-Centre'), | |
('/content/ngw/en/election-results/york-simcoe.html', 'York—Simcoe / York—Simcoe'), | |
('/content/ngw/en/election-results/york-south-weston.html', 'York South—Weston / York-Sud—Weston'), | |
('/content/ngw/en/election-results/kiiwetinoong.html', 'Kiiwetinoong / Kiiwetinoong'), | |
('/content/ngw/en/election-results/mushkegowuk-james-bay.html', 'Mushkegowuk—James Bay / Mushkegowuk—Baie James'), | |
) | |
def get_riding_files(): | |
"""Grabs each riding result HTML file and saves it into a local directory""" | |
for riding in ridings: | |
url = '%s%s' % (base_url, riding[0]) | |
filename = '%s\\%s.html' % (path, riding[1].replace('—', '-').split(' / ')[0]) | |
response = requests.get(url, proxies=proxyDict) | |
with open(filename, 'w') as myfile: | |
myfile.write(response.text.encode('utf8')) | |
def clean_riding_files(): | |
"""Extacts the result HTML table from the results page. | |
If I had more self-esteem, I'd have used a regex but this is quick and dirty and it works.""" | |
files = os.listdir(path) | |
marker1 = '<table class="table" align="center">' | |
marker2 = '</table>' | |
for f in files: | |
with open('%s\\%s' % (path, f), 'r') as myfile: | |
text = myfile.read() | |
text = text.decode('utf8') | |
text = text.split(marker1) | |
text = '<table>\n%s' % (text[1]) | |
text = text.split(marker2) | |
text = '%s\n</table>' % (text[0]) | |
text = text.replace('\t', '') | |
text = text.replace('<span class="eresults-language-divider"> /</span>', ' / ') | |
text = text.replace('<em>', '') | |
text = text.replace('</em>', '') | |
with open('%s\\%s' % (path, f), 'w') as writefile: | |
writefile.write(text.encode('utf8')) | |
def make_sql(): | |
"""Writes a MySQL insert query""" | |
query = "insert into ontario_election_2018_results (riding, candidate, party, votes, percentage) values " | |
query_values = [] | |
files = os.listdir(path) | |
for f in files: | |
riding = f.replace('.html', '') | |
print riding | |
with open('%s\\%s' % (path, f), 'r') as myfile: | |
text = myfile.read() | |
text = text.decode('utf8') | |
text = text.encode('utf8') | |
body = text.split('<tbody>')[1] # just get the table body | |
body = body.replace('<tr', '|<tr') # stick a special character in front of each row | |
body = strip_tags(body) | |
body = body.replace('\n', '\t') # replace newlines with tabs | |
while '\t\t' in body: body = body.replace('\t\t', '\t') # collapse duplicate tabs | |
body = body.replace('|', '\n') # switch the special character to a newline | |
body = body.strip() # trim leading and trailing whitespace | |
rows = body.split('\n') # get each row in its own file | |
for row in rows: | |
row = row.strip() # trim leading and trailing whitespace | |
cols = row.split('\t') # now split into rows | |
candidate = cols[0] | |
party = cols[1].split(' / ')[0] | |
votes = cols[2].replace(',', '') # strip out comma from numbers | |
percentage = cols[3].replace('%s', '') # strip out percent symbol | |
query_values.append("('%s', '%s', '%s', '%s', '%s')" % (riding, candidate, party, votes, percentage)) | |
query_values_string = ', \n'.join(query_values) | |
print '%s\n%s' % (query, query_values_string) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment