Created
March 13, 2018 18:40
-
-
Save righthandabacus/bc40a8af2336076a4cf3bd0d38817d16 to your computer and use it in GitHub Desktop.
收集投票率數據並寫入google spreadsheet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from __future__ import print_function | |
import datetime | |
import os | |
import subprocess | |
import time | |
import urlparse | |
from StringIO import StringIO | |
from lxml import etree | |
import gspread # for accessing google spreadsheet | |
from oauth2client.service_account import ServiceAccountCredentials | |
FNULL = open(os.devnull, 'w') | |
LINKS = [ | |
("Hong Kong Island",'http://www.elections.gov.hk/legco2018by/eng/tt_gc_LC1.html'), | |
("Kowloon West",'http://www.elections.gov.hk/legco2018by/eng/tt_gc_LC2.html'), | |
("New Territories East",'http://www.elections.gov.hk/legco2018by/eng/tt_gc_LC5.html'), | |
("Architectural, Surveying, Planning And Landscape",'http://www.elections.gov.hk/legco2018by/eng/tt_fc_ASnP.html') | |
] | |
def getTable(url): | |
'''Craw page, get array data | |
''' | |
main_html = subprocess.check_output(['curl',url], stderr=FNULL) | |
parser = etree.HTMLParser() | |
dom = etree.parse(StringIO(main_html), parser) | |
# voter count | |
e = dom.xpath(r"//table/tr/td[text()[contains(.,'Number of Registered')]]") | |
assert(e) | |
assert(e[0].getchildren()) # comment tags | |
votercount = e[0].getchildren()[0].tail | |
assert(votercount) | |
votercount = int(''.join(c for c in votercount if c != ',')) | |
# turn out data | |
tabledom = dom.xpath(r"//table/tr/td[text()[contains(.,'Number of Registered')]]/../..") | |
assert(tabledom) # This table must exist | |
numbertab = tabledom[0].getnext() # table of data | |
array = [] | |
for row in numbertab.xpath(r"//tr"): | |
rowdata = [cell.text.strip() for cell in row.getchildren() if cell.text] | |
if len(rowdata) != 3 or not all(rowdata): | |
continue # shouldn't be any empty stuff | |
rowdata[0] = rowdata[0].split()[-1] | |
rowdata[1] = ''.join(c for c in rowdata[1] if c != ',') # remove thousand separateor | |
if not rowdata[1].isdigit(): | |
continue | |
else: | |
rowdata[1] = int(rowdata[1]) | |
array.append(rowdata[:2]) | |
# return data | |
return votercount, array | |
def main(): | |
'''Crawl and update google sheet | |
''' | |
alldata = {} | |
for name, url in LINKS: | |
#print('Crawling %s' % url) | |
alldata[name.lower()] = getTable(url) | |
# format for output (row1: header) | |
outrow1 = ["Constituency" | |
,"Hong Kong Island" | |
,"Kowloon West" | |
,"New Territories East" | |
,"Architectural, Surveying, Planning And Landscape" | |
] | |
emptyrow = [''] * len(outrow1) | |
outarray = [outrow1, emptyrow[:]] | |
# row 2: voter count | |
for i,cell in enumerate(outrow1): | |
if i == 0: | |
outarray[1][0] = 'Number of Registered Electors' | |
elif cell.lower() in alldata: | |
outarray[1][i] = alldata[cell.lower()][0] | |
# row 3: empty | |
outarray.append(emptyrow[:]) | |
# row 4 | |
outarray.append(emptyrow[:]) | |
outarray[3][0] = 'Time' | |
# row 5 + beyond | |
for i, cell in enumerate(outrow1): | |
if cell.lower() not in alldata: continue # no data, empty anyway | |
array = alldata[cell.lower()][1] | |
for row in array: | |
if row[0] not in [r[0] for r in outarray]: | |
outarray.append(emptyrow[:]) # new row | |
outarray[-1][0] = row[0] | |
timerow = [r for r in outarray if r[0] == row[0]] | |
assert(timerow) # pick the row that match the time, must exist | |
timerow[0][i] = row[1] | |
# authenticate google client | |
scope = ['https://spreadsheets.google.com/feeds'] | |
creds = ServiceAccountCredentials.from_json_keyfile_name('service_account.json', scope) | |
client = gspread.authorize(creds) | |
myclient = client.open("2018 Progressive Turnout Rate") | |
sheet = [w for w in myclient.worksheets() if w.title == '2018 By-Elections'][0] | |
# write to google sheet | |
for rowid, sheetrow in enumerate(outarray): | |
for colid, sheetcell in enumerate(sheetrow): | |
sheet.update_cell(rowid+1, colid+1, sheetcell) | |
# Print, in TSV | |
print('Updated sheet to the following:') | |
print("\n".join( | |
"\t".join(str(cell) for cell in row) | |
for row in outarray | |
)) | |
print(datetime.datetime.now()) | |
if __name__ == '__main__': | |
main() | |
# vim:set ts=4 sw=4 et: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment