Skip to content

Instantly share code, notes, and snippets.

@righthandabacus
Created March 13, 2018 18:40
Show Gist options
  • Save righthandabacus/bc40a8af2336076a4cf3bd0d38817d16 to your computer and use it in GitHub Desktop.
Save righthandabacus/bc40a8af2336076a4cf3bd0d38817d16 to your computer and use it in GitHub Desktop.
收集投票率數據並寫入google spreadsheet
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import datetime
import os
import subprocess
import time
import urlparse
from StringIO import StringIO
from lxml import etree
import gspread # for accessing google spreadsheet
from oauth2client.service_account import ServiceAccountCredentials
FNULL = open(os.devnull, 'w')
LINKS = [
("Hong Kong Island",'http://www.elections.gov.hk/legco2018by/eng/tt_gc_LC1.html'),
("Kowloon West",'http://www.elections.gov.hk/legco2018by/eng/tt_gc_LC2.html'),
("New Territories East",'http://www.elections.gov.hk/legco2018by/eng/tt_gc_LC5.html'),
("Architectural, Surveying, Planning And Landscape",'http://www.elections.gov.hk/legco2018by/eng/tt_fc_ASnP.html')
]
def getTable(url):
'''Craw page, get array data
'''
main_html = subprocess.check_output(['curl',url], stderr=FNULL)
parser = etree.HTMLParser()
dom = etree.parse(StringIO(main_html), parser)
# voter count
e = dom.xpath(r"//table/tr/td[text()[contains(.,'Number of Registered')]]")
assert(e)
assert(e[0].getchildren()) # comment tags
votercount = e[0].getchildren()[0].tail
assert(votercount)
votercount = int(''.join(c for c in votercount if c != ','))
# turn out data
tabledom = dom.xpath(r"//table/tr/td[text()[contains(.,'Number of Registered')]]/../..")
assert(tabledom) # This table must exist
numbertab = tabledom[0].getnext() # table of data
array = []
for row in numbertab.xpath(r"//tr"):
rowdata = [cell.text.strip() for cell in row.getchildren() if cell.text]
if len(rowdata) != 3 or not all(rowdata):
continue # shouldn't be any empty stuff
rowdata[0] = rowdata[0].split()[-1]
rowdata[1] = ''.join(c for c in rowdata[1] if c != ',') # remove thousand separateor
if not rowdata[1].isdigit():
continue
else:
rowdata[1] = int(rowdata[1])
array.append(rowdata[:2])
# return data
return votercount, array
def main():
'''Crawl and update google sheet
'''
alldata = {}
for name, url in LINKS:
#print('Crawling %s' % url)
alldata[name.lower()] = getTable(url)
# format for output (row1: header)
outrow1 = ["Constituency"
,"Hong Kong Island"
,"Kowloon West"
,"New Territories East"
,"Architectural, Surveying, Planning And Landscape"
]
emptyrow = [''] * len(outrow1)
outarray = [outrow1, emptyrow[:]]
# row 2: voter count
for i,cell in enumerate(outrow1):
if i == 0:
outarray[1][0] = 'Number of Registered Electors'
elif cell.lower() in alldata:
outarray[1][i] = alldata[cell.lower()][0]
# row 3: empty
outarray.append(emptyrow[:])
# row 4
outarray.append(emptyrow[:])
outarray[3][0] = 'Time'
# row 5 + beyond
for i, cell in enumerate(outrow1):
if cell.lower() not in alldata: continue # no data, empty anyway
array = alldata[cell.lower()][1]
for row in array:
if row[0] not in [r[0] for r in outarray]:
outarray.append(emptyrow[:]) # new row
outarray[-1][0] = row[0]
timerow = [r for r in outarray if r[0] == row[0]]
assert(timerow) # pick the row that match the time, must exist
timerow[0][i] = row[1]
# authenticate google client
scope = ['https://spreadsheets.google.com/feeds']
creds = ServiceAccountCredentials.from_json_keyfile_name('service_account.json', scope)
client = gspread.authorize(creds)
myclient = client.open("2018 Progressive Turnout Rate")
sheet = [w for w in myclient.worksheets() if w.title == '2018 By-Elections'][0]
# write to google sheet
for rowid, sheetrow in enumerate(outarray):
for colid, sheetcell in enumerate(sheetrow):
sheet.update_cell(rowid+1, colid+1, sheetcell)
# Print, in TSV
print('Updated sheet to the following:')
print("\n".join(
"\t".join(str(cell) for cell in row)
for row in outarray
))
print(datetime.datetime.now())
if __name__ == '__main__':
main()
# vim:set ts=4 sw=4 et:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment