Last active
December 10, 2015 15:09
-
-
Save richardjpope/4452689 to your computer and use it in GitHub Desktop.
Oyster Card backup script for ScraperWiki.com Vault
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a very basic script to backup oyster card data to a scraperwiki vault | |
# Notes: | |
# 1) You need an oyster card that has been registered on tfl.gov.uk | |
# 2) This script requires you to enter your username and password (this about what that means before progressing, and do so at your own risk) | |
# 3) This script should be run in a PRIVATE SCRAPERWIKI VAULT ONLY https://scraperwiki.com/pricing/ not a public scraper, or the world will know your password | |
import scraperwiki | |
import mechanize | |
import lxml.html | |
from lxml.etree import tostring | |
import csv | |
username = 'YOUR TFL USERNAME' | |
password = 'YOUR TFL PASSWORD' | |
#setup browser | |
br = mechanize.Browser() | |
#br.set_all_readonly(False) # allow everything to be written to | |
br.set_handle_robots(False) # no robots | |
br.set_handle_refresh(False) # can sometimes hang without this | |
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] | |
response = br.open('https://oyster.tfl.gov.uk/oyster/entry.do') | |
#get the login form | |
br.form = list(br.forms())[0] | |
username_input = br.form.find_control("j_username") | |
password_input = br.form.find_control("j_password") | |
#enter password and submit | |
username_input.value = username | |
password_input.value = password | |
response = br.submit() | |
#find the journey history link | |
journey_history_link = None | |
for link in br.links(): | |
if link.text == 'Journey history': | |
journey_history_link = link | |
pass | |
#if no history link, then raise an exception | |
if journey_history_link == None: | |
raise Exception('Failed to find jouney history link') | |
#got to the jouney history page | |
response = br.follow_link(journey_history_link) | |
# mechanize doesnt seem to like the html here, so use lxml to find download link | |
root = lxml.html.fromstring(response.read()) | |
download_link = root.cssselect("form#jhDownloadForm input")[0] | |
download_href = download_link.attrib['onclick'].replace('document.jhDownloadForm.action="', '').replace('";document.jhDownloadForm.submit();', '') | |
download_href = 'https://oyster.tfl.gov.uk' + download_href | |
# download the csv | |
response = br.open(download_href) | |
# read and save | |
csv_reader = csv.DictReader(response.read().splitlines()) | |
for row in csv_reader: | |
row['Journey Action'] = row['Journey/Action'] # rename this col, as database objects to the slash | |
del(row['Journey/Action']) | |
scraperwiki.sqlite.save(unique_keys=['Start Time', 'End Time', 'Date', 'Journey Action'], data=row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment