Skip to content

Instantly share code, notes, and snippets.

@geekman
Created June 5, 2012 14:45
Show Gist options
  • Select an option

  • Save geekman/2875439 to your computer and use it in GitHub Desktop.

Select an option

Save geekman/2875439 to your computer and use it in GitHub Desktop.
scrapes MRT information from sgwiki.com for Farebot
#!/usr/bin/python
#
# scrapes MRT information from sgwiki.com for FareBot
# outputs Java statements to be copied & pasted into EZLinkTransitData.java
#
import urllib, urllib2
import re
WIKI_URL = 'http://sgwiki.com/index.php?'
# prints more information
VERBOSE = False
MRT_LINES = [
'North East Line',
'Circle Line',
'East West Line',
'North South Line',
]
# hardcoded alternate abbreviations
# not in the wiki -- have to be found manually
ALTERNATE_ABBREVIATIONS = {
# DBN and OTN found by Sean Cross
'DBG': ('DBN', 'NEL gantry'), # Dhoby Ghaut
'OTP': ('OTN', 'NEL gantry'), # Outram Park
}
WIKI_LINK_RE = re.compile(r'\[\[([^]]+)]]')
def fetch_page(title):
page_name = title.replace(' ', '_')
params = urllib.urlencode({'title': page_name, 'action': 'raw'})
req = urllib2.Request(WIKI_URL + params)
#print req.get_full_url()
resp = urllib2.urlopen(req)
wiki = resp.read()
# need to handle redirects here, since we're reading the raw markup
if wiki.strip().startswith('#REDIRECT'):
link = WIKI_LINK_RE.search(wiki)
if not link:
raise ValueError, 'unable to parse redirect'
return fetch_page(link.group(1))
return wiki
def fetch_station(page_name):
wiki = fetch_page(page_name)
# parse wiki
tmpl_match = re.search('{{([^}]+)}}', wiki, re.DOTALL)
if tmpl_match:
tmpl = tmpl_match.group(1)
tmpl_fields = [s.strip() for s in tmpl.split('|')]
if tmpl_fields[0].startswith('Infobox Station'):
fields = {}
for f in tmpl_fields:
if f.startswith('Abbreviation=') or \
f.startswith('Coordinates=') or \
f.startswith('Code 2=') or \
f.startswith('Status=') or \
f.startswith('Name='):
fs = f.split('=', 1)
fields[fs[0].strip()] = fs[1].strip()
return fields
return None
def fetch_line(page_name):
wiki = fetch_page(page_name)
stations = []
for table_match in re.finditer(r'{\|([^}]+)\|}', wiki, re.DOTALL):
table = table_match.group(1)
# find links within the table
for match in WIKI_LINK_RE.finditer(table):
link_split = match.group(1).split('|')
station = link_split[0]
if station.endswith('Station'):
stations.append(station)
return stations
def print_station_def(station_info, visited=None):
if visited is None:
visited = []
# name in diff languages
station_name = station_info['Name'].split('<br>')[0]
station_name = station_name.strip()
codes = station_info['Code 2'].replace('/', ' / ')
abbr = station_info['Abbreviation']
if station_info['Status'].lower() != 'opened':
if VERBOSE: print '// station %s is not yet opened' % station_name
elif abbr not in visited:
fmt_str = 'put("%s", new MRTStation("%s", "%s", "%s", "1.17", "103.5"));'
visited.append(abbr)
print fmt_str % (abbr, station_name, codes, abbr)
# is there an alternate abbreviation for this station?
if abbr in ALTERNATE_ABBREVIATIONS:
alt_abbr, comment = ALTERNATE_ABBREVIATIONS[abbr]
visited.append(alt_abbr)
print (fmt_str + ' // %s') % \
(alt_abbr, station_name, codes, alt_abbr, comment)
else:
if VERBOSE: print '// station %s already visited' % station_name
if __name__ == '__main__':
visited = []
for line in MRT_LINES:
print '// %s' % line
for station in fetch_line(line):
try:
s = fetch_station(station)
if not s:
print '// unable to parse %s' % station
else:
print_station_def(s, visited)
except urllib2.HTTPError, e:
print '// unable to fetch %s: %s' % (station, str(e))
print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment