Created
June 5, 2012 14:45
-
-
Save geekman/2875439 to your computer and use it in GitHub Desktop.
scrapes MRT information from sgwiki.com for Farebot
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # | |
| # scrapes MRT information from sgwiki.com for FareBot | |
| # outputs Java statements to be copied & pasted into EZLinkTransitData.java | |
| # | |
| import urllib, urllib2 | |
| import re | |
| WIKI_URL = 'http://sgwiki.com/index.php?' | |
| # prints more information | |
| VERBOSE = False | |
| MRT_LINES = [ | |
| 'North East Line', | |
| 'Circle Line', | |
| 'East West Line', | |
| 'North South Line', | |
| ] | |
| # hardcoded alternate abbreviations | |
| # not in the wiki -- have to be found manually | |
| ALTERNATE_ABBREVIATIONS = { | |
| # DBN and OTN found by Sean Cross | |
| 'DBG': ('DBN', 'NEL gantry'), # Dhoby Ghaut | |
| 'OTP': ('OTN', 'NEL gantry'), # Outram Park | |
| } | |
| WIKI_LINK_RE = re.compile(r'\[\[([^]]+)]]') | |
| def fetch_page(title): | |
| page_name = title.replace(' ', '_') | |
| params = urllib.urlencode({'title': page_name, 'action': 'raw'}) | |
| req = urllib2.Request(WIKI_URL + params) | |
| #print req.get_full_url() | |
| resp = urllib2.urlopen(req) | |
| wiki = resp.read() | |
| # need to handle redirects here, since we're reading the raw markup | |
| if wiki.strip().startswith('#REDIRECT'): | |
| link = WIKI_LINK_RE.search(wiki) | |
| if not link: | |
| raise ValueError, 'unable to parse redirect' | |
| return fetch_page(link.group(1)) | |
| return wiki | |
| def fetch_station(page_name): | |
| wiki = fetch_page(page_name) | |
| # parse wiki | |
| tmpl_match = re.search('{{([^}]+)}}', wiki, re.DOTALL) | |
| if tmpl_match: | |
| tmpl = tmpl_match.group(1) | |
| tmpl_fields = [s.strip() for s in tmpl.split('|')] | |
| if tmpl_fields[0].startswith('Infobox Station'): | |
| fields = {} | |
| for f in tmpl_fields: | |
| if f.startswith('Abbreviation=') or \ | |
| f.startswith('Coordinates=') or \ | |
| f.startswith('Code 2=') or \ | |
| f.startswith('Status=') or \ | |
| f.startswith('Name='): | |
| fs = f.split('=', 1) | |
| fields[fs[0].strip()] = fs[1].strip() | |
| return fields | |
| return None | |
| def fetch_line(page_name): | |
| wiki = fetch_page(page_name) | |
| stations = [] | |
| for table_match in re.finditer(r'{\|([^}]+)\|}', wiki, re.DOTALL): | |
| table = table_match.group(1) | |
| # find links within the table | |
| for match in WIKI_LINK_RE.finditer(table): | |
| link_split = match.group(1).split('|') | |
| station = link_split[0] | |
| if station.endswith('Station'): | |
| stations.append(station) | |
| return stations | |
| def print_station_def(station_info, visited=None): | |
| if visited is None: | |
| visited = [] | |
| # name in diff languages | |
| station_name = station_info['Name'].split('<br>')[0] | |
| station_name = station_name.strip() | |
| codes = station_info['Code 2'].replace('/', ' / ') | |
| abbr = station_info['Abbreviation'] | |
| if station_info['Status'].lower() != 'opened': | |
| if VERBOSE: print '// station %s is not yet opened' % station_name | |
| elif abbr not in visited: | |
| fmt_str = 'put("%s", new MRTStation("%s", "%s", "%s", "1.17", "103.5"));' | |
| visited.append(abbr) | |
| print fmt_str % (abbr, station_name, codes, abbr) | |
| # is there an alternate abbreviation for this station? | |
| if abbr in ALTERNATE_ABBREVIATIONS: | |
| alt_abbr, comment = ALTERNATE_ABBREVIATIONS[abbr] | |
| visited.append(alt_abbr) | |
| print (fmt_str + ' // %s') % \ | |
| (alt_abbr, station_name, codes, alt_abbr, comment) | |
| else: | |
| if VERBOSE: print '// station %s already visited' % station_name | |
| if __name__ == '__main__': | |
| visited = [] | |
| for line in MRT_LINES: | |
| print '// %s' % line | |
| for station in fetch_line(line): | |
| try: | |
| s = fetch_station(station) | |
| if not s: | |
| print '// unable to parse %s' % station | |
| else: | |
| print_station_def(s, visited) | |
| except urllib2.HTTPError, e: | |
| print '// unable to fetch %s: %s' % (station, str(e)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment