geekman · June 5, 2012 14:45
diff --git a/scrape_mrt.py b/scrape_mrt.py
 #!/usr/bin/python
 #
 # scrapes MRT information from sgwiki.com for FareBot
 # outputs Java statements to be copied & pasted into EZLinkTransitData.java
 #

 import urllib, urllib2
 import re

 WIKI_URL = 'http://sgwiki.com/index.php?'

 # prints more information
 VERBOSE = False

 MRT_LINES = [
 	'North East Line', 
 	'Circle Line',
 	'East West Line',
 	'North South Line',
 ]

 # hardcoded alternate abbreviations
 # not in the wiki -- have to be found manually
 ALTERNATE_ABBREVIATIONS = {
 	# DBN and OTN found by Sean Cross
 	'DBG':	('DBN', 'NEL gantry'),		# Dhoby Ghaut
 	'OTP':	('OTN', 'NEL gantry'),		# Outram Park
 }

 WIKI_LINK_RE = re.compile(r'\[\[([^]]+)]]')


 def fetch_page(title):
 	page_name = title.replace(' ', '_')
 	params = urllib.urlencode({'title': page_name, 'action': 'raw'})
 	req = urllib2.Request(WIKI_URL + params)
 	#print req.get_full_url()
 	resp = urllib2.urlopen(req)

 	wiki = resp.read()

 	# need to handle redirects here, since we're reading the raw markup
 	if wiki.strip().startswith('#REDIRECT'):
 		link = WIKI_LINK_RE.search(wiki)
 		if not link:
 			raise ValueError, 'unable to parse redirect'

 		return fetch_page(link.group(1))

 	return wiki

 def fetch_station(page_name):
 	wiki = fetch_page(page_name)

 	# parse wiki
 	tmpl_match = re.search('{{([^}]+)}}', wiki, re.DOTALL)
 	if tmpl_match:
 		tmpl = tmpl_match.group(1)
 		tmpl_fields = [s.strip() for s in tmpl.split('|')]
 		if tmpl_fields[0].startswith('Infobox Station'):
 			fields = {}
 			for f in tmpl_fields:
 				if f.startswith('Abbreviation=') or \
 						f.startswith('Coordinates=') or \
 						f.startswith('Code 2=') or \
 						f.startswith('Status=') or \
 						f.startswith('Name='):
 					fs = f.split('=', 1)
 					fields[fs[0].strip()] = fs[1].strip()
 			return fields

 	return None

 def fetch_line(page_name):
 	wiki = fetch_page(page_name)

 	stations = []

 	for table_match in re.finditer(r'{\|([^}]+)\|}', wiki, re.DOTALL):
 		table = table_match.group(1)
 		# find links within the table
 		for match in WIKI_LINK_RE.finditer(table):
 			link_split = match.group(1).split('|')
 			station = link_split[0]
 			if station.endswith('Station'):
 				stations.append(station)

 	return stations

 def print_station_def(station_info, visited=None):
 	if visited is None:
 		visited = []

 	# name in diff languages
 	station_name = station_info['Name'].split('<br>')[0]
 	station_name = station_name.strip()

 	codes = station_info['Code 2'].replace('/', ' / ')

 	abbr = station_info['Abbreviation']

 	if station_info['Status'].lower() != 'opened':
 		if VERBOSE: print '// station %s is not yet opened' % station_name
 	elif abbr not in visited:
 		fmt_str = 'put("%s", new MRTStation("%s", "%s", "%s", "1.17", "103.5"));'
 		visited.append(abbr)
 		print fmt_str % (abbr, station_name, codes, abbr)

 		# is there an alternate abbreviation for this station?
 		if abbr in ALTERNATE_ABBREVIATIONS:
 			alt_abbr, comment = ALTERNATE_ABBREVIATIONS[abbr]
 			visited.append(alt_abbr)
 			print (fmt_str + ' // %s') % \
 					(alt_abbr, station_name, codes, alt_abbr, comment)
 	else:
 		if VERBOSE: print '// station %s already visited' % station_name



 if __name__ == '__main__':
 	visited = []
 	for line in MRT_LINES:
 		print '// %s' % line
 		for station in fetch_line(line):
 			try:
 				s = fetch_station(station)
 				if not s:
 					print '// unable to parse %s' % station
 				else:
 					print_station_def(s, visited)
 			except urllib2.HTTPError, e:
 				print '// unable to fetch %s: %s' % (station, str(e))
 		print
	#!/usr/bin/python
	#
	# scrapes MRT information from sgwiki.com for FareBot
	# outputs Java statements to be copied & pasted into EZLinkTransitData.java
	#

	import urllib, urllib2
	import re

	WIKI_URL = 'http://sgwiki.com/index.php?'

	# prints more information
	VERBOSE = False

	MRT_LINES = [
	'North East Line',
	'Circle Line',
	'East West Line',
	'North South Line',
	]

	# hardcoded alternate abbreviations
	# not in the wiki -- have to be found manually
	ALTERNATE_ABBREVIATIONS = {
	# DBN and OTN found by Sean Cross
	'DBG': ('DBN', 'NEL gantry'), # Dhoby Ghaut
	'OTP': ('OTN', 'NEL gantry'), # Outram Park
	}

	WIKI_LINK_RE = re.compile(r'\[\[([^]]+)]]')


	def fetch_page(title):
	page_name = title.replace(' ', '_')
	params = urllib.urlencode({'title': page_name, 'action': 'raw'})
	req = urllib2.Request(WIKI_URL + params)
	#print req.get_full_url()
	resp = urllib2.urlopen(req)

	wiki = resp.read()

	# need to handle redirects here, since we're reading the raw markup
	if wiki.strip().startswith('#REDIRECT'):
	link = WIKI_LINK_RE.search(wiki)
	if not link:
	raise ValueError, 'unable to parse redirect'

	return fetch_page(link.group(1))

	return wiki

	def fetch_station(page_name):
	wiki = fetch_page(page_name)

	# parse wiki
	tmpl_match = re.search('{{([^}]+)}}', wiki, re.DOTALL)
	if tmpl_match:
	tmpl = tmpl_match.group(1)
	tmpl_fields = [s.strip() for s in tmpl.split('\|')]
	if tmpl_fields[0].startswith('Infobox Station'):
	fields = {}
	for f in tmpl_fields:
	if f.startswith('Abbreviation=') or \
	f.startswith('Coordinates=') or \
	f.startswith('Code 2=') or \
	f.startswith('Status=') or \
	f.startswith('Name='):
	fs = f.split('=', 1)
	fields[fs[0].strip()] = fs[1].strip()
	return fields

	return None

	def fetch_line(page_name):
	wiki = fetch_page(page_name)

	stations = []

	for table_match in re.finditer(r'{\\|([^}]+)\\|}', wiki, re.DOTALL):
	table = table_match.group(1)
	# find links within the table
	for match in WIKI_LINK_RE.finditer(table):
	link_split = match.group(1).split('\|')
	station = link_split[0]
	if station.endswith('Station'):
	stations.append(station)

	return stations

	def print_station_def(station_info, visited=None):
	if visited is None:
	visited = []

	# name in diff languages
	station_name = station_info['Name'].split('<br>')[0]
	station_name = station_name.strip()

	codes = station_info['Code 2'].replace('/', ' / ')

	abbr = station_info['Abbreviation']

	if station_info['Status'].lower() != 'opened':
	if VERBOSE: print '// station %s is not yet opened' % station_name
	elif abbr not in visited:
	fmt_str = 'put("%s", new MRTStation("%s", "%s", "%s", "1.17", "103.5"));'
	visited.append(abbr)
	print fmt_str % (abbr, station_name, codes, abbr)

	# is there an alternate abbreviation for this station?
	if abbr in ALTERNATE_ABBREVIATIONS:
	alt_abbr, comment = ALTERNATE_ABBREVIATIONS[abbr]
	visited.append(alt_abbr)
	print (fmt_str + ' // %s') % \
	(alt_abbr, station_name, codes, alt_abbr, comment)
	else:
	if VERBOSE: print '// station %s already visited' % station_name



	if __name__ == '__main__':
	visited = []
	for line in MRT_LINES:
	print '// %s' % line
	for station in fetch_line(line):
	try:
	s = fetch_station(station)
	if not s:
	print '// unable to parse %s' % station
	else:
	print_station_def(s, visited)
	except urllib2.HTTPError, e:
	print '// unable to fetch %s: %s' % (station, str(e))
	print
No results found