ruiwen · July 2, 2013 11:24
diff --git a/whatshappening.py b/whatshappening.py
 # -*- coding: utf-8 -*-
 #!/bin/python

 # RSS feed parser/scraper for WhatsHappening.sg 

 import urllib2
 import csv
 import re
 import cStringIO
 import codecs
 import hashlib
 import json

 from urllib2 import URLError, HTTPError
 from bs4 import BeautifulSoup


 # Unicode DictWriter object
 # http://stackoverflow.com/a/5838817
 class DictUnicodeWriter(object):

 	def __init__(self, f, fieldnames, dialect=csv.excel, encoding="utf-8", **kwds):
 		# Redirect output to a queue
 		self.queue = cStringIO.StringIO()
 		self.writer = csv.DictWriter(self.queue, fieldnames, dialect=dialect, **kwds)
 		self.stream = f
 		self.encoder = codecs.getincrementalencoder(encoding)()

 	def writerow(self, D):
 		self.writer.writerow({k:v.encode("utf-8") for k,v in D.items()})
 		# Fetch UTF-8 output from the queue ...
 		data = self.queue.getvalue()
 		data = data.decode("utf-8")
 		# ... and reencode it into the target encoding
 		data = self.encoder.encode(data)
 		# write to the target stream
 		self.stream.write(data)
 		# empty queue
 		self.queue.truncate(0)

 	def writerows(self, rows):
 		for D in rows:
 			self.writerow(D)

 	def writeheader(self):
 		self.writer.writeheader()



 class WhatsHappening(object):
 	'''WhatsHappening RSS feed scraper'''

 	# Define the RSS endpoint
 	# Sourced from http://www.whatshappening.sg/events/index.php?com=rss
 	# With the following options checked:
 	# Exhibitions
 	# Fashion
 	# Group / Community Events
 	# Shopping
 	# Travel / Outdoor
 	# Art / Stage
 	# Charity
 	# Dining / Food
 	# Family
 	# Garage sales
 	# Housing
 	# Music
 	# Pets
 	# Sports
 	# Singapore
 	src = "http://www.whatshappening.sg/events/rss.php?cID=6,21,4,18,22,19,14,10,8,17,9,20,11,7&c=Singapore"

 	# Default start / end times
 	DEFAULT_START = "09:00 AM"
 	DEFAULT_END = "09:00 PM"

 	def __init__(self, src='', afile='activities.csv', vfile='venues.csv', with_repeats=True):
 		self.src = src if src else self.src
 		self.__afile = open(afile, 'w')
 		self.__vfile = open(vfile, 'w')

 		self.repeats = with_repeats

 		if not afile:
 			raise Exception("'file' should be be None")

 		# Open output file as read-write, not append
 		self.__af = DictUnicodeWriter(self.__afile, [
 				'Activity',
 				'Description',
 				'Price',
 				'Start',
 				'End',
 				'Contact',
 				'Email',
 				'Link',
 				'Hash',
 				'Venue'
 			], restval='', extrasaction='ignore')
 		self.__af.writeheader()

 		self.__vf = DictUnicodeWriter(self.__vfile, [
 				'Name',
 				'Desc',
 				'Hours',
 				'Contact',
 				'Email',
 				'Address',
 				'Lat',
 				'Lng',
 				'Hash'
 			], restval='', extrasaction='ignore')
 		self.__vf.writeheader()


 	def __parse_time(self, timestring):
 		'''Parse a string representing the time duration of an event, and return a start/end time pair'''

 		# Let's make some assumptions about the event's start/end times
 		start = self.DEFAULT_START
 		end = self.DEFAULT_END

 		print "Timestring: %s" % timestring

 		# We begin to match the various timestring formats below
 		# Each should be unique enough such that matching one means that the rest do not match
 		
 		# If the time string is of the standard "09:00 AM - 10:00 PM" format
 		res = re.search(r"(?P<start>\d{2}:\d{2} (AM|PM)).+(?P<end>\d{2}:\d{2} (AM|PM))", timestring)
 		if res and res.groupdict():
 			start = res.groupdict().get('start', start)
 			end = res.groupdict().get('end', end)


 		# Check for lines like "Begins at 08:00 AM" or "Starts at 09:00 AM"
 		# Match case insensitively
 		res = re.search(r"^(start|begin).+(?P<start>\d{2}:\d{2} (AM|PM))", timestring, re.I)
 		if res and res.groupdict():
 			start = res.groupdict()['start']


 		# Sanity check to make sure we have start/end times
 		if not start or not end:
 			raise Exception("Could not parse time: %s" % timestring)

 		return [start, end]


 	def __parse_series(self, seriesurl):
 		'''Parses a seriesurl on whatshappening.sg, and grabs a list of tuples of datetime pairs'''

 		series = BeautifulSoup(urllib2.urlopen(seriesurl).read())

 		# A WhatsHappening.sg series listing page is made up multiple pairs of <div>s,
 		# all siblings without hierarchy, so here we go

 		# Get all times on page
 		# The .replace() call is in there to handle unicode character \xa0 which prevents us from splitting 
 		# the string properly, so we replace it with a regular space
 		#times = [i.text.strip().replace(u'\xa0', u' ').split(" - ") for i in series.select('div[class^="eventListTime"]')]
 		times = [i.text.strip() for i in series.select('div[class^="eventListTime"]')]
 		times = map(self.__parse_time, times) # Parse _ALL_ the times

 		# Get all dates on page
 		dates = [i.find("a").text for i in series.select('div[class^="eventListTitle"]')]

 		# Put them all together in the format
 		# [
 		#	("<date> <start_time>", "<date> <end_time>"),
 		#	("<date> <start_time>", "<date> <end_time>"),
 		#	("<date> <start_time>", "<date> <end_time>"),
 		#	...
 		#]
 		event_series = [{'Start': "%s %s" % (i[0], i[1][0], ), "End": "%s %s" % (i[0], i[1][1])} for i in zip(dates, times)]

 		return event_series


 	def __parse_item(self, itemurl):
 		'''Parses an individual item on whatshappening.sg'''

 		item = BeautifulSoup(urllib2.urlopen(itemurl).read()) # Grab and parse the item

 		# Start grabbing info
 		name = item.select('.summary')[0].text if item.select('.summary') else ''
 		desc = item.select('.description')[0].text if item.select('.description') else ''
 		
 		# Grab and process the dates
 		date_start = item.select('.dtstart')[0].text if item.select('.dtstart') else ''
 		time_range = item.select('.eventDetailTime')[0].text if item.select('.eventDetailTime') else ''

 		start, end = self.__parse_time(time_range)

 		# Combine the datetime strings	
 		start = "%s %s" % (date_start, start)
 		end = "%s %s" % (date_start, end)

 		# Scrape the price
 		price = re.search(r"Cost: (?P<cost>.+)$", item.select('.eventDetailPrice')[0].text).groupdict()['cost'] if item.select('.eventDetailPrice') else 'Unknown'

 		venue = {
 			'Name': item.select('.org')[0].text.replace('"', '') if item.select('.org') else '',
 			'street': item.select('.street-address')[0].text if item.select('.street-address') else '',
 			'postal': item.select('.postal-code')[0].text if item.select('.postal-code') else ''
 		}
 		venue['Address'] = "%s Singapore %s" % (venue['street'], venue['postal'])

 		# Construct venue hash
 		m = hashlib.new('md5')
 		m.update(venue['Name'])
 		venue['Hash'] = m.hexdigest()

 		contact = {
 			# Regex first trawls for the string that contains "Phone:", then parses it with a regex to extract the phone number component of that string
 			# The regex "(?P<phone>(\+65(\ )?)?\d{4}(\ )?\d{4})" extracts phone numbers which:
 			#  - may or may not begin with +65
 			#  - may or may not have a space after +65
 			#  - may or may not have a space after the first 4 numbers
 			'phone': re.search(r"(?P<phone>(\+65(\ |-)?)?\d{4}(\ |-)?\d{4})", item.find_all(text=re.compile('Phone:'))[0]).groupdict()['phone'] if item.find_all(text=re.compile('Phone:')) else '',

 			# Email is a little tricker because the site very smartly does not list the lister's email in the source, choosing instead to write it post-load
 			# via Javascript
 			# Conveniently enough, that script is embedded right in the page and can be parsed with the regex below
 			# The first .find_all() selector trawls for the string 'var ename' which holds the username component of the email
 			# The text result of that search is then further parsed through the regex to extract the 'ename' and 'edomain' components from the embedded Javascript,
 			# which we can then recombine into the full email address
 			'email': re.search(r"var ename = '(?P<ename>.+)';.+var edomain = '(?P<edomain>.+)';", item.find_all(text=re.compile("var ename"))[0], re.DOTALL).groupdict() if item.find_all(text=re.compile("var ename")) else ''
 		}

 		# A little fix
 		# contact['email'] is currently the groupdict() result from the regex match. We need to recombine the components into a proper email address
 		contact['email'] = "%s@%s" % (contact['email']['ename'], contact['email']['edomain']) if  contact['email'] and contact['email']['ename'] and contact['email']['edomain'] else ''

 		# Grab the website link
 		url = item.select('a[href^="http://www.whatshappening.sg/events/link/index.php"]')[0]['href'] if item.select('a[href^="http://www.whatshappening.sg/events/link/index.php"]') else ''
 		link = ''
 		if url:
 			try:
 				link = urllib2.urlopen(url).url
 			except Exception, e:
 				pass # Oh well

 		# The 'seriesList' URL indicates that this is a repeated event, so we should trawl it
 		event_series = []
 		if self.repeats: # Grab repeats only if flag is set
 			if item.select('a[href^="http://www.whatshappening.sg/events/index.php?com=serieslist"]'):
 				seriesurl = item.select('a[href^="http://www.whatshappening.sg/events/index.php?com=serieslist"]')[0]['href']				
 				event_series = self.__parse_series(seriesurl)

 		# Construct the info struct
 		info = {
 			'Activity': name,
 			'Description': desc,
 			'Price': price,
 			'Start': start,
 			'End': end,
 			'Contact': contact['phone'],
 			'Email': contact['email'],
 			'Link': link
 		}

 		# Construct the activity hash
 		main = hashlib.new("md5")
 		hash_string = "%s %s" % (info['Activity'], venue['Name'])
 		main.update(hash_string.encode('utf-8'))
 		info['main_hash'] = main.hexdigest()

 		sub = hashlib.new("md5")
 		sub.update("%s - %s" % (info['Start'], info['End']))
 		info['sub_hash'] = sub.hexdigest()

 		info['Hash'] = "%s-%s" % (info['main_hash'], info['sub_hash'])

 		# Add the venue hash
 		info['Venue'] = venue['Hash']

 		return [info, event_series, venue]

 	def parse(self):
 		ERROR = []

 		soup = BeautifulSoup(urllib2.urlopen(self.src).read()) # Grab and parse the feed

 		for g in soup.find_all('guid'): # Iterate through all <guid> items

 			try:

 				info, event_series, venue = self.__parse_item(g.text)

 				self.__af.writerow(info)
 				self.__vf.writerow(venue)

 				# DEBUG
 				print json.dumps(info, indent=4)
 				print json.dumps(venue, indent=4)

 				if event_series and self.repeats:
 					# Write a new entry for each repeat event
 					for e in event_series:
 						info['Start'] = e['Start']
 						info['End'] = e['End']

 						sub = hashlib.new("md5")
 						sub.update("%s - %s" % (info['Start'], info['End']))
 						info['sub_hash'] = sub.hexdigest()
 						
 						info['Hash'] = "%s-%s" % (info['main_hash'], info['sub_hash'])						

 						self.__af.writerow(info)

 			except URLError, e:
 				#print g.text
 				if hasattr(e, 'reason'):
 					print 'We failed to reach a server.'
 					print 'Reason: ', e.reason
 				elif hasattr(e, 'code'):
 					print 'The server couldn\'t fulfill the request.'
 					print 'Error code: ', e.code

 				#raise e
 				ERROR.append(g.text)

 			# except Exception as e:
 			# 	print g
 			# 	print e
 			# 	print dir(e)
 			# 	#raise e

 			finally:
 				pass
 				# print "\n\n"
 				# print 20 * ""
 				# print "\n"

 		print ERROR

 		self.__afile.close()
 		self.__vfile.close()

 if __name__ == '__main__':
 	ws = WhatsHappening()
 	ws.parse()
	# -- coding: utf-8 --
	#!/bin/python

	# RSS feed parser/scraper for WhatsHappening.sg

	import urllib2
	import csv
	import re
	import cStringIO
	import codecs
	import hashlib
	import json

	from urllib2 import URLError, HTTPError
	from bs4 import BeautifulSoup


	# Unicode DictWriter object
	# http://stackoverflow.com/a/5838817
	class DictUnicodeWriter(object):

	def __init__(self, f, fieldnames, dialect=csv.excel, encoding="utf-8", **kwds):
	# Redirect output to a queue
	self.queue = cStringIO.StringIO()
	self.writer = csv.DictWriter(self.queue, fieldnames, dialect=dialect, **kwds)
	self.stream = f
	self.encoder = codecs.getincrementalencoder(encoding)()

	def writerow(self, D):
	self.writer.writerow({k:v.encode("utf-8") for k,v in D.items()})
	# Fetch UTF-8 output from the queue ...
	data = self.queue.getvalue()
	data = data.decode("utf-8")
	# ... and reencode it into the target encoding
	data = self.encoder.encode(data)
	# write to the target stream
	self.stream.write(data)
	# empty queue
	self.queue.truncate(0)

	def writerows(self, rows):
	for D in rows:
	self.writerow(D)

	def writeheader(self):
	self.writer.writeheader()



	class WhatsHappening(object):
	'''WhatsHappening RSS feed scraper'''

	# Define the RSS endpoint
	# Sourced from http://www.whatshappening.sg/events/index.php?com=rss
	# With the following options checked:
	# Exhibitions
	# Fashion
	# Group / Community Events
	# Shopping
	# Travel / Outdoor
	# Art / Stage
	# Charity
	# Dining / Food
	# Family
	# Garage sales
	# Housing
	# Music
	# Pets
	# Sports
	# Singapore
	src = "http://www.whatshappening.sg/events/rss.php?cID=6,21,4,18,22,19,14,10,8,17,9,20,11,7&c=Singapore"

	# Default start / end times
	DEFAULT_START = "09:00 AM"
	DEFAULT_END = "09:00 PM"

	def __init__(self, src='', afile='activities.csv', vfile='venues.csv', with_repeats=True):
	self.src = src if src else self.src
	self.__afile = open(afile, 'w')
	self.__vfile = open(vfile, 'w')

	self.repeats = with_repeats

	if not afile:
	raise Exception("'file' should be be None")

	# Open output file as read-write, not append
	self.__af = DictUnicodeWriter(self.__afile, [
	'Activity',
	'Description',
	'Price',
	'Start',
	'End',
	'Contact',
	'Email',
	'Link',
	'Hash',
	'Venue'
	], restval='', extrasaction='ignore')
	self.__af.writeheader()

	self.__vf = DictUnicodeWriter(self.__vfile, [
	'Name',
	'Desc',
	'Hours',
	'Contact',
	'Email',
	'Address',
	'Lat',
	'Lng',
	'Hash'
	], restval='', extrasaction='ignore')
	self.__vf.writeheader()


	def __parse_time(self, timestring):
	'''Parse a string representing the time duration of an event, and return a start/end time pair'''

	# Let's make some assumptions about the event's start/end times
	start = self.DEFAULT_START
	end = self.DEFAULT_END

	print "Timestring: %s" % timestring

	# We begin to match the various timestring formats below
	# Each should be unique enough such that matching one means that the rest do not match

	# If the time string is of the standard "09:00 AM - 10:00 PM" format
	res = re.search(r"(?P<start>\d{2}:\d{2} (AM\|PM)).+(?P<end>\d{2}:\d{2} (AM\|PM))", timestring)
	if res and res.groupdict():
	start = res.groupdict().get('start', start)
	end = res.groupdict().get('end', end)


	# Check for lines like "Begins at 08:00 AM" or "Starts at 09:00 AM"
	# Match case insensitively
	res = re.search(r"^(start\|begin).+(?P<start>\d{2}:\d{2} (AM\|PM))", timestring, re.I)
	if res and res.groupdict():
	start = res.groupdict()['start']


	# Sanity check to make sure we have start/end times
	if not start or not end:
	raise Exception("Could not parse time: %s" % timestring)

	return [start, end]


	def __parse_series(self, seriesurl):
	'''Parses a seriesurl on whatshappening.sg, and grabs a list of tuples of datetime pairs'''

	series = BeautifulSoup(urllib2.urlopen(seriesurl).read())

	# A WhatsHappening.sg series listing page is made up multiple pairs of <div>s,
	# all siblings without hierarchy, so here we go

	# Get all times on page
	# The .replace() call is in there to handle unicode character \xa0 which prevents us from splitting
	# the string properly, so we replace it with a regular space
	#times = [i.text.strip().replace(u'\xa0', u' ').split(" - ") for i in series.select('div[class^="eventListTime"]')]
	times = [i.text.strip() for i in series.select('div[class^="eventListTime"]')]
	times = map(self.__parse_time, times) # Parse _ALL_ the times

	# Get all dates on page
	dates = [i.find("a").text for i in series.select('div[class^="eventListTitle"]')]

	# Put them all together in the format
	# [
	# ("<date> <start_time>", "<date> <end_time>"),
	# ("<date> <start_time>", "<date> <end_time>"),
	# ("<date> <start_time>", "<date> <end_time>"),
	# ...
	#]
	event_series = [{'Start': "%s %s" % (i[0], i[1][0], ), "End": "%s %s" % (i[0], i[1][1])} for i in zip(dates, times)]

	return event_series


	def __parse_item(self, itemurl):
	'''Parses an individual item on whatshappening.sg'''

	item = BeautifulSoup(urllib2.urlopen(itemurl).read()) # Grab and parse the item

	# Start grabbing info
	name = item.select('.summary')[0].text if item.select('.summary') else ''
	desc = item.select('.description')[0].text if item.select('.description') else ''

	# Grab and process the dates
	date_start = item.select('.dtstart')[0].text if item.select('.dtstart') else ''
	time_range = item.select('.eventDetailTime')[0].text if item.select('.eventDetailTime') else ''

	start, end = self.__parse_time(time_range)

	# Combine the datetime strings
	start = "%s %s" % (date_start, start)
	end = "%s %s" % (date_start, end)

	# Scrape the price
	price = re.search(r"Cost: (?P<cost>.+)$", item.select('.eventDetailPrice')[0].text).groupdict()['cost'] if item.select('.eventDetailPrice') else 'Unknown'

	venue = {
	'Name': item.select('.org')[0].text.replace('"', '') if item.select('.org') else '',
	'street': item.select('.street-address')[0].text if item.select('.street-address') else '',
	'postal': item.select('.postal-code')[0].text if item.select('.postal-code') else ''
	}
	venue['Address'] = "%s Singapore %s" % (venue['street'], venue['postal'])

	# Construct venue hash
	m = hashlib.new('md5')
	m.update(venue['Name'])
	venue['Hash'] = m.hexdigest()

	contact = {
	# Regex first trawls for the string that contains "Phone:", then parses it with a regex to extract the phone number component of that string
	# The regex "(?P<phone>(\+65(\ )?)?\d{4}(\ )?\d{4})" extracts phone numbers which:
	# - may or may not begin with +65
	# - may or may not have a space after +65
	# - may or may not have a space after the first 4 numbers
	'phone': re.search(r"(?P<phone>(\+65(\ \|-)?)?\d{4}(\ \|-)?\d{4})", item.find_all(text=re.compile('Phone:'))[0]).groupdict()['phone'] if item.find_all(text=re.compile('Phone:')) else '',

	# Email is a little tricker because the site very smartly does not list the lister's email in the source, choosing instead to write it post-load
	# via Javascript
	# Conveniently enough, that script is embedded right in the page and can be parsed with the regex below
	# The first .find_all() selector trawls for the string 'var ename' which holds the username component of the email
	# The text result of that search is then further parsed through the regex to extract the 'ename' and 'edomain' components from the embedded Javascript,
	# which we can then recombine into the full email address
	'email': re.search(r"var ename = '(?P<ename>.+)';.+var edomain = '(?P<edomain>.+)';", item.find_all(text=re.compile("var ename"))[0], re.DOTALL).groupdict() if item.find_all(text=re.compile("var ename")) else ''
	}

	# A little fix
	# contact['email'] is currently the groupdict() result from the regex match. We need to recombine the components into a proper email address
	contact['email'] = "%s@%s" % (contact['email']['ename'], contact['email']['edomain']) if contact['email'] and contact['email']['ename'] and contact['email']['edomain'] else ''

	# Grab the website link
	url = item.select('a[href^="http://www.whatshappening.sg/events/link/index.php"]')[0]['href'] if item.select('a[href^="http://www.whatshappening.sg/events/link/index.php"]') else ''
	link = ''
	if url:
	try:
	link = urllib2.urlopen(url).url
	except Exception, e:
	pass # Oh well

	# The 'seriesList' URL indicates that this is a repeated event, so we should trawl it
	event_series = []
	if self.repeats: # Grab repeats only if flag is set
	if item.select('a[href^="http://www.whatshappening.sg/events/index.php?com=serieslist"]'):
	seriesurl = item.select('a[href^="http://www.whatshappening.sg/events/index.php?com=serieslist"]')[0]['href']
	event_series = self.__parse_series(seriesurl)

	# Construct the info struct
	info = {
	'Activity': name,
	'Description': desc,
	'Price': price,
	'Start': start,
	'End': end,
	'Contact': contact['phone'],
	'Email': contact['email'],
	'Link': link
	}

	# Construct the activity hash
	main = hashlib.new("md5")
	hash_string = "%s %s" % (info['Activity'], venue['Name'])
	main.update(hash_string.encode('utf-8'))
	info['main_hash'] = main.hexdigest()

	sub = hashlib.new("md5")
	sub.update("%s - %s" % (info['Start'], info['End']))
	info['sub_hash'] = sub.hexdigest()

	info['Hash'] = "%s-%s" % (info['main_hash'], info['sub_hash'])

	# Add the venue hash
	info['Venue'] = venue['Hash']

	return [info, event_series, venue]

	def parse(self):
	ERROR = []

	soup = BeautifulSoup(urllib2.urlopen(self.src).read()) # Grab and parse the feed

	for g in soup.find_all('guid'): # Iterate through all <guid> items

	try:

	info, event_series, venue = self.__parse_item(g.text)

	self.__af.writerow(info)
	self.__vf.writerow(venue)

	# DEBUG
	print json.dumps(info, indent=4)
	print json.dumps(venue, indent=4)

	if event_series and self.repeats:
	# Write a new entry for each repeat event
	for e in event_series:
	info['Start'] = e['Start']
	info['End'] = e['End']

	sub = hashlib.new("md5")
	sub.update("%s - %s" % (info['Start'], info['End']))
	info['sub_hash'] = sub.hexdigest()

	info['Hash'] = "%s-%s" % (info['main_hash'], info['sub_hash'])

	self.__af.writerow(info)

	except URLError, e:
	#print g.text
	if hasattr(e, 'reason'):
	print 'We failed to reach a server.'
	print 'Reason: ', e.reason
	elif hasattr(e, 'code'):
	print 'The server couldn\'t fulfill the request.'
	print 'Error code: ', e.code

	#raise e
	ERROR.append(g.text)

	# except Exception as e:
	# print g
	# print e
	# print dir(e)
	# #raise e

	finally:
	pass
	# print "\n\n"
	# print 20 * ""
	# print "\n"

	print ERROR

	self.__afile.close()
	self.__vfile.close()

	if __name__ == '__main__':
	ws = WhatsHappening()
	ws.parse()