Skip to content

Instantly share code, notes, and snippets.

@ruiwen
Created July 2, 2013 11:24
Show Gist options
  • Save ruiwen/5908528 to your computer and use it in GitHub Desktop.
Save ruiwen/5908528 to your computer and use it in GitHub Desktop.
A scraper for WhatsHappening.sg
# -*- coding: utf-8 -*-
#!/bin/python
# RSS feed parser/scraper for WhatsHappening.sg
import urllib2
import csv
import re
import cStringIO
import codecs
import hashlib
import json
from urllib2 import URLError, HTTPError
from bs4 import BeautifulSoup
# Unicode DictWriter object
# http://stackoverflow.com/a/5838817
class DictUnicodeWriter(object):
def __init__(self, f, fieldnames, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.DictWriter(self.queue, fieldnames, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, D):
self.writer.writerow({k:v.encode("utf-8") for k,v in D.items()})
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for D in rows:
self.writerow(D)
def writeheader(self):
self.writer.writeheader()
class WhatsHappening(object):
'''WhatsHappening RSS feed scraper'''
# Define the RSS endpoint
# Sourced from http://www.whatshappening.sg/events/index.php?com=rss
# With the following options checked:
# Exhibitions
# Fashion
# Group / Community Events
# Shopping
# Travel / Outdoor
# Art / Stage
# Charity
# Dining / Food
# Family
# Garage sales
# Housing
# Music
# Pets
# Sports
# Singapore
src = "http://www.whatshappening.sg/events/rss.php?cID=6,21,4,18,22,19,14,10,8,17,9,20,11,7&c=Singapore"
# Default start / end times
DEFAULT_START = "09:00 AM"
DEFAULT_END = "09:00 PM"
def __init__(self, src='', afile='activities.csv', vfile='venues.csv', with_repeats=True):
self.src = src if src else self.src
self.__afile = open(afile, 'w')
self.__vfile = open(vfile, 'w')
self.repeats = with_repeats
if not afile:
raise Exception("'file' should be be None")
# Open output file as read-write, not append
self.__af = DictUnicodeWriter(self.__afile, [
'Activity',
'Description',
'Price',
'Start',
'End',
'Contact',
'Email',
'Link',
'Hash',
'Venue'
], restval='', extrasaction='ignore')
self.__af.writeheader()
self.__vf = DictUnicodeWriter(self.__vfile, [
'Name',
'Desc',
'Hours',
'Contact',
'Email',
'Address',
'Lat',
'Lng',
'Hash'
], restval='', extrasaction='ignore')
self.__vf.writeheader()
def __parse_time(self, timestring):
'''Parse a string representing the time duration of an event, and return a start/end time pair'''
# Let's make some assumptions about the event's start/end times
start = self.DEFAULT_START
end = self.DEFAULT_END
print "Timestring: %s" % timestring
# We begin to match the various timestring formats below
# Each should be unique enough such that matching one means that the rest do not match
# If the time string is of the standard "09:00 AM - 10:00 PM" format
res = re.search(r"(?P<start>\d{2}:\d{2} (AM|PM)).+(?P<end>\d{2}:\d{2} (AM|PM))", timestring)
if res and res.groupdict():
start = res.groupdict().get('start', start)
end = res.groupdict().get('end', end)
# Check for lines like "Begins at 08:00 AM" or "Starts at 09:00 AM"
# Match case insensitively
res = re.search(r"^(start|begin).+(?P<start>\d{2}:\d{2} (AM|PM))", timestring, re.I)
if res and res.groupdict():
start = res.groupdict()['start']
# Sanity check to make sure we have start/end times
if not start or not end:
raise Exception("Could not parse time: %s" % timestring)
return [start, end]
def __parse_series(self, seriesurl):
'''Parses a seriesurl on whatshappening.sg, and grabs a list of tuples of datetime pairs'''
series = BeautifulSoup(urllib2.urlopen(seriesurl).read())
# A WhatsHappening.sg series listing page is made up multiple pairs of <div>s,
# all siblings without hierarchy, so here we go
# Get all times on page
# The .replace() call is in there to handle unicode character \xa0 which prevents us from splitting
# the string properly, so we replace it with a regular space
#times = [i.text.strip().replace(u'\xa0', u' ').split(" - ") for i in series.select('div[class^="eventListTime"]')]
times = [i.text.strip() for i in series.select('div[class^="eventListTime"]')]
times = map(self.__parse_time, times) # Parse _ALL_ the times
# Get all dates on page
dates = [i.find("a").text for i in series.select('div[class^="eventListTitle"]')]
# Put them all together in the format
# [
# ("<date> <start_time>", "<date> <end_time>"),
# ("<date> <start_time>", "<date> <end_time>"),
# ("<date> <start_time>", "<date> <end_time>"),
# ...
#]
event_series = [{'Start': "%s %s" % (i[0], i[1][0], ), "End": "%s %s" % (i[0], i[1][1])} for i in zip(dates, times)]
return event_series
def __parse_item(self, itemurl):
'''Parses an individual item on whatshappening.sg'''
item = BeautifulSoup(urllib2.urlopen(itemurl).read()) # Grab and parse the item
# Start grabbing info
name = item.select('.summary')[0].text if item.select('.summary') else ''
desc = item.select('.description')[0].text if item.select('.description') else ''
# Grab and process the dates
date_start = item.select('.dtstart')[0].text if item.select('.dtstart') else ''
time_range = item.select('.eventDetailTime')[0].text if item.select('.eventDetailTime') else ''
start, end = self.__parse_time(time_range)
# Combine the datetime strings
start = "%s %s" % (date_start, start)
end = "%s %s" % (date_start, end)
# Scrape the price
price = re.search(r"Cost: (?P<cost>.+)$", item.select('.eventDetailPrice')[0].text).groupdict()['cost'] if item.select('.eventDetailPrice') else 'Unknown'
venue = {
'Name': item.select('.org')[0].text.replace('"', '') if item.select('.org') else '',
'street': item.select('.street-address')[0].text if item.select('.street-address') else '',
'postal': item.select('.postal-code')[0].text if item.select('.postal-code') else ''
}
venue['Address'] = "%s Singapore %s" % (venue['street'], venue['postal'])
# Construct venue hash
m = hashlib.new('md5')
m.update(venue['Name'])
venue['Hash'] = m.hexdigest()
contact = {
# Regex first trawls for the string that contains "Phone:", then parses it with a regex to extract the phone number component of that string
# The regex "(?P<phone>(\+65(\ )?)?\d{4}(\ )?\d{4})" extracts phone numbers which:
# - may or may not begin with +65
# - may or may not have a space after +65
# - may or may not have a space after the first 4 numbers
'phone': re.search(r"(?P<phone>(\+65(\ |-)?)?\d{4}(\ |-)?\d{4})", item.find_all(text=re.compile('Phone:'))[0]).groupdict()['phone'] if item.find_all(text=re.compile('Phone:')) else '',
# Email is a little tricker because the site very smartly does not list the lister's email in the source, choosing instead to write it post-load
# via Javascript
# Conveniently enough, that script is embedded right in the page and can be parsed with the regex below
# The first .find_all() selector trawls for the string 'var ename' which holds the username component of the email
# The text result of that search is then further parsed through the regex to extract the 'ename' and 'edomain' components from the embedded Javascript,
# which we can then recombine into the full email address
'email': re.search(r"var ename = '(?P<ename>.+)';.+var edomain = '(?P<edomain>.+)';", item.find_all(text=re.compile("var ename"))[0], re.DOTALL).groupdict() if item.find_all(text=re.compile("var ename")) else ''
}
# A little fix
# contact['email'] is currently the groupdict() result from the regex match. We need to recombine the components into a proper email address
contact['email'] = "%s@%s" % (contact['email']['ename'], contact['email']['edomain']) if contact['email'] and contact['email']['ename'] and contact['email']['edomain'] else ''
# Grab the website link
url = item.select('a[href^="http://www.whatshappening.sg/events/link/index.php"]')[0]['href'] if item.select('a[href^="http://www.whatshappening.sg/events/link/index.php"]') else ''
link = ''
if url:
try:
link = urllib2.urlopen(url).url
except Exception, e:
pass # Oh well
# The 'seriesList' URL indicates that this is a repeated event, so we should trawl it
event_series = []
if self.repeats: # Grab repeats only if flag is set
if item.select('a[href^="http://www.whatshappening.sg/events/index.php?com=serieslist"]'):
seriesurl = item.select('a[href^="http://www.whatshappening.sg/events/index.php?com=serieslist"]')[0]['href']
event_series = self.__parse_series(seriesurl)
# Construct the info struct
info = {
'Activity': name,
'Description': desc,
'Price': price,
'Start': start,
'End': end,
'Contact': contact['phone'],
'Email': contact['email'],
'Link': link
}
# Construct the activity hash
main = hashlib.new("md5")
hash_string = "%s %s" % (info['Activity'], venue['Name'])
main.update(hash_string.encode('utf-8'))
info['main_hash'] = main.hexdigest()
sub = hashlib.new("md5")
sub.update("%s - %s" % (info['Start'], info['End']))
info['sub_hash'] = sub.hexdigest()
info['Hash'] = "%s-%s" % (info['main_hash'], info['sub_hash'])
# Add the venue hash
info['Venue'] = venue['Hash']
return [info, event_series, venue]
def parse(self):
ERROR = []
soup = BeautifulSoup(urllib2.urlopen(self.src).read()) # Grab and parse the feed
for g in soup.find_all('guid'): # Iterate through all <guid> items
try:
info, event_series, venue = self.__parse_item(g.text)
self.__af.writerow(info)
self.__vf.writerow(venue)
# DEBUG
print json.dumps(info, indent=4)
print json.dumps(venue, indent=4)
if event_series and self.repeats:
# Write a new entry for each repeat event
for e in event_series:
info['Start'] = e['Start']
info['End'] = e['End']
sub = hashlib.new("md5")
sub.update("%s - %s" % (info['Start'], info['End']))
info['sub_hash'] = sub.hexdigest()
info['Hash'] = "%s-%s" % (info['main_hash'], info['sub_hash'])
self.__af.writerow(info)
except URLError, e:
#print g.text
if hasattr(e, 'reason'):
print 'We failed to reach a server.'
print 'Reason: ', e.reason
elif hasattr(e, 'code'):
print 'The server couldn\'t fulfill the request.'
print 'Error code: ', e.code
#raise e
ERROR.append(g.text)
# except Exception as e:
# print g
# print e
# print dir(e)
# #raise e
finally:
pass
# print "\n\n"
# print 20 * ""
# print "\n"
print ERROR
self.__afile.close()
self.__vfile.close()
if __name__ == '__main__':
ws = WhatsHappening()
ws.parse()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment