Created
July 2, 2013 11:24
-
-
Save ruiwen/5908528 to your computer and use it in GitHub Desktop.
A scraper for WhatsHappening.sg
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
#!/bin/python | |
# RSS feed parser/scraper for WhatsHappening.sg | |
import urllib2 | |
import csv | |
import re | |
import cStringIO | |
import codecs | |
import hashlib | |
import json | |
from urllib2 import URLError, HTTPError | |
from bs4 import BeautifulSoup | |
# Unicode DictWriter object | |
# http://stackoverflow.com/a/5838817 | |
class DictUnicodeWriter(object): | |
def __init__(self, f, fieldnames, dialect=csv.excel, encoding="utf-8", **kwds): | |
# Redirect output to a queue | |
self.queue = cStringIO.StringIO() | |
self.writer = csv.DictWriter(self.queue, fieldnames, dialect=dialect, **kwds) | |
self.stream = f | |
self.encoder = codecs.getincrementalencoder(encoding)() | |
def writerow(self, D): | |
self.writer.writerow({k:v.encode("utf-8") for k,v in D.items()}) | |
# Fetch UTF-8 output from the queue ... | |
data = self.queue.getvalue() | |
data = data.decode("utf-8") | |
# ... and reencode it into the target encoding | |
data = self.encoder.encode(data) | |
# write to the target stream | |
self.stream.write(data) | |
# empty queue | |
self.queue.truncate(0) | |
def writerows(self, rows): | |
for D in rows: | |
self.writerow(D) | |
def writeheader(self): | |
self.writer.writeheader() | |
class WhatsHappening(object): | |
'''WhatsHappening RSS feed scraper''' | |
# Define the RSS endpoint | |
# Sourced from http://www.whatshappening.sg/events/index.php?com=rss | |
# With the following options checked: | |
# Exhibitions | |
# Fashion | |
# Group / Community Events | |
# Shopping | |
# Travel / Outdoor | |
# Art / Stage | |
# Charity | |
# Dining / Food | |
# Family | |
# Garage sales | |
# Housing | |
# Music | |
# Pets | |
# Sports | |
# Singapore | |
src = "http://www.whatshappening.sg/events/rss.php?cID=6,21,4,18,22,19,14,10,8,17,9,20,11,7&c=Singapore" | |
# Default start / end times | |
DEFAULT_START = "09:00 AM" | |
DEFAULT_END = "09:00 PM" | |
def __init__(self, src='', afile='activities.csv', vfile='venues.csv', with_repeats=True): | |
self.src = src if src else self.src | |
self.__afile = open(afile, 'w') | |
self.__vfile = open(vfile, 'w') | |
self.repeats = with_repeats | |
if not afile: | |
raise Exception("'file' should be be None") | |
# Open output file as read-write, not append | |
self.__af = DictUnicodeWriter(self.__afile, [ | |
'Activity', | |
'Description', | |
'Price', | |
'Start', | |
'End', | |
'Contact', | |
'Email', | |
'Link', | |
'Hash', | |
'Venue' | |
], restval='', extrasaction='ignore') | |
self.__af.writeheader() | |
self.__vf = DictUnicodeWriter(self.__vfile, [ | |
'Name', | |
'Desc', | |
'Hours', | |
'Contact', | |
'Email', | |
'Address', | |
'Lat', | |
'Lng', | |
'Hash' | |
], restval='', extrasaction='ignore') | |
self.__vf.writeheader() | |
def __parse_time(self, timestring): | |
'''Parse a string representing the time duration of an event, and return a start/end time pair''' | |
# Let's make some assumptions about the event's start/end times | |
start = self.DEFAULT_START | |
end = self.DEFAULT_END | |
print "Timestring: %s" % timestring | |
# We begin to match the various timestring formats below | |
# Each should be unique enough such that matching one means that the rest do not match | |
# If the time string is of the standard "09:00 AM - 10:00 PM" format | |
res = re.search(r"(?P<start>\d{2}:\d{2} (AM|PM)).+(?P<end>\d{2}:\d{2} (AM|PM))", timestring) | |
if res and res.groupdict(): | |
start = res.groupdict().get('start', start) | |
end = res.groupdict().get('end', end) | |
# Check for lines like "Begins at 08:00 AM" or "Starts at 09:00 AM" | |
# Match case insensitively | |
res = re.search(r"^(start|begin).+(?P<start>\d{2}:\d{2} (AM|PM))", timestring, re.I) | |
if res and res.groupdict(): | |
start = res.groupdict()['start'] | |
# Sanity check to make sure we have start/end times | |
if not start or not end: | |
raise Exception("Could not parse time: %s" % timestring) | |
return [start, end] | |
def __parse_series(self, seriesurl): | |
'''Parses a seriesurl on whatshappening.sg, and grabs a list of tuples of datetime pairs''' | |
series = BeautifulSoup(urllib2.urlopen(seriesurl).read()) | |
# A WhatsHappening.sg series listing page is made up multiple pairs of <div>s, | |
# all siblings without hierarchy, so here we go | |
# Get all times on page | |
# The .replace() call is in there to handle unicode character \xa0 which prevents us from splitting | |
# the string properly, so we replace it with a regular space | |
#times = [i.text.strip().replace(u'\xa0', u' ').split(" - ") for i in series.select('div[class^="eventListTime"]')] | |
times = [i.text.strip() for i in series.select('div[class^="eventListTime"]')] | |
times = map(self.__parse_time, times) # Parse _ALL_ the times | |
# Get all dates on page | |
dates = [i.find("a").text for i in series.select('div[class^="eventListTitle"]')] | |
# Put them all together in the format | |
# [ | |
# ("<date> <start_time>", "<date> <end_time>"), | |
# ("<date> <start_time>", "<date> <end_time>"), | |
# ("<date> <start_time>", "<date> <end_time>"), | |
# ... | |
#] | |
event_series = [{'Start': "%s %s" % (i[0], i[1][0], ), "End": "%s %s" % (i[0], i[1][1])} for i in zip(dates, times)] | |
return event_series | |
def __parse_item(self, itemurl): | |
'''Parses an individual item on whatshappening.sg''' | |
item = BeautifulSoup(urllib2.urlopen(itemurl).read()) # Grab and parse the item | |
# Start grabbing info | |
name = item.select('.summary')[0].text if item.select('.summary') else '' | |
desc = item.select('.description')[0].text if item.select('.description') else '' | |
# Grab and process the dates | |
date_start = item.select('.dtstart')[0].text if item.select('.dtstart') else '' | |
time_range = item.select('.eventDetailTime')[0].text if item.select('.eventDetailTime') else '' | |
start, end = self.__parse_time(time_range) | |
# Combine the datetime strings | |
start = "%s %s" % (date_start, start) | |
end = "%s %s" % (date_start, end) | |
# Scrape the price | |
price = re.search(r"Cost: (?P<cost>.+)$", item.select('.eventDetailPrice')[0].text).groupdict()['cost'] if item.select('.eventDetailPrice') else 'Unknown' | |
venue = { | |
'Name': item.select('.org')[0].text.replace('"', '') if item.select('.org') else '', | |
'street': item.select('.street-address')[0].text if item.select('.street-address') else '', | |
'postal': item.select('.postal-code')[0].text if item.select('.postal-code') else '' | |
} | |
venue['Address'] = "%s Singapore %s" % (venue['street'], venue['postal']) | |
# Construct venue hash | |
m = hashlib.new('md5') | |
m.update(venue['Name']) | |
venue['Hash'] = m.hexdigest() | |
contact = { | |
# Regex first trawls for the string that contains "Phone:", then parses it with a regex to extract the phone number component of that string | |
# The regex "(?P<phone>(\+65(\ )?)?\d{4}(\ )?\d{4})" extracts phone numbers which: | |
# - may or may not begin with +65 | |
# - may or may not have a space after +65 | |
# - may or may not have a space after the first 4 numbers | |
'phone': re.search(r"(?P<phone>(\+65(\ |-)?)?\d{4}(\ |-)?\d{4})", item.find_all(text=re.compile('Phone:'))[0]).groupdict()['phone'] if item.find_all(text=re.compile('Phone:')) else '', | |
# Email is a little tricker because the site very smartly does not list the lister's email in the source, choosing instead to write it post-load | |
# via Javascript | |
# Conveniently enough, that script is embedded right in the page and can be parsed with the regex below | |
# The first .find_all() selector trawls for the string 'var ename' which holds the username component of the email | |
# The text result of that search is then further parsed through the regex to extract the 'ename' and 'edomain' components from the embedded Javascript, | |
# which we can then recombine into the full email address | |
'email': re.search(r"var ename = '(?P<ename>.+)';.+var edomain = '(?P<edomain>.+)';", item.find_all(text=re.compile("var ename"))[0], re.DOTALL).groupdict() if item.find_all(text=re.compile("var ename")) else '' | |
} | |
# A little fix | |
# contact['email'] is currently the groupdict() result from the regex match. We need to recombine the components into a proper email address | |
contact['email'] = "%s@%s" % (contact['email']['ename'], contact['email']['edomain']) if contact['email'] and contact['email']['ename'] and contact['email']['edomain'] else '' | |
# Grab the website link | |
url = item.select('a[href^="http://www.whatshappening.sg/events/link/index.php"]')[0]['href'] if item.select('a[href^="http://www.whatshappening.sg/events/link/index.php"]') else '' | |
link = '' | |
if url: | |
try: | |
link = urllib2.urlopen(url).url | |
except Exception, e: | |
pass # Oh well | |
# The 'seriesList' URL indicates that this is a repeated event, so we should trawl it | |
event_series = [] | |
if self.repeats: # Grab repeats only if flag is set | |
if item.select('a[href^="http://www.whatshappening.sg/events/index.php?com=serieslist"]'): | |
seriesurl = item.select('a[href^="http://www.whatshappening.sg/events/index.php?com=serieslist"]')[0]['href'] | |
event_series = self.__parse_series(seriesurl) | |
# Construct the info struct | |
info = { | |
'Activity': name, | |
'Description': desc, | |
'Price': price, | |
'Start': start, | |
'End': end, | |
'Contact': contact['phone'], | |
'Email': contact['email'], | |
'Link': link | |
} | |
# Construct the activity hash | |
main = hashlib.new("md5") | |
hash_string = "%s %s" % (info['Activity'], venue['Name']) | |
main.update(hash_string.encode('utf-8')) | |
info['main_hash'] = main.hexdigest() | |
sub = hashlib.new("md5") | |
sub.update("%s - %s" % (info['Start'], info['End'])) | |
info['sub_hash'] = sub.hexdigest() | |
info['Hash'] = "%s-%s" % (info['main_hash'], info['sub_hash']) | |
# Add the venue hash | |
info['Venue'] = venue['Hash'] | |
return [info, event_series, venue] | |
def parse(self): | |
ERROR = [] | |
soup = BeautifulSoup(urllib2.urlopen(self.src).read()) # Grab and parse the feed | |
for g in soup.find_all('guid'): # Iterate through all <guid> items | |
try: | |
info, event_series, venue = self.__parse_item(g.text) | |
self.__af.writerow(info) | |
self.__vf.writerow(venue) | |
# DEBUG | |
print json.dumps(info, indent=4) | |
print json.dumps(venue, indent=4) | |
if event_series and self.repeats: | |
# Write a new entry for each repeat event | |
for e in event_series: | |
info['Start'] = e['Start'] | |
info['End'] = e['End'] | |
sub = hashlib.new("md5") | |
sub.update("%s - %s" % (info['Start'], info['End'])) | |
info['sub_hash'] = sub.hexdigest() | |
info['Hash'] = "%s-%s" % (info['main_hash'], info['sub_hash']) | |
self.__af.writerow(info) | |
except URLError, e: | |
#print g.text | |
if hasattr(e, 'reason'): | |
print 'We failed to reach a server.' | |
print 'Reason: ', e.reason | |
elif hasattr(e, 'code'): | |
print 'The server couldn\'t fulfill the request.' | |
print 'Error code: ', e.code | |
#raise e | |
ERROR.append(g.text) | |
# except Exception as e: | |
# print g | |
# print e | |
# print dir(e) | |
# #raise e | |
finally: | |
pass | |
# print "\n\n" | |
# print 20 * "" | |
# print "\n" | |
print ERROR | |
self.__afile.close() | |
self.__vfile.close() | |
if __name__ == '__main__': | |
ws = WhatsHappening() | |
ws.parse() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment