Skip to content

Instantly share code, notes, and snippets.

@bshillingford
Last active December 16, 2015 05:39
Show Gist options
  • Save bshillingford/5385633 to your computer and use it in GitHub Desktop.
Save bshillingford/5385633 to your computer and use it in GitHub Desktop.
SSC course scrape Python module
#!/usr/bin/python2.7
import lxml, lxml.etree, urllib2, re
SESSYR=2012
SESSCD='W'
debug_last_url = None
def read_endpoint(dept=None, course=None, section=None):
global SESSYR, SESSCD
global debug_last_url
url_prefix = "https://courses.students.ubc.ca/cs/servlets/SRVCourseSchedule?sessyr=%d&sesscd=%s&" % (SESSYR, SESSCD)
if dept is None:
url = url_prefix + "output=2&req=0"
elif course is None:
url = url_prefix + "output=5&req=2&dept=%s" % dept
elif section is None:
url = url_prefix + "output=5&req=4&dept=%s&course=%s" % (dept, course)
else:
raise Exception("not implemented")
debug_last_url = url # DEBUG
result = urllib2.urlopen(url).read().decode('utf8', 'ignore')
# workaround for anthropology text bugs
result = re.sub("[\x00-\x08\x0b\x0c\x0e-\x1f]", "", result)
return lxml.etree.XML(result)
def scrape_depts():
for elem_dept in read_endpoint():
yield dict(elem_dept.items())
def scrape_courses(dept):
if type(dept) == dict:
dept = dept['key']
for elem_course in read_endpoint(dept):
yield dict(elem_course.items())
def scrape_sections(dept, course):
if type(dept) == dict:
dept = dept['key']
if type(course) == dict:
course = course['key']
for elem_section in read_endpoint(dept, course):
section = dict(elem_section.items())
teachingunits = []
for elem_teachingunit in elem_section.find("teachingunits"):
teachingunit = dict(elem_teachingunit.items())
elem_meetings = elem_teachingunit.find("meetings")
if elem_meetings is not None:
meetings = [dict(elem_meeting.items()) for elem_meeting in elem_meetings]
teachingunit['meetings'] = meetings
teachingunits.append(teachingunit)
section['teachingunits'] = teachingunits
yield section
if __name__ == "__main__":
import simplejson
allsections = []
f = open("sections.json", "w")
for dept in scrape_depts():
for course in scrape_courses(dept['key']):
sections = scrape_sections(dept['key'], course)
for section in sections:
section["course"] = course["key"]
section["dept"] = dept["key"]
allsections.append(section)
simplejson.dump(allsections, f)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment