Created
April 26, 2011 04:06
-
-
Save abstrctn/941775 to your computer and use it in GitHub Desktop.
nyu.OpenCourseSearch.org Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cookielib, urllib2, urllib, BeautifulSoup, re, time, pickle | |
#from courses.helpers import * | |
""" | |
This class scrapes NYU's course registration listings on Albert, | |
and writes the pickled data out to a specified file directory. | |
Shouldn't require a working NYU Net ID to work, but adding it is worth a shot | |
if something seems broken. | |
Can whip up a command-line version if anyone wants it. | |
>>> from scraper import Scraper | |
>>> s = Scraper('/opt/storage/') | |
>>> s.run() | |
>>> s = Scraper('/opt/storage/', '$username', '$password') | |
>>> s.run() | |
# (optional) scrape only a slice of the subjects | |
>>> s = Scraper('/opt/storage/', '$username', '$password', 0, 10) | |
>>> s.run() | |
Data is pickled per-course category as follows: | |
classes = { | |
'$course_id': { | |
'class_name': 'Animal Minds', | |
'classification': 'ANST-UA', | |
'college': 'College of Arts and Science', | |
'component': 'Lecture', | |
'course_name': 'Topics is AS', | |
'description': 'This course analyzes the ways that...', | |
'grading': 'CAS Graded', | |
'is_open': 'Open', | |
'level': 'Undergraduate', | |
'loc_code': 'WS', | |
'meet_data': '09/06/2011 - 12/23/2011 Mon,Wed 11.00 AM - 12.15 PM with Sebo, Jeffrey', | |
'notes': 'Open only to ANST minors during the first...', | |
'number': '600', | |
'section': '001', | |
'session': '09/06/2011 - 12/16/2011', | |
}, | |
... | |
} | |
""" | |
class Scraper: | |
def __init__(self, storage_dir, username=None, password=None, start=0, end=None): | |
self.ICSID = '' | |
self.sections = None | |
self.section_data = {} | |
self.classes = {} | |
self.hold = None | |
self.DUMP_DIR = storage_dir | |
self.username = username | |
self.password = password | |
self.start_index = start | |
if end: | |
self.end_index = start + end | |
else: | |
self.end_index = None | |
self.login_url = 'https://admin.portal.nyu.edu/psp/paprod/EMPLOYEE/EMPL/?cmd=login' | |
self.scrape_url = 'https://sis.nyu.edu/psc/csprod/EMPLOYEE/CSSS/c/NYU_SR.NYU_CLS_SRCH.GBL' | |
self.last_section = '' | |
self.longs = [] | |
self.cj = cookielib.CookieJar() | |
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj)) | |
def _make_connection(self): | |
try: | |
if self.username and self.password: | |
self.log_in() | |
print "touching home..." | |
self.access_home() | |
except: | |
print "Server is not responding. Try quitting and running again..." | |
print "retrieving course sections..." | |
self.get_all_sections() | |
def run(self, forever=False): | |
self._make_connection() | |
if self.end_index: | |
end = self.end_index | |
else: | |
end = len(self.sections) | |
while True: | |
for i in range(self.start_index, end): | |
print "Section %s." % i | |
self.process_section(i) | |
if not forever: | |
return | |
def log_in(self): | |
params = { | |
'timezoneOffset': '240', | |
'httpPort': '', | |
'userid': self.username, | |
'pwd': self.password, | |
'Submit': 'Sign In' | |
} | |
data = urllib.urlencode(params) | |
# open the log in page | |
r = self.opener.open(self.login_url) | |
# log in | |
r = self.opener.open(self.login_url, data) | |
page = r.read() | |
print "logged in." | |
def access_home(self): | |
r = self.opener.open(self.scrape_url) | |
page = r.read() | |
soup = BeautifulSoup.BeautifulSoup(page) | |
self.ICSID = soup.find('input', {'type': 'hidden', 'name': 'ICSID'})['value'] | |
print "Browsing key found: %s" % self.ICSID | |
def get_home(self): | |
params = { | |
'ICAction': 'NYU_CLS_DERIVED_BACK', | |
'ICChanged': '-1', | |
'ICElementNum': '0', | |
'ICFocus': '', | |
'ICResubmit': '0', | |
'ICSID': self.ICSID, | |
'ICSaveWarningFilter': '0', | |
'ICStateNum': '1', | |
'ICType': 'Panel', | |
'ICXPos': '0', | |
'ICYPos': '0', | |
'NYU_CLS_DERIVED_DESCR100': '', | |
'NYU_CLS_DERIVED_DESCR100_JOB_POST1': '', | |
'NYU_CLS_WRK_NYU_FALL': 'Y', | |
'NYU_CLS_WRK_NYU_FALL$chk': 'Y', | |
} | |
data = urllib.urlencode(params) | |
r = self.opener.open(self.scrape_url, data) | |
page = r.read() | |
return BeautifulSoup.BeautifulSoup(page) | |
def get_all_sections(self): | |
""" | |
Gathers list of all colleges, and their corresponding class sections. | |
Writes full list out to 'colleges-classifications.txt': | |
data = [ | |
["College of Arts and Science", "Biology", "BIOL-UA"], | |
... | |
] | |
""" | |
soup = self.get_home() | |
divs = soup.findAll('a', {'class': 'SSSAZLINK'}) | |
ret = [] | |
for i in divs: | |
ret.append([i['id'], i.contents[0]]) | |
self.sections = ret[1:] | |
print "%s sections loaded" % (len(self.sections)) | |
# get colleges, classifications | |
data = [] | |
ret = [] | |
colleges = soup.findAll('td', {'class': 'SSSGROUPBOXLEFTLABEL'}) | |
for c in colleges: | |
college = c.contents[1] | |
college = college.replace(' ', '') | |
table = c.findNext('table', {'class': 'SSSGROUPBOXLEFT'}) | |
classifications = table.findAll('a', {'class': 'SSSAZLINK'}) | |
for cls in classifications: | |
name = " ".join(["%s" % i for i in cls.contents]) | |
name = name.replace('<br />', '') | |
name = name.replace('&', '&') | |
try: | |
course_name, course_code = re.search('(.*)\s\((.+)\)', name).groups() | |
data.append([college, course_name, course_code]) | |
except: | |
print name | |
f = open('%scollege-classifications.txt' % (self.DUMP_DIR), 'w') | |
pickle.dump(data, f) | |
f.close() | |
def _confirm_long_listing(self, ICStateNum): | |
# basically, press "yes" to confirm showing more than 100 results | |
params = { | |
'ICAction': "#ICYes", | |
'ICChanged': '-1', | |
'ICElementNum': '0', | |
'ICFocus': '', | |
'ICResubmit': '0', | |
'ICSID': self.ICSID, | |
'ICSaveWarningFilter': '0', | |
'ICStateNum': ICStateNum, | |
'ICType': 'Panel', | |
'ICXPos': '0', | |
'ICYPos': '0', | |
'NYU_CLS_DERIVED_DESCR100': '', | |
'NYU_CLS_DERIVED_DESCR100_JOB_POST1': '', | |
'NYU_CLS_WRK_NYU_FALL': 'Y', | |
'NYU_CLS_WRK_NYU_FALL$chk': 'Y', | |
} | |
data = urllib.urlencode(params) | |
r = self.opener.open(self.scrape_url, data) | |
page = r.read() | |
return BeautifulSoup.BeautifulSoup(page) | |
def get_section_listing(self, id, current_only='Y'): | |
params = { | |
'ICAction': id, | |
'ICChanged': '-1', | |
'ICElementNum': '0', | |
'ICFocus': '', | |
'ICResubmit': '0', | |
'ICSID': self.ICSID, | |
'ICSaveWarningFilter': '0', | |
'ICStateNum': '1', | |
'ICType': 'Panel', | |
'ICXPos': '0', | |
'ICYPos': '0', | |
'NYU_CLS_WRK_NYU_FALL': current_only, | |
'NYU_CLS_WRK_NYU_FALL$chk': current_only, | |
} | |
data = urllib.urlencode(params) | |
r = self.opener.open(self.scrape_url, data) | |
page = r.read() | |
soup = BeautifulSoup.BeautifulSoup(page) | |
ICStateNum = 1 # default | |
if page.find("This search will return more than 100 results and may take a while to process.") > 0: | |
print "confirming long listing..." | |
#time.sleep(4) | |
self.longs.append(id) | |
ICStateNum = int(soup.find('input', {'type': 'hidden', 'name': 'ICStateNum'})['value']) | |
print 'ICStateNum found: %s' % ICStateNum | |
soup = self._confirm_long_listing(ICStateNum) | |
print "long listing retrieved." | |
if not self.section_data.has_key(id): | |
self.section_data[id] = {} | |
self._list_courses_in_section(id, soup, current_only) | |
print "%s classes gathered from %s" % (len(self.section_data[id].keys()), id) | |
self.last_section = id | |
return ICStateNum | |
def _list_courses_in_section(self, id, soup, current_only): | |
ret = {} | |
classes = soup.findAll('span', | |
{'style': "background-color: white; font-family: arial; color: black; font-size: 16px; font-weight: normal"}) | |
for c in classes: | |
try: | |
bits = [i.strip() for i in re.split(' (\d+)', c.find('b').contents[0])] | |
classification = bits[0] | |
number = bits[1] | |
name = " ".join(bits[2:]) | |
except: | |
classification, number, name = None, None, None | |
description, meta, college, units, level = '', '', '', '', '' | |
try: | |
description = c.find('div', {'class': 'courseDescription'}).find('p').contents[0] | |
except: pass | |
try: | |
meta = c.findNext('span', {'class': 'SSSTEXTBLUE'}).contents[0].split('|') | |
except: pass | |
try: | |
college = meta[0].strip() | |
except: pass | |
try: | |
units = meta[1].split(':')[1].strip() | |
except: pass | |
try: | |
level = meta[2].split(':')[1].strip() | |
except: pass | |
try: | |
index = c.findNext('td', {'class': 'SSSGROUPBOXRIGHTLABEL'}).findNext('a')['name'].split('$')[1] | |
except: | |
# when there is a "more description" link | |
index = None | |
self.section_data[id]["%s-%s" % (classification, number)] = { | |
'classification': classification, | |
'name': name, | |
'description': description, | |
'number': number, | |
'college': college, | |
'units': units, | |
'level': level, | |
'index': index} | |
def process_section(self, id): | |
try: | |
id = self.sections[id][0] | |
except: | |
return | |
hold = len(self.classes.keys()) | |
ICStateNum = self.get_section_listing(id) + 1 | |
keys = {} | |
for key in self.section_data[id].keys(): | |
# uniquify it | |
keys[key] = 1 | |
total = len(keys.keys()) | |
for i, key in enumerate(sorted(keys.keys())): | |
ind = self.section_data[id][key]['index'] | |
if ind: | |
print " grabbing course %s of %s..." % (i + 1, total) | |
self.get_detail_for_course_in_section(id, ind, ICStateNum) | |
print "%s classes processed" % (len(self.classes.keys()) - hold) | |
print "%s total classes scraped" % len(self.classes.keys()) | |
f = open('%s%s.txt' % (self.DUMP_DIR, id), 'w') | |
pickle.dump(self.classes, f) | |
f.close() | |
self.classes = {} | |
# django helper method, loads to db | |
#load_listing(id) | |
def get_detail_for_course_in_section(self, id, ind, ICStateNum): | |
params = { | |
'ICAction': 'NYU_CLS_DERIVED_TERM$' + ind, | |
'ICChanged': '-1', | |
'ICElementNum': '0', | |
'ICFocus': '', | |
'ICResubmit': '0', | |
'ICSID': self.ICSID, | |
'ICSaveWarningFilter': '0', | |
'ICStateNum': "%s" % ICStateNum, | |
'ICType': 'Panel', | |
'ICXPos': '0', | |
'ICYPos': '1332', | |
'NYU_CLS_DERIVED_DESCR100': '', | |
'NYU_CLS_DERIVED_DESCR100_JOB_POST1': '', | |
'NYU_CLS_WRK_NYU_FALL$chk': 'N', | |
} | |
data = urllib.urlencode(params) | |
r = self.opener.open(self.scrape_url, data) | |
page = r.read() | |
soup = BeautifulSoup.BeautifulSoup(page) | |
try: | |
self.summarize_detail(id, soup) | |
except: | |
print "fail" | |
def summarize_detail(self, id, soup): | |
tables = soup.find('table', {'class': 'PSLEVEL3SCROLLAREABODY'}).findAll('table', {'class': 'PSGROUPBOX', 'width': '531'}) | |
for table in tables: | |
notes = '' | |
datacell = table.find('td', {'style': 'background-color: white; font-family: arial; font-size: 12px;'}) | |
self.datacell = datacell | |
if ("%s" % datacell).find("<b>Topic: </b>") > 0: | |
# Topics class. Special case. | |
classification, number = re.split('\s+', | |
re.search('([-\w]+\s+\d+)', "%s" % datacell.find('p')).groups()[0]) | |
class_name = re.search('<b>Topic: </b>(.*)<p>', "%s" % datacell).groups()[0].strip() | |
else: | |
# Normal case. | |
bits = re.split('\s+', re.search('([-\w\s.]+\s+\d+)', "%s" % datacell).groups()[0]) | |
number = bits[-1] | |
classification = " ".join(bits[:-1]) | |
class_name = '' | |
try: | |
units = re.search('([\w]+( - )?[\w]+ units)', "%s" % datacell).groups()[0] | |
except: | |
try: | |
# leftover units from previous session of same class, use | |
if units: | |
pass | |
else: | |
units = "" | |
except: | |
units = self.section_data[id]["%s-%s" % (classification, number)]['units'], | |
for i in datacell.findAll('span'): | |
if i.contents: | |
if i.contents[0].__unicode__() == "Class#:": | |
classnum = i.nextSibling.strip(' |') | |
if i.contents[0].__unicode__() == "Session:": | |
session = i.nextSibling.strip(' |') | |
if i.contents[0].__unicode__() == "Section:": | |
section = i.nextSibling.strip(' |\r\n') | |
if i.contents[0].__unicode__() == "Class Status:": | |
is_open = i.nextSibling.nextSibling.contents[0] | |
elif i.contents[0].__unicode__() == "Grading:": | |
grading = i.nextSibling.strip() | |
elif i.contents[0].__unicode__() == "<b>Course Location Code: </b>": | |
loccode = i.nextSibling.strip(' |') | |
component = i.nextSibling.nextSibling.nextSibling.strip() | |
elif i.contents[0].__unicode__() == "<b>Notes: </b>": | |
notes = i.nextSibling.strip() | |
try: | |
meet_data = re.search('(\d{2}/\d{2}/\d{4}[^<]*(at[^<]*)?(with[^<]*)?)\r', "%s" % datacell).groups()[0].strip(' \r\n') | |
except: | |
self.save = datacell | |
self.classes[classnum] = { | |
'classification': "%s" % classification, | |
'number': "%s" % number, | |
'is_open': "%s" % is_open, | |
'session': "%s" % session, | |
'section': "%s" % section, | |
'grading': "%s" % grading, | |
'loc_code': "%s" % loccode, | |
'component': "%s" % component, | |
'meet_data': "%s" % meet_data, | |
'notes': "%s" % notes, | |
'level': "%s" % self.section_data[id]["%s-%s" % (classification, number)]['level'], | |
'course_name': "%s" % self.section_data[id]["%s-%s" % (classification, number)]['name'], | |
'college': "%s" % self.section_data[id]["%s-%s" % (classification, number)]['college'], | |
'description': "%s" % self.section_data[id]["%s-%s" % (classification, number)]['description'], | |
'class_name': "%s" % class_name, | |
'units': "%s" % units, | |
} | |
return True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment