Skip to content

Instantly share code, notes, and snippets.

@shazeline
Created November 27, 2013 04:22

Revisions

  1. shazeline created this gist Nov 27, 2013.
    64 changes: 64 additions & 0 deletions scraper_raw.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,64 @@
    import re
    import requests
    from bs4 import BeautifulSoup

    BASE = 'http://www.registrar.ucla.edu/schedule/'

    def get_dept_url(term, dept):
    return BASE + 'crsredir.aspx?termsel=%s&subareasel=%s' % (term, dept)

    def get_course_url(term, dept, id):
    return BASE + 'detselect.aspx?termsel=%s&subareasel=%s&idxcrs=%s' % (term, dept, id)

    def cell_text(cell):
    return ' '.join(cell.stripped_strings)

    def parse_table(table):
    data = []
    for row in table.find_all('tr'):
    row_data = map(cell_text, row.find_all(re.compile('t[dh]')))[::2]
    data.append(row_data)
    return data

    response = requests.get(BASE + 'schedulehome.aspx')
    soup = BeautifulSoup(response.text)
    values = []
    for option in soup.find_all('option'):
    values.append(option.get('value'))

    terms = values[0:5]
    dept_codes = values[5:]

    winter = terms[1]
    dept_urls = []
    for dept_code in dept_codes:
    dept_urls.append(get_dept_url(winter, dept_code))

    for i, dept_url in enumerate(dept_urls):
    response = requests.get(dept_url)
    soup = BeautifulSoup(response.text)
    course_ids = []
    for option in soup.find_all('option'):
    course_ids.append(option.get('value'))

    course_urls = []
    for course_id in course_ids:
    course_url = get_course_url(winter, dept_codes[i], course_id)
    course_urls.append(course_url)

    for course_url in course_urls:
    response = requests.get(course_url)
    soup = BeautifulSoup(response.text)
    tables = soup.find_all('table')
    if len(tables) < 8:
    break
    enrollment_table = tables[8]
    parsed_table = parse_table(enrollment_table)
    print course_url
    # print parsed_table
    for i in range(1, len(parsed_table)):
    try:
    print parsed_table[i][1] + ' ' + parsed_table[i][2] + ':\t' + parsed_table[i][13]
    except:
    print 'whoops!'
    print '=========================='