Skip to content

Instantly share code, notes, and snippets.

@danthedaniel
Last active August 20, 2018 21:17
Show Gist options
  • Save danthedaniel/3a95503a0cf388a046c3d662d00f133e to your computer and use it in GitHub Desktop.
Save danthedaniel/3a95503a0cf388a046c3d662d00f133e to your computer and use it in GitHub Desktop.
"""Drexel Web Term Master Schedule scraper."""
import bs4 as bs
import sqlite3
import requests
from datetime import datetime
def db_setup(conn):
"""Create the table and indices."""
c = conn.cursor()
c.execute("""CREATE TABLE IF NOT EXISTS classes (
subject_code TEXT,
course_num TEXT,
instr_type TEXT,
instr_method TEXT,
section TEXT,
crn INTEGER,
full BOOLEAN,
title TEXT,
days TEXT,
start_time INTEGER,
end_time INTEGER,
instructor TEXT
)""")
c.execute("CREATE INDEX IF NOT EXISTS crn ON classes (crn)")
c.execute("CREATE INDEX IF NOT EXISTS start_time ON classes (start_time)")
c.execute("CREATE INDEX IF NOT EXISTS end_time ON classes (end_time)")
conn.commit()
def get_college(root, href, conn):
"""Handle a college's majors.
Parameters
----------
root : str
The web address root.
href : str
Absolute path of the page to request.
conn : sqlite connection
Database connection.
"""
content = requests.get(root + href).content
bullets = bs.BeautifulSoup(content, "lxml").find(class_="collegePanel")
for link in bullets.find_all("a"):
get_field(root, link.get("href"), conn)
def get_field(root, href, conn):
"""Handle a major's classes.
Parameters
----------
root : str
The web address root.
href : str
Absolute path of the page to request.
conn : sqlite connection
Database connection.
"""
content = requests.get(root + href).content
sections = bs.BeautifulSoup(content, "lxml").find(class_="tableHeader")
# Filter out stray strings
sections = [
x for x in sections.next_siblings
if isinstance(x, bs.element.Tag)
]
for section in sections:
get_section(section, conn)
def get_section(section, conn):
"""Insert a class section into the database.
Parameters
----------
section : bs4.element.Tag
A table row from the class section lists.
conn : sqlite connection
Database connection.
"""
c = conn.cursor()
cells = section.find_all("td")
# Only use this tag if it has all of the required fields
if len(cells) != 11:
return
is_full = cells[5].find("p").attrs["title"] == "FULL"
time_start, time_end = get_time_range(cells[9].get_text())
c.execute(
"INSERT INTO classes VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
(
cells[0].get_text(),
cells[1].get_text(),
cells[2].get_text(),
cells[3].get_text(),
cells[4].get_text(),
int(cells[5].get_text()),
is_full,
cells[6].get_text(),
cells[8].get_text(),
time_start,
time_end,
cells[10].get_text()
)
)
conn.commit()
print("{}{} - {}".format(
cells[0].get_text(),
cells[1].get_text(),
cells[4].get_text()
))
def get_time_range(times):
"""From a string, read in a time range.
Parameters
----------
times : str
String in the format: "08:00 AM - 09:00 AM"
Returns
-------
Tuple of 2 integers or (None, None) if couldn't parse.
"""
try:
time_range = times.split(" - ")
time_start = datetime.strptime(time_range[0], "%I:%M %p").time()
time_end = datetime.strptime(time_range[1], "%I:%M %p").time()
return (time_as_minutes(time_start), time_as_minutes(time_end))
except ValueError:
return (None, None)
def time_as_minutes(time):
"""Convert a datetime.time to an integer number of minutes.
Parameters
----------
time : datetime.time
The time object to convert from.
Returns
-------
Integer number of minutes past midnight.
"""
return time.hour * 60 + time.minute
def main():
"""Read the Drexel Web TMS into a SQLite database."""
root = "https://duapp2.drexel.edu"
# This is the root address for a term in the Web TMS
start_url = "/webtms_du/app?component=quarterTermDetails&page=Home&service=direct&sp=ZH4sIAAAAAAAAAFvzloG1uIhBPjWlVC%2BlKLUiNUcvs6hErzw1qSS3WC8lsSRRLyS1KJcBAhiZGJh9GNgTk0tCMnNTSxhEfLISyxL1iwtz9EECxSWJuQXWPgwcJUAtzvkpQBVCEBU5iXnp%2BsElRZl56TB5l9Ti5EKGOgamioKCEgY2IwNDc2NToJHBBSBVCoGliUVAZQqG5rqGFgD84zXupgAAAA%3D%3D"
start_content = requests.get(root + start_url).content
sidebar = bs.BeautifulSoup(start_content, "lxml").find(id="sideLeft")
conn = sqlite3.connect('spring.sqlite')
db_setup(conn)
for link in sidebar.find_all("a"):
get_college(root, link.get("href"), conn)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment