Last active
August 20, 2018 21:17
-
-
Save danthedaniel/3a95503a0cf388a046c3d662d00f133e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Drexel Web Term Master Schedule scraper.""" | |
import bs4 as bs | |
import sqlite3 | |
import requests | |
from datetime import datetime | |
def db_setup(conn): | |
"""Create the table and indices.""" | |
c = conn.cursor() | |
c.execute("""CREATE TABLE IF NOT EXISTS classes ( | |
subject_code TEXT, | |
course_num TEXT, | |
instr_type TEXT, | |
instr_method TEXT, | |
section TEXT, | |
crn INTEGER, | |
full BOOLEAN, | |
title TEXT, | |
days TEXT, | |
start_time INTEGER, | |
end_time INTEGER, | |
instructor TEXT | |
)""") | |
c.execute("CREATE INDEX IF NOT EXISTS crn ON classes (crn)") | |
c.execute("CREATE INDEX IF NOT EXISTS start_time ON classes (start_time)") | |
c.execute("CREATE INDEX IF NOT EXISTS end_time ON classes (end_time)") | |
conn.commit() | |
def get_college(root, href, conn): | |
"""Handle a college's majors. | |
Parameters | |
---------- | |
root : str | |
The web address root. | |
href : str | |
Absolute path of the page to request. | |
conn : sqlite connection | |
Database connection. | |
""" | |
content = requests.get(root + href).content | |
bullets = bs.BeautifulSoup(content, "lxml").find(class_="collegePanel") | |
for link in bullets.find_all("a"): | |
get_field(root, link.get("href"), conn) | |
def get_field(root, href, conn): | |
"""Handle a major's classes. | |
Parameters | |
---------- | |
root : str | |
The web address root. | |
href : str | |
Absolute path of the page to request. | |
conn : sqlite connection | |
Database connection. | |
""" | |
content = requests.get(root + href).content | |
sections = bs.BeautifulSoup(content, "lxml").find(class_="tableHeader") | |
# Filter out stray strings | |
sections = [ | |
x for x in sections.next_siblings | |
if isinstance(x, bs.element.Tag) | |
] | |
for section in sections: | |
get_section(section, conn) | |
def get_section(section, conn): | |
"""Insert a class section into the database. | |
Parameters | |
---------- | |
section : bs4.element.Tag | |
A table row from the class section lists. | |
conn : sqlite connection | |
Database connection. | |
""" | |
c = conn.cursor() | |
cells = section.find_all("td") | |
# Only use this tag if it has all of the required fields | |
if len(cells) != 11: | |
return | |
is_full = cells[5].find("p").attrs["title"] == "FULL" | |
time_start, time_end = get_time_range(cells[9].get_text()) | |
c.execute( | |
"INSERT INTO classes VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", | |
( | |
cells[0].get_text(), | |
cells[1].get_text(), | |
cells[2].get_text(), | |
cells[3].get_text(), | |
cells[4].get_text(), | |
int(cells[5].get_text()), | |
is_full, | |
cells[6].get_text(), | |
cells[8].get_text(), | |
time_start, | |
time_end, | |
cells[10].get_text() | |
) | |
) | |
conn.commit() | |
print("{}{} - {}".format( | |
cells[0].get_text(), | |
cells[1].get_text(), | |
cells[4].get_text() | |
)) | |
def get_time_range(times): | |
"""From a string, read in a time range. | |
Parameters | |
---------- | |
times : str | |
String in the format: "08:00 AM - 09:00 AM" | |
Returns | |
------- | |
Tuple of 2 integers or (None, None) if couldn't parse. | |
""" | |
try: | |
time_range = times.split(" - ") | |
time_start = datetime.strptime(time_range[0], "%I:%M %p").time() | |
time_end = datetime.strptime(time_range[1], "%I:%M %p").time() | |
return (time_as_minutes(time_start), time_as_minutes(time_end)) | |
except ValueError: | |
return (None, None) | |
def time_as_minutes(time): | |
"""Convert a datetime.time to an integer number of minutes. | |
Parameters | |
---------- | |
time : datetime.time | |
The time object to convert from. | |
Returns | |
------- | |
Integer number of minutes past midnight. | |
""" | |
return time.hour * 60 + time.minute | |
def main(): | |
"""Read the Drexel Web TMS into a SQLite database.""" | |
root = "https://duapp2.drexel.edu" | |
# This is the root address for a term in the Web TMS | |
start_url = "/webtms_du/app?component=quarterTermDetails&page=Home&service=direct&sp=ZH4sIAAAAAAAAAFvzloG1uIhBPjWlVC%2BlKLUiNUcvs6hErzw1qSS3WC8lsSRRLyS1KJcBAhiZGJh9GNgTk0tCMnNTSxhEfLISyxL1iwtz9EECxSWJuQXWPgwcJUAtzvkpQBVCEBU5iXnp%2BsElRZl56TB5l9Ti5EKGOgamioKCEgY2IwNDc2NToJHBBSBVCoGliUVAZQqG5rqGFgD84zXupgAAAA%3D%3D" | |
start_content = requests.get(root + start_url).content | |
sidebar = bs.BeautifulSoup(start_content, "lxml").find(id="sideLeft") | |
conn = sqlite3.connect('spring.sqlite') | |
db_setup(conn) | |
for link in sidebar.find_all("a"): | |
get_college(root, link.get("href"), conn) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment