Last active
September 20, 2021 19:55
-
-
Save yoavst/de2a2d2e9ade70fa452d1ec84170fe12 to your computer and use it in GitHub Desktop.
Timetable downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import io | |
import itertools | |
import json | |
from pyquery import PyQuery as pq | |
from selenium import webdriver | |
from selenium.webdriver.chrome.webdriver import WebDriver | |
SEMESTER = { | |
u"קיץ": 0, | |
u"א'": 1, | |
u"ב'": 2 | |
} | |
# region Utils | |
def exists(s): | |
return s is not None and s != "" | |
def group(seq, sep_lambda): | |
g = [] | |
for el in seq: | |
if sep_lambda(el): | |
yield g | |
g = [] | |
else: | |
g.append(el) | |
yield g | |
def flatmap(func, *iterable): | |
return itertools.chain.from_iterable(map(func, *iterable)) | |
def clean_str(s): | |
if not s: | |
return None | |
return ' '.join(s.split()) | |
# endregion | |
def extract_courses_from_group(courses_group): | |
if len(courses_group) < 5: | |
return [] | |
group_courses = [] | |
course_info = pq(pq(courses_group[1]).children()[0]).contents() | |
course_id, group_id = course_info[0].strip(), course_info[2].strip() | |
name = pq(courses_group[1]).children()[1].text | |
faculty = pq(pq(courses_group[2]).children()[-1]).text() | |
last_teacher = None | |
for item in courses_group[4:-1]: | |
children = pq(item).children() | |
if len(children) < 6: | |
continue | |
if not exists(children[1].text) or not exists(children[2].text) or not exists(children[3].text) or not exists( | |
children[4].text) or not exists(children[5].text): | |
continue | |
last_teacher = clean_str(pq(children[0]).text().strip() or last_teacher) | |
hours = children[5].text.split('-') | |
group_courses.append({ | |
'name': name, | |
'id': course_id, | |
'faculty': faculty, | |
'semester': SEMESTER[children[6].text.strip()], | |
'end': hours[1], | |
'start': hours[0], | |
'day': ord(children[4].text[0]) - ord(u'א') + 1, | |
'room': children[3].text.replace(' ', ''), | |
'building': children[2].text, | |
'type': children[1].text, | |
'lecturer': last_teacher | |
}) | |
return group_courses | |
def parse_results_page(html): | |
page = pq(html) | |
trs = page('#frmgrid > table:nth-child(4)')('tr')[1:] | |
groups = list(group(trs, lambda element: len(pq(element).text()) == 0)) | |
return list(flatmap(extract_courses_from_group, groups)) | |
def load_courses(driver, year, semester, department_values): | |
""" | |
:type driver: WebDriver | |
""" | |
driver.get("https://www.ims.tau.ac.il/tal/kr/Search_P.aspx") | |
# Choose departments | |
for dep_id, dep_value in department_values.items(): | |
for dep_opt in driver.find_element_by_id("lstDep" + str(dep_id)).find_elements_by_tag_name("option"): | |
if dep_opt.get_attribute("value") == dep_value: | |
dep_opt.click() | |
break | |
else: | |
print("Fail to select departments for {} -> {}, exiting".format(dep_id, dep_value)) | |
return | |
# Choose year | |
for year_opt in driver.find_element_by_id("lstYear1").find_elements_by_tag_name("option"): | |
if year_opt.get_attribute("value") == str(year): | |
year_opt.click() | |
break | |
# Choose semester | |
for sem_opt in driver.find_elements_by_name("ckSem"): | |
if sem_opt.get_attribute("value") == str(semester): | |
sem_opt.click() | |
break | |
# Start searching | |
driver.find_element_by_id("search1").click() | |
# Go over all the pages | |
results = [] | |
while True: | |
html_source_code = driver.execute_script("return document.body.innerHTML;") | |
current_results = parse_results_page(html_source_code) | |
results.extend(current_results) | |
# Move to next page | |
next_page_btn = driver.find_elements_by_id("next") | |
if next_page_btn: | |
next_page_btn[0].click() | |
else: | |
break | |
return results | |
def main(): | |
driver = webdriver.Chrome() | |
courses1 = load_courses(driver, 2021, 1, { | |
1: '08', | |
2: '05', | |
3: '10', | |
4: '04', | |
5: '06-16', | |
6: '03', | |
7: '14', | |
8: '12', | |
9: '01', | |
10: '11', | |
11: '2171', | |
12: '1880-1882', | |
13: '1843', | |
14: '2120' | |
}) | |
courses2 = load_courses(driver, 2021, 1, { | |
10: '07', | |
11: '2172', | |
12: '1883' | |
}) | |
courses3 = load_courses(driver, 2021, 1, { | |
10: '15', | |
}) | |
courses = courses1 + courses2 + courses3 | |
with io.open('timetable.json', 'w', encoding='utf-8') as f: | |
data = json.dumps(courses, indent=4, ensure_ascii=False) | |
f.write(unicode(data)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment