Created
July 15, 2023 15:40
-
-
Save GorgeousOne/89fda2203740d231487b92e80fe22949 to your computer and use it in GitHub Desktop.
Summaery 2023 calender event scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""visits the event list of the summaery 2023 and scrapes all events into one big icalendar file. | |
update the url and year in the future""" | |
from datetime import datetime | |
import pytz | |
from icalendar import Calendar, Event | |
from selenium.webdriver.common.by import By | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
def get_event_start_end_place(summaery_event): | |
"""read the start, end time and possible place from the same calender info element""" | |
date_string = get_event_date(summaery_event) | |
time_info = summaery_event.find_element(By.CLASS_NAME, "cal_maininfo") | |
infos = time_info.find_elements(By.CSS_SELECTOR, "li") | |
# filter event start and end for only the datetime string | |
start = date_string + " " + infos[0].text[8:-4] | |
end = infos[1].text[6:-4] | |
# add the event date to event end if it's not in the string yet | |
if len(end) < 10: | |
end = date_string + " " + end | |
place = infos[2].text[5:] if len(infos) > 2 else None | |
time_format = "%d.%m.%Y %H.%M" | |
germany = pytz.timezone('Europe/Berlin') | |
start = datetime.strptime(start, time_format) | |
end = datetime.strptime(end, time_format) | |
return germany.localize(start), germany.localize(end), place | |
def get_event_date(summaery_event): | |
"""read the event date string from the big date element to the left. and append a year""" | |
month = summaery_event.find_element(By.CLASS_NAME, "month").text | |
day = summaery_event.find_element(By.CLASS_NAME, "day_of_month").text | |
return day.zfill(2) + "." + month.zfill(2) + ".2023" | |
def get_event_title_url(summaery_event): | |
"""read the event title and url for more info from the desciption to the right""" | |
title_elem = summaery_event.find_element(By.CSS_SELECTOR, "h2").find_element(By.CSS_SELECTOR, "a") | |
title = title_elem.text | |
url = title_elem.get_attribute("href") | |
return title, url | |
def get_event_teaser(summaery_event): | |
"""read the description text of the event on the right""" | |
return summaery_event.find_element(By.CLASS_NAME, "teasertext").text[:-4] | |
def add_event_to_cal(cal, start, end, place, title, url, teaser): | |
"""create and add a calendar event with all the info to the calendar""" | |
event = Event() | |
event.add("summary", title) | |
event.add("dtstart", start) | |
event.add("dtend", end) | |
if place: | |
event.add("location", place) | |
event.add("url", url) | |
event.add("description", teaser) | |
cal.add_component(event) | |
if __name__ == "__main__": | |
opts = Options() | |
opts.add_argument("user-agent=firefox") | |
driver = webdriver.Chrome(options=opts) | |
page_url = "https://www.uni-weimar.de/de/universitaet/aktuell/jaehrliche-veranstaltungen/jaehrliche-veranstaltungen-2023/summaery/veranstaltungen/" | |
driver.get(page_url) | |
# find all events listed on the page | |
summaery_events = driver.find_elements(By.CLASS_NAME, "summaeryLiveEvent") | |
cal = Calendar() | |
for i, event in enumerate(summaery_events): | |
add_event_to_cal( | |
cal, | |
*get_event_start_end_place(event), | |
*get_event_title_url(event), | |
get_event_teaser(event)) | |
print(i + 1, "/", len(summaery_events)) | |
# write them into one big icalendar file | |
with open("summaery.ics", "wb") as f: | |
f.write(cal.to_ical()) | |
driver.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment