GorgeousOne · July 10, 2025 12:19
diff --git a/summaery_scraper.py b/summaery_scraper.py
 '''Scraper to download the Summaery events into a .ics calender file for importing it into a calender app or something'''

 import os
 import pytz
 import re
 import requests
 from bs4 import BeautifulSoup
 from datetime import datetime
 from icalendar import Calendar, Event

 def get_event_start_end_place(event_entry):
 	'''read the start, end time and possible place from the same calender info element'''
 	date_string = get_event_date(event_entry)
 	time_info = event_entry.find(class_='cal_maininfo')
 	infos = time_info.find_all('li')

 	time_pattern = r'(\d{1,2}\.\d{2})'
 	# filter event start and end for only the datetime string
 	start = date_string + ' ' + re.search(time_pattern, infos[0].text).group(1)
 	end = re.search(time_pattern, infos[1].text).group(1)

 	# add the event date to event end if it's not in the string yet
 	if len(end) < 10:
 		end = date_string + ' ' + end

 	place = re.sub(r'Ort:', '', infos[2].text).strip() if len(infos) > 2 else None

 	time_format = '%d.%m.%Y %H.%M'
 	germany = pytz.timezone('Europe/Berlin')
 	start = datetime.strptime(start, time_format)
 	end = datetime.strptime(end, time_format)

 	return germany.localize(start), germany.localize(end), place


 def get_event_date(event_entry):
 	'''read the event date string from the big date element to the left. and append a year'''
 	month = event_entry.find(class_='month').text.strip()
 	day = event_entry.find(class_='day_of_month').text.strip()
 	return day.zfill(2) + '.' + month.zfill(2) + '.' + str(year)


 def get_event_title_url(event_entry):
 	'''read the event title and url for more info from the desciption to the right'''
 	title_elem = event_entry.find('h2').find('a')
 	title = title_elem.text.strip()
 	url = 'https://www.uni-weimar.de' + title_elem['href']
 	return title, url


 def get_event_teaser(event_entry):
 	'''read the description text of the event on the right'''
 	teaser = event_entry.find(class_='teasertext').text.strip()
 	return re.sub(r'\bmehr\b$', '', teaser).strip()

 def add_event_to_cal(cal, start, end, place, title, url, teaser):
 	'''create and add a calendar event with all the info to the calendar'''
 	event = Event()
 	event.add('summary', title)
 	event.add('dtstart', start)
 	event.add('dtend', end)

 	if place:
 		event.add('location', place)

 	event.add('url', url)
 	event.add('description', teaser)
 	cal.add_component(event)


 year = 2025
 page_url = 'https://www.uni-weimar.de/de/universitaet/aktuell/veranstaltungskalender/highlights-des-jahres/2025/summaery/veranstaltungen/'

 if __name__ == '__main__':
 	filepath = f'website_{year}.html'

 	if not os.path.exists(filepath):
 		print('downloading website')
 		response = requests.get(page_url)
 		with open(filepath, 'wb') as f:
 			print('saved website')
 			f.write(response.content)

 	with open(filepath, 'r', encoding='utf-8') as f:
 		print('loading html')
 		html_string = f.read()

 	import re
 	html_string = re.sub(r'\s\s+', ' ', html_string)
 	dom = BeautifulSoup(html_string, 'html.parser')

 	# find all events listed on the page
 	event_entries = dom.find_all(class_='summaeryLiveEvent')
 	print(f'Found {len(event_entries)} events')
 	cal = Calendar()

 	for i, entry in enumerate(event_entries):
 		add_event_to_cal(
 			cal,
 			*get_event_start_end_place(entry),
 			*get_event_title_url(entry),
 			get_event_teaser(entry))
 		print(i + 1, '/', len(event_entries), get_event_title_url(entry)[0])
 	# write them into one big icalendar file
 	with open(f'summaery_{year}.ics', 'wb') as f:
 		f.write(cal.to_ical())
	'''Scraper to download the Summaery events into a .ics calender file for importing it into a calender app or something'''

	import os
	import pytz
	import re
	import requests
	from bs4 import BeautifulSoup
	from datetime import datetime
	from icalendar import Calendar, Event

	def get_event_start_end_place(event_entry):
	'''read the start, end time and possible place from the same calender info element'''
	date_string = get_event_date(event_entry)
	time_info = event_entry.find(class_='cal_maininfo')
	infos = time_info.find_all('li')

	time_pattern = r'(\d{1,2}\.\d{2})'
	# filter event start and end for only the datetime string
	start = date_string + ' ' + re.search(time_pattern, infos[0].text).group(1)
	end = re.search(time_pattern, infos[1].text).group(1)

	# add the event date to event end if it's not in the string yet
	if len(end) < 10:
	end = date_string + ' ' + end

	place = re.sub(r'Ort:', '', infos[2].text).strip() if len(infos) > 2 else None

	time_format = '%d.%m.%Y %H.%M'
	germany = pytz.timezone('Europe/Berlin')
	start = datetime.strptime(start, time_format)
	end = datetime.strptime(end, time_format)

	return germany.localize(start), germany.localize(end), place


	def get_event_date(event_entry):
	'''read the event date string from the big date element to the left. and append a year'''
	month = event_entry.find(class_='month').text.strip()
	day = event_entry.find(class_='day_of_month').text.strip()
	return day.zfill(2) + '.' + month.zfill(2) + '.' + str(year)


	def get_event_title_url(event_entry):
	'''read the event title and url for more info from the desciption to the right'''
	title_elem = event_entry.find('h2').find('a')
	title = title_elem.text.strip()
	url = 'https://www.uni-weimar.de' + title_elem['href']
	return title, url


	def get_event_teaser(event_entry):
	'''read the description text of the event on the right'''
	teaser = event_entry.find(class_='teasertext').text.strip()
	return re.sub(r'\bmehr\b$', '', teaser).strip()

	def add_event_to_cal(cal, start, end, place, title, url, teaser):
	'''create and add a calendar event with all the info to the calendar'''
	event = Event()
	event.add('summary', title)
	event.add('dtstart', start)
	event.add('dtend', end)

	if place:
	event.add('location', place)

	event.add('url', url)
	event.add('description', teaser)
	cal.add_component(event)


	year = 2025
	page_url = 'https://www.uni-weimar.de/de/universitaet/aktuell/veranstaltungskalender/highlights-des-jahres/2025/summaery/veranstaltungen/'

	if __name__ == '__main__':
	filepath = f'website_{year}.html'

	if not os.path.exists(filepath):
	print('downloading website')
	response = requests.get(page_url)
	with open(filepath, 'wb') as f:
	print('saved website')
	f.write(response.content)

	with open(filepath, 'r', encoding='utf-8') as f:
	print('loading html')
	html_string = f.read()

	import re
	html_string = re.sub(r'\s\s+', ' ', html_string)
	dom = BeautifulSoup(html_string, 'html.parser')

	# find all events listed on the page
	event_entries = dom.find_all(class_='summaeryLiveEvent')
	print(f'Found {len(event_entries)} events')
	cal = Calendar()

	for i, entry in enumerate(event_entries):
	add_event_to_cal(
	cal,
	*get_event_start_end_place(entry),
	*get_event_title_url(entry),
	get_event_teaser(entry))
	print(i + 1, '/', len(event_entries), get_event_title_url(entry)[0])
	# write them into one big icalendar file
	with open(f'summaery_{year}.ics', 'wb') as f:
	f.write(cal.to_ical())
No results found