Created
June 20, 2023 14:08
-
-
Save cboulanger/3e41bb05ff074e29b95e6d66d23cf64d to your computer and use it in GitHub Desktop.
Create a list of sessions ordered by tracks from an easychair.org conference programme
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import dateparser | |
import re | |
import requests | |
from bs4 import BeautifulSoup | |
import csv | |
from collections import defaultdict | |
url = ".../program.html" | |
track_url = ".../tracks.html" | |
css_url = "..../program.css" | |
page_title = "XXX - Tracks und Sessions" | |
def download_session_data(url): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8') | |
# Initialize CSV writer | |
with open('conference_data.csv', 'w', newline='', encoding='utf-8') as file: | |
writer = csv.writer(file) | |
writer.writerow(["session_id", "session", "date", "interval", "track", "title"]) | |
# Initialize date | |
date = None | |
for div in soup.find_all('div'): | |
# Check if div contains date | |
if div.get('class') == ['date']: | |
date = dateparser.parse(div.text) | |
date = date.strftime('%d.%m.%Y') | |
# Check if div is a session | |
elif div.get('class') and 'session' in div.get('class'): | |
session_id = div.find('a').get('name').replace('session:', '') | |
heading = div.find('div', class_='heading').text | |
matches = re.search(r'(.+)\s*Session\s*(\w+)\s*:\s*Track\s*(\w+):\s*(.+)', heading) | |
if matches: | |
interval, session, track, title = matches.groups() | |
writer.writerow([session_id, session, date, interval.strip(), track, title]) | |
def create_track_overview(track_url, program_url, page_title, css_url): | |
# Download and parse the track page | |
response = requests.get(track_url) | |
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8') | |
# Extract track titles | |
tracks = {} | |
i = 1 | |
for h3 in soup.find_all('h3'): | |
a = h3.find('a') | |
if a: | |
tracks[str(i)] = a.text.strip() | |
i+=1 | |
# Parse the CSV file | |
sessions = defaultdict(list) | |
with open('conference_data.csv', 'r', newline='', encoding='utf-8') as file: | |
reader = csv.reader(file) | |
next(reader) # Skip header | |
for row in reader: | |
session_id, session, date, interval, track, title = row | |
if session_id != "" and track != "": | |
sessions[track].append((date, interval, title, session_id, session)) | |
# Sort sessions by date and interval | |
for track in sessions: | |
sessions[track].sort() | |
# Create the HTML page | |
html = '<html><head>' | |
html += '<meta charset="UTF-8">' | |
html += f'<title>{page_title}</title>' | |
html += f'<link rel="stylesheet" type="text/css" href="{css_url}">' | |
html += '<style>td { padding-right: 20px; }</style>' | |
html += '</head><body>' | |
for track in sorted(sessions.keys()): | |
session_list = sessions[track] | |
html += f'<h2>{tracks[track]}</h2>' | |
html += '<table style="">' | |
for date, interval, title, session_id, session in session_list: | |
html += f'<tr><td>{date}</td><td>{interval}</td><td>{session}</td><td><a href="{program_url}#session:{session_id}">{title}</a></td></tr>' | |
html += '</table>' | |
html += '</body></html>' | |
# Write the HTML to a file | |
with open('sessions_by_track.html', 'w', encoding='utf-8') as file: | |
file.write(html) | |
download_session_data(url) | |
create_track_overview(track_url, url, page_title= page_title, css_url=css_url) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment