Created
November 9, 2023 10:13
-
-
Save cboulanger/dc369a8f29ec0c46f4e2ef764d749807 to your computer and use it in GitHub Desktop.
Parse an EasyChair conference programme to create a separate track overview. This requires a consistent naming scheme for the sessions. Cannot be used as-is.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import dateparser | |
import re | |
import requests | |
from bs4 import BeautifulSoup | |
import csv | |
from collections import defaultdict | |
url = "https://easychair.org/smart-program/<conference_name>/program.html" | |
track_url = "https://easychair.org/smart-program/<conference_name>/de_tracks.html" | |
css_url = "https://easychair.org/smart-program/<conference_name>/program.css" | |
page_title = "Conference Name - Tracks und Sessions" | |
def download_session_data(url): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8') | |
# Initialize CSV writer | |
with open('conference_data.csv', 'w', newline='', encoding='utf-8') as file: | |
writer = csv.writer(file) | |
writer.writerow(["session_id", "session", "date", "interval", "track", "title"]) | |
# Initialize date | |
date = None | |
for div in soup.find_all('div'): | |
# Check if div contains date | |
if div.get('class') == ['date']: | |
date = dateparser.parse(div.text) | |
date = date.strftime('%d.%m.%Y') | |
# Check if div is a session | |
elif div.get('class') and 'session' in div.get('class'): | |
session_id = div.find('a').get('name').replace('session:', '') | |
heading = div.find('div', class_='heading').text | |
matches = re.search(r'(.+)\s*Session\s*(\w+)\s*:\s*Track\s*(\w+):\s*(.+)', heading) | |
if matches: | |
interval, session, track, title = matches.groups() | |
writer.writerow([session_id, session, date, interval.strip(), track, title]) | |
def create_track_overview(track_url, program_url, page_title, css_url): | |
# Download and parse the track page | |
response = requests.get(track_url) | |
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8') | |
# Extract track titles | |
tracks = {} | |
i = 1 | |
for h3 in soup.find_all('h3'): | |
a = h3.find('a') | |
if a: | |
tracks[str(i)] = a.text.strip() | |
i+=1 | |
# Parse the CSV file | |
sessions = defaultdict(list) | |
with open('conference_data.csv', 'r', newline='', encoding='utf-8') as file: | |
reader = csv.reader(file) | |
next(reader) # Skip header | |
for row in reader: | |
session_id, session, date, interval, track, title = row | |
if session_id != "" and track != "": | |
sessions[track].append((date, interval, title, session_id, session)) | |
# Sort sessions by date and interval | |
for track in sessions: | |
sessions[track].sort() | |
# Create the HTML page | |
html = '<html><head>' | |
html += '<meta charset="UTF-8">' | |
html += f'<title>{page_title}</title>' | |
html += f'<link rel="stylesheet" type="text/css" href="{css_url}">' | |
html += '<style>td { padding-right: 20px; }</style>' | |
html += '</head><body>' | |
for track in sorted(sessions.keys()): | |
session_list = sessions[track] | |
html += f'<h2>{tracks[track]}</h2>' | |
html += '<table style="">' | |
for date, interval, title, session_id, session in session_list: | |
html += f'<tr><td>{date}</td><td>{interval}</td><td>{session}</td><td><a href="{program_url}#session:{session_id}">{title}</a></td></tr>' | |
html += '</table>' | |
html += '</body></html>' | |
# Write the HTML to a file | |
with open('sessions_by_track.html', 'w', encoding='utf-8') as file: | |
file.write(html) | |
download_session_data(url) | |
create_track_overview(track_url, url, page_title= page_title, css_url=css_url) | |
#%% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment