Skip to content

Instantly share code, notes, and snippets.

@cboulanger
Created November 9, 2023 10:13
Show Gist options
  • Save cboulanger/dc369a8f29ec0c46f4e2ef764d749807 to your computer and use it in GitHub Desktop.
Save cboulanger/dc369a8f29ec0c46f4e2ef764d749807 to your computer and use it in GitHub Desktop.
Parse an EasyChair conference programme to create a separate track overview. This requires a consistent naming scheme for the sessions. Cannot be used as-is.
import dateparser
import re
import requests
from bs4 import BeautifulSoup
import csv
from collections import defaultdict
url = "https://easychair.org/smart-program/<conference_name>/program.html"
track_url = "https://easychair.org/smart-program/<conference_name>/de_tracks.html"
css_url = "https://easychair.org/smart-program/<conference_name>/program.css"
page_title = "Conference Name - Tracks und Sessions"
def download_session_data(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
# Initialize CSV writer
with open('conference_data.csv', 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["session_id", "session", "date", "interval", "track", "title"])
# Initialize date
date = None
for div in soup.find_all('div'):
# Check if div contains date
if div.get('class') == ['date']:
date = dateparser.parse(div.text)
date = date.strftime('%d.%m.%Y')
# Check if div is a session
elif div.get('class') and 'session' in div.get('class'):
session_id = div.find('a').get('name').replace('session:', '')
heading = div.find('div', class_='heading').text
matches = re.search(r'(.+)\s*Session\s*(\w+)\s*:\s*Track\s*(\w+):\s*(.+)', heading)
if matches:
interval, session, track, title = matches.groups()
writer.writerow([session_id, session, date, interval.strip(), track, title])
def create_track_overview(track_url, program_url, page_title, css_url):
# Download and parse the track page
response = requests.get(track_url)
soup = BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')
# Extract track titles
tracks = {}
i = 1
for h3 in soup.find_all('h3'):
a = h3.find('a')
if a:
tracks[str(i)] = a.text.strip()
i+=1
# Parse the CSV file
sessions = defaultdict(list)
with open('conference_data.csv', 'r', newline='', encoding='utf-8') as file:
reader = csv.reader(file)
next(reader) # Skip header
for row in reader:
session_id, session, date, interval, track, title = row
if session_id != "" and track != "":
sessions[track].append((date, interval, title, session_id, session))
# Sort sessions by date and interval
for track in sessions:
sessions[track].sort()
# Create the HTML page
html = '<html><head>'
html += '<meta charset="UTF-8">'
html += f'<title>{page_title}</title>'
html += f'<link rel="stylesheet" type="text/css" href="{css_url}">'
html += '<style>td { padding-right: 20px; }</style>'
html += '</head><body>'
for track in sorted(sessions.keys()):
session_list = sessions[track]
html += f'<h2>{tracks[track]}</h2>'
html += '<table style="">'
for date, interval, title, session_id, session in session_list:
html += f'<tr><td>{date}</td><td>{interval}</td><td>{session}</td><td><a href="{program_url}#session:{session_id}">{title}</a></td></tr>'
html += '</table>'
html += '</body></html>'
# Write the HTML to a file
with open('sessions_by_track.html', 'w', encoding='utf-8') as file:
file.write(html)
download_session_data(url)
create_track_overview(track_url, url, page_title= page_title, css_url=css_url)
#%%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment