Last active
December 1, 2022 09:55
-
-
Save leifdenby/c5a4e5f8c147d13863c02ba6e0438035 to your computer and use it in GitHub Desktop.
Parser for ECWMF event pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 | |
import dateutil.parser | |
import pandas as pd | |
def parse_event_details(url): | |
""" | |
Parse event details from ECMWF event page and return title, timings, | |
speaker name, speaker affiliation, talk pdf and video URLs as a | |
pandas.DataFrame | |
An event page URL example could be https://events.ecmwf.int/event/304/timetable/ | |
""" | |
req = requests.get(url) | |
soup = bs4.BeautifulSoup(req.text) | |
table_el = soup.find("table") | |
data = [] | |
skip_rows = ["day-header", "non-empty-session"] | |
for i, table_row in enumerate(table_el.find_all("tr")): | |
if "class" in table_row.attrs and any( | |
[sr in table_row.attrs["class"] for sr in skip_rows] | |
): | |
continue | |
entry_data = {} | |
time_col = table_row.find(name="td", attrs={"class": "time"}) | |
entry_data["start_time"] = dateutil.parser.parse( | |
time_col.find( | |
"span", attrs={"class": ["timetable-time", "start-time"]} | |
).attrs["data-time"] | |
) | |
entry_data["end_time"] = dateutil.parser.parse( | |
time_col.find( | |
"span", attrs={"class": ["timetable-time", "end-time"]} | |
).attrs["data-time"] | |
) | |
entry_data["title"] = table_row.find( | |
"div", attrs={"class": "title"} | |
).text.strip() | |
el_speaker_list = table_row.find("div", attrs={"class": "speaker-list"}) | |
if el_speaker_list is not None: | |
entry_data["speaker_name"] = ( | |
el_speaker_list.find("span", attrs={"class": ""}).find("span").text | |
) | |
entry_data["speaker_affiliation"] = el_speaker_list.find( | |
"span", attrs={"class": "affiliation"} | |
).text.strip()[1:-1] | |
url_selectors = dict( | |
talk_pdf_url="icon-file-pdf", | |
talk_video_url="icon-link", | |
) | |
for item_name, item_class in url_selectors.items(): | |
el_url = table_row.find("a", attrs={"class": item_class}) | |
if el_url is not None: | |
url = el_url.attrs["href"] | |
if not url.startswith("http"): | |
url = f"https://events.ecmwf.int{url}" | |
entry_data[item_name] = url | |
data.append(entry_data) | |
return pd.DataFrame(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment