Skip to content

Instantly share code, notes, and snippets.

@lokal-profil
Last active August 18, 2019 08:45
Show Gist options
  • Select an option

  • Save lokal-profil/24eee88b065a30b0587545feefa28731 to your computer and use it in GitHub Desktop.

Select an option

Save lokal-profil/24eee88b065a30b0587545feefa28731 to your computer and use it in GitHub Desktop.
Parser for Wikimania 2019 xml to tsv (the tsv output format might since then have become outdated)
import sys
from xml.etree import ElementTree
import datetime
from collections import defaultdict
DAY_TITLE_FORMAT = '%A' # '%A %d %B'
BLOCK_FORMAT = '{day}-session-block-{num}' # saturday-session-block-1
TSV_FORMAR = '{id}\t{day}\t{title}\t"{abstract}"\t"{persons_list}"\t{persons}\t{room}\t\t{block}\t{space}\t{start}\t{end}\t{link}\t\tTRUE\t"{tags}"\t\t\t\n'
BLOCK_ENDS = {
'2019-08-16': {'index': 1, 'end_times': ('12:00', '14:30')},
'2019-08-17': {'index': 2, 'end_times': ('11:00', '13:00', '15:30')},
'2019-08-18': {'index': 3, 'end_times': ('11:00', '13:00', '15:30')}
}
def get_next_day(days):
next_day = None
for day in days:
if next_day is None:
next_day = day
else:
next_date = strptime(next_day.get('date'), '%Y-%m-%d')
date = strptime(day.get('date'), '%Y-%m-%d')
if date < next_date:
next_day = day
return next_day
def strptime(*args):
return datetime.datetime.strptime(*args)
def build_day_string(day):
date = strptime(day.get('date'), '%Y-%m-%d')
return date.strftime(DAY_TITLE_FORMAT)
def build_block_string(day, end, block_data):
block_num = len(block_data) + 1
for i, cmp_time in enumerate(block_data, 1):
if end <= cmp_time:
block_num = i
break
weekday_lc = build_day_string(day).lower()
return BLOCK_FORMAT.format(day=weekday_lc, num=block_num)
def get_next_event(events):
"""
Get the next from a list of event.
Checks first for start time and then title alphabetically.
"""
next_event = None
for event in events:
if next_event is None:
next_event = event
else:
start = strptime(event.find('start').text, '%H:%M')
next_start = strptime(next_event.find('start').text, '%H:%M')
title = event.find('title').text
next_title = next_event.find('title').text
if start < next_start or \
start == next_start and title < next_title:
next_event = event
return next_event
def create_csv_item(event, day, block_data):
parameters = defaultdict(str)
parameters['id'] = event.attrib.get('id')
parameters['day'] = build_day_string(day)
parameters['title'] = event.find('title').text
parameters['abstract'] = event.find('title').text.replace('\n', '\r')
presenters = get_all_presenters(event.find('persons'))
parameters['persons_list'] = '\r'.join(presenters)
parameters['persons'] = ', '.join(presenters)
parameters['room'] = event.find('room').text
space_element = event.find('space')
if space_element is not None:
parameters['space'] = space_element.text
else:
parameters['space'] = ''
start = event.find('start').text
parameters['start'] = start
start_time = strptime(start, '%H:%M')
duration_string = event.find('duration').text
duration_time = strptime(duration_string, '%H:%M')
end_time = start_time + datetime.timedelta(
hours=duration_time.hour, minutes=duration_time.minute
)
parameters['end'] = end_time.strftime('%H:%M')
parameters['block'] = build_block_string(day, end_time, block_data)
parameters['link'] = event.find('links/link').get('href')
parameters['tags'] = '\r'.join(event.find('identifiers').attrib.keys()) # not the right place for this, maybe abuse pathways
return TSV_FORMAR.format(**parameters)
def get_all_presenters(persons_element):
if persons_element is None:
return []
person_elements = persons_element.findall('person')
return [p.text for p in person_elements]
def make_block_data():
block_data = {}
for v in BLOCK_ENDS.values():
block_data[v.get('index')] = [strptime(time, '%H:%M')
for time in v.get('end_times')]
return block_data
if __name__ == '__main__':
out_file = '{}.tsv'.format(sys.argv[1].rsplit('.')[0])
with open(out_file, 'w') as f:
block_data = make_block_data()
tree = ElementTree.parse(sys.argv[1])
root = tree.getroot()
days = root.findall('.//day')
day_index = 1
while days:
day = get_next_day(days)
days.remove(day)
events = day.findall('.//event')
while events:
event = get_next_event(events)
events.remove(event)
f.write(create_csv_item(event, day, block_data.get(day_index)))
day_index += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment