Last active
August 18, 2019 08:45
-
-
Save lokal-profil/24eee88b065a30b0587545feefa28731 to your computer and use it in GitHub Desktop.
Parser for Wikimania 2019 xml to tsv (the tsv output format might since then have become outdated)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| from xml.etree import ElementTree | |
| import datetime | |
| from collections import defaultdict | |
| DAY_TITLE_FORMAT = '%A' # '%A %d %B' | |
| BLOCK_FORMAT = '{day}-session-block-{num}' # saturday-session-block-1 | |
| TSV_FORMAR = '{id}\t{day}\t{title}\t"{abstract}"\t"{persons_list}"\t{persons}\t{room}\t\t{block}\t{space}\t{start}\t{end}\t{link}\t\tTRUE\t"{tags}"\t\t\t\n' | |
| BLOCK_ENDS = { | |
| '2019-08-16': {'index': 1, 'end_times': ('12:00', '14:30')}, | |
| '2019-08-17': {'index': 2, 'end_times': ('11:00', '13:00', '15:30')}, | |
| '2019-08-18': {'index': 3, 'end_times': ('11:00', '13:00', '15:30')} | |
| } | |
| def get_next_day(days): | |
| next_day = None | |
| for day in days: | |
| if next_day is None: | |
| next_day = day | |
| else: | |
| next_date = strptime(next_day.get('date'), '%Y-%m-%d') | |
| date = strptime(day.get('date'), '%Y-%m-%d') | |
| if date < next_date: | |
| next_day = day | |
| return next_day | |
| def strptime(*args): | |
| return datetime.datetime.strptime(*args) | |
| def build_day_string(day): | |
| date = strptime(day.get('date'), '%Y-%m-%d') | |
| return date.strftime(DAY_TITLE_FORMAT) | |
| def build_block_string(day, end, block_data): | |
| block_num = len(block_data) + 1 | |
| for i, cmp_time in enumerate(block_data, 1): | |
| if end <= cmp_time: | |
| block_num = i | |
| break | |
| weekday_lc = build_day_string(day).lower() | |
| return BLOCK_FORMAT.format(day=weekday_lc, num=block_num) | |
| def get_next_event(events): | |
| """ | |
| Get the next from a list of event. | |
| Checks first for start time and then title alphabetically. | |
| """ | |
| next_event = None | |
| for event in events: | |
| if next_event is None: | |
| next_event = event | |
| else: | |
| start = strptime(event.find('start').text, '%H:%M') | |
| next_start = strptime(next_event.find('start').text, '%H:%M') | |
| title = event.find('title').text | |
| next_title = next_event.find('title').text | |
| if start < next_start or \ | |
| start == next_start and title < next_title: | |
| next_event = event | |
| return next_event | |
| def create_csv_item(event, day, block_data): | |
| parameters = defaultdict(str) | |
| parameters['id'] = event.attrib.get('id') | |
| parameters['day'] = build_day_string(day) | |
| parameters['title'] = event.find('title').text | |
| parameters['abstract'] = event.find('title').text.replace('\n', '\r') | |
| presenters = get_all_presenters(event.find('persons')) | |
| parameters['persons_list'] = '\r'.join(presenters) | |
| parameters['persons'] = ', '.join(presenters) | |
| parameters['room'] = event.find('room').text | |
| space_element = event.find('space') | |
| if space_element is not None: | |
| parameters['space'] = space_element.text | |
| else: | |
| parameters['space'] = '' | |
| start = event.find('start').text | |
| parameters['start'] = start | |
| start_time = strptime(start, '%H:%M') | |
| duration_string = event.find('duration').text | |
| duration_time = strptime(duration_string, '%H:%M') | |
| end_time = start_time + datetime.timedelta( | |
| hours=duration_time.hour, minutes=duration_time.minute | |
| ) | |
| parameters['end'] = end_time.strftime('%H:%M') | |
| parameters['block'] = build_block_string(day, end_time, block_data) | |
| parameters['link'] = event.find('links/link').get('href') | |
| parameters['tags'] = '\r'.join(event.find('identifiers').attrib.keys()) # not the right place for this, maybe abuse pathways | |
| return TSV_FORMAR.format(**parameters) | |
| def get_all_presenters(persons_element): | |
| if persons_element is None: | |
| return [] | |
| person_elements = persons_element.findall('person') | |
| return [p.text for p in person_elements] | |
| def make_block_data(): | |
| block_data = {} | |
| for v in BLOCK_ENDS.values(): | |
| block_data[v.get('index')] = [strptime(time, '%H:%M') | |
| for time in v.get('end_times')] | |
| return block_data | |
| if __name__ == '__main__': | |
| out_file = '{}.tsv'.format(sys.argv[1].rsplit('.')[0]) | |
| with open(out_file, 'w') as f: | |
| block_data = make_block_data() | |
| tree = ElementTree.parse(sys.argv[1]) | |
| root = tree.getroot() | |
| days = root.findall('.//day') | |
| day_index = 1 | |
| while days: | |
| day = get_next_day(days) | |
| days.remove(day) | |
| events = day.findall('.//event') | |
| while events: | |
| event = get_next_event(events) | |
| events.remove(event) | |
| f.write(create_csv_item(event, day, block_data.get(day_index))) | |
| day_index += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment