Created
December 13, 2020 00:42
-
-
Save tomwhoiscontrary/8ae91b8ff57aec373574cb61c03f25df to your computer and use it in GitHub Desktop.
a python script to roughly tabulate the Sci-Fi London 2020 short films programme from a downloaded copy of the webpage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
import html.parser | |
import contextlib | |
import re | |
import csv | |
text_pattern = re.compile(r'([A-Z ()&!0-9]+) \(([^)]+)\) (.*)') | |
meta_pattern = re.compile(r'Dir: ([^,]+),? ([A-Za-z ]+), ([^0-9]+), ([0-9]+) ?mins?.?') | |
def parse(s, pattern): | |
m = pattern.match(s) | |
if not m: raise ValueError(s) | |
return m.groups() | |
class ShortsParser(html.parser.HTMLParser, contextlib.AbstractContextManager): | |
def __init__(self, output): | |
super().__init__() | |
self.output = output | |
self.text = None | |
def handle_starttag(self, tag, attrs_list): | |
attrs = dict(attrs_list) | |
if tag == 'tr' and attrs['class'] and attrs['class'].startswith('row-'): | |
self.index = int(attrs['class'].split(' ')[0].split('-')[1]) | |
elif tag == 'img': | |
self.img_src = attrs['src'] | |
elif tag == 'td' and attrs['class'] and attrs['class'] == 'column-2': | |
self.text = '' | |
def handle_data(self, data): | |
if self.text is not None: | |
self.text = self.text + data | |
def handle_endtag(self, tag): | |
if tag == 'tr': | |
text = ' '.join(self.text.split()) | |
text_match = text_pattern.match(text) | |
if not text_match: raise ValueError(text) | |
title_upper, meta, description = parse(text, text_pattern) | |
title = title_upper.title() | |
director, country, language, length_str = parse(meta, meta_pattern) | |
length = int(length_str) | |
self.output(self.index, self.img_src, title, director, country, language, length, description) | |
def __exit__(self, exc_type, exc_value, traceback): | |
self.close() | |
with open('shorts.csv', 'w', newline='') as out_file: | |
out = csv.writer(out_file) | |
out.writerow(('index', 'image', 'title', 'director', 'country', 'language', 'length', 'description')) | |
def write_film(index, img_src, title, director, country, language, length, description): | |
print(index, title) | |
out.writerow((str(index), f'=IMAGE("{img_src}")', title, director, country, language, str(length), description)) | |
with ShortsParser(write_film) as parser: | |
with open('shorts.html') as in_file: | |
for line in in_file: | |
parser.feed(line) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment