Skip to content

Instantly share code, notes, and snippets.

@tomwhoiscontrary
Created December 13, 2020 00:42
Show Gist options
  • Save tomwhoiscontrary/8ae91b8ff57aec373574cb61c03f25df to your computer and use it in GitHub Desktop.
Save tomwhoiscontrary/8ae91b8ff57aec373574cb61c03f25df to your computer and use it in GitHub Desktop.
a python script to roughly tabulate the Sci-Fi London 2020 short films programme from a downloaded copy of the webpage
#! /usr/bin/env python3
import html.parser
import contextlib
import re
import csv
text_pattern = re.compile(r'([A-Z ()&!0-9]+) \(([^)]+)\) (.*)')
meta_pattern = re.compile(r'Dir: ([^,]+),? ([A-Za-z ]+), ([^0-9]+), ([0-9]+) ?mins?.?')
def parse(s, pattern):
m = pattern.match(s)
if not m: raise ValueError(s)
return m.groups()
class ShortsParser(html.parser.HTMLParser, contextlib.AbstractContextManager):
def __init__(self, output):
super().__init__()
self.output = output
self.text = None
def handle_starttag(self, tag, attrs_list):
attrs = dict(attrs_list)
if tag == 'tr' and attrs['class'] and attrs['class'].startswith('row-'):
self.index = int(attrs['class'].split(' ')[0].split('-')[1])
elif tag == 'img':
self.img_src = attrs['src']
elif tag == 'td' and attrs['class'] and attrs['class'] == 'column-2':
self.text = ''
def handle_data(self, data):
if self.text is not None:
self.text = self.text + data
def handle_endtag(self, tag):
if tag == 'tr':
text = ' '.join(self.text.split())
text_match = text_pattern.match(text)
if not text_match: raise ValueError(text)
title_upper, meta, description = parse(text, text_pattern)
title = title_upper.title()
director, country, language, length_str = parse(meta, meta_pattern)
length = int(length_str)
self.output(self.index, self.img_src, title, director, country, language, length, description)
def __exit__(self, exc_type, exc_value, traceback):
self.close()
with open('shorts.csv', 'w', newline='') as out_file:
out = csv.writer(out_file)
out.writerow(('index', 'image', 'title', 'director', 'country', 'language', 'length', 'description'))
def write_film(index, img_src, title, director, country, language, length, description):
print(index, title)
out.writerow((str(index), f'=IMAGE("{img_src}")', title, director, country, language, str(length), description))
with ShortsParser(write_film) as parser:
with open('shorts.html') as in_file:
for line in in_file:
parser.feed(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment