tomwhoiscontrary · December 13, 2020 00:42
diff --git a/summarise-shorts.py b/summarise-shorts.py
 #! /usr/bin/env python3

 import html.parser
 import contextlib
 import re
 import csv

 text_pattern = re.compile(r'([A-Z ()&!0-9]+) \(([^)]+)\) (.*)')
 meta_pattern = re.compile(r'Dir: ([^,]+),? ([A-Za-z ]+), ([^0-9]+), ([0-9]+) ?mins?.?')

 def parse(s, pattern):
 	m = pattern.match(s)
 	if not m: raise ValueError(s)
 	return m.groups()

 class ShortsParser(html.parser.HTMLParser, contextlib.AbstractContextManager):
 	def __init__(self, output):
 		super().__init__()
 		self.output = output
 		self.text = None

 	def handle_starttag(self, tag, attrs_list):
 		attrs = dict(attrs_list)
 		if tag == 'tr' and attrs['class'] and attrs['class'].startswith('row-'):
 			self.index = int(attrs['class'].split(' ')[0].split('-')[1])
 		elif tag == 'img':
 			self.img_src = attrs['src']
 		elif tag == 'td' and attrs['class'] and attrs['class'] == 'column-2':
 			self.text = ''

 	def handle_data(self, data):
 		if self.text is not None:
 			self.text = self.text + data

 	def handle_endtag(self, tag):
 		if tag == 'tr':
 			text = ' '.join(self.text.split())
 			text_match = text_pattern.match(text)
 			if not text_match: raise ValueError(text)
 			title_upper, meta, description = parse(text, text_pattern)
 			title = title_upper.title()
 			director, country, language, length_str = parse(meta, meta_pattern)
 			length = int(length_str)
 			
 			self.output(self.index, self.img_src, title, director, country, language, length, description)

 	def __exit__(self, exc_type, exc_value, traceback):
 		self.close()


 with open('shorts.csv', 'w', newline='') as out_file:
 	out = csv.writer(out_file)
 	out.writerow(('index', 'image', 'title', 'director', 'country', 'language', 'length', 'description'))
 	def write_film(index, img_src, title, director, country, language, length, description):
 		print(index, title)
 		out.writerow((str(index), f'=IMAGE("{img_src}")', title, director, country, language, str(length), description))
 	with ShortsParser(write_film) as parser:
 		with open('shorts.html') as in_file:
 			for line in in_file:
 				parser.feed(line)
	#! /usr/bin/env python3

	import html.parser
	import contextlib
	import re
	import csv

	text_pattern = re.compile(r'([A-Z ()&!0-9]+) \(([^)]+)\) (.*)')
	meta_pattern = re.compile(r'Dir: ([^,]+),? ([A-Za-z ]+), ([^0-9]+), ([0-9]+) ?mins?.?')

	def parse(s, pattern):
	m = pattern.match(s)
	if not m: raise ValueError(s)
	return m.groups()

	class ShortsParser(html.parser.HTMLParser, contextlib.AbstractContextManager):
	def __init__(self, output):
	super().__init__()
	self.output = output
	self.text = None

	def handle_starttag(self, tag, attrs_list):
	attrs = dict(attrs_list)
	if tag == 'tr' and attrs['class'] and attrs['class'].startswith('row-'):
	self.index = int(attrs['class'].split(' ')[0].split('-')[1])
	elif tag == 'img':
	self.img_src = attrs['src']
	elif tag == 'td' and attrs['class'] and attrs['class'] == 'column-2':
	self.text = ''

	def handle_data(self, data):
	if self.text is not None:
	self.text = self.text + data

	def handle_endtag(self, tag):
	if tag == 'tr':
	text = ' '.join(self.text.split())
	text_match = text_pattern.match(text)
	if not text_match: raise ValueError(text)
	title_upper, meta, description = parse(text, text_pattern)
	title = title_upper.title()
	director, country, language, length_str = parse(meta, meta_pattern)
	length = int(length_str)

	self.output(self.index, self.img_src, title, director, country, language, length, description)

	def __exit__(self, exc_type, exc_value, traceback):
	self.close()


	with open('shorts.csv', 'w', newline='') as out_file:
	out = csv.writer(out_file)
	out.writerow(('index', 'image', 'title', 'director', 'country', 'language', 'length', 'description'))
	def write_film(index, img_src, title, director, country, language, length, description):
	print(index, title)
	out.writerow((str(index), f'=IMAGE("{img_src}")', title, director, country, language, str(length), description))
	with ShortsParser(write_film) as parser:
	with open('shorts.html') as in_file:
	for line in in_file:
	parser.feed(line)