Created
June 17, 2021 15:01
-
-
Save blarghmatey/3b75adcdb101e47d10b95ff960022051 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from convert import (read_transcript, parse_srt_transcript, parse_text_transcript, | |
render_srt_transcript_to_vtt, render_text_transcript_to_html) | |
import click | |
from click import Path | |
@click.command() | |
@click.option('--filename', '-f', type=Path(exists=True), | |
help='Source transcript to be converted') | |
@click.option('--output', '-o', type=Path(exists=False), | |
help='Destination for writing the converted transcript') | |
def convert_transcript(filename, output=None): | |
type_map = {'srt': 'vtt', 'txt': 'html'} | |
func_map = {'in': {'srt': parse_srt_transcript, | |
'txt': parse_text_transcript}, | |
'out': {'vtt': render_srt_transcript_to_vtt, | |
'html': render_text_transcript_to_html}} | |
ftype = filename.rsplit('.', maxsplit=1)[-1] | |
out_type = type_map[ftype] | |
transcript = read_transcript(filename) | |
parsed = func_map['in'][ftype](transcript) | |
with open(filename.replace(ftype, out_type), 'w') as converted: | |
converted.write(func_map['out'][out_type](parsed)) | |
if __name__ == '__main__': | |
convert_transcript() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from parsy import decimal_digit, whitespace, seq, test_char, peek, generate, regex | |
from parsy import string as pstring | |
from yattag import Doc, indent | |
from urllib.parse import quote | |
import re | |
import textwrap | |
def create_text_parser(): | |
word = test_char(lambda c: c.isalpha() and not re.match(r'\s+', c), | |
'A non space character').many().concat() | |
name = word.skip(whitespace).many().desc('Speaker Name') | |
time_segment = decimal_digit.times(1, 2).concat().map(int) | |
time_sep = pstring(':') | |
timestamp = time_segment.skip(time_sep.optional()).many().skip( | |
whitespace).desc('Timestamp') | |
section_start = seq(speaker=name, timestamp=timestamp) | |
paragraph = test_char(lambda c: c != '\n', | |
'Paragraph text').many().concat() | |
return section_start, paragraph | |
def create_srt_parser(): | |
cue_header = decimal_digit.at_least(1).skip(whitespace).concat() | |
name = regex(r'(.*?): ').optional().desc('Speaker Name') | |
time_segment = decimal_digit.times(1, 3).concat().map(int) | |
timestamp = regex(r'\d{2}:\d{2}:\d{2},\d{3}').desc('Timestamp') | |
time_range = seq(start=timestamp, sep=pstring( | |
' --> '), end=timestamp).skip(pstring('\n')) | |
paragraph = test_char(lambda c: c != '\n', | |
'Paragraph text').many().concat() | |
speech = seq(name=name, par=paragraph) | |
return cue_header, time_range, speech | |
def read_transcript(fpath): | |
with open(fpath, 'r') as t: | |
transcript = t.read() | |
return transcript | |
@generate | |
def text_section(): | |
section_start, paragraph = create_text_parser() | |
pars = [] | |
wsfunc = whitespace.optional().map(lambda c: c if c is not None else '') | |
header = yield section_start | |
par = yield paragraph | |
pars.append(par) | |
ws = yield wsfunc | |
head_test = yield peek(section_start.optional()) | |
while not head_test and len(ws) > 0: | |
par = yield paragraph | |
pars.append(par) | |
ws = yield wsfunc | |
head_test = yield peek(section_start.optional()) | |
return header, pars | |
@generate | |
def srt_section(): | |
cue_header, time_range, speech = create_srt_parser() | |
header = yield cue_header | |
timecue = yield time_range | |
transcript = yield speech | |
yield whitespace | |
return header, timecue, transcript | |
def parse_text_transcript(transcript): | |
return text_section.many().parse(transcript) | |
def parse_srt_transcript(transcript): | |
return srt_section.many().parse(transcript) | |
def render_text_transcript_to_html(parsed_transcript): | |
doc, tag, text, line = Doc().ttl() | |
with tag('details'): | |
with tag('summary'): | |
text('Click here to read the unedited transcript...') | |
for head, pars in parsed_transcript: | |
with tag('h4'): | |
text(' '.join(head['speaker']) + ' ') | |
timestring = ':'.join([f'{t:02d}' for t in head['timestamp']]) | |
line('a', timestring, href=f'?t={quote(timestring)}') | |
for par in pars: | |
line('p', par) | |
return indent(doc.getvalue()) | |
def render_srt_transcript_to_vtt(parsed_transcript): | |
vtt_header = textwrap.dedent("""WEBVTT | |
""") | |
vtt_doc = '' | |
last_speaker = None | |
for cue_header, time_range, speech in parsed_transcript: | |
speaker = speech['name'] | |
if speaker: | |
last_speaker = speaker | |
else: | |
speaker = last_speaker | |
vtt_doc += textwrap.dedent(f'''\ | |
{cue_header} | |
{time_range['start'].replace(',', '.')}{time_range['sep']}{time_range['end'].replace(',', '.')} | |
<v {speaker.strip(': ')}>{speech['par']} | |
''') | |
textwrap.dedent(vtt_doc) | |
return vtt_header + vtt_doc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment