Skip to content

Instantly share code, notes, and snippets.

@Pokechu22
Created May 28, 2025 05:26
Show Gist options
  • Save Pokechu22/3e2cbee501b6f7487335ad35bb54d588 to your computer and use it in GitHub Desktop.
Save Pokechu22/3e2cbee501b6f7487335ad35bb54d588 to your computer and use it in GitHub Desktop.
Split WebVTT cues that use cue timestamps (https://www.w3.org/TR/webvtt1/#webvtt-cue-timestamp) into multiple cues for VLC
# SPDX-License-Identifier: CC0-1.0
# Splits WebVTT cues that use cue timestamps (https://www.w3.org/TR/webvtt1/#webvtt-cue-timestamp) into multiple cues
# This is mainly relevant for auto-generated YouTube captions, which generally aren't split on sentence boundaries and thus showing the whole cue produces bad results
import webvtt # https://github.com/glut23/webvtt-py/ - pip install webvtt-py
import sys
import os
if len(sys.argv) > 2:
in_file = sys.argv[1]
out_file = sys.argv[2]
else:
out_file = sys.argv[1]
in_file = out_file + '.old'
os.rename(out_file, in_file)
caption_data = webvtt.read(in_file)
old_captions = list(caption_data.captions)
caption_data.captions.clear()
for caption in old_captions:
if '<' in caption.raw_text:
new_caption = ''
start = caption.start
for part in caption.raw_text.split('<'):
if '>' in part:
tag_data, rest = part.split('>')
if webvtt.models.Timestamp.PATTERN.fullmatch(tag_data):
caption_data.captions.append(webvtt.models.Caption(start, tag_data, new_caption, caption.identifier))
start = tag_data
new_caption += rest
else:
new_caption += '<'
new_caption += tag_data
new_caption += '>'
new_caption += rest
else:
new_caption += part
caption_data.captions.append(webvtt.models.Caption(start, caption.end, new_caption, caption.identifier))
else:
caption_data.captions.append(caption)
caption_data.save(out_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment