Created
May 28, 2025 05:26
-
-
Save Pokechu22/3e2cbee501b6f7487335ad35bb54d588 to your computer and use it in GitHub Desktop.
Split WebVTT cues that use cue timestamps (https://www.w3.org/TR/webvtt1/#webvtt-cue-timestamp) into multiple cues for VLC
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-License-Identifier: CC0-1.0 | |
# Splits WebVTT cues that use cue timestamps (https://www.w3.org/TR/webvtt1/#webvtt-cue-timestamp) into multiple cues | |
# This is mainly relevant for auto-generated YouTube captions, which generally aren't split on sentence boundaries and thus showing the whole cue produces bad results | |
import webvtt # https://github.com/glut23/webvtt-py/ - pip install webvtt-py | |
import sys | |
import os | |
if len(sys.argv) > 2: | |
in_file = sys.argv[1] | |
out_file = sys.argv[2] | |
else: | |
out_file = sys.argv[1] | |
in_file = out_file + '.old' | |
os.rename(out_file, in_file) | |
caption_data = webvtt.read(in_file) | |
old_captions = list(caption_data.captions) | |
caption_data.captions.clear() | |
for caption in old_captions: | |
if '<' in caption.raw_text: | |
new_caption = '' | |
start = caption.start | |
for part in caption.raw_text.split('<'): | |
if '>' in part: | |
tag_data, rest = part.split('>') | |
if webvtt.models.Timestamp.PATTERN.fullmatch(tag_data): | |
caption_data.captions.append(webvtt.models.Caption(start, tag_data, new_caption, caption.identifier)) | |
start = tag_data | |
new_caption += rest | |
else: | |
new_caption += '<' | |
new_caption += tag_data | |
new_caption += '>' | |
new_caption += rest | |
else: | |
new_caption += part | |
caption_data.captions.append(webvtt.models.Caption(start, caption.end, new_caption, caption.identifier)) | |
else: | |
caption_data.captions.append(caption) | |
caption_data.save(out_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment