Created
December 5, 2022 14:29
-
-
Save cyphar/0a8295521e085755f0f09851148e9cb7 to your computer and use it in GitHub Desktop.
A very (possibly too) lax WebVTT-to-SubRip converter powered by AWK
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/zsh | |
# vtt2srt: a very (possibly too) lax vtt-to-srt converter | |
# Copyright (C) 2022 Aleksa Sarai <[email protected]> | |
# | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
# While tools like ffmpeg can convert WebVTT subtitles (.vtt) files to SubRip | |
# (.srt) if the WebVTT file is correctly formatted, it seems that minor issues | |
# with the WebVTT cause ffmpeg to give up and output nothing. This is less than | |
# ideal (especially since it seems that Netflix produces WebVTT files that are | |
# broken in this way) and ultimately we only need the core bits of information | |
# from WebVTT in order to produce working SubRip files (in fact, the two | |
# formats are so similar it's a shame that WebVTT wasn't designed to be a | |
# backwards-compatible superset of SubRip, though obviously there are reasons | |
# this wasn't done). | |
# | |
# In any case, this is a small awk-powered script which scrapes the minimum | |
# amount of needed information from the WebVTT file (caring very little about | |
# the format -- other than the most basic requirement that all segments are | |
# separated by an empty line) and then outputs an SRT with the same contents | |
# (with all HTML tags stripped). | |
set -Eeuo pipefail | |
[[ "${ZSH_EVAL_CONTEXT:-}" =~ :file$ ]] && IN_SOURCE=1 || IN_SOURCE= | |
function bail() { | |
echo "$@" >&2 | |
[[ "$IN_SOURCE" ]] || exit 1 | |
} | |
[ "$#" -eq 2 ] || bail "usage: $0 <vtt-in> <srt-out>" | |
vtt="$1" | |
srt="$2" | |
awk "$(<<-'EOF' | |
# After a blank line, the current block has ended. | |
/^$/ { in_subtitle = ignore_block = 0 } | |
# We ignore all block types (WEBVTT, STYLE, NOTE) because we don't do any | |
# styling. Note that this is only done to make sure any weird .vtt files | |
# which interleave these with cues are correctly handled. | |
/^(NOTE|STYLE|WEBVTT)\>/ { ignore_block = 1 } | |
ignore_block { next } | |
# If we're in a subtitle line, just output it (stripping out any possible | |
# HTML tags. Obviously since we're doing regex, any complicated tags won't | |
# work, but who is going to have a field containing ">" in their subtitle | |
# file? | |
in_subtitle { | |
print gensub(/<[^>]*>/, "", "g", $0) | |
next | |
} | |
# Only the timing information of a cue is allowed to contain "-->". We only | |
# have to change the decimal format (WebVTT uses "." while SRT uses ","). | |
# We ignore any of the cue settings since SRT doesn't support them. | |
/-->/ { | |
print "" | |
print cue_number++ | |
print gensub(/\./, ",", "g", $1), "-->", gensub(/\./, ",", "g", $3) | |
in_subtitle = 1 | |
} | |
EOF | |
)" <"$vtt" >"$srt" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment