Created
April 25, 2026 02:10
-
-
Save ericboehs/01124f74d9bc04f808c6b7d46dbf8e68 to your computer and use it in GitHub Desktop.
vtt-to-md: Convert WebVTT transcripts to anchored markdown with audio-fragment-linked timestamps
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env ruby | |
| # Convert WebVTT to anchored markdown with HH:MM:SS audio-fragment links. | |
| require 'optparse' | |
| VERSION = '0.1.0' | |
| options = { audio: nil, output: nil, title: nil, no_audio: false } | |
| parser = OptionParser.new do |opts| | |
| opts.banner = <<~BANNER | |
| Usage: vtt-to-md [options] <input.vtt> | |
| Converts a WebVTT transcript to markdown with per-utterance HTML anchors | |
| (`<a id="tHHMMSS"></a>`) and timestamps that link to the corresponding | |
| moment in the audio file via HTML5 media-fragment URIs (`audio.m4a#t=Xs`). | |
| Consecutive cues from the same speaker are merged. | |
| By default the audio sibling (same basename, .m4a) is auto-discovered and | |
| output is written to the matching .md next to the .vtt. | |
| BANNER | |
| opts.separator '' | |
| opts.separator 'Options:' | |
| opts.on('-a', '--audio PATH', 'Audio file basename to link to (default: sibling .m4a)') { |v| options[:audio] = v } | |
| opts.on('--no-audio', 'Skip audio linking even if a sibling .m4a exists') { options[:no_audio] = true } | |
| opts.on('-o', '--output PATH', 'Output path; use - for stdout (default: sibling .md)') { |v| options[:output] = v } | |
| opts.on('-t', '--title TITLE', 'Human title (default: Title-Cased filename without date)') { |v| options[:title] = v } | |
| opts.on('-h', '--help', 'Show this help') { puts opts; exit 0 } | |
| opts.on('-v', '--version', 'Show version') { puts "vtt-to-md #{VERSION}"; exit 0 } | |
| end | |
| parser.parse! | |
| if ARGV.empty? | |
| puts parser | |
| exit 1 | |
| end | |
| input = ARGV[0] | |
| abort "vtt-to-md: input not found: #{input}" unless File.exist?(input) | |
| dir = File.dirname(input) | |
| stem = File.basename(input, '.vtt') | |
| audio = options[:audio] | |
| if !audio && !options[:no_audio] | |
| candidate = File.join(dir, "#{stem}.m4a") | |
| if File.exist?(candidate) | |
| audio = "#{stem}.m4a" | |
| else | |
| warn "vtt-to-md: no sibling audio at #{candidate} -- emitting timestamps without audio links (use --audio PATH to override)" | |
| end | |
| end | |
| audio = nil if options[:no_audio] | |
| title = options[:title] || stem.sub(/^\d{4}-\d{2}-\d{2}-/, '').gsub('-', ' ').split.map(&:capitalize).join(' ') | |
| output_path = options[:output] || File.join(dir, "#{stem}.md") | |
| out = (output_path == '-') ? $stdout : File.open(output_path, 'w') | |
| vtt = File.read(input) | |
| date = File.basename(input)[/^\d{4}-\d{2}-\d{2}/] || 'unknown date' | |
| out.puts "# #{date} — #{title}" | |
| out.puts | |
| out.puts "> Source: Microsoft Teams meeting transcript (`.vtt`). Generated from `#{File.basename(input)}`." | |
| if audio | |
| out.puts ">" | |
| out.puts "> Each timestamp links to the corresponding moment in the meeting audio (`#{audio}`). The audio file is not committed; if you have a local copy alongside this markdown, the links will jump there directly in IINA / VLC / Safari." | |
| end | |
| out.puts | |
| out.puts '---' | |
| out.puts | |
| cues = [] | |
| vtt.split(/\n\n+/).each do |block| | |
| lines = block.strip.split("\n") | |
| next if lines.empty? || lines.first == 'WEBVTT' | |
| lines.shift if lines.first =~ /^\d+$/ | |
| next unless lines.first =~ /^(\d{2}):(\d{2}):(\d{2})\.\d{3}\s+-->/ | |
| h, m, s = $1.to_i, $2.to_i, $3.to_i | |
| text = lines[1..].join(' ').strip | |
| speaker = nil | |
| if text =~ /\A<v\s*([^>]*)>(.*?)<\/v>\z/m | |
| raw = $1.strip | |
| speaker = raw.empty? ? nil : raw | |
| text = $2.strip | |
| end | |
| next if text.empty? | |
| cues << { | |
| anchor: format('t%02d%02d%02d', h, m, s), | |
| label: format('%02d:%02d:%02d', h, m, s), | |
| seconds: h * 3600 + m * 60 + s, | |
| speaker: speaker, | |
| text: text | |
| } | |
| end | |
| merged = cues.each_with_object([]) do |c, acc| | |
| if !acc.empty? && acc.last[:speaker] == c[:speaker] | |
| acc.last[:text] += ' ' + c[:text] | |
| else | |
| acc << c.dup | |
| end | |
| end | |
| merged.each do |c| | |
| out.puts %(<a id="#{c[:anchor]}"></a>) | |
| ts = audio ? "[#{c[:label]}](#{audio}#t=#{c[:seconds]})" : c[:label] | |
| header = "**#{ts}" | |
| header += " — #{c[:speaker]}" if c[:speaker] | |
| out.puts "#{header}**" | |
| out.puts | |
| out.puts "> #{c[:text]}" | |
| out.puts | |
| end | |
| out.close unless out == $stdout | |
| warn "vtt-to-md: wrote #{output_path}" unless output_path == '-' |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
vtt-to-md
A small Ruby script that converts WebVTT transcripts (e.g. Microsoft Teams meeting exports) into clean, anchored markdown with timestamps that deep-link into the corresponding moment of the audio recording.
Each utterance gets:
<a id="tHHMMSS"></a>) you can link to from anywhereaudio.m4a#t=109) — clicking it in IINA / VLC / Safari jumps straight to that second<v Speaker>...</v>tags), with consecutive cues from the same speaker auto-mergedWhy
Teams gives you a
.vttand a.m4a. Neither is fun to navigate or reference. This produces a third sibling file — the.md— that is searchable, copy-pasteable into notes/docs/PRs, and (when the.m4ais alongside it) one click away from the actual audio.Install
Dependencies
optparse)Usage
Output sample
Flags
-a, --audio PATH.m4a)--no-audio.m4aexists-o, --output PATH-for stdout (default: sibling.md)-t, --title TITLE-h, --help-v, --versionDefaults
<stem>.m4aexists next to the.vtt, it is used automatically. Pass--audio PATHto override, or--no-audioto skip..md(e.g.foo.vtt->foo.md). Pass-o PATHor-o -for stdout.2026-04-16-foo-sync.vttbecomesFoo Sync. Pass--titleto override.YYYY-MM-DD-prefix on the filename if present.