Skip to content

Instantly share code, notes, and snippets.

Created February 8, 2024 11:06
Show Gist options
  • Save Airbus5717/ec899febf2c151fea2fb3ccce2f2bcc0 to your computer and use it in GitHub Desktop.
Save Airbus5717/ec899febf2c151fea2fb3ccce2f2bcc0 to your computer and use it in GitHub Desktop.
Get all files in dir that end with .vtt and replace with .txt with the specific edits required
# source
import os
def find_files(directory, extension):
Recursively finds all files with a specific extension in a directory and its subdirectories.
- directory (str): The directory to start the search from.
- extension (str): The file extension to search for (e.g., '.txt', '.jpg', etc.).
- file_list (list): A list of file paths matching the specified extension.
file_list = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith(extension):
file_list.append(os.path.join(root, file))
return file_list
# Example usage:
directory_path = './'
file_extension = '.vtt'
found_files = find_files(directory_path, file_extension)
print("Found files with extension '{}':".format(file_extension))
# for file_path in found_files:
# print(file_path)
Convert YouTube subtitles(vtt) to human readable text.
Download only subtitles from YouTube with youtube-dl:
youtube-dl --skip-download --convert-subs vtt <video_url>
Note that default subtitle format provided by YouTube is ass, which is hard
to process with simple regex. Luckily youtube-dl can convert ass to vtt, which
is easier to process.
To conver all vtt files inside a directory:
find . -name "*.vtt" -exec python {} \;
import sys
import re
def remove_tags(text):
Remove vtt markup tags
tags = [
for pat in tags:
text = re.sub(pat, '', text)
# extract timestamp, only kep HH:MM
text = re.sub(
r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%',
text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
return text
def remove_header(lines):
Remove vtt file header
pos = -1
for mark in ('##', 'Language: en',):
if mark in lines:
pos = lines.index(mark)
lines = lines[pos+1:]
return lines
def merge_duplicates(lines):
Remove duplicated subtitles. Duplacates are always adjacent.
last_timestamp = ''
last_cap = ''
for line in lines:
if line == "":
if re.match('^\d{2}:\d{2}$', line):
if line != last_timestamp:
yield line
last_timestamp = line
if line != last_cap:
yield line
last_cap = line
def merge_short_lines(lines):
buffer = ''
for line in lines:
if line == "" or re.match('^\d{2}:\d{2}$', line):
yield '\n' + line
if len(line+buffer) < 80:
buffer += ' ' + line
yield buffer.strip()
buffer = line
yield buffer
def main():
for file in found_files:
print("file path: "+ file)
with open(file, 'r', encoding='utf-8') as f:
text =
vtt_file_name = file
txt_name = re.sub(r'.vtt$', '.txt', vtt_file_name)
print("text file path: "+ txt_name)
text = remove_tags(text)
lines = text.splitlines()
lines = remove_header(lines)
lines = merge_duplicates(lines)
lines = list(lines)
lines = merge_short_lines(lines)
lines = list(lines)
with open(txt_name, 'w') as f:
for line in lines:
if __name__ == "__main__":
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment