Skip to content

Instantly share code, notes, and snippets.

@c2h2
Created July 30, 2024 17:10
Show Gist options
  • Save c2h2/00af656b46fabfcca8287a7468d888d6 to your computer and use it in GitHub Desktop.
Save c2h2/00af656b46fabfcca8287a7468d888d6 to your computer and use it in GitHub Desktop.
convert vtt or srt to txt file, trim all timestamps
import re
import sys
import os
import glob
def srt_vtt_to_text(file_path):
txt_file = os.path.splitext(file_path)[0] + '.txt'
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Remove sequence numbers (for SRT files)
content = re.sub(r'^\d+\s*$', '', content, flags=re.MULTILINE)
# Remove SRT timestamps
content = re.sub(r'\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}', '', content)
# Remove VTT timestamps
content = re.sub(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', '', content)
# Remove any extra empty lines
content = re.sub(r'\n\s*\n', '\n', content).strip()
# Remove neighboring duplicate lines
lines = content.split('\n')
deduped_lines = [lines[i] for i in range(len(lines)) if i == 0 or lines[i] != lines[i - 1]]
content = '\n'.join(deduped_lines)
with open(txt_file, 'w', encoding='utf-8') as file:
file.write(content)
print(f"Successfully converted {file_path} to {txt_file}")
except Exception as e:
print(f"An error occurred with {file_path}: {e}")
def main(directory):
if not os.path.isdir(directory):
print(f"Directory {directory} does not exist.")
return
srt_files = glob.glob(os.path.join(directory, '**', '*.srt'), recursive=True)
vtt_files = glob.glob(os.path.join(directory, '**', '*.vtt'), recursive=True)
for file_path in srt_files + vtt_files:
srt_vtt_to_text(file_path)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python srt_to_text.py <directory>")
else:
directory = sys.argv[1]
main(directory)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment