Skip to content

Instantly share code, notes, and snippets.

@amrutadotorg
Last active May 20, 2025 20:08
Show Gist options
  • Save amrutadotorg/99b9be9813e84d2150fccd6d452fdfab to your computer and use it in GitHub Desktop.
Save amrutadotorg/99b9be9813e84d2150fccd6d452fdfab to your computer and use it in GitHub Desktop.
#!/bin/bash
# VTT Processor Ubuntu Installer Script
echo "===== VTT File Processor Installer ====="
echo "This script will set up everything needed to run the VTT Processor application."
echo ""
# Get username for desktop shortcut
USERNAME=$(whoami)
HOME_DIR="/home/$USERNAME"
# Create application directory
echo "Creating application directory..."
mkdir -p $HOME_DIR/vtt_processor
# Install dependencies
echo "Installing required system packages..."
sudo apt update
sudo apt install -y python3 python3-pip python3-tk git
echo "Installing required Python packages..."
pip3 install webvtt-py
# Handle TKinterDnD2 installation specially
echo "Installing TKinterDnD2..."
if pip3 install tkinterdnd2; then
echo "TKinterDnD2 installed successfully from PyPI"
else
echo "Trying to install TKinterDnD2 from source..."
cd /tmp
git clone https://github.com/pmgagne/tkinterdnd2
cd tkinterdnd2
pip3 install .
cd $HOME_DIR
fi
# Create the Python script file
echo "Creating the VTT processor script..."
cat > $HOME_DIR/vtt_processor/vtt_processor.py << 'ENDSCRIPT'
#!/usr/bin/env python3
import webvtt
import re
import os
import tkinter as tk
from tkinter import filedialog, messagebox, ttk
import tkinterdnd2 as tkdnd
class VTTProcessorApp:
def __init__(self, master):
self.master = master
master.title("VTT File Processor")
master.geometry("600x500")
# Tracking dropped files
self.dropped_files = []
# Style
self.style = ttk.Style()
self.style.configure("TLabel", font=("Arial", 10))
self.style.configure("TButton", font=("Arial", 10))
# Drag and Drop Setup
self.master.drop_target_register(tkdnd.DND_FILES)
self.master.dnd_bind("<<Drop>>", self.handle_drop)
# Create UI Elements
self.create_widgets()
def create_widgets(self):
# Drag and Drop Area
self.drop_frame = tk.Frame(
self.master,
width=500,
height=200,
bg="light gray",
borderwidth=2,
relief=tk.RIDGE,
)
self.drop_frame.pack(pady=20, padx=20, fill=tk.X)
self.drop_frame.pack_propagate(False)
self.drop_label = tk.Label(
self.drop_frame,
text="Drag and Drop VTT and Text Files Here",
bg="light gray",
font=("Arial", 12),
)
self.drop_label.pack(expand=True)
# Operation Buttons Frame
button_frame = tk.Frame(self.master)
button_frame.pack(pady=10)
# Buttons for different operations
ttk.Button(
button_frame, text="Extract VTT Text", command=self.extract_vtt_text_gui
).pack(side=tk.LEFT, padx=5)
ttk.Button(
button_frame, text="Replace VTT Text", command=self.replace_vtt_text_gui
).pack(side=tk.LEFT, padx=5)
# Result Display
self.result_text = tk.Text(self.master, height=10, width=70, wrap=tk.WORD)
self.result_text.pack(pady=10, padx=20, fill=tk.X)
def handle_drop(self, event):
"""Handle file drop event"""
self.dropped_files = self.master.tk.splitlist(event.data)
if not self.dropped_files:
return
# Categorize dropped files
vtt_files = [f for f in self.dropped_files if f.lower().endswith(".vtt")]
text_files = [f for f in self.dropped_files if f.lower().endswith(".txt")]
# If we have both VTT and text files, prompt for replacement
if vtt_files and text_files:
self.prompt_vtt_replacement(vtt_files[0], text_files[0])
elif len(vtt_files) == 1:
self.process_vtt_file(vtt_files[0])
elif len(text_files) == 1:
self.process_text_file(text_files[0])
else:
messagebox.showinfo("Unsupported Files", "Please drop .vtt or .txt files")
def prompt_vtt_replacement(self, vtt_file, text_file):
"""Prompt user to replace VTT text with dropped text file"""
dialog = tk.Toplevel(self.master)
dialog.title("Replace VTT Text")
dialog.geometry("300x200")
tk.Label(
dialog, text="Replace VTT text with dropped text file?", wraplength=250
).pack(pady=10)
def do_replace():
try:
# Always clean the text file during replacement
output = os.path.splitext(vtt_file)[0] + "_replaced.vtt"
replace_vtt_text(vtt_file, text_file, output, clean_text=True)
self.update_result_text(
f"Replaced text in {vtt_file} with cleaned {text_file}"
)
dialog.destroy()
except Exception as e:
messagebox.showerror("Error", str(e))
def cancel():
dialog.destroy()
ttk.Button(dialog, text="Replace", command=do_replace).pack(pady=5)
ttk.Button(dialog, text="Cancel", command=cancel).pack(pady=5)
def process_vtt_file(self, vtt_file):
"""Guess best operation for VTT file"""
dialog = tk.Toplevel(self.master)
dialog.title("VTT File Action")
dialog.geometry("300x200")
def do_extract():
output = os.path.splitext(vtt_file)[0] + "_text.txt"
self.extract_vtt_text_single(vtt_file, output)
dialog.destroy()
def do_replace():
text_file = filedialog.askopenfilename(
title="Select Replacement Text File",
filetypes=[("Text Files", "*.txt")],
)
if text_file:
output = os.path.splitext(vtt_file)[0] + "_replaced.vtt"
self.replace_vtt_text_single(vtt_file, text_file, output)
dialog.destroy()
tk.Label(dialog, text="Choose an action for VTT file:").pack(pady=10)
ttk.Button(dialog, text="Extract Text", command=do_extract).pack(pady=5)
ttk.Button(dialog, text="Replace Text", command=do_replace).pack(pady=5)
def process_text_file(self, text_file):
"""Guess best operation for text file"""
output = os.path.splitext(text_file)[0] + "_cleaned.txt"
self.clean_text_file_single(text_file, output)
def extract_vtt_text_single(self, vtt_file, output_file):
"""Extract text from a single VTT file"""
try:
actual_output_files = extract_vtt_text(vtt_file, output_file)
# Handle both single file and multiple file scenarios
if isinstance(actual_output_files, list):
# Multiple files scenario
output_message = f"Extracted text from {vtt_file}, split into {len(actual_output_files)} files:\n"
output_message += "\n".join(actual_output_files)
else:
# Single file scenario
output_message = (
f"Extracted text from {vtt_file} to {actual_output_files}"
)
self.update_result_text(output_message)
except Exception as e:
messagebox.showerror("Error", str(e))
def clean_text_file_single(self, text_file, output_file):
"""Clean a single text file"""
try:
clean_text_file(text_file, output_file)
self.update_result_text(f"Cleaned text file: {text_file}")
except Exception as e:
messagebox.showerror("Error", str(e))
def replace_vtt_text_single(self, vtt_file, text_file, output_file):
"""Replace text in a single VTT file"""
try:
# Always clean the text file during replacement
replace_vtt_text(vtt_file, text_file, output_file, clean_text=True)
self.update_result_text(
f"Replaced text in {vtt_file} with cleaned {text_file}"
)
except Exception as e:
messagebox.showerror("Error", str(e))
def update_result_text(self, message):
"""Update the result text area"""
self.result_text.insert(tk.END, message + "\n")
self.result_text.see(tk.END)
def extract_vtt_text_gui(self):
"""GUI method to extract VTT text"""
vtt_file = filedialog.askopenfilename(
title="Select VTT File", filetypes=[("VTT Files", "*.vtt")]
)
if vtt_file:
output_file = filedialog.asksaveasfilename(
defaultextension=".txt", filetypes=[("Text Files", "*.txt")]
)
if output_file:
self.extract_vtt_text_single(vtt_file, output_file)
def replace_vtt_text_gui(self):
"""GUI method to replace VTT text"""
vtt_file = filedialog.askopenfilename(
title="Select VTT File", filetypes=[("VTT Files", "*.vtt")]
)
if vtt_file:
text_file = filedialog.askopenfilename(
title="Select Replacement Text File",
filetypes=[("Text Files", "*.txt")],
)
if text_file:
output_file = filedialog.asksaveasfilename(
defaultextension=".vtt", filetypes=[("VTT Files", "*.vtt")]
)
if output_file:
replace_vtt_text(vtt_file, text_file, output_file, clean_text=True)
self.update_result_text(f"Replaced text in {vtt_file}")
def clean_text_file(input_file_path, output_file_path=None):
"""
Cleans a text file by removing line numbers, extra spaces, &nbsp; entities,
and replacing multiple consecutive spaces with a single space.
Args:
input_file_path (str): Path to the input text file
output_file_path (str, optional): Path where the cleaned text will be saved.
If None, returns the cleaned lines.
Returns:
list or None: If output_file_path is None, returns a list of cleaned lines.
Otherwise, returns None after saving to file.
"""
# Read the input file
with open(input_file_path, "r", encoding="utf-8") as file:
lines = file.readlines()
# Clean each line
cleaned_lines = []
for line in lines:
# First, remove &nbsp; entities
line = line.replace("&nbsp;", " ")
# Remove line numbers (pattern like "200." or "201.") and leading whitespace
cleaned_line = re.sub(r"^\s*\d+\.\s*", "", line.strip())
# Replace multiple consecutive spaces with a single space
cleaned_line = re.sub(r" {2,}", " ", cleaned_line)
# Only add non-empty lines
if cleaned_line:
cleaned_lines.append(cleaned_line)
# If an output file path is provided, save the cleaned lines
if output_file_path:
with open(output_file_path, "w", encoding="utf-8") as file:
for line in cleaned_lines:
file.write(f"{line}\n")
print(f"Successfully cleaned text file and saved to: {output_file_path}")
return None
# Otherwise, return the cleaned lines
return cleaned_lines
def extract_vtt_text(vtt_file_path, output_file_path, max_lines=300):
"""
Extracts text content from a VTT file without timestamps and saves it to text file(s).
Splits into multiple files if number of lines exceeds max_lines.
Args:
vtt_file_path (str): Path to the input VTT file
output_file_path (str): Path where the text content will be saved
max_lines (int): Maximum number of lines per output file
"""
try:
# Parse the VTT file
vtt = webvtt.read(vtt_file_path)
# Extract and process text from each caption
extracted_lines = []
for caption in vtt.captions:
# Replace newlines with spaces to merge multi-line captions into a single line
merged_text = " ".join(caption.text.split("\n"))
# Remove &nbsp; entities
merged_text = merged_text.replace("&nbsp;", " ")
# Replace multiple consecutive spaces with a single space
merged_text = re.sub(r" {2,}", " ", merged_text)
# Only add non-empty lines
if merged_text.strip():
extracted_lines.append(merged_text)
# Determine output file paths
if output_file_path:
# Get the directory and base filename
file_dir = os.path.dirname(output_file_path)
base_filename = os.path.basename(output_file_path)
base_name, file_ext = os.path.splitext(base_filename)
# Split into multiple files if needed
output_files = []
if len(extracted_lines) > max_lines:
# Calculate number of files needed
num_files = (len(extracted_lines) + max_lines - 1) // max_lines
for i in range(num_files):
# Calculate start and end indices for this file
start_idx = i * max_lines
end_idx = min((i + 1) * max_lines, len(extracted_lines))
# Calculate total lines in each file
current_file_lines = end_idx - start_idx
# Create filename with file number and total lines
current_filename = f"{base_name}_{i + 1}of{num_files}_{current_file_lines}lines{file_ext}"
current_filepath = os.path.join(file_dir, current_filename)
# Write subset of lines to file, removing trailing newline
with open(current_filepath, "w", encoding="utf-8") as text_file:
text_file.write(
"\n".join(extracted_lines[start_idx:end_idx]).rstrip()
)
output_files.append(current_filepath)
print(
f"Successfully split {len(extracted_lines)} lines from VTT file into {num_files} files"
)
return output_files
# If no splitting needed
current_file_lines = len(extracted_lines)
full_output_path = os.path.join(
file_dir, f"{base_name}_{current_file_lines}lines{file_ext}"
)
# Write lines to file, removing trailing newline
with open(full_output_path, "w", encoding="utf-8") as text_file:
text_file.write("\n".join(extracted_lines).rstrip() + "\n")
print(
f"Successfully extracted {len(extracted_lines)} lines from VTT file to: {full_output_path}"
)
return full_output_path
# If no output path provided, return the lines
return extracted_lines
except Exception as e:
raise Exception(f"Error extracting text from VTT: {e}")
def replace_vtt_text(vtt_file_path, text_file_path, output_file_path, clean_text=False):
"""
Replaces the text content in a VTT file with text from a separate text file
using the webvtt-py library. Skips empty lines in the text file.
Args:
vtt_file_path (str): Path to the input VTT file
text_file_path (str): Path to the text file with replacement content
output_file_path (str): Path where the modified VTT file will be saved
clean_text (bool): Whether to clean the text file before using it
"""
# If cleaning is requested, clean the text file first
if clean_text:
replacement_lines = clean_text_file(text_file_path)
else:
# Read the replacement text, skipping empty lines
with open(text_file_path, "r", encoding="utf-8") as text_file:
replacement_lines = [
line.strip() for line in text_file.readlines() if line.strip()
]
# Parse the VTT file
vtt = webvtt.read(vtt_file_path)
# Check if we have the right number of replacement lines
if len(replacement_lines) != len(vtt.captions):
raise ValueError(
f"Number of non-empty lines in text file ({len(replacement_lines)}) "
f"doesn't match number of captions in VTT file ({len(vtt.captions)})"
)
# Replace the text for each caption
for i, caption in enumerate(vtt.captions):
caption.text = replacement_lines[i]
# Save the modified VTT file
vtt.save(output_file_path)
print(
f"Successfully created new VTT file with replaced text at: {output_file_path}"
)
def main():
"""Start the GUI application"""
root = tkdnd.Tk()
app = VTTProcessorApp(root)
root.mainloop()
if __name__ == "__main__":
main()
ENDSCRIPT
# Make script executable
chmod +x $HOME_DIR/vtt_processor/vtt_processor.py
# Create desktop shortcut
echo "Creating desktop shortcut..."
cat > $HOME_DIR/Desktop/VTT-Processor.desktop << EOF
[Desktop Entry]
Version=1.0
Type=Application
Name=VTT Processor
Comment=Process VTT files
Exec=python3 $HOME_DIR/vtt_processor/vtt_processor.py
Icon=text-editor
Terminal=false
Categories=Utility;
EOF
chmod +x $HOME_DIR/Desktop/VTT-Processor.desktop
echo ""
echo "===== Installation Complete! ====="
echo ""
echo "You can now run the VTT Processor in two ways:"
echo "1. Double-click the 'VTT Processor' icon on your desktop"
echo "2. Run this command in terminal: python3 $HOME_DIR/vtt_processor/vtt_processor.py"
echo ""
echo "Would you like to run the application now? (y/n)"
read -r response
if [[ "$response" =~ ^([yY][eE][sS]|[yY])+$ ]]; then
python3 $HOME_DIR/vtt_processor/vtt_processor.py
else
echo "You can run the application later using one of the methods above."
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment