Skip to content

Instantly share code, notes, and snippets.

@mbutler
Last active June 3, 2025 19:28
Show Gist options
  • Save mbutler/a1a5bbb05222f5858bc01845c6acdeda to your computer and use it in GitHub Desktop.
Save mbutler/a1a5bbb05222f5858bc01845c6acdeda to your computer and use it in GitHub Desktop.
Batch transcribe audio and video files using OpenAI Whisper
import os
import math
import subprocess
import openai
import pysrt
from datetime import timedelta
API_KEY = "api-key"
INPUT_DIR = "input_media"
CHUNK_DIR = "audio_chunks"
OUTPUT_DIR = "output"
CHUNK_DURATION_SEC = 300
client = openai.OpenAI(api_key=API_KEY)
os.makedirs(CHUNK_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
def list_media_files(directory):
return [
os.path.join(directory, f)
for f in os.listdir(directory)
if f.lower().endswith(('.mp4', '.mp3'))
]
def get_duration_seconds(input_path):
cmd = [
"ffprobe", "-v", "error",
"-show_entries", "format=duration",
"-of", "default=noprint_wrappers=1:nokey=1",
input_path
]
return float(subprocess.check_output(cmd).decode().strip())
def split_media(input_path, output_dir, chunk_duration):
total_duration = get_duration_seconds(input_path)
total_chunks = math.ceil(total_duration / chunk_duration)
base = os.path.splitext(os.path.basename(input_path))[0]
chunk_paths = []
for i in range(total_chunks):
start_time = i * chunk_duration
output_file = os.path.join(output_dir, f"{base}_chunk_{i:03}.mp3")
cmd = [
"ffmpeg", "-y", "-i", input_path, "-ss", str(start_time),
"-t", str(chunk_duration),
"-vn", "-ar", "16000", "-ac", "1", "-b:a", "64k",
output_file
]
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
chunk_paths.append(output_file)
return chunk_paths
def transcribe_chunk(path):
with open(path, "rb") as f:
return client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="srt"
)
def shift_srt(srt_data, shift_seconds):
subs = pysrt.from_string(srt_data)
for sub in subs:
sub.shift(seconds=shift_seconds)
return subs
def process_file(filepath):
base_name = os.path.splitext(os.path.basename(filepath))[0]
print(f"Processing: {base_name}")
chunk_paths = split_media(filepath, CHUNK_DIR, CHUNK_DURATION_SEC)
all_subs = pysrt.SubRipFile()
for i, chunk_path in enumerate(chunk_paths):
print(f" Transcribing chunk {i + 1}/{len(chunk_paths)}")
try:
srt_text = transcribe_chunk(chunk_path)
shifted = shift_srt(srt_text, i * CHUNK_DURATION_SEC)
all_subs.extend(shifted)
except Exception as e:
print(f" Error on chunk {i}: {e}")
final_srt_path = os.path.join(OUTPUT_DIR, base_name + ".srt")
all_subs.save(final_srt_path, encoding="utf-8")
print(f"Saved: {final_srt_path}\n")
def main():
media_files = list_media_files(INPUT_DIR)
print(f"Found {len(media_files)} media files to process\n")
for f in media_files:
process_file(f)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment