Last active
June 3, 2025 19:28
-
-
Save mbutler/a1a5bbb05222f5858bc01845c6acdeda to your computer and use it in GitHub Desktop.
Batch transcribe audio and video files using OpenAI Whisper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import math | |
import subprocess | |
import openai | |
import pysrt | |
from datetime import timedelta | |
API_KEY = "api-key" | |
INPUT_DIR = "input_media" | |
CHUNK_DIR = "audio_chunks" | |
OUTPUT_DIR = "output" | |
CHUNK_DURATION_SEC = 300 | |
client = openai.OpenAI(api_key=API_KEY) | |
os.makedirs(CHUNK_DIR, exist_ok=True) | |
os.makedirs(OUTPUT_DIR, exist_ok=True) | |
def list_media_files(directory): | |
return [ | |
os.path.join(directory, f) | |
for f in os.listdir(directory) | |
if f.lower().endswith(('.mp4', '.mp3')) | |
] | |
def get_duration_seconds(input_path): | |
cmd = [ | |
"ffprobe", "-v", "error", | |
"-show_entries", "format=duration", | |
"-of", "default=noprint_wrappers=1:nokey=1", | |
input_path | |
] | |
return float(subprocess.check_output(cmd).decode().strip()) | |
def split_media(input_path, output_dir, chunk_duration): | |
total_duration = get_duration_seconds(input_path) | |
total_chunks = math.ceil(total_duration / chunk_duration) | |
base = os.path.splitext(os.path.basename(input_path))[0] | |
chunk_paths = [] | |
for i in range(total_chunks): | |
start_time = i * chunk_duration | |
output_file = os.path.join(output_dir, f"{base}_chunk_{i:03}.mp3") | |
cmd = [ | |
"ffmpeg", "-y", "-i", input_path, "-ss", str(start_time), | |
"-t", str(chunk_duration), | |
"-vn", "-ar", "16000", "-ac", "1", "-b:a", "64k", | |
output_file | |
] | |
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
chunk_paths.append(output_file) | |
return chunk_paths | |
def transcribe_chunk(path): | |
with open(path, "rb") as f: | |
return client.audio.transcriptions.create( | |
model="whisper-1", | |
file=f, | |
response_format="srt" | |
) | |
def shift_srt(srt_data, shift_seconds): | |
subs = pysrt.from_string(srt_data) | |
for sub in subs: | |
sub.shift(seconds=shift_seconds) | |
return subs | |
def process_file(filepath): | |
base_name = os.path.splitext(os.path.basename(filepath))[0] | |
print(f"Processing: {base_name}") | |
chunk_paths = split_media(filepath, CHUNK_DIR, CHUNK_DURATION_SEC) | |
all_subs = pysrt.SubRipFile() | |
for i, chunk_path in enumerate(chunk_paths): | |
print(f" Transcribing chunk {i + 1}/{len(chunk_paths)}") | |
try: | |
srt_text = transcribe_chunk(chunk_path) | |
shifted = shift_srt(srt_text, i * CHUNK_DURATION_SEC) | |
all_subs.extend(shifted) | |
except Exception as e: | |
print(f" Error on chunk {i}: {e}") | |
final_srt_path = os.path.join(OUTPUT_DIR, base_name + ".srt") | |
all_subs.save(final_srt_path, encoding="utf-8") | |
print(f"Saved: {final_srt_path}\n") | |
def main(): | |
media_files = list_media_files(INPUT_DIR) | |
print(f"Found {len(media_files)} media files to process\n") | |
for f in media_files: | |
process_file(f) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment