Last active
July 31, 2025 11:37
-
-
Save abdusco/f79fdd7ddcf8e0d3f623a15fcf9495d3 to your computer and use it in GitHub Desktop.
Generate SRT subtitles for a video / audio using OpenAI's Whisper model
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env -S uv run --script | |
# /// script | |
# dependencies = ["httpx", "srt"] | |
# /// | |
""" | |
# Whisper Subtitles | |
This script generates SRT subtitles for audio or video files using the OpenAI Whisper API. | |
## Features | |
- Extracts audio from video files. | |
- Speeds up the audio before transcription to reduce token usage and cost (transcription still works reliably). | |
- Stretches subtitle timings back to match the original speed. | |
- Saves the result as SRT subtitle files. | |
## Usage | |
- Make sure ffmpeg and [uv is installed](https://docs.astral.sh/uv/getting-started/installation/). | |
- Set your OpenAI API key in the environment variable `OPENAI_API_KEY`. | |
```bash | |
chmod +x whisper_subtitles.py | |
OPENAI_API_KEY=sk-123 ./whisper_subtitles.py input_file.mp4 -o output.srt | |
``` | |
""" | |
import argparse | |
import logging | |
import os | |
import subprocess | |
import tempfile | |
from pathlib import Path | |
import httpx | |
import srt | |
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s") | |
logger = logging.getLogger(__name__) | |
def check_ffmpeg_installed(): | |
try: | |
subprocess.run(["ffmpeg", "-version"], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
except FileNotFoundError: | |
raise RuntimeError("ffmpeg is not installed. Please install it to use this script.") | |
def parse_args() -> argparse.Namespace: | |
parser = argparse.ArgumentParser(description="Generate SRT subtitles using OpenAI Whisper API.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
parser.add_argument("input", type=Path, help="Input audio or video file") | |
parser.add_argument("-o", "--output", type=Path, required=True, help="Output SRT file path") | |
parser.add_argument( | |
"--openai-api-key", | |
type=str, | |
default=os.getenv("OPENAI_API_KEY"), | |
required=True, | |
help="OpenAI API key (default: from OPENAI_API_KEY env var)", | |
) | |
def float_between(min: float, max: float) -> callable: | |
def parser(value: str) -> float: | |
fvalue = float(value) | |
if not (min <= fvalue <= max): | |
raise argparse.ArgumentTypeError(f"Value must be between {min} and {max}") | |
return value | |
return parser | |
parser.add_argument( | |
"--speed", | |
type=float_between(1, 10), | |
default=2.5, | |
help="Audio speed-up factor for transcription to reduce token usage at the cost of accuracy", | |
) | |
return parser.parse_args() | |
def extract_audio(input_path: Path, save_path: Path, speed: float) -> Path: | |
cmd = [ | |
"ffmpeg", | |
"-y", | |
"-i", | |
str(input_path), | |
"-vn", | |
"-acodec", | |
"aac", | |
"-ar", | |
"16000", | |
"-ac", | |
"1", | |
"-b:a", | |
"32k", | |
"-filter:a", | |
f"atempo={speed}", | |
str(save_path), | |
] | |
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
return save_path | |
def stretch_srt(srt_text: str, speed: float) -> str: | |
subs = list(srt.parse(srt_text)) | |
for sub in subs: | |
sub.start = sub.start * speed | |
sub.end = sub.end * speed | |
return srt.compose(subs) | |
def read_audio_duration(audio_path: Path) -> float: | |
cmd = [ | |
"ffprobe", | |
"-v", | |
"error", | |
"-show_entries", | |
"format=duration", | |
"-of", | |
"default=noprint_wrappers=1:nokey=1", | |
str(audio_path), | |
] | |
result = subprocess.run(cmd, capture_output=True, text=True, check=True) | |
return float(result.stdout.strip()) | |
def transcribe_as_srt(audio_path: Path, api_key: str) -> str: | |
client = httpx.Client( | |
base_url="https://api.openai.com/v1/", | |
headers={"Authorization": f"Bearer {api_key}"}, | |
timeout=120, | |
) | |
with audio_path.open("rb") as f: | |
res = client.post( | |
"/audio/transcriptions", | |
data={"model": "whisper-1", "response_format": "srt"}, | |
files={"file": f}, | |
) | |
if res.is_error: | |
logger.error(f"Error during transcription: {res.text}") | |
res.raise_for_status() | |
if ms := res.headers.get("openai-processing-ms"): | |
logger.info(f"Transcription done in {ms} ms") | |
try: | |
price_per_min = 0.006 | |
duration = read_audio_duration(audio_path) | |
cost = (duration / 60) * price_per_min | |
logger.info(f"Transcription cost: ${cost:.4f} for {duration:.2f} seconds") | |
except Exception: | |
pass | |
return res.text | |
def main() -> None: | |
args = parse_args() | |
check_ffmpeg_installed() | |
save_path: Path = args.output.resolve().with_name(f"{args.input.stem}.srt") | |
logger.info(f"Extracting audio from {args.input}") | |
with tempfile.TemporaryDirectory() as tmpdir: | |
audio_path = Path(tmpdir) / "audio.m4a" | |
extract_audio(input_path=args.input, save_path=audio_path, speed=args.speed) | |
try: | |
logger.info(f"Transcribing {audio_path}") | |
srt_text = transcribe_as_srt(audio_path=audio_path, api_key=args.api_key) | |
save_path.with_name(f"{args.input.stem}.txt").write_text(srt_text) | |
except Exception as e: | |
logger.error(f"Error during transcription: {e}") | |
raise SystemExit(1) | |
logger.debug("Stretching SRT timings to 1x") | |
srt_stretched = stretch_srt(srt_text=srt_text, speed=args.speed) | |
save_path.write_text(srt_stretched) | |
logger.info(f"SRT written to {save_path}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment