Skip to content

Instantly share code, notes, and snippets.

@gekkedev
Created February 27, 2026 22:57
Show Gist options
  • Select an option

  • Save gekkedev/156d175689bc859b5eec2b71380befe4 to your computer and use it in GitHub Desktop.

Select an option

Save gekkedev/156d175689bc859b5eec2b71380befe4 to your computer and use it in GitHub Desktop.
Offline transcription CLI with .txt/.srt/.json outputs. Works with consumer laptops, not just high-end GPUs...

Offline Transcription CLI (CPU-first, fallback-friendly)

Small CLI wrapper around faster-whisper to transcribe audio/video offline into:

  • *.txt
  • *.srt
  • *.json (with segment + word timestamps and confidence)

Designed for normal machines, with compatibility-first defaults:

  • device=cpu
  • compute_type=int8
  • automatic model fallback: large-v3 -> medium -> small -> base
  • optional fallback from GPU to CPU if GPU fails

Requirements

  • Python 3.9+
  • faster-whisper

Install:

python3 -m pip install --user faster-whisper

Basic usage

python3 run_offline_transcribe.py "my_audio.m4a"

Useful examples

Transcribe multiple files:

python3 run_offline_transcribe.py "a.wav" "b.mp3" --output-dir outputs

Force CPU-safe settings:

python3 run_offline_transcribe.py "call.m4a" --device cpu --compute-type int8

Higher quality (slower):

python3 run_offline_transcribe.py "call.m4a" --language en --beam-size 10 --best-of 10

Bias recognition for non-dictionary terms:

python3 run_offline_transcribe.py "call.m4a" --hotwords "Brand name,odd-spelled w0rd"

Use a smaller model directly:

python3 run_offline_transcribe.py "call.m4a" --model small --no-fallback

Notes

  • First run may download model weights.
  • Output filenames are based on input filename stem.
  • JSON stores the actual runtime model/device/compute type used.
MIT License
Copyright (c) 2026 gekkedev
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
#!/usr/bin/env python3
import argparse
import json
import sys
from pathlib import Path
from typing import List, Dict, Any, Tuple
from faster_whisper import WhisperModel
def format_timestamp(seconds: float) -> str:
millis = int(round(seconds * 1000))
hours = millis // 3_600_000
millis %= 3_600_000
minutes = millis // 60_000
millis %= 60_000
secs = millis // 1000
millis %= 1000
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
def write_srt(path: Path, segments: List[Dict[str, Any]]) -> None:
lines: List[str] = []
for i, seg in enumerate(segments, start=1):
start = format_timestamp(seg["start"])
end = format_timestamp(seg["end"])
text = seg["text"].strip()
lines.append(str(i))
lines.append(f"{start} --> {end}")
lines.append(text)
lines.append("")
path.write_text("\n".join(lines), encoding="utf-8")
def parse_temperature(raw: str) -> List[float]:
values = []
for part in raw.split(","):
part = part.strip()
if not part:
continue
values.append(float(part))
if not values:
raise ValueError("Temperature list cannot be empty.")
return values
def parse_csv(raw: str) -> List[str]:
values = [part.strip() for part in raw.split(",") if part.strip()]
return values
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Offline transcription with faster-whisper. Writes .txt, .srt and .json outputs."
)
parser.add_argument(
"audio",
nargs="+",
help="Path(s) to audio/video files to transcribe.",
)
parser.add_argument(
"--output-dir",
default=".",
help="Directory to write output files (default: current directory).",
)
parser.add_argument(
"--output-prefix",
default=None,
help="Custom output base name for single-file input (without extension).",
)
parser.add_argument(
"--model",
default="large-v3",
help="Primary Whisper model name (e.g. large-v3, medium, small).",
)
parser.add_argument(
"--fallback-models",
default="medium,small,base",
help="Comma-separated fallback models if the primary attempt fails (default: medium,small,base).",
)
parser.add_argument(
"--no-fallback",
action="store_true",
help="Disable fallback to smaller models.",
)
parser.add_argument("--device", default="cpu", help="Device for inference (cpu/cuda/auto).")
parser.add_argument(
"--compute-type",
default="int8",
help="Compute type for CTranslate2 (e.g. int8, int8_float32, float32, float16).",
)
parser.add_argument(
"--cpu-threads",
type=int,
default=None,
help="Limit CPU threads for CTranslate2 (default: library decides).",
)
parser.add_argument(
"--num-workers",
type=int,
default=1,
help="Number of workers used by faster-whisper (default: 1, compatibility-friendly).",
)
parser.add_argument(
"--task",
default="transcribe",
choices=["transcribe", "translate"],
help="Transcription task.",
)
parser.add_argument(
"--language",
default=None,
help="Language code (e.g. de, en). Omit for auto-detection.",
)
parser.add_argument("--beam-size", type=int, default=5, help="Beam size.")
parser.add_argument("--best-of", type=int, default=5, help="Best-of sampling.")
parser.add_argument("--patience", type=float, default=1.5, help="Decoding patience.")
parser.add_argument(
"--temperature",
default="0.0,0.2,0.4",
help="Comma-separated temperature schedule.",
)
parser.add_argument(
"--hotwords",
default=None,
help="Optional comma-separated hotwords to bias decoding.",
)
parser.add_argument(
"--word-timestamps",
action=argparse.BooleanOptionalAction,
default=True,
help="Enable per-word timestamps.",
)
parser.add_argument(
"--vad-filter",
action=argparse.BooleanOptionalAction,
default=False,
help="Enable voice activity detection filter.",
)
parser.add_argument(
"--log-progress",
action=argparse.BooleanOptionalAction,
default=True,
help="Show decode progress bar.",
)
return parser.parse_args()
def build_model_attempts(args: argparse.Namespace) -> List[Tuple[str, str, str]]:
models = [args.model]
if not args.no_fallback:
for fallback_model in parse_csv(args.fallback_models):
if fallback_model not in models:
models.append(fallback_model)
devices = [args.device]
if args.device != "cpu" and "cpu" not in devices:
# If GPU/auto fails, retry on CPU for maximum compatibility.
devices.append("cpu")
attempts: List[Tuple[str, str, str]] = []
for model_name in models:
for device_name in devices:
attempts.append((model_name, device_name, args.compute_type))
return attempts
def load_model(
model_name: str,
device_name: str,
compute_type: str,
args: argparse.Namespace,
) -> WhisperModel:
kwargs: Dict[str, Any] = {
"device": device_name,
"compute_type": compute_type,
"num_workers": args.num_workers,
}
if args.cpu_threads is not None:
kwargs["cpu_threads"] = args.cpu_threads
return WhisperModel(model_name, **kwargs)
def transcribe_one(
model: WhisperModel,
audio_path: Path,
output_base: Path,
args: argparse.Namespace,
runtime_model: str,
runtime_device: str,
runtime_compute_type: str,
) -> None:
temperature = parse_temperature(args.temperature)
segments_iter, info = model.transcribe(
str(audio_path),
task=args.task,
language=args.language,
beam_size=args.beam_size,
best_of=args.best_of,
patience=args.patience,
length_penalty=1.0,
repetition_penalty=1.02,
no_repeat_ngram_size=3,
condition_on_previous_text=True,
log_progress=args.log_progress,
compression_ratio_threshold=2.2,
log_prob_threshold=-0.8,
no_speech_threshold=0.4,
word_timestamps=args.word_timestamps,
vad_filter=args.vad_filter,
temperature=temperature,
hotwords=args.hotwords,
)
segments: List[Dict[str, Any]] = []
full_text_parts: List[str] = []
for seg in segments_iter:
words = []
if seg.words:
for w in seg.words:
words.append(
{
"word": w.word,
"start": w.start,
"end": w.end,
"probability": w.probability,
}
)
segment_obj = {
"id": seg.id,
"start": seg.start,
"end": seg.end,
"text": seg.text,
"avg_logprob": seg.avg_logprob,
"compression_ratio": seg.compression_ratio,
"no_speech_prob": seg.no_speech_prob,
"words": words,
}
segments.append(segment_obj)
full_text_parts.append(seg.text.strip())
language = info.language
language_prob = info.language_probability
txt_path = output_base.parent / f"{output_base.name}.txt"
srt_path = output_base.parent / f"{output_base.name}.srt"
json_path = output_base.parent / f"{output_base.name}.json"
txt_path.write_text("\n".join(part for part in full_text_parts if part), encoding="utf-8")
write_srt(srt_path, segments)
payload = {
"source_audio": str(audio_path),
"model": runtime_model,
"device": runtime_device,
"compute_type": runtime_compute_type,
"detected_language": language,
"detected_language_probability": language_prob,
"duration": info.duration,
"duration_after_vad": info.duration_after_vad,
"transcription_options": {
"task": args.task,
"language": args.language,
"beam_size": args.beam_size,
"best_of": args.best_of,
"patience": args.patience,
"word_timestamps": args.word_timestamps,
"vad_filter": args.vad_filter,
"hotwords": args.hotwords,
"temperature": temperature,
"compression_ratio_threshold": 2.2,
"log_prob_threshold": -0.8,
"no_speech_threshold": 0.4,
},
"segments": segments,
}
json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
print(
f"[{audio_path.name}] model={runtime_model}, device={runtime_device}, "
f"language={language} (p={language_prob:.3f}), duration={info.duration:.1f}s"
)
print(f"Wrote: {txt_path}, {srt_path}, {json_path}")
def main() -> None:
args = parse_args()
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
audio_paths = [Path(p) for p in args.audio]
for path in audio_paths:
if not path.exists():
raise FileNotFoundError(f"Input file not found: {path}")
if args.output_prefix and len(audio_paths) > 1:
raise ValueError("--output-prefix can only be used with a single input file.")
attempts = build_model_attempts(args)
model_cache: Dict[Tuple[str, str, str], WhisperModel] = {}
for audio_path in audio_paths:
if args.output_prefix:
output_base = output_dir / args.output_prefix
else:
output_base = output_dir / audio_path.stem
last_error: Exception | None = None
success = False
for model_name, device_name, compute_type in attempts:
key = (model_name, device_name, compute_type)
try:
if key not in model_cache:
print(
f"Loading model={model_name}, device={device_name}, "
f"compute_type={compute_type}..."
)
model_cache[key] = load_model(model_name, device_name, compute_type, args)
transcribe_one(
model_cache[key],
audio_path,
output_base,
args,
runtime_model=model_name,
runtime_device=device_name,
runtime_compute_type=compute_type,
)
success = True
break
except Exception as exc:
last_error = exc
print(
f"Attempt failed: model={model_name}, device={device_name}, "
f"compute_type={compute_type}. Error: {exc}",
file=sys.stderr,
)
if not success:
raise RuntimeError(
f"All transcription attempts failed for {audio_path}. "
f"Last error: {last_error}"
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment