Skip to content

Instantly share code, notes, and snippets.

@PaperNick
Last active November 18, 2023 21:20
Show Gist options
  • Save PaperNick/61cf16f0f1c5735d988c0703c25f3cf2 to your computer and use it in GitHub Desktop.
Save PaperNick/61cf16f0f1c5735d988c0703c25f3cf2 to your computer and use it in GitHub Desktop.
Faster Whisper transcribe script

Faster Whisper transcribe script

This is a helper script to make the process of transcribing with guillaumekln/faster-whisper model easier.

Installation

  1. Create a venv python -m venv venv
  2. Activate the venv source venv/bin/active
  3. Install faster-whisper from PyPI pip install faster-whisper
  4. Download the faster-whisper-large-v2 model from Hugging Face
  5. Move the Hugging Face large-v2 model contents into faster-whisper-large-v2-ct2 folder
  6. Run a transcription job: python transcribe.py audio.mp3
import argparse
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional
from faster_whisper import WhisperModel
# model_size = "large-v2" # Will download faster-whisper-large-v2 from Hugging face
model_size = "faster-whisper-large-v2-ct2" # Load the local large-v2 model
# Run on GPU with FP16
# model = WhisperModel(model_size, device="cuda", compute_type="float16")
# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")
parser = argparse.ArgumentParser(
usage="%(prog)s [audio file]",
description="Transcribe the given audio file using faster-whisper-large-v2 model",
allow_abbrev=False,
)
parser.add_argument("audio_file", type=str)
parser.add_argument(
"lang",
type=str,
nargs="?",
default=None,
help="Specify the audio language. Will auto-detect if not specified",
)
args = parser.parse_args()
def srt_format_timestamp(seconds: float) -> str:
"""
Taken from: https://github.com/openai/whisper/discussions/98#discussioncomment-3726175
"""
assert seconds >= 0, "non-negative timestamp expected"
milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000
milliseconds -= hours * 3_600_000
minutes = milliseconds // 60_000
milliseconds -= minutes * 60_000
seconds = milliseconds // 1_000
milliseconds -= seconds * 1_000
return (f"{hours:02d}:") + f"{minutes:02d}:{seconds:02d},{milliseconds:03d}"
def transcribe_audio(audio_path: str, audio_lang: Optional[str]) -> None:
model = WhisperModel(model_size, device="cpu", compute_type="int8")
segments, info = model.transcribe(audio_path, language=audio_lang, beam_size=5)
if audio_lang is None:
print(f"Detected language '{info.language}' with probability {info.language_probability}")
now_time = datetime.now().strftime("%Y-%m-%d_%H-%M")
file_name_no_suffix = Path(audio_path).with_suffix("").name
srt_file_name = Path(f"{file_name_no_suffix}_{now_time}.srt").name
transcription = ""
for segment in segments:
startTime = srt_format_timestamp(segment.start)
endTime = srt_format_timestamp(segment.end)
text_srt = (f"{segment.id}\n{startTime} --> {endTime}\n{segment.text.strip()}\n\n")
text_console = (f"[{segment.id} - {startTime} --> {endTime}] {segment.text.strip()}")
print(text_console)
transcription += text_srt
Path(srt_file_name).write_text(transcription)
transcribe_audio(args.audio_file, args.lang)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment