|
import argparse |
|
from datetime import datetime, timedelta |
|
from pathlib import Path |
|
from typing import Optional |
|
|
|
from faster_whisper import WhisperModel |
|
|
|
|
|
# model_size = "large-v2" # Will download faster-whisper-large-v2 from Hugging face |
|
model_size = "faster-whisper-large-v2-ct2" # Load the local large-v2 model |
|
|
|
# Run on GPU with FP16 |
|
# model = WhisperModel(model_size, device="cuda", compute_type="float16") |
|
|
|
# or run on GPU with INT8 |
|
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16") |
|
# or run on CPU with INT8 |
|
# model = WhisperModel(model_size, device="cpu", compute_type="int8") |
|
|
|
|
|
parser = argparse.ArgumentParser( |
|
usage="%(prog)s [audio file]", |
|
description="Transcribe the given audio file using faster-whisper-large-v2 model", |
|
allow_abbrev=False, |
|
) |
|
parser.add_argument("audio_file", type=str) |
|
parser.add_argument( |
|
"lang", |
|
type=str, |
|
nargs="?", |
|
default=None, |
|
help="Specify the audio language. Will auto-detect if not specified", |
|
) |
|
args = parser.parse_args() |
|
|
|
|
|
def srt_format_timestamp(seconds: float) -> str: |
|
""" |
|
Taken from: https://github.com/openai/whisper/discussions/98#discussioncomment-3726175 |
|
""" |
|
assert seconds >= 0, "non-negative timestamp expected" |
|
milliseconds = round(seconds * 1000.0) |
|
|
|
hours = milliseconds // 3_600_000 |
|
milliseconds -= hours * 3_600_000 |
|
|
|
minutes = milliseconds // 60_000 |
|
milliseconds -= minutes * 60_000 |
|
|
|
seconds = milliseconds // 1_000 |
|
milliseconds -= seconds * 1_000 |
|
|
|
return (f"{hours:02d}:") + f"{minutes:02d}:{seconds:02d},{milliseconds:03d}" |
|
|
|
|
|
def transcribe_audio(audio_path: str, audio_lang: Optional[str]) -> None: |
|
model = WhisperModel(model_size, device="cpu", compute_type="int8") |
|
|
|
segments, info = model.transcribe(audio_path, language=audio_lang, beam_size=5) |
|
|
|
if audio_lang is None: |
|
print(f"Detected language '{info.language}' with probability {info.language_probability}") |
|
|
|
now_time = datetime.now().strftime("%Y-%m-%d_%H-%M") |
|
file_name_no_suffix = Path(audio_path).with_suffix("").name |
|
srt_file_name = Path(f"{file_name_no_suffix}_{now_time}.srt").name |
|
|
|
transcription = "" |
|
for segment in segments: |
|
startTime = srt_format_timestamp(segment.start) |
|
endTime = srt_format_timestamp(segment.end) |
|
text_srt = (f"{segment.id}\n{startTime} --> {endTime}\n{segment.text.strip()}\n\n") |
|
text_console = (f"[{segment.id} - {startTime} --> {endTime}] {segment.text.strip()}") |
|
|
|
print(text_console) |
|
transcription += text_srt |
|
|
|
Path(srt_file_name).write_text(transcription) |
|
|
|
|
|
transcribe_audio(args.audio_file, args.lang) |