Skip to content

Instantly share code, notes, and snippets.

@Phate334
Last active January 22, 2025 17:01
Show Gist options
  • Save Phate334/3720b44b7ad20e25536930faa581100f to your computer and use it in GitHub Desktop.
Save Phate334/3720b44b7ad20e25536930faa581100f to your computer and use it in GitHub Desktop.
Faster Whisper + Google translate
import argparse
import asyncio
import logging
from pathlib import Path
import ffmpeg
import pysubs2
from faster_whisper import BatchedInferencePipeline, WhisperModel
from faster_whisper.vad import VadOptions
from googletrans import Translator
logging.basicConfig(level=logging.INFO)
# "Systran/faster-whisper-large-v3"
DEFAULT_MOKDEL_NAME = "deepdml/faster-whisper-large-v3-turbo-ct2"
vad_opts = VadOptions(
min_silence_duration_ms=500, # 降低靜音判斷時長到 500ms
speech_pad_ms=200, # 降低 padding 到 200ms
max_speech_duration_s=5, # 限制每段最長 10 秒
onset=0.5, # 語音判定閾值
offset=0.35, # 靜音判定閾值
)
async def translate_subs(subs, target_lang):
translator = Translator()
for line in subs:
translation = await translator.translate(line.text, dest=target_lang)
line.text = translation.text
return subs
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="OpenAI API Whisper Transcription and Translation"
)
parser.add_argument("file", type=str, help="Path to the audio file")
parser.add_argument(
"--src-lang", type=str, help="Language of the audio file", default="ja"
)
parser.add_argument(
"--tgt-lang",
type=str,
help="Language to translate the transcript to",
default="zh-tw",
)
parser.add_argument(
"--model",
type=str,
help="OpenAI Whisper model name",
default=DEFAULT_MOKDEL_NAME,
)
parser.add_argument(
"--keep", action="store_true", help="Keep the intermediate files"
)
parser.add_argument(
"--no-trans", action="store_true", default=False, help="Skip translation step"
)
args = parser.parse_args()
src_lang = args.src_lang
no_trans = args.no_trans
target_lang = args.tgt_lang
MOKDEL_NAME = args.model
whisper_model = WhisperModel(
MOKDEL_NAME, device="cuda", compute_type="int8_float16"
)
batched_model = BatchedInferencePipeline(model=whisper_model)
src_file = Path(args.file)
src_filename = src_file.stem
src_wav_file = src_file.with_suffix(".wav")
src_srt_file = src_file.with_suffix(".srt")
src_dir = src_file.parent
# Check if the audio file is in WAV format
logging.info("Converting %s to WAV format", src_file)
if not src_wav_file.exists():
logging.info("%s does not exist. Use ffmpeg to convert it to WAV", src_wav_file)
ffmpeg.input(str(src_file)).output(
str(src_wav_file), ar=16000, ac=1, c="pcm_s16le"
).run(overwrite_output=True)
else:
logging.info("%s already exists. Skipping conversion", src_wav_file)
logging.info("Transcribing %s to %s", src_file, src_srt_file)
with open(src_wav_file, "rb") as audio_file:
segments, info = batched_model.transcribe(
audio=audio_file,
language=src_lang,
condition_on_previous_text=True,
prefix=f"This is a transcript of the audio file in {src_lang}.",
vad_filter=True,
vad_parameters=vad_opts,
)
results = []
for s in segments:
segment_dict = {"start": s.start, "end": s.end, "text": s.text}
results.append(segment_dict)
logging.info("Transcription completed")
subs = pysubs2.load_from_whisper(results)
subs.save(src_srt_file, encoding="utf-8")
# Translate the transcript to the target language
if not args.no_trans:
logging.info("Translating %s to %s", src_srt_file, target_lang)
translated_subs = asyncio.run(translate_subs(subs, target_lang))
output_path = src_dir.joinpath(f"{src_filename}_{target_lang}.srt")
translated_subs.save(output_path, encoding="utf-8")
logging.info("Translated subtitles saved to %s", output_path)
else:
logging.info("Skipping translation as --no-trans was specified")
# Clean up the intermediate files
if not args.keep:
logging.info("Cleaning up intermediate files")
src_wav_file.unlink()
src_srt_file.unlink()
else:
logging.info("Intermediate files kept")
import os
import subprocess
from pathlib import Path
from tqdm import tqdm
from faster_whisper import WhisperModel
from googletrans import Translator
import pysubs2
import torch
whisper_size = "large-v2"
print('Loading model...')
if torch.cuda.is_available():
whisper_model = WhisperModel(whisper_size, device="cuda", compute_type="float16")
device = torch.device("cuda")
else:
whisper_model = WhisperModel(whisper_size, device="cpu", compute_type="int8")
device = torch.device("cpu")
def transcribe_to_srt(file_name, language, model, condition_on_previous_text=True):
assert os.path.exists(file_name), f"No {file_name} found in current path."
file_basename = os.path.splitext(file_name)[0]
output_dir = os.path.dirname(file_name)
torch.cuda.empty_cache()
print('Transcribe in progress...')
segments, info = model.transcribe(audio = file_name,
beam_size=5,
language=language,
condition_on_previous_text=condition_on_previous_text,
vad_parameters=dict(min_silence_duration_ms=1000))
total_duration = round(info.duration, 2)
results= []
with tqdm(total=total_duration, unit=" seconds") as pbar:
for s in segments:
segment_dict = {'start':s.start,'end':s.end,'text':s.text}
results.append(segment_dict)
segment_duration = s.end - s.start
pbar.update(segment_duration)
print('Transcription done')
subs = pysubs2.load_from_whisper(results)
srt_file_path = file_basename + '.srt'
subs.save(srt_file_path)
return srt_file_path
def translate_srt(srt_path: Path, target_language: str) -> Path:
translator = Translator()
subs = pysubs2.load(srt_path)
for line in tqdm(subs):
translation = translator.translate(line.text, dest=target_language)
line.text = translation.text
output_path = srt_path.rsplit('.', 1)[0] + '_' + target_language + '.srt'
subs.save(output_path, encoding='utf-8')
return output_path

Install

!pip install faster-whisper
!pip install googletrans==3.1.0a0
!pip install pysubs2
@Phate334
Copy link
Author

Phate334 commented Aug 5, 2023

Usage

from pathlib import Path
import os

storage_source = "/drive/MyDrive/Colab Notebooks/Whisper Youtube/"
for mp4 in Path(storage_source).glob('*.mp4'):
  wav_path = f"{storage_source}{mp4.stem}.wav"
  srt_path = f"{storage_source}{mp4.stem}.srt"
  translated_str_path = f"{storage_source}{mp4.stem}_translated.srt"

  os.system(f'ffmpeg -i "{str(mp4)}" -ar 16000 -ac 1 -c:a pcm_s16le "{wav_path}"')
  transcribe_to_srt(wav_path, "ja", whisper_model, condition_on_previous_text=True)
  translated_str = translate_srt(srt_path, 'zh-TW')
  os.system(f'cp "{translated_str}" "{translated_str_path}"')

  # 刪除 wav 和原始的 srt 檔案
  os.remove(wav_path)
  os.remove(srt_path)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment