sadatnfs · July 31, 2024 02:58
diff --git a/transcribe_video_FasterWhisper_WSL_CUDA.py b/transcribe_video_FasterWhisper_WSL_CUDA.py
 # Prepping your environment #
 # 1. Set up WSL with ffmpeg: https://streaminglearningcenter.com/encoding/running-ffmpeg-on-windows-subsystem-for-linux.html
 # 2. Install Miniconda in your WSL environment: https://dev.to/sfpear/miniconda-in-wsl-3642
 # 3. Install Python packages: 
 #     pip uninstall ffmpeg
 #     pip uninstall ffmpeg-python
 #     pip install ffmpeg-python git+https://github.com/SYSTRAN/faster-whisper
 # 4. [IF YOU HAVE A COMPATIBLE NVIDIA GPU] Install all CUDA stuff, which can be painful
 #    The instructions in this section should work clean: https://github.com/SYSTRAN/faster-whisper?tab=readme-ov-file#install-with-pip-linux-only
 #    But it might be easiest to use the Docker image.
 #    If you're going the Docker route, here's a few to-do (and instructions) that seem decent: https://logic2020.com/insight/wsl-docker-gpu-enabled-nvidia/
 import time
 import math
 import ffmpeg
 import os
 import pickle

 from faster_whisper import WhisperModel


 def extract_audio(input_video_name):
    """
    Extract audio from a video file
    """
    extracted_audio = f"audio-{input_video_name}.wav"
    stream = ffmpeg.input(input_video)
    stream = ffmpeg.output(stream, extracted_audio)
    ffmpeg.run(stream, overwrite_output=True)
    return extracted_audio


 def transcribe(model, audio):
    """
    Run transcription model
    """
    # model = WhisperModel("small")
    segments, info = model.transcribe(audio)
    language = info[0]
    print("Transcription language", info[0])
    segments = list(segments)
    for segment in segments:
        # print(segment)
        print(
            "[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)
        )
    return language, segments


 def format_time(seconds):
    """
    Helper method for a clean timestamp format
    """
    hours = math.floor(seconds / 3600)
    seconds %= 3600
    minutes = math.floor(seconds / 60)
    seconds %= 60
    milliseconds = round((seconds - math.floor(seconds)) * 1000)
    seconds = math.floor(seconds)
    formatted_time = (
        f"{hours:02d}:{minutes:02d}:{seconds:01d},{milliseconds:03d}"
    )

    return formatted_time


 def generate_subtitle_file(language, segments):
    """
    Create subtitle file from transcribed dict
    """
    subtitle_file = f"sub-{input_video_name}.{language}.srt"
    text = ""
    for index, segment in enumerate(segments):
        segment_start = format_time(segment.start)
        segment_end = format_time(segment.end)
        text += f"{str(index+1)} \n"
        text += f"{segment_start} --> {segment_end} \n"
        text += f"{segment.text} \n"
        text += "\n"

    f = open(subtitle_file, "w")
    f.write(text)
    f.close()

    return subtitle_file


 def main():
    """
    Driver
    """
    os.chdir("/mnt/c")
    input_video = "video_file.mp4"
    input_video_name = input_video.replace(".mp4", "")
    extracted_audio = extract_audio(input_video_name=input_video_name)

    # Check for existence
    if not os.path.isfile(extracted_audio):
        raise Exception("Audio file does not exist")

    # If using CPU
    # NOTE: You should set the number of threads appropriately
    num_threads = 4
    os.environ["OMP_NUM_THREADS"] = f"{num_threads}"
    model = WhisperModel(
        model_size_or_path="small.en",
        device="cpu",
        cpu_threads=num_threads,
        num_workers=3,
    )
    # If using GPU
    # model = WhisperModel(model_size_or_path="small.en", device="cuda")

    language, segments = transcribe(model=model, audio=extracted_audio)

    # Generate subtitle file
    subtitle_file = generate_subtitle_file(language=language, segments=segments)

    # Save the segments into a pickle
    pickle_outfile = f"segments_{input_video_name}.pkl"
    with open(pickle_outfile, 'wb') as file:
        pickle.dump(segments, file)

    ## Load the pickle into memory with:
    # segments = pickle.load(open(pickle_outfile, 'rb'))
    return 0


 if __name__ == "__main__":
    main()
	# Prepping your environment #
	# 1. Set up WSL with ffmpeg: https://streaminglearningcenter.com/encoding/running-ffmpeg-on-windows-subsystem-for-linux.html
	# 2. Install Miniconda in your WSL environment: https://dev.to/sfpear/miniconda-in-wsl-3642
	# 3. Install Python packages:
	# pip uninstall ffmpeg
	# pip uninstall ffmpeg-python
	# pip install ffmpeg-python git+https://github.com/SYSTRAN/faster-whisper
	# 4. [IF YOU HAVE A COMPATIBLE NVIDIA GPU] Install all CUDA stuff, which can be painful
	# The instructions in this section should work clean: https://github.com/SYSTRAN/faster-whisper?tab=readme-ov-file#install-with-pip-linux-only
	# But it might be easiest to use the Docker image.
	# If you're going the Docker route, here's a few to-do (and instructions) that seem decent: https://logic2020.com/insight/wsl-docker-gpu-enabled-nvidia/
	import time
	import math
	import ffmpeg
	import os
	import pickle

	from faster_whisper import WhisperModel


	def extract_audio(input_video_name):
	"""
	Extract audio from a video file
	"""
	extracted_audio = f"audio-{input_video_name}.wav"
	stream = ffmpeg.input(input_video)
	stream = ffmpeg.output(stream, extracted_audio)
	ffmpeg.run(stream, overwrite_output=True)
	return extracted_audio


	def transcribe(model, audio):
	"""
	Run transcription model
	"""
	# model = WhisperModel("small")
	segments, info = model.transcribe(audio)
	language = info[0]
	print("Transcription language", info[0])
	segments = list(segments)
	for segment in segments:
	# print(segment)
	print(
	"[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)
	)
	return language, segments


	def format_time(seconds):
	"""
	Helper method for a clean timestamp format
	"""
	hours = math.floor(seconds / 3600)
	seconds %= 3600
	minutes = math.floor(seconds / 60)
	seconds %= 60
	milliseconds = round((seconds - math.floor(seconds)) * 1000)
	seconds = math.floor(seconds)
	formatted_time = (
	f"{hours:02d}:{minutes:02d}:{seconds:01d},{milliseconds:03d}"
	)

	return formatted_time


	def generate_subtitle_file(language, segments):
	"""
	Create subtitle file from transcribed dict
	"""
	subtitle_file = f"sub-{input_video_name}.{language}.srt"
	text = ""
	for index, segment in enumerate(segments):
	segment_start = format_time(segment.start)
	segment_end = format_time(segment.end)
	text += f"{str(index+1)} \n"
	text += f"{segment_start} --> {segment_end} \n"
	text += f"{segment.text} \n"
	text += "\n"

	f = open(subtitle_file, "w")
	f.write(text)
	f.close()

	return subtitle_file


	def main():
	"""
	Driver
	"""
	os.chdir("/mnt/c")
	input_video = "video_file.mp4"
	input_video_name = input_video.replace(".mp4", "")
	extracted_audio = extract_audio(input_video_name=input_video_name)

	# Check for existence
	if not os.path.isfile(extracted_audio):
	raise Exception("Audio file does not exist")

	# If using CPU
	# NOTE: You should set the number of threads appropriately
	num_threads = 4
	os.environ["OMP_NUM_THREADS"] = f"{num_threads}"
	model = WhisperModel(
	model_size_or_path="small.en",
	device="cpu",
	cpu_threads=num_threads,
	num_workers=3,
	)
	# If using GPU
	# model = WhisperModel(model_size_or_path="small.en", device="cuda")

	language, segments = transcribe(model=model, audio=extracted_audio)

	# Generate subtitle file
	subtitle_file = generate_subtitle_file(language=language, segments=segments)

	# Save the segments into a pickle
	pickle_outfile = f"segments_{input_video_name}.pkl"
	with open(pickle_outfile, 'wb') as file:
	pickle.dump(segments, file)

	## Load the pickle into memory with:
	# segments = pickle.load(open(pickle_outfile, 'rb'))
	return 0


	if __name__ == "__main__":
	main()