Skip to content

Instantly share code, notes, and snippets.

@sadatnfs
Last active July 31, 2024 02:58
Show Gist options
  • Save sadatnfs/313cb23d9258fb3bba2d3a681260db49 to your computer and use it in GitHub Desktop.
Save sadatnfs/313cb23d9258fb3bba2d3a681260db49 to your computer and use it in GitHub Desktop.
transcript video
# Prepping your environment #
# 1. Set up WSL with ffmpeg: https://streaminglearningcenter.com/encoding/running-ffmpeg-on-windows-subsystem-for-linux.html
# 2. Install Miniconda in your WSL environment: https://dev.to/sfpear/miniconda-in-wsl-3642
# 3. Install Python packages:
# pip uninstall ffmpeg
# pip uninstall ffmpeg-python
# pip install ffmpeg-python git+https://github.com/SYSTRAN/faster-whisper
# 4. [IF YOU HAVE A COMPATIBLE NVIDIA GPU] Install all CUDA stuff, which can be painful
# The instructions in this section should work clean: https://github.com/SYSTRAN/faster-whisper?tab=readme-ov-file#install-with-pip-linux-only
# But it might be easiest to use the Docker image.
# If you're going the Docker route, here's a few to-do (and instructions) that seem decent: https://logic2020.com/insight/wsl-docker-gpu-enabled-nvidia/
import time
import math
import ffmpeg
import os
import pickle
from faster_whisper import WhisperModel
def extract_audio(input_video_name):
"""
Extract audio from a video file
"""
extracted_audio = f"audio-{input_video_name}.wav"
stream = ffmpeg.input(input_video)
stream = ffmpeg.output(stream, extracted_audio)
ffmpeg.run(stream, overwrite_output=True)
return extracted_audio
def transcribe(model, audio):
"""
Run transcription model
"""
# model = WhisperModel("small")
segments, info = model.transcribe(audio)
language = info[0]
print("Transcription language", info[0])
segments = list(segments)
for segment in segments:
# print(segment)
print(
"[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text)
)
return language, segments
def format_time(seconds):
"""
Helper method for a clean timestamp format
"""
hours = math.floor(seconds / 3600)
seconds %= 3600
minutes = math.floor(seconds / 60)
seconds %= 60
milliseconds = round((seconds - math.floor(seconds)) * 1000)
seconds = math.floor(seconds)
formatted_time = (
f"{hours:02d}:{minutes:02d}:{seconds:01d},{milliseconds:03d}"
)
return formatted_time
def generate_subtitle_file(language, segments):
"""
Create subtitle file from transcribed dict
"""
subtitle_file = f"sub-{input_video_name}.{language}.srt"
text = ""
for index, segment in enumerate(segments):
segment_start = format_time(segment.start)
segment_end = format_time(segment.end)
text += f"{str(index+1)} \n"
text += f"{segment_start} --> {segment_end} \n"
text += f"{segment.text} \n"
text += "\n"
f = open(subtitle_file, "w")
f.write(text)
f.close()
return subtitle_file
def main():
"""
Driver
"""
os.chdir("/mnt/c")
input_video = "video_file.mp4"
input_video_name = input_video.replace(".mp4", "")
extracted_audio = extract_audio(input_video_name=input_video_name)
# Check for existence
if not os.path.isfile(extracted_audio):
raise Exception("Audio file does not exist")
# If using CPU
# NOTE: You should set the number of threads appropriately
num_threads = 4
os.environ["OMP_NUM_THREADS"] = f"{num_threads}"
model = WhisperModel(
model_size_or_path="small.en",
device="cpu",
cpu_threads=num_threads,
num_workers=3,
)
# If using GPU
# model = WhisperModel(model_size_or_path="small.en", device="cuda")
language, segments = transcribe(model=model, audio=extracted_audio)
# Generate subtitle file
subtitle_file = generate_subtitle_file(language=language, segments=segments)
# Save the segments into a pickle
pickle_outfile = f"segments_{input_video_name}.pkl"
with open(pickle_outfile, 'wb') as file:
pickle.dump(segments, file)
## Load the pickle into memory with:
# segments = pickle.load(open(pickle_outfile, 'rb'))
return 0
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment