Created
November 23, 2023 21:36
-
-
Save zeitiger/71d394c47606cc082907d71fe7d2ec0c to your computer and use it in GitHub Desktop.
Python script for splitting a video based on detected speakers: This script utilizes frame analysis to identify speakers in a video, comparing frames against known speaker images and creating separate video clips for each speaker found. The code employs similarity calculations and MoviePy for video processing, offering a starting point for speak…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cv2 | |
import numpy as np | |
from moviepy.editor import VideoFileClip | |
from moviepy.video.io.ffmpeg_writer import FFMPEG_VideoWriter as VideoFileClipWriter | |
lastest_speaker_frames = [None for _ in range(6)] | |
# TODO find a better way to calculate ssim | |
def calculate_ssim(x, y): | |
# Convert images to grayscale | |
gray_img1 = cv2.cvtColor(x, cv2.COLOR_BGR2GRAY) | |
gray_img2 = cv2.cvtColor(y, cv2.COLOR_BGR2GRAY) | |
# Calculate Structural Similarity Index (SSI) | |
return cv2.matchTemplate(gray_img1, gray_img2, cv2.TM_CCOEFF_NORMED) | |
def detect_and_write_frame(frame, known_speakers, writers): | |
similarities = [ | |
calculate_ssim(frame, speaker_image) | |
for speaker_image in known_speakers | |
] | |
best_match_index = np.argmax([sim.item() for sim in similarities]) | |
for i, writer in enumerate(writers): | |
if i == best_match_index: | |
writer.write_frame(frame) | |
lastest_speaker_frames[best_match_index] = frame | |
elif lastest_speaker_frames[i] is not None: | |
writer.write_frame(lastest_speaker_frames[i]) | |
else: | |
writer.write_frame(green_frame(frame)) | |
def green_frame(frame): | |
result = np.zeros_like(frame) | |
result[:, :, 1] = 255 | |
return result | |
def split_video(input_file): | |
video = VideoFileClip(input_file) | |
output_paths = [f"output_speaker_{i + 1}.mp4" for i in range(6)] | |
known_speakers = [cv2.imread(f"tales-from-the-loop/speaker{i + 1}.png") for i in range(6)] | |
video_writers = [VideoFileClipWriter(output_path, video.size, video.fps) for output_path in output_paths] | |
print(f"All frame count {video.reader.nframes}") | |
i = 0 | |
for frame in video.iter_frames(): | |
detect_and_write_frame(frame, known_speakers, video_writers) | |
i += 1 | |
if i % 100 == 0: | |
print(f"Processing frame {i} of {video.reader.nframes}") | |
for video_writer in video_writers: | |
video_writer.close() | |
split_video("tales-from-the-loop/source.mp4") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment