Skip to content

Instantly share code, notes, and snippets.

@hirocarma
Created October 19, 2025 09:51
Show Gist options
  • Save hirocarma/9a09df7ecab4cd1fc572183d8b233ca6 to your computer and use it in GitHub Desktop.
Save hirocarma/9a09df7ecab4cd1fc572183d8b233ca6 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import cv2
import sys
import numpy as np
import math
import shutil
import json
from sklearn.cluster import DBSCAN
import insightface
from insightface.app import FaceAnalysis
from ultralytics import YOLO
import torch
#
FRAME_SKIP = 24
RECTANGLE_DRAW = 1
def time_to_srt_format(seconds):
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
ms = int((seconds - int(seconds)) * 1000)
return f"{h:02}:{m:02}:{s:02},{ms:03}"
def save_representative_image(cluster_path, output_path):
embeddings = []
images = []
image_paths = []
for img_name in os.listdir(cluster_path):
if not img_name.lower().endswith(".png"):
continue
img_path = os.path.join(cluster_path, img_name)
img = cv2.imread(img_path)
if img is not None:
images.append(img)
image_paths.append(img_path)
if not images:
return None
rep_img = images[len(images) // 2]
cv2.imwrite(output_path, rep_img)
return output_path
def extract_faces_from_video(
input_video_path,
output_video_path="faces_preview.mp4",
faces_dir="faces",
clustered_dir="faces_clustered",
characters_dir="characters",
scene_json_path="scene_summary.json",
scene_srt_path="scene_summary.srt",
det_size=(640, 640),
eps=0.7,
min_samples=3,
):
os.makedirs(faces_dir, exist_ok=True)
os.makedirs(clustered_dir, exist_ok=True)
os.makedirs(characters_dir, exist_ok=True)
# --- InsightFace ---
face_app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"])
face_app.prepare(ctx_id=0, det_size=det_size)
# --- YOLOv8 ---
yolo_model = YOLO("yolov8n.pt")
cap = cv2.VideoCapture(input_video_path)
if not cap.isOpened():
print("Can't open video.")
return
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
print(f"Processing: {input_video_path}")
face_features = []
face_images = []
face_frame_indices = []
character_features = []
character_images = []
character_frame_indices = []
frame_count = 0
face_index = 0
char_index = 0
while True:
ret, frame = cap.read()
if not ret:
break
frame_count += 1
# --- frame Skip ---
if (frame_count - 1) % FRAME_SKIP != 0:
out.write(frame)
continue
# --- face detect ---
faces = face_app.get(frame)
for face in faces:
x1, y1, x2, y2 = map(int, face.bbox)
if RECTANGLE_DRAW == 1:
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
pad = 10
x1p = max(x1 - pad, 0)
y1p = max(y1 - pad, 0)
x2p = min(x2 + pad, frame.shape[1])
y2p = min(y2 + pad, frame.shape[0])
face_img = frame[y1p:y2p, x1p:x2p]
face_path = os.path.join(faces_dir, f"face_{face_index:05d}.png")
cv2.imwrite(face_path, face_img)
face_images.append(face_img)
face_features.append(face.normed_embedding)
face_frame_indices.append(frame_count)
face_index += 1
# --- character detect ---
results = yolo_model.predict(frame, imgsz=640, conf=0.5)
for res in results:
for box in res.boxes:
cls = int(box.cls[0])
if cls != 0: # 0=person
continue
x1, y1, x2, y2 = map(int, box.xyxy[0])
if RECTANGLE_DRAW == 1:
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
char_img = frame[y1:y2, x1:x2]
char_path = os.path.join(characters_dir, f"char_{char_index:05d}.png")
cv2.imwrite(char_path, char_img)
character_images.append(char_img)
character_frame_indices.append(frame_count)
char_index += 1
out.write(frame)
if frame_count % 50 == 0:
print(f"{frame_count} Frame processing complete.")
cap.release()
out.release()
print("The video has been saved.:", output_video_path)
if len(face_features) == 0:
print("No face detected.")
else:
features = np.array(face_features)
clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="cosine").fit(
features
)
labels = clustering.labels_
num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f"Clustering complete: {num_clusters} Character detected.")
scene_summary = {}
srt_entries = []
srt_counter = 1
for cluster_id in range(num_clusters):
cluster_path = os.path.join(clustered_dir, f"character_{cluster_id+1}")
os.makedirs(cluster_path, exist_ok=True)
indices = np.where(labels == cluster_id)[0]
frame_list = []
for idx in indices:
src_path = os.path.join(faces_dir, f"face_{idx:05d}.png")
shutil.copy(
src_path, os.path.join(cluster_path, os.path.basename(src_path))
)
frame_list.append(face_frame_indices[idx])
frame_list = sorted(frame_list)
if not frame_list:
continue
ranges = []
start = frame_list[0]
prev = start
max_gap = fps * 2
for f in frame_list[1:]:
if f - prev > max_gap:
ranges.append((start, prev))
start = f
prev = f
ranges.append((start, prev))
time_ranges = [
{"start_sec": round(s / fps, 2), "end_sec": round(e / fps, 2)}
for s, e in ranges
]
scene_summary[f"character_{cluster_id+1}"] = {
"frames_detected": len(frame_list),
"scene_ranges": time_ranges,
}
for tr in time_ranges:
srt_entries.append(
f"{srt_counter}\n"
f"{time_to_srt_format(tr['start_sec'])} --> {time_to_srt_format(tr['end_sec'])}\n"
f"Character {cluster_id+1} appears\n\n"
)
srt_counter += 1
rep_img_path = os.path.join(cluster_path, "representative.png")
save_representative_image(cluster_path, rep_img_path)
with open(scene_json_path, "w", encoding="utf-8") as f:
json.dump(scene_summary, f, indent=2, ensure_ascii=False)
print("Character appearance information saved.:", scene_json_path)
with open(scene_srt_path, "w", encoding="utf-8") as f:
f.writelines(srt_entries)
print("Character appearance srt data saved.:", scene_srt_path)
print("Comleted.")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("usage: python script.py <input_video.mp4>")
sys.exit(1)
video_path = sys.argv[1]
extract_faces_from_video(video_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment