hirocarma · October 19, 2025 09:51
diff --git a/anime_face_and_person_detect.py b/anime_face_and_person_detect.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import os
 import cv2
 import sys
 import numpy as np
 import math
 import shutil
 import json
 from sklearn.cluster import DBSCAN
 import insightface
 from insightface.app import FaceAnalysis
 from ultralytics import YOLO
 import torch

 #
 FRAME_SKIP = 24
 RECTANGLE_DRAW = 1


 def time_to_srt_format(seconds):
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    ms = int((seconds - int(seconds)) * 1000)
    return f"{h:02}:{m:02}:{s:02},{ms:03}"


 def save_representative_image(cluster_path, output_path):
    embeddings = []
    images = []
    image_paths = []

    for img_name in os.listdir(cluster_path):
        if not img_name.lower().endswith(".png"):
            continue
        img_path = os.path.join(cluster_path, img_name)
        img = cv2.imread(img_path)
        if img is not None:
            images.append(img)
            image_paths.append(img_path)

    if not images:
        return None

    rep_img = images[len(images) // 2]
    cv2.imwrite(output_path, rep_img)
    return output_path


 def extract_faces_from_video(
    input_video_path,
    output_video_path="faces_preview.mp4",
    faces_dir="faces",
    clustered_dir="faces_clustered",
    characters_dir="characters",
    scene_json_path="scene_summary.json",
    scene_srt_path="scene_summary.srt",
    det_size=(640, 640),
    eps=0.7,
    min_samples=3,
 ):
    os.makedirs(faces_dir, exist_ok=True)
    os.makedirs(clustered_dir, exist_ok=True)
    os.makedirs(characters_dir, exist_ok=True)

    # --- InsightFace ---
    face_app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"])
    face_app.prepare(ctx_id=0, det_size=det_size)

    # --- YOLOv8 ---
    yolo_model = YOLO("yolov8n.pt")  

    cap = cv2.VideoCapture(input_video_path)
    if not cap.isOpened():
        print("Can't open video.")
        return

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    print(f"Processing: {input_video_path}")
    face_features = []
    face_images = []
    face_frame_indices = []

    character_features = []
    character_images = []
    character_frame_indices = []

    frame_count = 0
    face_index = 0
    char_index = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # --- frame Skip ---
        if (frame_count - 1) % FRAME_SKIP != 0:
            out.write(frame)
            continue

        # --- face detect ---
        faces = face_app.get(frame)
        for face in faces:
            x1, y1, x2, y2 = map(int, face.bbox)
            if RECTANGLE_DRAW == 1:
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            pad = 10
            x1p = max(x1 - pad, 0)
            y1p = max(y1 - pad, 0)
            x2p = min(x2 + pad, frame.shape[1])
            y2p = min(y2 + pad, frame.shape[0])
            face_img = frame[y1p:y2p, x1p:x2p]

            face_path = os.path.join(faces_dir, f"face_{face_index:05d}.png")
            cv2.imwrite(face_path, face_img)
            face_images.append(face_img)
            face_features.append(face.normed_embedding)
            face_frame_indices.append(frame_count)
            face_index += 1

        # --- character detect ---
        results = yolo_model.predict(frame, imgsz=640, conf=0.5)
        for res in results:
            for box in res.boxes:
                cls = int(box.cls[0])
                if cls != 0:  # 0=person
                    continue
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                if RECTANGLE_DRAW == 1:
                    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)

                char_img = frame[y1:y2, x1:x2]
                char_path = os.path.join(characters_dir, f"char_{char_index:05d}.png")
                cv2.imwrite(char_path, char_img)

                character_images.append(char_img)
                character_frame_indices.append(frame_count)
                char_index += 1

        out.write(frame)

        if frame_count % 50 == 0:
            print(f"{frame_count} Frame processing complete.")

    cap.release()
    out.release()
    print("The video has been saved.:", output_video_path)

    if len(face_features) == 0:
        print("No face detected.")
    else:
        features = np.array(face_features)
        clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="cosine").fit(
            features
        )
        labels = clustering.labels_
        num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        print(f"Clustering complete: {num_clusters} Character detected.")

        scene_summary = {}
        srt_entries = []
        srt_counter = 1

        for cluster_id in range(num_clusters):
            cluster_path = os.path.join(clustered_dir, f"character_{cluster_id+1}")
            os.makedirs(cluster_path, exist_ok=True)
            indices = np.where(labels == cluster_id)[0]

            frame_list = []
            for idx in indices:
                src_path = os.path.join(faces_dir, f"face_{idx:05d}.png")
                shutil.copy(
                    src_path, os.path.join(cluster_path, os.path.basename(src_path))
                )
                frame_list.append(face_frame_indices[idx])

            frame_list = sorted(frame_list)
            if not frame_list:
                continue

            ranges = []
            start = frame_list[0]
            prev = start
            max_gap = fps * 2

            for f in frame_list[1:]:
                if f - prev > max_gap:
                    ranges.append((start, prev))
                    start = f
                prev = f
            ranges.append((start, prev))

            time_ranges = [
                {"start_sec": round(s / fps, 2), "end_sec": round(e / fps, 2)}
                for s, e in ranges
            ]

            scene_summary[f"character_{cluster_id+1}"] = {
                "frames_detected": len(frame_list),
                "scene_ranges": time_ranges,
            }

            for tr in time_ranges:
                srt_entries.append(
                    f"{srt_counter}\n"
                    f"{time_to_srt_format(tr['start_sec'])} --> {time_to_srt_format(tr['end_sec'])}\n"
                    f"Character {cluster_id+1} appears\n\n"
                )
                srt_counter += 1

            rep_img_path = os.path.join(cluster_path, "representative.png")
            save_representative_image(cluster_path, rep_img_path)

        with open(scene_json_path, "w", encoding="utf-8") as f:
            json.dump(scene_summary, f, indent=2, ensure_ascii=False)
        print("Character appearance information saved.:", scene_json_path)

        with open(scene_srt_path, "w", encoding="utf-8") as f:
            f.writelines(srt_entries)
        print("Character appearance srt data saved.:", scene_srt_path)

    print("Comleted.")


 if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("usage: python script.py <input_video.mp4>")
        sys.exit(1)

    video_path = sys.argv[1]
    extract_faces_from_video(video_path)
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	import os
	import cv2
	import sys
	import numpy as np
	import math
	import shutil
	import json
	from sklearn.cluster import DBSCAN
	import insightface
	from insightface.app import FaceAnalysis
	from ultralytics import YOLO
	import torch

	#
	FRAME_SKIP = 24
	RECTANGLE_DRAW = 1


	def time_to_srt_format(seconds):
	h = int(seconds // 3600)
	m = int((seconds % 3600) // 60)
	s = int(seconds % 60)
	ms = int((seconds - int(seconds)) * 1000)
	return f"{h:02}:{m:02}:{s:02},{ms:03}"


	def save_representative_image(cluster_path, output_path):
	embeddings = []
	images = []
	image_paths = []

	for img_name in os.listdir(cluster_path):
	if not img_name.lower().endswith(".png"):
	continue
	img_path = os.path.join(cluster_path, img_name)
	img = cv2.imread(img_path)
	if img is not None:
	images.append(img)
	image_paths.append(img_path)

	if not images:
	return None

	rep_img = images[len(images) // 2]
	cv2.imwrite(output_path, rep_img)
	return output_path


	def extract_faces_from_video(
	input_video_path,
	output_video_path="faces_preview.mp4",
	faces_dir="faces",
	clustered_dir="faces_clustered",
	characters_dir="characters",
	scene_json_path="scene_summary.json",
	scene_srt_path="scene_summary.srt",
	det_size=(640, 640),
	eps=0.7,
	min_samples=3,
	):
	os.makedirs(faces_dir, exist_ok=True)
	os.makedirs(clustered_dir, exist_ok=True)
	os.makedirs(characters_dir, exist_ok=True)

	# --- InsightFace ---
	face_app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"])
	face_app.prepare(ctx_id=0, det_size=det_size)

	# --- YOLOv8 ---
	yolo_model = YOLO("yolov8n.pt")

	cap = cv2.VideoCapture(input_video_path)
	if not cap.isOpened():
	print("Can't open video.")
	return

	fourcc = cv2.VideoWriter_fourcc(*"mp4v")
	fps = cap.get(cv2.CAP_PROP_FPS)
	width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

	print(f"Processing: {input_video_path}")
	face_features = []
	face_images = []
	face_frame_indices = []

	character_features = []
	character_images = []
	character_frame_indices = []

	frame_count = 0
	face_index = 0
	char_index = 0

	while True:
	ret, frame = cap.read()
	if not ret:
	break

	frame_count += 1

	# --- frame Skip ---
	if (frame_count - 1) % FRAME_SKIP != 0:
	out.write(frame)
	continue

	# --- face detect ---
	faces = face_app.get(frame)
	for face in faces:
	x1, y1, x2, y2 = map(int, face.bbox)
	if RECTANGLE_DRAW == 1:
	cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

	pad = 10
	x1p = max(x1 - pad, 0)
	y1p = max(y1 - pad, 0)
	x2p = min(x2 + pad, frame.shape[1])
	y2p = min(y2 + pad, frame.shape[0])
	face_img = frame[y1p:y2p, x1p:x2p]

	face_path = os.path.join(faces_dir, f"face_{face_index:05d}.png")
	cv2.imwrite(face_path, face_img)
	face_images.append(face_img)
	face_features.append(face.normed_embedding)
	face_frame_indices.append(frame_count)
	face_index += 1

	# --- character detect ---
	results = yolo_model.predict(frame, imgsz=640, conf=0.5)
	for res in results:
	for box in res.boxes:
	cls = int(box.cls[0])
	if cls != 0: # 0=person
	continue
	x1, y1, x2, y2 = map(int, box.xyxy[0])
	if RECTANGLE_DRAW == 1:
	cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)

	char_img = frame[y1:y2, x1:x2]
	char_path = os.path.join(characters_dir, f"char_{char_index:05d}.png")
	cv2.imwrite(char_path, char_img)

	character_images.append(char_img)
	character_frame_indices.append(frame_count)
	char_index += 1

	out.write(frame)

	if frame_count % 50 == 0:
	print(f"{frame_count} Frame processing complete.")

	cap.release()
	out.release()
	print("The video has been saved.:", output_video_path)

	if len(face_features) == 0:
	print("No face detected.")
	else:
	features = np.array(face_features)
	clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="cosine").fit(
	features
	)
	labels = clustering.labels_
	num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
	print(f"Clustering complete: {num_clusters} Character detected.")

	scene_summary = {}
	srt_entries = []
	srt_counter = 1

	for cluster_id in range(num_clusters):
	cluster_path = os.path.join(clustered_dir, f"character_{cluster_id+1}")
	os.makedirs(cluster_path, exist_ok=True)
	indices = np.where(labels == cluster_id)[0]

	frame_list = []
	for idx in indices:
	src_path = os.path.join(faces_dir, f"face_{idx:05d}.png")
	shutil.copy(
	src_path, os.path.join(cluster_path, os.path.basename(src_path))
	)
	frame_list.append(face_frame_indices[idx])

	frame_list = sorted(frame_list)
	if not frame_list:
	continue

	ranges = []
	start = frame_list[0]
	prev = start
	max_gap = fps * 2

	for f in frame_list[1:]:
	if f - prev > max_gap:
	ranges.append((start, prev))
	start = f
	prev = f
	ranges.append((start, prev))

	time_ranges = [
	{"start_sec": round(s / fps, 2), "end_sec": round(e / fps, 2)}
	for s, e in ranges
	]

	scene_summary[f"character_{cluster_id+1}"] = {
	"frames_detected": len(frame_list),
	"scene_ranges": time_ranges,
	}

	for tr in time_ranges:
	srt_entries.append(
	f"{srt_counter}\n"
	f"{time_to_srt_format(tr['start_sec'])} --> {time_to_srt_format(tr['end_sec'])}\n"
	f"Character {cluster_id+1} appears\n\n"
	)
	srt_counter += 1

	rep_img_path = os.path.join(cluster_path, "representative.png")
	save_representative_image(cluster_path, rep_img_path)

	with open(scene_json_path, "w", encoding="utf-8") as f:
	json.dump(scene_summary, f, indent=2, ensure_ascii=False)
	print("Character appearance information saved.:", scene_json_path)

	with open(scene_srt_path, "w", encoding="utf-8") as f:
	f.writelines(srt_entries)
	print("Character appearance srt data saved.:", scene_srt_path)

	print("Comleted.")


	if __name__ == "__main__":
	if len(sys.argv) < 2:
	print("usage: python script.py <input_video.mp4>")
	sys.exit(1)

	video_path = sys.argv[1]
	extract_faces_from_video(video_path)