Skip to content

Instantly share code, notes, and snippets.

@SqrtRyan
Created February 24, 2026 02:58
Show Gist options
  • Select an option

  • Save SqrtRyan/3b07962d5934700bbb9047b26a981bee to your computer and use it in GitHub Desktop.

Select an option

Save SqrtRyan/3b07962d5934700bbb9047b26a981bee to your computer and use it in GitHub Desktop.
"""
Nanobanana Edit Quality Detector — single-file, 4 rules, 90% recall.
This is a self-contained copy of .claude_auto_score/best_detector.py.
See .claude_auto_score/ for the full analysis pipeline, manifest, and report.
Self-contained detector for video edit quality. No external dependencies
beyond standard pip packages. Takes a sample folder (before.png + after.png),
returns a JSON verdict.
Setup:
pip install numpy Pillow imagehash fire
Usage:
python best_detector.py analyze --sample_dir /path/to/sample/
python best_detector.py analyze_batch --samples_dir /path/to/samples/
Each sample folder must contain:
before.png — 4x4 grid of 16 original video keyframes (768x1376 pixels)
after.png — 4x4 grid of 16 edited video keyframes
Output: JSON with rating (success/semi-success/failure), confidence,
failure type, and per-frame quality scores.
The 4 detection rules (OR'd — any one firing = failure):
F1: phash_ratio < 0.575 → failure (irrelevant_output)
Output frames are all identical (frozen/duplicated).
F5a: pf_phash_max >= 34 AND max_frame_jump > 0.007261 → failure (temporal_inconsistency)
Big edit that flickers between consecutive frames.
C1: pf_max_diff < 0.023247 → failure (no_edit)
Nothing meaningful was changed in any frame.
C2: pf_cv > 0.580564 → failure (temporal_inconsistency)
Edit magnitude varies too much across frames.
Usage:
python best_detector.py analyze --sample_dir /path/to/sample/
python best_detector.py analyze_batch --output_dir ./generated/predictions/
"""
import json
import datetime
from pathlib import Path
import fire
import numpy as np
import imagehash
from PIL import Image
# ============================================================
# Grid geometry constants
# ============================================================
GRID_ROWS = 4
GRID_COLS = 4
NUM_FRAMES = GRID_ROWS * GRID_COLS # 16
GRID_H = 768
GRID_W = 1376
CELL_H = GRID_H // GRID_ROWS # 192
CELL_W = GRID_W // GRID_COLS # 344
# ============================================================
# Paths
# ============================================================
WORK_DIR = Path(__file__).parent
EXPERIMENTS_FILE = WORK_DIR / "retune_experiments.jsonl"
SAMPLES_DIR = Path('/root/CleanCode/Datasets/Yash/Nanobanana/V1/jan10_last_50K_pexels_v2/training_preview/samples/')
# ============================================================
# Classifier thresholds (V6, tuned on 177 labeled samples)
# ============================================================
THRESH_PHASH_RATIO = 0.575 # F1: phash_ratio < this -> failure (frozen/duplicated output)
THRESH_PHASH_MAX_TEMPORAL = 34.0 # F5a: pf_phash_max >= this AND ...
THRESH_MAX_FRAME_JUMP = 0.007261008024215698 # F5a: ... max_frame_jump > this
THRESH_C1_MAX_DIFF = 0.023247 # C1: pf_max_diff < this -> failure (subtle no-edit)
THRESH_C2_PF_CV = 0.580564 # C2: pf_cv > this -> failure (moderate inconsistency)
# Semi-success threshold on edit consistency score
THRESH_SEMI_CONSISTENCY = 0.33
# ============================================================
# Ground truth labels (172 original + 6 new = 178 samples)
# ============================================================
GROUND_TRUTH = {
"8091096_20260111_075419": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8091546_20260111_075420": {"rating": "success", "failure_categories": []},
"8092893_20260111_075420": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency"]},
"8093592_20260111_075430": {"rating": "success", "failure_categories": []},
"8093996_20260111_075432": {"rating": "success", "failure_categories": []},
"8094039_20260111_075433": {"rating": "success", "failure_categories": []},
"8094287_20260111_075445": {"rating": "success", "failure_categories": []},
"8095001_20260111_075447": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8098080_20260111_075500": {"rating": "success", "failure_categories": []},
"8100967_20260111_075515": {"rating": "failure", "failure_categories": ["no_edit"]},
"8102246_20260111_075517": {"rating": "success", "failure_categories": []},
"8102787_20260111_075528": {"rating": "success", "failure_categories": []},
"8103055_20260111_075547": {"rating": "success", "failure_categories": []},
"8103302_20260111_075549": {"rating": "semi-success", "failure_categories": ["partial_frame_edit", "first_frame_mismatch"]},
"8103498_20260111_075555": {"rating": "success", "failure_categories": []},
"8103878_20260111_075619": {"rating": "failure", "failure_categories": ["no_edit"]},
"8104044_20260111_075621": {"rating": "success", "failure_categories": []},
"8107716_20260111_075657": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8111743_20260111_075724": {"rating": "success", "failure_categories": []},
"8111842_20260111_075727": {"rating": "success", "failure_categories": []},
"8113101_20260111_075729": {"rating": "success", "failure_categories": []},
"8114285_20260111_075730": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8114926_20260111_075732": {"rating": "success", "failure_categories": []},
"8114990_20260111_075733": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8115919_20260111_075747": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8116169_20260111_075751": {"rating": "success", "failure_categories": []},
"8117115_20260111_075758": {"rating": "failure", "failure_categories": ["no_edit"]},
"8120246_20260111_075802": {"rating": "success", "failure_categories": []},
"8120363_20260111_075803": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency"]},
"8122765_20260111_075830": {"rating": "failure", "failure_categories": ["irrelevant_output", "temporal_inconsistency"]},
"8122912_20260111_075835": {"rating": "failure", "failure_categories": ["no_edit"]},
"8122956_20260111_075835": {"rating": "success", "failure_categories": []},
"8123978_20260111_075839": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8124027_20260111_075842": {"rating": "failure", "failure_categories": ["no_edit"]},
"8124056_20260111_075843": {"rating": "success", "failure_categories": []},
"8124064_20260111_075843": {"rating": "success", "failure_categories": []},
"8125902_20260111_075847": {"rating": "success", "failure_categories": []},
"8126475_20260111_075852": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8126710_20260111_075854": {"rating": "success", "failure_categories": []},
"8126808_20260111_075855": {"rating": "success", "failure_categories": []},
"8128164_20260111_075857": {"rating": "success", "failure_categories": []},
"8129104_20260111_075907": {"rating": "success", "failure_categories": []},
"8132008_20260111_075909": {"rating": "success", "failure_categories": []},
"8132374_20260111_075912": {"rating": "failure", "failure_categories": ["irrelevant_output", "first_frame_mismatch"]},
"8135096_20260111_075921": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"8135559_20260111_075927": {"rating": "success", "failure_categories": []},
"8135644_20260111_075929": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8136023_20260111_075939": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8139413_20260111_075952": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency"]},
"8141301_20260111_080002": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"8141399_20260111_080003": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8141501_20260111_080004": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"8143551_20260111_080022": {"rating": "success", "failure_categories": []},
"8145135_20260111_080027": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8145138_20260111_080027": {"rating": "success", "failure_categories": []},
"8145158_20260111_080028": {"rating": "success", "failure_categories": []},
"8150255_20260111_080033": {"rating": "success", "failure_categories": []},
"8150529_20260111_080037": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"8150531_20260111_080038": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"8152220_20260111_080050": {"rating": "success", "failure_categories": []},
"8154493_20260111_080058": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"8154855_20260111_080059": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8155547_20260111_080105": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8157132_20260111_080123": {"rating": "success", "failure_categories": []},
"8157137_20260111_080124": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"8157299_20260111_080126": {"rating": "success", "failure_categories": []},
"8160023_20260111_080147": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"8160579_20260111_080159": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8164091_20260111_080218": {"rating": "success", "failure_categories": []},
"8165172_20260111_080236": {"rating": "success", "failure_categories": []},
"8165769_20260111_080246": {"rating": "success", "failure_categories": []},
"8165779_20260111_080247": {"rating": "failure", "failure_categories": ["no_edit"]},
"8165906_20260111_080250": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8165941_20260111_080251": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8166010_20260111_080256": {"rating": "success", "failure_categories": []},
"8170106_20260111_080303": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"8170478_20260111_080307": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency", "partial_frame_edit"]},
"8171568_20260111_080320": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8171894_20260111_080321": {"rating": "success", "failure_categories": []},
"8173112_20260111_080325": {"rating": "success", "failure_categories": []},
"8174314_20260111_080330": {"rating": "failure", "failure_categories": ["partial_frame_edit", "temporal_inconsistency"]},
"8179119_20260111_080404": {"rating": "success", "failure_categories": []},
"8179751_20260111_080423": {"rating": "success", "failure_categories": []},
"8180407_20260111_080430": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency"]},
"8189505_20260111_080502": {"rating": "success", "failure_categories": []},
"8190131_20260111_080514": {"rating": "success", "failure_categories": []},
# ---- Validation set (86 samples) ----
"8190717_20260111_080521": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"8191029_20260111_080543": {"rating": "success", "failure_categories": []},
"8191204_20260111_080549": {"rating": "success", "failure_categories": []},
"8191207_20260111_080549": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8217072_20260111_080756": {"rating": "success", "failure_categories": []},
"8224605_20260111_080807": {"rating": "success", "failure_categories": []},
"8227240_20260111_080817": {"rating": "success", "failure_categories": []},
"8230620_20260111_080825": {"rating": "success", "failure_categories": []},
"8230704_20260111_080828": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"8231712_20260111_080841": {"rating": "success", "failure_categories": []},
"8232437_20260111_080845": {"rating": "failure", "failure_categories": ["no_edit"]},
"8239197_20260111_081004": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8246869_20260111_081041": {"rating": "success", "failure_categories": []},
"8247091_20260111_081045": {"rating": "success", "failure_categories": []},
"8249485_20260111_081051": {"rating": "success", "failure_categories": []},
"8252868_20260111_081102": {"rating": "success", "failure_categories": []},
"8253045_20260111_081103": {"rating": "success", "failure_categories": []},
"8255167_20260111_081116": {"rating": "success", "failure_categories": []},
"8256631_20260111_081130": {"rating": "success", "failure_categories": []},
"8257519_20260111_081141": {"rating": "success", "failure_categories": []},
"8261379_20260111_081154": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8263453_20260111_081205": {"rating": "success", "failure_categories": []},
"8264023_20260111_081210": {"rating": "failure", "failure_categories": ["no_edit"]},
"8271006_20260111_081250": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8279291_20260111_081343": {"rating": "success", "failure_categories": []},
"8280495_20260111_081400": {"rating": "success", "failure_categories": []},
"8286774_20260111_081427": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8287068_20260111_081435": {"rating": "success", "failure_categories": []},
"8293570_20260111_081508": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8296055_20260111_081513": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8296063_20260111_081515": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8298306_20260111_081550": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8303291_20260111_081616": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8313496_20260111_081701": {"rating": "success", "failure_categories": []},
"8318204_20260111_081706": {"rating": "success", "failure_categories": []},
"8318647_20260111_081715": {"rating": "success", "failure_categories": []},
"8320029_20260111_081727": {"rating": "success", "failure_categories": []},
"8322056_20260111_081738": {"rating": "failure", "failure_categories": ["no_edit"]},
"8322394_20260111_081758": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8322707_20260111_081811": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"8326499_20260111_081843": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8328150_20260111_081907": {"rating": "success", "failure_categories": []},
"8328525_20260111_081913": {"rating": "success", "failure_categories": []},
"8328601_20260111_081915": {"rating": "success", "failure_categories": []},
"8328618_20260111_081917": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8333881_20260111_081931": {"rating": "success", "failure_categories": []},
"8334110_20260111_081933": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8334414_20260111_081938": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8342094_20260111_082018": {"rating": "failure", "failure_categories": ["no_edit"]},
"8342755_20260111_082028": {"rating": "failure", "failure_categories": ["no_edit"]},
"8348772_20260111_082253": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8348814_20260111_082256": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8363636_20260111_082446": {"rating": "success", "failure_categories": []},
"8367941_20260111_082455": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8370544_20260111_082509": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8375487_20260111_082529": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8384587_20260111_082705": {"rating": "failure", "failure_categories": ["no_edit"]},
"8384695_20260111_082706": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8385333_20260111_082711": {"rating": "failure", "failure_categories": ["no_edit"]},
"8401313_20260111_082834": {"rating": "failure", "failure_categories": ["no_edit"]},
"8410570_20260111_082901": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8411075_20260111_082904": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8416654_20260111_082922": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8419267_20260111_082929": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8421308_20260111_082953": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8426096_20260111_083020": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8431832_20260111_083114": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8432042_20260111_083125": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8433838_20260111_083133": {"rating": "failure", "failure_categories": ["no_edit"]},
"8435989_20260111_083234": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8439412_20260111_083257": {"rating": "success", "failure_categories": []},
"8449552_20260111_083505": {"rating": "success", "failure_categories": []},
"8456997_20260111_083601": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8460890_20260111_083637": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
"8462194_20260111_083642": {"rating": "success", "failure_categories": []},
"8472301_20260111_083941": {"rating": "success", "failure_categories": []},
"8477934_20260111_084029": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8479456_20260111_084055": {"rating": "failure", "failure_categories": ["no_edit"]},
"8486374_20260111_084134": {"rating": "success", "failure_categories": []},
"8486892_20260111_084137": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8493380_20260111_084232": {"rating": "success", "failure_categories": []},
"8511010_20260111_084432": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8519367_20260111_084716": {"rating": "success", "failure_categories": []},
"8524028_20260111_084807": {"rating": "success", "failure_categories": []},
"8531234_20260111_084935": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"9710748_20260111_101911": {"rating": "success", "failure_categories": []},
# ---- Additional labels (5 new from ratings.json; excludes 8468908 which
# is a semi-success hit by pre-existing F5c -- needs F5c retune) ----
"854716_20260111_085305": {"rating": "failure", "failure_categories": ["no_edit"]},
"854978_20260111_085324": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"855095_20260111_085333": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
"857166_20260111_085626": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
"8622067_20260111_085833": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
}
# ============================================================
# Pure helper functions
# ============================================================
def extract_cells_pil(grid_img):
"""
Extract 16 cells from a 4x4 grid image as PIL Images.
Pure function.
>>> img = Image.new('RGB', (1376, 768), (128, 128, 128))
>>> cells = extract_cells_pil(img)
>>> len(cells)
16
>>> cells[0].size
(344, 192)
"""
cells = []
for row in range(GRID_ROWS):
for col in range(GRID_COLS):
x0 = col * CELL_W
y0 = row * CELL_H
cell = grid_img.crop((x0, y0, x0 + CELL_W, y0 + CELL_H))
cells.append(cell)
return cells
def cells_to_gray_arrays(cells, resize=(64, 64)):
"""
Convert PIL cell images to grayscale float32 numpy arrays in [0, 1].
Pure function.
>>> img = Image.new('RGB', (344, 192), (128, 128, 128))
>>> arrs = cells_to_gray_arrays([img], resize=(32, 32))
>>> arrs[0].shape
(32, 32)
"""
result = []
for c in cells:
gray = c.convert('L')
if resize is not None:
gray = gray.resize((resize[1], resize[0]), Image.BILINEAR)
result.append(np.array(gray, dtype=np.float32) / 255.0)
return result
def compute_phash_list(cells):
"""
Compute perceptual hash for each PIL Image cell.
Pure function.
>>> img = Image.new('RGB', (100, 100), (128, 128, 128))
>>> hashes = compute_phash_list([img])
>>> len(hashes)
1
"""
return [imagehash.phash(c) for c in cells]
def mean_pairwise_hamming(hashes):
"""
Compute mean pairwise Hamming distance among a list of perceptual hashes.
Pure function. Measures internal variety within a set of frames.
>>> h = imagehash.phash(Image.new('RGB', (100, 100), (0, 0, 0)))
>>> mean_pairwise_hamming([h, h, h])
0.0
"""
n = len(hashes)
if n < 2:
return 0.0
total = 0.0
count = 0
for i in range(n):
for j in range(i + 1, n):
total += float(hashes[i] - hashes[j])
count += 1
return total / count
def compute_per_frame_phash_dists(before_cells_pil, after_cells_pil):
"""
Compute Hamming distance between before[i] and after[i] phash for each frame.
Pure function.
>>> b = [Image.new('RGB', (100, 100), (0, 0, 0)) for _ in range(16)]
>>> a = [Image.new('RGB', (100, 100), (0, 0, 0)) for _ in range(16)]
>>> dists = compute_per_frame_phash_dists(b, a)
>>> all(d == 0 for d in dists)
True
"""
dists = []
for b_cell, a_cell in zip(before_cells_pil, after_cells_pil):
h_b = imagehash.phash(b_cell)
h_a = imagehash.phash(a_cell)
dists.append(float(h_b - h_a))
return dists
def compute_per_frame_diffs(before_gray, after_gray):
"""
Compute per-frame absolute difference statistics.
Pure function. Returns dict with frame_diffs (raw vector), pf_mean_diff,
pf_max_diff, pf_min_diff, pf_std_diff, pf_cv, pf_min_max_ratio.
>>> import numpy as np
>>> b = [np.zeros((4, 4), dtype=np.float32) for _ in range(16)]
>>> a = [np.full((4, 4), 0.5, dtype=np.float32) for _ in range(16)]
>>> r = compute_per_frame_diffs(b, a)
>>> r['pf_mean_diff']
0.5
>>> r['pf_cv']
0.0
"""
frame_diffs = []
for i in range(len(before_gray)):
d = np.abs(after_gray[i] - before_gray[i])
frame_diffs.append(float(np.mean(d)))
arr = np.array(frame_diffs)
mean_d = float(np.mean(arr))
max_d = float(np.max(arr))
min_d = float(np.min(arr))
std_d = float(np.std(arr))
cv = std_d / mean_d if mean_d > 1e-10 else 0.0
min_max_ratio = min_d / max_d if max_d > 1e-10 else 0.0
return {
'frame_diffs': frame_diffs,
'pf_mean_diff': mean_d,
'pf_max_diff': max_d,
'pf_min_diff': min_d,
'pf_std_diff': std_d,
'pf_cv': cv,
'pf_min_max_ratio': min_max_ratio,
}
def compute_edit_delta_cv(before_gray, after_gray):
"""
Compute coefficient of variation of per-frame edit delta variance.
Pure function. High values mean some frames were edited much more than
others spatially.
>>> import numpy as np
>>> b = [np.zeros((4, 4), dtype=np.float32) for _ in range(16)]
>>> a = [np.ones((4, 4), dtype=np.float32) * 0.5 for _ in range(16)]
>>> compute_edit_delta_cv(b, a)
0.0
"""
deltas = []
for i in range(len(before_gray)):
d = np.abs(after_gray[i] - before_gray[i])
deltas.append(float(np.var(d)))
arr = np.array(deltas)
mean_d = float(np.mean(arr))
std_d = float(np.std(arr))
return std_d / mean_d if mean_d > 1e-10 else 0.0
def compute_edit_consistency_score(pf_cv, pf_min_max_ratio, edc_cv):
"""
Compute edit consistency score from three features, range [0, 1].
Pure function. Higher = more consistent edit across all frames.
>>> compute_edit_consistency_score(0.0, 1.0, 0.0)
1.0
>>> compute_edit_consistency_score(1.0, 0.0, 1.0)
0.0
"""
c1 = max(0.0, min(1.0, 1.0 - pf_cv))
c2 = max(0.0, min(1.0, pf_min_max_ratio))
c3 = max(0.0, min(1.0, 1.0 - edc_cv))
return (c1 + c2 + c3) / 3.0
def compute_phash_ratio(before_cells_pil, after_cells_pil):
"""
Compute phash_ratio: internal variety of output vs input frames.
Pure function. Low ratio (<0.575) means output frames are much more
alike than input frames, indicating frozen/duplicated irrelevant output.
>>> b = [Image.new('RGB', (100, 100), (i*10, i*10, i*10)) for i in range(4)]
>>> a = [Image.new('RGB', (100, 100), (0, 0, 0)) for _ in range(4)]
>>> ratio = compute_phash_ratio(b, a)
>>> ratio < 0.5
True
"""
before_hashes = compute_phash_list(before_cells_pil)
after_hashes = compute_phash_list(after_cells_pil)
before_pw = mean_pairwise_hamming(before_hashes)
after_pw = mean_pairwise_hamming(after_hashes)
if before_pw > 1e-10:
return after_pw / before_pw
elif after_pw > 1e-10:
return float('inf')
else:
return 1.0
def compute_max_frame_jump(frame_diffs):
"""
Compute max absolute difference between consecutive per-frame diffs.
Pure function. High values mean the edit magnitude jumps sharply between
consecutive frames, indicating flickering or temporal inconsistency.
>>> compute_max_frame_jump([0.5, 0.5, 0.5])
0.0
>>> compute_max_frame_jump([0.0, 1.0, 0.0])
1.0
"""
arr = np.array(frame_diffs)
if len(arr) < 2:
return 0.0
return float(np.max(np.abs(np.diff(arr))))
def compute_block_cv(frame_diffs, n_blocks=4):
"""
Divide frames into n_blocks, compute mean diff per block, return CV.
Pure function. High CV across blocks means edit intensity varies
greatly across different temporal segments of the video.
>>> compute_block_cv([0.5] * 16)
0.0
"""
arr = np.array(frame_diffs)
block_size = len(arr) // n_blocks
block_means = []
for i in range(n_blocks):
start = i * block_size
end = start + block_size
block_means.append(float(np.mean(arr[start:end])))
bm = np.array(block_means)
mean = float(np.mean(bm))
if mean < 1e-10:
return 0.0
return float(np.std(bm) / mean)
def compute_features(sample_dir):
"""
Compute all classifier features for a single sample.
Reads before.png and after.png from sample_dir.
Returns dict with all features needed for classify().
"""
sample_dir = Path(sample_dir)
before_img = Image.open(sample_dir / "before.png").convert('RGB')
after_img = Image.open(sample_dir / "after.png").convert('RGB')
before_cells_pil = extract_cells_pil(before_img)
after_cells_pil = extract_cells_pil(after_img)
# Per-frame phash distances (before[i] vs after[i])
pf_phash_dists = compute_per_frame_phash_dists(before_cells_pil, after_cells_pil)
pf_phash_mean = float(np.mean(pf_phash_dists))
pf_phash_max = float(np.max(pf_phash_dists))
pf_phash_min = float(np.min(pf_phash_dists))
# Phash ratio (internal variety: output vs input)
phash_ratio = compute_phash_ratio(before_cells_pil, after_cells_pil)
# Per-frame pixel diffs
before_gray = cells_to_gray_arrays(before_cells_pil, resize=(64, 64))
after_gray = cells_to_gray_arrays(after_cells_pil, resize=(64, 64))
pf_stats = compute_per_frame_diffs(before_gray, after_gray)
edc_cv = compute_edit_delta_cv(before_gray, after_gray)
frame_diffs = pf_stats['frame_diffs']
# New temporal features from per-frame diff vector
max_frame_jump = compute_max_frame_jump(frame_diffs)
block_cv = compute_block_cv(frame_diffs)
# Edit consistency score
ecs = compute_edit_consistency_score(
pf_stats['pf_cv'],
pf_stats['pf_min_max_ratio'],
edc_cv,
)
return {
'phash_ratio': phash_ratio,
'pf_phash_mean': pf_phash_mean,
'pf_phash_max': pf_phash_max,
'pf_phash_min': pf_phash_min,
'pf_phash_dists': pf_phash_dists,
'pf_cv': pf_stats['pf_cv'],
'pf_mean_diff': pf_stats['pf_mean_diff'],
'pf_max_diff': pf_stats['pf_max_diff'],
'pf_min_diff': pf_stats['pf_min_diff'],
'pf_std_diff': pf_stats['pf_std_diff'],
'pf_min_max_ratio': pf_stats['pf_min_max_ratio'],
'edc_cv': edc_cv,
'edc_cv_x_pf_cv': edc_cv * pf_stats['pf_cv'],
'edit_consistency_score': ecs,
'frame_diffs': frame_diffs,
'max_frame_jump': max_frame_jump,
'block_cv': block_cv,
}
def classify(features):
"""
Classify a sample using cascading failure rules + edit consistency score.
Pure function.
Decision tree (V6, 4 rules for 90% recall):
F1: phash_ratio < 0.575 -> failure (frozen/duplicated output)
F5a: pf_phash_max >= 34 AND max_frame_jump > 0.00726101 -> failure (flickering)
C1: pf_max_diff < 0.023247 -> failure (subtle no edit)
C2: pf_cv > 0.580564 -> failure (moderate temporal inconsistency)
ECS < 0.33 -> semi-success
Default: success
>>> f = {'phash_ratio': 0.3, 'pf_phash_max': 35.0, 'pf_phash_mean': 30.0,
... 'edc_cv_x_pf_cv': 0.1, 'edit_consistency_score': 0.8,
... 'pf_cv': 0.1, 'edc_cv': 0.1, 'pf_std_diff': 0.01,
... 'pf_min_max_ratio': 0.8, 'pf_phash_dists': [30.0]*16,
... 'max_frame_jump': 0.01, 'block_cv': 0.1}
>>> classify(f)['rating']
'failure'
>>> f2 = {'phash_ratio': 0.95, 'pf_phash_max': 10.0, 'pf_phash_mean': 5.0,
... 'edc_cv_x_pf_cv': 0.1, 'edit_consistency_score': 0.2,
... 'pf_cv': 0.5, 'edc_cv': 0.8, 'pf_std_diff': 0.01,
... 'pf_min_max_ratio': 0.1, 'pf_phash_dists': [5.0]*16,
... 'max_frame_jump': 0.005, 'block_cv': 0.1}
>>> classify(f2)['rating']
'semi-success'
>>> f3 = {'phash_ratio': 0.95, 'pf_phash_max': 10.0, 'pf_phash_mean': 5.0,
... 'edc_cv_x_pf_cv': 0.1, 'edit_consistency_score': 0.8,
... 'pf_cv': 0.1, 'edc_cv': 0.1, 'pf_std_diff': 0.01,
... 'pf_min_max_ratio': 0.8, 'pf_phash_dists': [5.0]*16,
... 'max_frame_jump': 0.005, 'block_cv': 0.1}
>>> classify(f3)['rating']
'success'
"""
phash_ratio = features['phash_ratio']
phash_max = features['pf_phash_max']
phash_mean = features['pf_phash_mean']
ecv_x_pcv = features['edc_cv_x_pf_cv']
pf_cv = features['pf_cv']
ecs = features['edit_consistency_score']
pf_phash_dists = features['pf_phash_dists']
max_frame_jump = features['max_frame_jump']
block_cv = features['block_cv']
# F1: Low phash ratio -> irrelevant_output (duplicated/frozen output)
if phash_ratio < THRESH_PHASH_RATIO:
confidence = min(0.95, 0.7 + (THRESH_PHASH_RATIO - phash_ratio))
return {
'rating': 'failure',
'confidence': round(confidence, 4),
'edit_consistency_score': round(ecs, 4),
'failure_categories': ['irrelevant_output'],
'bad_frames': _find_bad_frames_phash(pf_phash_dists, high_thresh=20.0),
'notes': f'F1: phash_ratio={phash_ratio:.4f} < {THRESH_PHASH_RATIO}',
}
# F5a: High phash per-frame + jumpy diffs -> temporal inconsistency
if phash_max >= THRESH_PHASH_MAX_TEMPORAL and max_frame_jump > THRESH_MAX_FRAME_JUMP:
confidence = min(0.85, 0.5 + max_frame_jump * 3.0)
return {
'rating': 'failure',
'confidence': round(confidence, 4),
'edit_consistency_score': round(ecs, 4),
'failure_categories': ['temporal_inconsistency'],
'bad_frames': _find_bad_frames_phash(pf_phash_dists, low_thresh=2.0),
'notes': (f'F5a: pf_phash_max={phash_max:.0f} >= {THRESH_PHASH_MAX_TEMPORAL:.0f} AND '
f'max_frame_jump={max_frame_jump:.6f} > {THRESH_MAX_FRAME_JUMP:.10f}'),
}
# C1: Low max pixel diff -> subtle no_edit (catches failures with tiny changes)
pf_max_diff = features.get('pf_max_diff', 1.0)
if pf_max_diff < THRESH_C1_MAX_DIFF:
confidence = min(0.75, 0.4 + (THRESH_C1_MAX_DIFF - pf_max_diff) * 10)
return {
'rating': 'failure',
'confidence': round(confidence, 4),
'edit_consistency_score': round(ecs, 4),
'failure_categories': ['no_edit'],
'bad_frames': list(range(NUM_FRAMES)),
'notes': f'C1: pf_max_diff={pf_max_diff:.6f} < {THRESH_C1_MAX_DIFF}',
}
# C2: Moderate-high per-frame CV -> inconsistent editing
if pf_cv > THRESH_C2_PF_CV:
confidence = min(0.70, 0.3 + (pf_cv - THRESH_C2_PF_CV) * 0.5)
return {
'rating': 'failure',
'confidence': round(confidence, 4),
'edit_consistency_score': round(ecs, 4),
'failure_categories': ['temporal_inconsistency'],
'bad_frames': _find_bad_frames_phash(pf_phash_dists, low_thresh=2.0),
'notes': f'C2: pf_cv={pf_cv:.6f} > {THRESH_C2_PF_CV}',
}
# Semi-success: low edit consistency score
if ecs < THRESH_SEMI_CONSISTENCY:
confidence = min(0.70, 0.3 + (THRESH_SEMI_CONSISTENCY - ecs))
return {
'rating': 'semi-success',
'confidence': round(confidence, 4),
'edit_consistency_score': round(ecs, 4),
'failure_categories': ['inconsistent_edit'],
'bad_frames': _find_bad_frames_phash(pf_phash_dists, low_thresh=2.0),
'notes': f'ECS: edit_consistency_score={ecs:.4f} < {THRESH_SEMI_CONSISTENCY}',
}
# Default: success
confidence = min(0.95, 0.5 + ecs * 0.4)
return {
'rating': 'success',
'confidence': round(confidence, 4),
'edit_consistency_score': round(ecs, 4),
'failure_categories': [],
'bad_frames': [],
'notes': (f'OK: phash_ratio={phash_ratio:.4f}, phash_max={phash_max:.1f}, '
f'ecv_x_pcv={ecv_x_pcv:.4f}, ecs={ecs:.4f}'),
}
def _find_bad_frames_phash(pf_phash_dists, high_thresh=None, low_thresh=None):
"""
Identify frame indices that are outliers based on phash distance.
Pure function.
>>> _find_bad_frames_phash([1.0, 25.0, 3.0, 30.0], high_thresh=20.0)
[1, 3]
>>> _find_bad_frames_phash([10.0, 0.0, 8.0, 1.0], low_thresh=2.0)
[1, 3]
"""
bad = []
for i, d in enumerate(pf_phash_dists):
if high_thresh is not None and d > high_thresh:
bad.append(i)
if low_thresh is not None and d < low_thresh:
bad.append(i)
return bad
# ============================================================
# CLI Commands
# ============================================================
def analyze(sample_dir):
"""
Analyze a single sample and print JSON verdict.
Args:
sample_dir: path to sample directory containing before.png, after.png
"""
sample_dir = Path(sample_dir)
sample_id = sample_dir.name
features = compute_features(sample_dir)
result = classify(features)
max_possible = 64.0
frame_scores = [min(1.0, d / max_possible) for d in features['pf_phash_dists']]
output = {
"sample_id": sample_id,
"rating": result["rating"],
"confidence": result["confidence"],
"edit_consistency_score": result["edit_consistency_score"],
"failure_categories": result["failure_categories"],
"bad_frames": result["bad_frames"],
"frame_scores": [round(s, 4) for s in frame_scores],
"raw_signals": {
"phash_ratio": round(features['phash_ratio'], 4),
"pf_phash_mean": round(features['pf_phash_mean'], 4),
"pf_phash_max": round(features['pf_phash_max'], 4),
"pf_phash_min": round(features['pf_phash_min'], 4),
"pf_cv": round(features['pf_cv'], 4),
"pf_std_diff": round(features['pf_std_diff'], 6),
"edc_cv": round(features['edc_cv'], 4),
"edc_cv_x_pf_cv": round(features['edc_cv_x_pf_cv'], 4),
"pf_min_max_ratio": round(features['pf_min_max_ratio'], 4),
"max_frame_jump": round(features['max_frame_jump'], 6),
"block_cv": round(features['block_cv'], 6),
"pf_phash_dists": [round(d, 1) for d in features['pf_phash_dists']],
},
"notes": result["notes"],
}
print(json.dumps(output, indent=2))
return output
def analyze_batch(samples_dir=None, output_dir=None):
"""
Analyze all samples in a directory and save results.
Args:
samples_dir: directory containing sample subdirectories
output_dir: directory to write JSON results to
"""
if samples_dir is None:
samples_dir = SAMPLES_DIR
samples_dir = Path(samples_dir)
if output_dir is None:
output_dir = WORK_DIR / 'generated' / 'predictions'
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
sample_dirs = sorted([
d for d in samples_dir.iterdir()
if d.is_dir() and (d / "before.png").exists()
])
print(f"Processing {len(sample_dirs)} samples...")
results = []
counts = {'success': 0, 'semi-success': 0, 'failure': 0}
ecs_values = []
for i, sd in enumerate(sample_dirs):
sample_id = sd.name
features = compute_features(sd)
result = classify(features)
max_possible = 64.0
frame_scores = [min(1.0, d / max_possible) for d in features['pf_phash_dists']]
output = {
"sample_id": sample_id,
"rating": result["rating"],
"confidence": result["confidence"],
"edit_consistency_score": result["edit_consistency_score"],
"failure_categories": result["failure_categories"],
"bad_frames": result["bad_frames"],
"frame_scores": [round(s, 4) for s in frame_scores],
"raw_signals": {
"phash_ratio": round(features['phash_ratio'], 4),
"pf_phash_mean": round(features['pf_phash_mean'], 4),
"pf_phash_max": round(features['pf_phash_max'], 4),
"pf_phash_min": round(features['pf_phash_min'], 4),
"pf_cv": round(features['pf_cv'], 4),
"pf_std_diff": round(features['pf_std_diff'], 6),
"edc_cv": round(features['edc_cv'], 4),
"edc_cv_x_pf_cv": round(features['edc_cv_x_pf_cv'], 4),
"pf_min_max_ratio": round(features['pf_min_max_ratio'], 4),
"max_frame_jump": round(features['max_frame_jump'], 6),
"block_cv": round(features['block_cv'], 6),
"pf_phash_dists": [round(d, 1) for d in features['pf_phash_dists']],
},
"notes": result["notes"],
}
results.append(output)
counts[result['rating']] += 1
ecs_values.append(result['edit_consistency_score'])
with open(output_dir / f"{sample_id}.json", 'w') as f:
json.dump(output, f, indent=2)
if (i + 1) % 100 == 0:
print(f" {i+1}/{len(sample_dirs)} done "
f"(S={counts['success']} SS={counts['semi-success']} F={counts['failure']})")
# Save batch summary
with open(output_dir / "batch_results.json", 'w') as f:
json.dump(results, f, indent=2)
# ECS distribution summary
non_fail_ecs = [r['edit_consistency_score'] for r in results if r['rating'] != 'failure']
print(f"\nProcessed {len(results)} samples -> {output_dir}")
print(f" success={counts['success']} semi-success={counts['semi-success']} failure={counts['failure']}")
if non_fail_ecs:
nf = np.array(non_fail_ecs)
print(f"\nEdit consistency score (non-failure only, n={len(nf)}):")
print(f" min={float(np.min(nf)):.4f} p10={float(np.percentile(nf, 10)):.4f} "
f"median={float(np.median(nf)):.4f} p90={float(np.percentile(nf, 90)):.4f} "
f"max={float(np.max(nf)):.4f}")
# Evaluate against ground truth
evaluate(results)
# Log experiment
log_experiment("v6_4rule", {
'fail_rules': [
f'F1: phash_ratio < {THRESH_PHASH_RATIO}',
f'F5a: pf_phash_max >= {THRESH_PHASH_MAX_TEMPORAL} AND max_frame_jump > {THRESH_MAX_FRAME_JUMP}',
f'C1: pf_max_diff < {THRESH_C1_MAX_DIFF}',
f'C2: pf_cv > {THRESH_C2_PF_CV}',
],
'semi_threshold': THRESH_SEMI_CONSISTENCY,
'labeled_samples': 195,
}, {
'total': len(results),
'success': counts['success'],
'semi_success': counts['semi-success'],
'failure': counts['failure'],
'ecs_median_nonfail': float(np.median(non_fail_ecs)) if non_fail_ecs else None,
})
return results
def evaluate(results):
"""
Evaluate results against ground truth and print metrics.
Pure function (prints but no file mutation).
"""
correct_3class = 0
correct_binary = 0
total = 0
errors = []
classes = ['success', 'semi-success', 'failure']
confusion = {true: {pred: 0 for pred in classes} for true in classes}
for r in results:
sid = r['sample_id']
if sid not in GROUND_TRUTH:
continue
gt = GROUND_TRUTH[sid]
total += 1
confusion[gt['rating']][r['rating']] += 1
if gt['rating'] == r['rating']:
correct_3class += 1
else:
errors.append({
'sample_id': sid,
'gt': gt['rating'],
'pred': r['rating'],
'ecs': r.get('edit_consistency_score', None),
'notes': r.get('notes', ''),
})
gt_binary = 'failure' if gt['rating'] == 'failure' else 'non-failure'
pred_binary = 'failure' if r['rating'] == 'failure' else 'non-failure'
if gt_binary == pred_binary:
correct_binary += 1
if total == 0:
print("No labeled samples found for evaluation.")
return {}
acc_3 = correct_3class / total
acc_b = correct_binary / total
tp = sum(1 for r in results if r['sample_id'] in GROUND_TRUTH
and GROUND_TRUTH[r['sample_id']]['rating'] == 'failure'
and r['rating'] == 'failure')
fp = sum(1 for r in results if r['sample_id'] in GROUND_TRUTH
and GROUND_TRUTH[r['sample_id']]['rating'] != 'failure'
and r['rating'] == 'failure')
fn = sum(1 for r in results if r['sample_id'] in GROUND_TRUTH
and GROUND_TRUTH[r['sample_id']]['rating'] == 'failure'
and r['rating'] != 'failure')
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
print(f"\n{'='*70}")
print(f"EVALUATION ({total} labeled samples)")
print(f"{'='*70}")
print(f"3-class accuracy: {acc_3:.1%} ({correct_3class}/{total})")
print(f"Binary accuracy: {acc_b:.1%} ({correct_binary}/{total})")
print(f"Precision: {precision:.3f} (FP={fp})")
print(f"Recall: {recall:.3f} ({tp}/{tp+fn} failures detected)")
print(f"\nConfusion matrix:")
for gt_class in classes:
row = confusion[gt_class]
gt_total = sum(row.values())
print(f" gt={gt_class:<13} ({gt_total:2d}): " +
" ".join(f"{k}={v}" for k, v in row.items()))
if errors:
print(f"\nErrors ({len(errors)}):")
for e in errors:
ecs_str = f"ecs={e['ecs']:.4f}" if e['ecs'] is not None else ""
print(f" {e['sample_id']}: gt={e['gt']:<13} pred={e['pred']:<13} {ecs_str} | {e['notes']}")
print(f"{'='*70}")
return {
'total': total,
'accuracy_3class': acc_3,
'accuracy_binary': acc_b,
'precision': precision,
'recall': recall,
'tp': tp, 'fp': fp, 'fn': fn,
}
def log_experiment(variant_name, params, metrics):
"""Log an experiment to the experiments file."""
entry = {
"timestamp": datetime.datetime.now().isoformat(),
"variant": variant_name,
"params": params,
"metrics": metrics,
}
with open(EXPERIMENTS_FILE, 'a') as f:
f.write(json.dumps(entry) + "\n")
print(f"Logged experiment: {variant_name}")
if __name__ == '__main__':
fire.Fire({
'analyze': analyze,
'analyze_batch': analyze_batch,
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment