SqrtRyan · February 24, 2026 02:58
diff --git a/gistfile1.txt b/gistfile1.txt
 """
 Nanobanana Edit Quality Detector — single-file, 4 rules, 90% recall.

 This is a self-contained copy of .claude_auto_score/best_detector.py.
 See .claude_auto_score/ for the full analysis pipeline, manifest, and report.

 Self-contained detector for video edit quality. No external dependencies
 beyond standard pip packages. Takes a sample folder (before.png + after.png),
 returns a JSON verdict.

 Setup:
    pip install numpy Pillow imagehash fire

 Usage:
    python best_detector.py analyze --sample_dir /path/to/sample/
    python best_detector.py analyze_batch --samples_dir /path/to/samples/

 Each sample folder must contain:
    before.png  — 4x4 grid of 16 original video keyframes (768x1376 pixels)
    after.png   — 4x4 grid of 16 edited video keyframes

 Output: JSON with rating (success/semi-success/failure), confidence,
 failure type, and per-frame quality scores.

 The 4 detection rules (OR'd — any one firing = failure):

  F1:  phash_ratio < 0.575 → failure (irrelevant_output)
       Output frames are all identical (frozen/duplicated).

  F5a: pf_phash_max >= 34 AND max_frame_jump > 0.007261 → failure (temporal_inconsistency)
       Big edit that flickers between consecutive frames.

  C1:  pf_max_diff < 0.023247 → failure (no_edit)
       Nothing meaningful was changed in any frame.

  C2:  pf_cv > 0.580564 → failure (temporal_inconsistency)
       Edit magnitude varies too much across frames.

 Usage:
    python best_detector.py analyze --sample_dir /path/to/sample/
    python best_detector.py analyze_batch --output_dir ./generated/predictions/
 """

 import json
 import datetime
 from pathlib import Path

 import fire
 import numpy as np
 import imagehash
 from PIL import Image


 # ============================================================
 # Grid geometry constants
 # ============================================================
 GRID_ROWS = 4
 GRID_COLS = 4
 NUM_FRAMES = GRID_ROWS * GRID_COLS  # 16
 GRID_H = 768
 GRID_W = 1376
 CELL_H = GRID_H // GRID_ROWS  # 192
 CELL_W = GRID_W // GRID_COLS   # 344

 # ============================================================
 # Paths
 # ============================================================
 WORK_DIR = Path(__file__).parent
 EXPERIMENTS_FILE = WORK_DIR / "retune_experiments.jsonl"
 SAMPLES_DIR = Path('/root/CleanCode/Datasets/Yash/Nanobanana/V1/jan10_last_50K_pexels_v2/training_preview/samples/')

 # ============================================================
 # Classifier thresholds (V6, tuned on 177 labeled samples)
 # ============================================================
 THRESH_PHASH_RATIO = 0.575         # F1: phash_ratio < this -> failure (frozen/duplicated output)
 THRESH_PHASH_MAX_TEMPORAL = 34.0   # F5a: pf_phash_max >= this AND ...
 THRESH_MAX_FRAME_JUMP = 0.007261008024215698  # F5a: ... max_frame_jump > this
 THRESH_C1_MAX_DIFF = 0.023247      # C1: pf_max_diff < this -> failure (subtle no-edit)
 THRESH_C2_PF_CV = 0.580564         # C2: pf_cv > this -> failure (moderate inconsistency)

 # Semi-success threshold on edit consistency score
 THRESH_SEMI_CONSISTENCY = 0.33

 # ============================================================
 # Ground truth labels (172 original + 6 new = 178 samples)
 # ============================================================
 GROUND_TRUTH = {
    "8091096_20260111_075419": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8091546_20260111_075420": {"rating": "success", "failure_categories": []},
    "8092893_20260111_075420": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency"]},
    "8093592_20260111_075430": {"rating": "success", "failure_categories": []},
    "8093996_20260111_075432": {"rating": "success", "failure_categories": []},
    "8094039_20260111_075433": {"rating": "success", "failure_categories": []},
    "8094287_20260111_075445": {"rating": "success", "failure_categories": []},
    "8095001_20260111_075447": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8098080_20260111_075500": {"rating": "success", "failure_categories": []},
    "8100967_20260111_075515": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8102246_20260111_075517": {"rating": "success", "failure_categories": []},
    "8102787_20260111_075528": {"rating": "success", "failure_categories": []},
    "8103055_20260111_075547": {"rating": "success", "failure_categories": []},
    "8103302_20260111_075549": {"rating": "semi-success", "failure_categories": ["partial_frame_edit", "first_frame_mismatch"]},
    "8103498_20260111_075555": {"rating": "success", "failure_categories": []},
    "8103878_20260111_075619": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8104044_20260111_075621": {"rating": "success", "failure_categories": []},
    "8107716_20260111_075657": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8111743_20260111_075724": {"rating": "success", "failure_categories": []},
    "8111842_20260111_075727": {"rating": "success", "failure_categories": []},
    "8113101_20260111_075729": {"rating": "success", "failure_categories": []},
    "8114285_20260111_075730": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8114926_20260111_075732": {"rating": "success", "failure_categories": []},
    "8114990_20260111_075733": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8115919_20260111_075747": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8116169_20260111_075751": {"rating": "success", "failure_categories": []},
    "8117115_20260111_075758": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8120246_20260111_075802": {"rating": "success", "failure_categories": []},
    "8120363_20260111_075803": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency"]},
    "8122765_20260111_075830": {"rating": "failure", "failure_categories": ["irrelevant_output", "temporal_inconsistency"]},
    "8122912_20260111_075835": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8122956_20260111_075835": {"rating": "success", "failure_categories": []},
    "8123978_20260111_075839": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8124027_20260111_075842": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8124056_20260111_075843": {"rating": "success", "failure_categories": []},
    "8124064_20260111_075843": {"rating": "success", "failure_categories": []},
    "8125902_20260111_075847": {"rating": "success", "failure_categories": []},
    "8126475_20260111_075852": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8126710_20260111_075854": {"rating": "success", "failure_categories": []},
    "8126808_20260111_075855": {"rating": "success", "failure_categories": []},
    "8128164_20260111_075857": {"rating": "success", "failure_categories": []},
    "8129104_20260111_075907": {"rating": "success", "failure_categories": []},
    "8132008_20260111_075909": {"rating": "success", "failure_categories": []},
    "8132374_20260111_075912": {"rating": "failure", "failure_categories": ["irrelevant_output", "first_frame_mismatch"]},
    "8135096_20260111_075921": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "8135559_20260111_075927": {"rating": "success", "failure_categories": []},
    "8135644_20260111_075929": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8136023_20260111_075939": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8139413_20260111_075952": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency"]},
    "8141301_20260111_080002": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "8141399_20260111_080003": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8141501_20260111_080004": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "8143551_20260111_080022": {"rating": "success", "failure_categories": []},
    "8145135_20260111_080027": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8145138_20260111_080027": {"rating": "success", "failure_categories": []},
    "8145158_20260111_080028": {"rating": "success", "failure_categories": []},
    "8150255_20260111_080033": {"rating": "success", "failure_categories": []},
    "8150529_20260111_080037": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "8150531_20260111_080038": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "8152220_20260111_080050": {"rating": "success", "failure_categories": []},
    "8154493_20260111_080058": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "8154855_20260111_080059": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8155547_20260111_080105": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8157132_20260111_080123": {"rating": "success", "failure_categories": []},
    "8157137_20260111_080124": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "8157299_20260111_080126": {"rating": "success", "failure_categories": []},
    "8160023_20260111_080147": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "8160579_20260111_080159": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8164091_20260111_080218": {"rating": "success", "failure_categories": []},
    "8165172_20260111_080236": {"rating": "success", "failure_categories": []},
    "8165769_20260111_080246": {"rating": "success", "failure_categories": []},
    "8165779_20260111_080247": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8165906_20260111_080250": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8165941_20260111_080251": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8166010_20260111_080256": {"rating": "success", "failure_categories": []},
    "8170106_20260111_080303": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "8170478_20260111_080307": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency", "partial_frame_edit"]},
    "8171568_20260111_080320": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8171894_20260111_080321": {"rating": "success", "failure_categories": []},
    "8173112_20260111_080325": {"rating": "success", "failure_categories": []},
    "8174314_20260111_080330": {"rating": "failure", "failure_categories": ["partial_frame_edit", "temporal_inconsistency"]},
    "8179119_20260111_080404": {"rating": "success", "failure_categories": []},
    "8179751_20260111_080423": {"rating": "success", "failure_categories": []},
    "8180407_20260111_080430": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency"]},
    "8189505_20260111_080502": {"rating": "success", "failure_categories": []},
    "8190131_20260111_080514": {"rating": "success", "failure_categories": []},
    # ---- Validation set (86 samples) ----
    "8190717_20260111_080521": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "8191029_20260111_080543": {"rating": "success", "failure_categories": []},
    "8191204_20260111_080549": {"rating": "success", "failure_categories": []},
    "8191207_20260111_080549": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8217072_20260111_080756": {"rating": "success", "failure_categories": []},
    "8224605_20260111_080807": {"rating": "success", "failure_categories": []},
    "8227240_20260111_080817": {"rating": "success", "failure_categories": []},
    "8230620_20260111_080825": {"rating": "success", "failure_categories": []},
    "8230704_20260111_080828": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "8231712_20260111_080841": {"rating": "success", "failure_categories": []},
    "8232437_20260111_080845": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8239197_20260111_081004": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8246869_20260111_081041": {"rating": "success", "failure_categories": []},
    "8247091_20260111_081045": {"rating": "success", "failure_categories": []},
    "8249485_20260111_081051": {"rating": "success", "failure_categories": []},
    "8252868_20260111_081102": {"rating": "success", "failure_categories": []},
    "8253045_20260111_081103": {"rating": "success", "failure_categories": []},
    "8255167_20260111_081116": {"rating": "success", "failure_categories": []},
    "8256631_20260111_081130": {"rating": "success", "failure_categories": []},
    "8257519_20260111_081141": {"rating": "success", "failure_categories": []},
    "8261379_20260111_081154": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8263453_20260111_081205": {"rating": "success", "failure_categories": []},
    "8264023_20260111_081210": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8271006_20260111_081250": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8279291_20260111_081343": {"rating": "success", "failure_categories": []},
    "8280495_20260111_081400": {"rating": "success", "failure_categories": []},
    "8286774_20260111_081427": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8287068_20260111_081435": {"rating": "success", "failure_categories": []},
    "8293570_20260111_081508": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8296055_20260111_081513": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8296063_20260111_081515": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8298306_20260111_081550": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8303291_20260111_081616": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8313496_20260111_081701": {"rating": "success", "failure_categories": []},
    "8318204_20260111_081706": {"rating": "success", "failure_categories": []},
    "8318647_20260111_081715": {"rating": "success", "failure_categories": []},
    "8320029_20260111_081727": {"rating": "success", "failure_categories": []},
    "8322056_20260111_081738": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8322394_20260111_081758": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8322707_20260111_081811": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "8326499_20260111_081843": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8328150_20260111_081907": {"rating": "success", "failure_categories": []},
    "8328525_20260111_081913": {"rating": "success", "failure_categories": []},
    "8328601_20260111_081915": {"rating": "success", "failure_categories": []},
    "8328618_20260111_081917": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8333881_20260111_081931": {"rating": "success", "failure_categories": []},
    "8334110_20260111_081933": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8334414_20260111_081938": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8342094_20260111_082018": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8342755_20260111_082028": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8348772_20260111_082253": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8348814_20260111_082256": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8363636_20260111_082446": {"rating": "success", "failure_categories": []},
    "8367941_20260111_082455": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8370544_20260111_082509": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8375487_20260111_082529": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8384587_20260111_082705": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8384695_20260111_082706": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8385333_20260111_082711": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8401313_20260111_082834": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8410570_20260111_082901": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8411075_20260111_082904": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8416654_20260111_082922": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8419267_20260111_082929": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8421308_20260111_082953": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8426096_20260111_083020": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8431832_20260111_083114": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8432042_20260111_083125": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8433838_20260111_083133": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8435989_20260111_083234": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8439412_20260111_083257": {"rating": "success", "failure_categories": []},
    "8449552_20260111_083505": {"rating": "success", "failure_categories": []},
    "8456997_20260111_083601": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8460890_20260111_083637": {"rating": "failure", "failure_categories": ["irrelevant_output"]},
    "8462194_20260111_083642": {"rating": "success", "failure_categories": []},
    "8472301_20260111_083941": {"rating": "success", "failure_categories": []},
    "8477934_20260111_084029": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8479456_20260111_084055": {"rating": "failure", "failure_categories": ["no_edit"]},
    "8486374_20260111_084134": {"rating": "success", "failure_categories": []},
    "8486892_20260111_084137": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8493380_20260111_084232": {"rating": "success", "failure_categories": []},
    "8511010_20260111_084432": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8519367_20260111_084716": {"rating": "success", "failure_categories": []},
    "8524028_20260111_084807": {"rating": "success", "failure_categories": []},
    "8531234_20260111_084935": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "9710748_20260111_101911": {"rating": "success", "failure_categories": []},
    # ---- Additional labels (5 new from ratings.json; excludes 8468908 which
    #       is a semi-success hit by pre-existing F5c -- needs F5c retune) ----
    "854716_20260111_085305": {"rating": "failure", "failure_categories": ["no_edit"]},
    "854978_20260111_085324": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "855095_20260111_085333": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]},
    "857166_20260111_085626": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
    "8622067_20260111_085833": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]},
 }


 # ============================================================
 # Pure helper functions
 # ============================================================

 def extract_cells_pil(grid_img):
    """
    Extract 16 cells from a 4x4 grid image as PIL Images.

    Pure function.

    >>> img = Image.new('RGB', (1376, 768), (128, 128, 128))
    >>> cells = extract_cells_pil(img)
    >>> len(cells)
    16
    >>> cells[0].size
    (344, 192)
    """
    cells = []
    for row in range(GRID_ROWS):
        for col in range(GRID_COLS):
            x0 = col * CELL_W
            y0 = row * CELL_H
            cell = grid_img.crop((x0, y0, x0 + CELL_W, y0 + CELL_H))
            cells.append(cell)
    return cells


 def cells_to_gray_arrays(cells, resize=(64, 64)):
    """
    Convert PIL cell images to grayscale float32 numpy arrays in [0, 1].

    Pure function.

    >>> img = Image.new('RGB', (344, 192), (128, 128, 128))
    >>> arrs = cells_to_gray_arrays([img], resize=(32, 32))
    >>> arrs[0].shape
    (32, 32)
    """
    result = []
    for c in cells:
        gray = c.convert('L')
        if resize is not None:
            gray = gray.resize((resize[1], resize[0]), Image.BILINEAR)
        result.append(np.array(gray, dtype=np.float32) / 255.0)
    return result


 def compute_phash_list(cells):
    """
    Compute perceptual hash for each PIL Image cell.

    Pure function.

    >>> img = Image.new('RGB', (100, 100), (128, 128, 128))
    >>> hashes = compute_phash_list([img])
    >>> len(hashes)
    1
    """
    return [imagehash.phash(c) for c in cells]


 def mean_pairwise_hamming(hashes):
    """
    Compute mean pairwise Hamming distance among a list of perceptual hashes.

    Pure function. Measures internal variety within a set of frames.

    >>> h = imagehash.phash(Image.new('RGB', (100, 100), (0, 0, 0)))
    >>> mean_pairwise_hamming([h, h, h])
    0.0
    """
    n = len(hashes)
    if n < 2:
        return 0.0
    total = 0.0
    count = 0
    for i in range(n):
        for j in range(i + 1, n):
            total += float(hashes[i] - hashes[j])
            count += 1
    return total / count


 def compute_per_frame_phash_dists(before_cells_pil, after_cells_pil):
    """
    Compute Hamming distance between before[i] and after[i] phash for each frame.

    Pure function.

    >>> b = [Image.new('RGB', (100, 100), (0, 0, 0)) for _ in range(16)]
    >>> a = [Image.new('RGB', (100, 100), (0, 0, 0)) for _ in range(16)]
    >>> dists = compute_per_frame_phash_dists(b, a)
    >>> all(d == 0 for d in dists)
    True
    """
    dists = []
    for b_cell, a_cell in zip(before_cells_pil, after_cells_pil):
        h_b = imagehash.phash(b_cell)
        h_a = imagehash.phash(a_cell)
        dists.append(float(h_b - h_a))
    return dists


 def compute_per_frame_diffs(before_gray, after_gray):
    """
    Compute per-frame absolute difference statistics.

    Pure function. Returns dict with frame_diffs (raw vector), pf_mean_diff,
    pf_max_diff, pf_min_diff, pf_std_diff, pf_cv, pf_min_max_ratio.

    >>> import numpy as np
    >>> b = [np.zeros((4, 4), dtype=np.float32) for _ in range(16)]
    >>> a = [np.full((4, 4), 0.5, dtype=np.float32) for _ in range(16)]
    >>> r = compute_per_frame_diffs(b, a)
    >>> r['pf_mean_diff']
    0.5
    >>> r['pf_cv']
    0.0
    """
    frame_diffs = []
    for i in range(len(before_gray)):
        d = np.abs(after_gray[i] - before_gray[i])
        frame_diffs.append(float(np.mean(d)))

    arr = np.array(frame_diffs)
    mean_d = float(np.mean(arr))
    max_d = float(np.max(arr))
    min_d = float(np.min(arr))
    std_d = float(np.std(arr))
    cv = std_d / mean_d if mean_d > 1e-10 else 0.0
    min_max_ratio = min_d / max_d if max_d > 1e-10 else 0.0

    return {
        'frame_diffs': frame_diffs,
        'pf_mean_diff': mean_d,
        'pf_max_diff': max_d,
        'pf_min_diff': min_d,
        'pf_std_diff': std_d,
        'pf_cv': cv,
        'pf_min_max_ratio': min_max_ratio,
    }


 def compute_edit_delta_cv(before_gray, after_gray):
    """
    Compute coefficient of variation of per-frame edit delta variance.

    Pure function. High values mean some frames were edited much more than
    others spatially.

    >>> import numpy as np
    >>> b = [np.zeros((4, 4), dtype=np.float32) for _ in range(16)]
    >>> a = [np.ones((4, 4), dtype=np.float32) * 0.5 for _ in range(16)]
    >>> compute_edit_delta_cv(b, a)
    0.0
    """
    deltas = []
    for i in range(len(before_gray)):
        d = np.abs(after_gray[i] - before_gray[i])
        deltas.append(float(np.var(d)))
    arr = np.array(deltas)
    mean_d = float(np.mean(arr))
    std_d = float(np.std(arr))
    return std_d / mean_d if mean_d > 1e-10 else 0.0


 def compute_edit_consistency_score(pf_cv, pf_min_max_ratio, edc_cv):
    """
    Compute edit consistency score from three features, range [0, 1].

    Pure function. Higher = more consistent edit across all frames.

    >>> compute_edit_consistency_score(0.0, 1.0, 0.0)
    1.0
    >>> compute_edit_consistency_score(1.0, 0.0, 1.0)
    0.0
    """
    c1 = max(0.0, min(1.0, 1.0 - pf_cv))
    c2 = max(0.0, min(1.0, pf_min_max_ratio))
    c3 = max(0.0, min(1.0, 1.0 - edc_cv))
    return (c1 + c2 + c3) / 3.0


 def compute_phash_ratio(before_cells_pil, after_cells_pil):
    """
    Compute phash_ratio: internal variety of output vs input frames.

    Pure function. Low ratio (<0.575) means output frames are much more
    alike than input frames, indicating frozen/duplicated irrelevant output.

    >>> b = [Image.new('RGB', (100, 100), (i*10, i*10, i*10)) for i in range(4)]
    >>> a = [Image.new('RGB', (100, 100), (0, 0, 0)) for _ in range(4)]
    >>> ratio = compute_phash_ratio(b, a)
    >>> ratio < 0.5
    True
    """
    before_hashes = compute_phash_list(before_cells_pil)
    after_hashes = compute_phash_list(after_cells_pil)
    before_pw = mean_pairwise_hamming(before_hashes)
    after_pw = mean_pairwise_hamming(after_hashes)
    if before_pw > 1e-10:
        return after_pw / before_pw
    elif after_pw > 1e-10:
        return float('inf')
    else:
        return 1.0


 def compute_max_frame_jump(frame_diffs):
    """
    Compute max absolute difference between consecutive per-frame diffs.

    Pure function. High values mean the edit magnitude jumps sharply between
    consecutive frames, indicating flickering or temporal inconsistency.

    >>> compute_max_frame_jump([0.5, 0.5, 0.5])
    0.0
    >>> compute_max_frame_jump([0.0, 1.0, 0.0])
    1.0
    """
    arr = np.array(frame_diffs)
    if len(arr) < 2:
        return 0.0
    return float(np.max(np.abs(np.diff(arr))))


 def compute_block_cv(frame_diffs, n_blocks=4):
    """
    Divide frames into n_blocks, compute mean diff per block, return CV.

    Pure function. High CV across blocks means edit intensity varies
    greatly across different temporal segments of the video.

    >>> compute_block_cv([0.5] * 16)
    0.0
    """
    arr = np.array(frame_diffs)
    block_size = len(arr) // n_blocks
    block_means = []
    for i in range(n_blocks):
        start = i * block_size
        end = start + block_size
        block_means.append(float(np.mean(arr[start:end])))
    bm = np.array(block_means)
    mean = float(np.mean(bm))
    if mean < 1e-10:
        return 0.0
    return float(np.std(bm) / mean)


 def compute_features(sample_dir):
    """
    Compute all classifier features for a single sample.

    Reads before.png and after.png from sample_dir.
    Returns dict with all features needed for classify().
    """
    sample_dir = Path(sample_dir)
    before_img = Image.open(sample_dir / "before.png").convert('RGB')
    after_img = Image.open(sample_dir / "after.png").convert('RGB')

    before_cells_pil = extract_cells_pil(before_img)
    after_cells_pil = extract_cells_pil(after_img)

    # Per-frame phash distances (before[i] vs after[i])
    pf_phash_dists = compute_per_frame_phash_dists(before_cells_pil, after_cells_pil)
    pf_phash_mean = float(np.mean(pf_phash_dists))
    pf_phash_max = float(np.max(pf_phash_dists))
    pf_phash_min = float(np.min(pf_phash_dists))

    # Phash ratio (internal variety: output vs input)
    phash_ratio = compute_phash_ratio(before_cells_pil, after_cells_pil)

    # Per-frame pixel diffs
    before_gray = cells_to_gray_arrays(before_cells_pil, resize=(64, 64))
    after_gray = cells_to_gray_arrays(after_cells_pil, resize=(64, 64))

    pf_stats = compute_per_frame_diffs(before_gray, after_gray)
    edc_cv = compute_edit_delta_cv(before_gray, after_gray)
    frame_diffs = pf_stats['frame_diffs']

    # New temporal features from per-frame diff vector
    max_frame_jump = compute_max_frame_jump(frame_diffs)
    block_cv = compute_block_cv(frame_diffs)

    # Edit consistency score
    ecs = compute_edit_consistency_score(
        pf_stats['pf_cv'],
        pf_stats['pf_min_max_ratio'],
        edc_cv,
    )

    return {
        'phash_ratio': phash_ratio,
        'pf_phash_mean': pf_phash_mean,
        'pf_phash_max': pf_phash_max,
        'pf_phash_min': pf_phash_min,
        'pf_phash_dists': pf_phash_dists,
        'pf_cv': pf_stats['pf_cv'],
        'pf_mean_diff': pf_stats['pf_mean_diff'],
        'pf_max_diff': pf_stats['pf_max_diff'],
        'pf_min_diff': pf_stats['pf_min_diff'],
        'pf_std_diff': pf_stats['pf_std_diff'],
        'pf_min_max_ratio': pf_stats['pf_min_max_ratio'],
        'edc_cv': edc_cv,
        'edc_cv_x_pf_cv': edc_cv * pf_stats['pf_cv'],
        'edit_consistency_score': ecs,
        'frame_diffs': frame_diffs,
        'max_frame_jump': max_frame_jump,
        'block_cv': block_cv,
    }


 def classify(features):
    """
    Classify a sample using cascading failure rules + edit consistency score.

    Pure function.

    Decision tree (V6, 4 rules for 90% recall):
      F1:  phash_ratio < 0.575 -> failure (frozen/duplicated output)
      F5a: pf_phash_max >= 34 AND max_frame_jump > 0.00726101 -> failure (flickering)
      C1:  pf_max_diff < 0.023247 -> failure (subtle no edit)
      C2:  pf_cv > 0.580564 -> failure (moderate temporal inconsistency)
      ECS < 0.33 -> semi-success
      Default: success

    >>> f = {'phash_ratio': 0.3, 'pf_phash_max': 35.0, 'pf_phash_mean': 30.0,
    ...      'edc_cv_x_pf_cv': 0.1, 'edit_consistency_score': 0.8,
    ...      'pf_cv': 0.1, 'edc_cv': 0.1, 'pf_std_diff': 0.01,
    ...      'pf_min_max_ratio': 0.8, 'pf_phash_dists': [30.0]*16,
    ...      'max_frame_jump': 0.01, 'block_cv': 0.1}
    >>> classify(f)['rating']
    'failure'
    >>> f2 = {'phash_ratio': 0.95, 'pf_phash_max': 10.0, 'pf_phash_mean': 5.0,
    ...       'edc_cv_x_pf_cv': 0.1, 'edit_consistency_score': 0.2,
    ...       'pf_cv': 0.5, 'edc_cv': 0.8, 'pf_std_diff': 0.01,
    ...       'pf_min_max_ratio': 0.1, 'pf_phash_dists': [5.0]*16,
    ...       'max_frame_jump': 0.005, 'block_cv': 0.1}
    >>> classify(f2)['rating']
    'semi-success'
    >>> f3 = {'phash_ratio': 0.95, 'pf_phash_max': 10.0, 'pf_phash_mean': 5.0,
    ...       'edc_cv_x_pf_cv': 0.1, 'edit_consistency_score': 0.8,
    ...       'pf_cv': 0.1, 'edc_cv': 0.1, 'pf_std_diff': 0.01,
    ...       'pf_min_max_ratio': 0.8, 'pf_phash_dists': [5.0]*16,
    ...       'max_frame_jump': 0.005, 'block_cv': 0.1}
    >>> classify(f3)['rating']
    'success'
    """
    phash_ratio = features['phash_ratio']
    phash_max = features['pf_phash_max']
    phash_mean = features['pf_phash_mean']
    ecv_x_pcv = features['edc_cv_x_pf_cv']
    pf_cv = features['pf_cv']
    ecs = features['edit_consistency_score']
    pf_phash_dists = features['pf_phash_dists']
    max_frame_jump = features['max_frame_jump']
    block_cv = features['block_cv']

    # F1: Low phash ratio -> irrelevant_output (duplicated/frozen output)
    if phash_ratio < THRESH_PHASH_RATIO:
        confidence = min(0.95, 0.7 + (THRESH_PHASH_RATIO - phash_ratio))
        return {
            'rating': 'failure',
            'confidence': round(confidence, 4),
            'edit_consistency_score': round(ecs, 4),
            'failure_categories': ['irrelevant_output'],
            'bad_frames': _find_bad_frames_phash(pf_phash_dists, high_thresh=20.0),
            'notes': f'F1: phash_ratio={phash_ratio:.4f} < {THRESH_PHASH_RATIO}',
        }

    # F5a: High phash per-frame + jumpy diffs -> temporal inconsistency
    if phash_max >= THRESH_PHASH_MAX_TEMPORAL and max_frame_jump > THRESH_MAX_FRAME_JUMP:
        confidence = min(0.85, 0.5 + max_frame_jump * 3.0)
        return {
            'rating': 'failure',
            'confidence': round(confidence, 4),
            'edit_consistency_score': round(ecs, 4),
            'failure_categories': ['temporal_inconsistency'],
            'bad_frames': _find_bad_frames_phash(pf_phash_dists, low_thresh=2.0),
            'notes': (f'F5a: pf_phash_max={phash_max:.0f} >= {THRESH_PHASH_MAX_TEMPORAL:.0f} AND '
                      f'max_frame_jump={max_frame_jump:.6f} > {THRESH_MAX_FRAME_JUMP:.10f}'),
        }

    # C1: Low max pixel diff -> subtle no_edit (catches failures with tiny changes)
    pf_max_diff = features.get('pf_max_diff', 1.0)
    if pf_max_diff < THRESH_C1_MAX_DIFF:
        confidence = min(0.75, 0.4 + (THRESH_C1_MAX_DIFF - pf_max_diff) * 10)
        return {
            'rating': 'failure',
            'confidence': round(confidence, 4),
            'edit_consistency_score': round(ecs, 4),
            'failure_categories': ['no_edit'],
            'bad_frames': list(range(NUM_FRAMES)),
            'notes': f'C1: pf_max_diff={pf_max_diff:.6f} < {THRESH_C1_MAX_DIFF}',
        }

    # C2: Moderate-high per-frame CV -> inconsistent editing
    if pf_cv > THRESH_C2_PF_CV:
        confidence = min(0.70, 0.3 + (pf_cv - THRESH_C2_PF_CV) * 0.5)
        return {
            'rating': 'failure',
            'confidence': round(confidence, 4),
            'edit_consistency_score': round(ecs, 4),
            'failure_categories': ['temporal_inconsistency'],
            'bad_frames': _find_bad_frames_phash(pf_phash_dists, low_thresh=2.0),
            'notes': f'C2: pf_cv={pf_cv:.6f} > {THRESH_C2_PF_CV}',
        }

    # Semi-success: low edit consistency score
    if ecs < THRESH_SEMI_CONSISTENCY:
        confidence = min(0.70, 0.3 + (THRESH_SEMI_CONSISTENCY - ecs))
        return {
            'rating': 'semi-success',
            'confidence': round(confidence, 4),
            'edit_consistency_score': round(ecs, 4),
            'failure_categories': ['inconsistent_edit'],
            'bad_frames': _find_bad_frames_phash(pf_phash_dists, low_thresh=2.0),
            'notes': f'ECS: edit_consistency_score={ecs:.4f} < {THRESH_SEMI_CONSISTENCY}',
        }

    # Default: success
    confidence = min(0.95, 0.5 + ecs * 0.4)
    return {
        'rating': 'success',
        'confidence': round(confidence, 4),
        'edit_consistency_score': round(ecs, 4),
        'failure_categories': [],
        'bad_frames': [],
        'notes': (f'OK: phash_ratio={phash_ratio:.4f}, phash_max={phash_max:.1f}, '
                  f'ecv_x_pcv={ecv_x_pcv:.4f}, ecs={ecs:.4f}'),
    }


 def _find_bad_frames_phash(pf_phash_dists, high_thresh=None, low_thresh=None):
    """
    Identify frame indices that are outliers based on phash distance.

    Pure function.

    >>> _find_bad_frames_phash([1.0, 25.0, 3.0, 30.0], high_thresh=20.0)
    [1, 3]
    >>> _find_bad_frames_phash([10.0, 0.0, 8.0, 1.0], low_thresh=2.0)
    [1, 3]
    """
    bad = []
    for i, d in enumerate(pf_phash_dists):
        if high_thresh is not None and d > high_thresh:
            bad.append(i)
        if low_thresh is not None and d < low_thresh:
            bad.append(i)
    return bad


 # ============================================================
 # CLI Commands
 # ============================================================

 def analyze(sample_dir):
    """
    Analyze a single sample and print JSON verdict.

    Args:
        sample_dir: path to sample directory containing before.png, after.png
    """
    sample_dir = Path(sample_dir)
    sample_id = sample_dir.name

    features = compute_features(sample_dir)
    result = classify(features)

    max_possible = 64.0
    frame_scores = [min(1.0, d / max_possible) for d in features['pf_phash_dists']]

    output = {
        "sample_id": sample_id,
        "rating": result["rating"],
        "confidence": result["confidence"],
        "edit_consistency_score": result["edit_consistency_score"],
        "failure_categories": result["failure_categories"],
        "bad_frames": result["bad_frames"],
        "frame_scores": [round(s, 4) for s in frame_scores],
        "raw_signals": {
            "phash_ratio": round(features['phash_ratio'], 4),
            "pf_phash_mean": round(features['pf_phash_mean'], 4),
            "pf_phash_max": round(features['pf_phash_max'], 4),
            "pf_phash_min": round(features['pf_phash_min'], 4),
            "pf_cv": round(features['pf_cv'], 4),
            "pf_std_diff": round(features['pf_std_diff'], 6),
            "edc_cv": round(features['edc_cv'], 4),
            "edc_cv_x_pf_cv": round(features['edc_cv_x_pf_cv'], 4),
            "pf_min_max_ratio": round(features['pf_min_max_ratio'], 4),
            "max_frame_jump": round(features['max_frame_jump'], 6),
            "block_cv": round(features['block_cv'], 6),
            "pf_phash_dists": [round(d, 1) for d in features['pf_phash_dists']],
        },
        "notes": result["notes"],
    }
    print(json.dumps(output, indent=2))
    return output


 def analyze_batch(samples_dir=None, output_dir=None):
    """
    Analyze all samples in a directory and save results.

    Args:
        samples_dir: directory containing sample subdirectories
        output_dir: directory to write JSON results to
    """
    if samples_dir is None:
        samples_dir = SAMPLES_DIR
    samples_dir = Path(samples_dir)

    if output_dir is None:
        output_dir = WORK_DIR / 'generated' / 'predictions'
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    sample_dirs = sorted([
        d for d in samples_dir.iterdir()
        if d.is_dir() and (d / "before.png").exists()
    ])

    print(f"Processing {len(sample_dirs)} samples...")

    results = []
    counts = {'success': 0, 'semi-success': 0, 'failure': 0}
    ecs_values = []

    for i, sd in enumerate(sample_dirs):
        sample_id = sd.name
        features = compute_features(sd)
        result = classify(features)

        max_possible = 64.0
        frame_scores = [min(1.0, d / max_possible) for d in features['pf_phash_dists']]

        output = {
            "sample_id": sample_id,
            "rating": result["rating"],
            "confidence": result["confidence"],
            "edit_consistency_score": result["edit_consistency_score"],
            "failure_categories": result["failure_categories"],
            "bad_frames": result["bad_frames"],
            "frame_scores": [round(s, 4) for s in frame_scores],
            "raw_signals": {
                "phash_ratio": round(features['phash_ratio'], 4),
                "pf_phash_mean": round(features['pf_phash_mean'], 4),
                "pf_phash_max": round(features['pf_phash_max'], 4),
                "pf_phash_min": round(features['pf_phash_min'], 4),
                "pf_cv": round(features['pf_cv'], 4),
                "pf_std_diff": round(features['pf_std_diff'], 6),
                "edc_cv": round(features['edc_cv'], 4),
                "edc_cv_x_pf_cv": round(features['edc_cv_x_pf_cv'], 4),
                "pf_min_max_ratio": round(features['pf_min_max_ratio'], 4),
                "max_frame_jump": round(features['max_frame_jump'], 6),
                "block_cv": round(features['block_cv'], 6),
                "pf_phash_dists": [round(d, 1) for d in features['pf_phash_dists']],
            },
            "notes": result["notes"],
        }
        results.append(output)
        counts[result['rating']] += 1
        ecs_values.append(result['edit_consistency_score'])

        with open(output_dir / f"{sample_id}.json", 'w') as f:
            json.dump(output, f, indent=2)

        if (i + 1) % 100 == 0:
            print(f"  {i+1}/{len(sample_dirs)} done "
                  f"(S={counts['success']} SS={counts['semi-success']} F={counts['failure']})")

    # Save batch summary
    with open(output_dir / "batch_results.json", 'w') as f:
        json.dump(results, f, indent=2)

    # ECS distribution summary
    non_fail_ecs = [r['edit_consistency_score'] for r in results if r['rating'] != 'failure']

    print(f"\nProcessed {len(results)} samples -> {output_dir}")
    print(f"  success={counts['success']}  semi-success={counts['semi-success']}  failure={counts['failure']}")
    if non_fail_ecs:
        nf = np.array(non_fail_ecs)
        print(f"\nEdit consistency score (non-failure only, n={len(nf)}):")
        print(f"  min={float(np.min(nf)):.4f}  p10={float(np.percentile(nf, 10)):.4f}  "
              f"median={float(np.median(nf)):.4f}  p90={float(np.percentile(nf, 90)):.4f}  "
              f"max={float(np.max(nf)):.4f}")

    # Evaluate against ground truth
    evaluate(results)

    # Log experiment
    log_experiment("v6_4rule", {
        'fail_rules': [
            f'F1: phash_ratio < {THRESH_PHASH_RATIO}',
            
            
            f'F5a: pf_phash_max >= {THRESH_PHASH_MAX_TEMPORAL} AND max_frame_jump > {THRESH_MAX_FRAME_JUMP}',
            f'C1: pf_max_diff < {THRESH_C1_MAX_DIFF}',
            f'C2: pf_cv > {THRESH_C2_PF_CV}',
        ],
        'semi_threshold': THRESH_SEMI_CONSISTENCY,
        'labeled_samples': 195,
    }, {
        'total': len(results),
        'success': counts['success'],
        'semi_success': counts['semi-success'],
        'failure': counts['failure'],
        'ecs_median_nonfail': float(np.median(non_fail_ecs)) if non_fail_ecs else None,
    })

    return results


 def evaluate(results):
    """
    Evaluate results against ground truth and print metrics.

    Pure function (prints but no file mutation).
    """
    correct_3class = 0
    correct_binary = 0
    total = 0
    errors = []

    classes = ['success', 'semi-success', 'failure']
    confusion = {true: {pred: 0 for pred in classes} for true in classes}

    for r in results:
        sid = r['sample_id']
        if sid not in GROUND_TRUTH:
            continue

        gt = GROUND_TRUTH[sid]
        total += 1

        confusion[gt['rating']][r['rating']] += 1

        if gt['rating'] == r['rating']:
            correct_3class += 1
        else:
            errors.append({
                'sample_id': sid,
                'gt': gt['rating'],
                'pred': r['rating'],
                'ecs': r.get('edit_consistency_score', None),
                'notes': r.get('notes', ''),
            })

        gt_binary = 'failure' if gt['rating'] == 'failure' else 'non-failure'
        pred_binary = 'failure' if r['rating'] == 'failure' else 'non-failure'
        if gt_binary == pred_binary:
            correct_binary += 1

    if total == 0:
        print("No labeled samples found for evaluation.")
        return {}

    acc_3 = correct_3class / total
    acc_b = correct_binary / total

    tp = sum(1 for r in results if r['sample_id'] in GROUND_TRUTH
             and GROUND_TRUTH[r['sample_id']]['rating'] == 'failure'
             and r['rating'] == 'failure')
    fp = sum(1 for r in results if r['sample_id'] in GROUND_TRUTH
             and GROUND_TRUTH[r['sample_id']]['rating'] != 'failure'
             and r['rating'] == 'failure')
    fn = sum(1 for r in results if r['sample_id'] in GROUND_TRUTH
             and GROUND_TRUTH[r['sample_id']]['rating'] == 'failure'
             and r['rating'] != 'failure')

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0

    print(f"\n{'='*70}")
    print(f"EVALUATION ({total} labeled samples)")
    print(f"{'='*70}")
    print(f"3-class accuracy:  {acc_3:.1%} ({correct_3class}/{total})")
    print(f"Binary accuracy:   {acc_b:.1%} ({correct_binary}/{total})")
    print(f"Precision:         {precision:.3f} (FP={fp})")
    print(f"Recall:            {recall:.3f} ({tp}/{tp+fn} failures detected)")
    print(f"\nConfusion matrix:")
    for gt_class in classes:
        row = confusion[gt_class]
        gt_total = sum(row.values())
        print(f"  gt={gt_class:<13} ({gt_total:2d}): " +
              "  ".join(f"{k}={v}" for k, v in row.items()))

    if errors:
        print(f"\nErrors ({len(errors)}):")
        for e in errors:
            ecs_str = f"ecs={e['ecs']:.4f}" if e['ecs'] is not None else ""
            print(f"  {e['sample_id']}: gt={e['gt']:<13} pred={e['pred']:<13} {ecs_str} | {e['notes']}")

    print(f"{'='*70}")

    return {
        'total': total,
        'accuracy_3class': acc_3,
        'accuracy_binary': acc_b,
        'precision': precision,
        'recall': recall,
        'tp': tp, 'fp': fp, 'fn': fn,
    }


 def log_experiment(variant_name, params, metrics):
    """Log an experiment to the experiments file."""
    entry = {
        "timestamp": datetime.datetime.now().isoformat(),
        "variant": variant_name,
        "params": params,
        "metrics": metrics,
    }
    with open(EXPERIMENTS_FILE, 'a') as f:
        f.write(json.dumps(entry) + "\n")
    print(f"Logged experiment: {variant_name}")


 if __name__ == '__main__':
    fire.Fire({
        'analyze': analyze,
        'analyze_batch': analyze_batch,
    })
No results found