Created
February 24, 2026 02:58
-
-
Save SqrtRyan/3b07962d5934700bbb9047b26a981bee to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Nanobanana Edit Quality Detector — single-file, 4 rules, 90% recall. | |
| This is a self-contained copy of .claude_auto_score/best_detector.py. | |
| See .claude_auto_score/ for the full analysis pipeline, manifest, and report. | |
| Self-contained detector for video edit quality. No external dependencies | |
| beyond standard pip packages. Takes a sample folder (before.png + after.png), | |
| returns a JSON verdict. | |
| Setup: | |
| pip install numpy Pillow imagehash fire | |
| Usage: | |
| python best_detector.py analyze --sample_dir /path/to/sample/ | |
| python best_detector.py analyze_batch --samples_dir /path/to/samples/ | |
| Each sample folder must contain: | |
| before.png — 4x4 grid of 16 original video keyframes (768x1376 pixels) | |
| after.png — 4x4 grid of 16 edited video keyframes | |
| Output: JSON with rating (success/semi-success/failure), confidence, | |
| failure type, and per-frame quality scores. | |
| The 4 detection rules (OR'd — any one firing = failure): | |
| F1: phash_ratio < 0.575 → failure (irrelevant_output) | |
| Output frames are all identical (frozen/duplicated). | |
| F5a: pf_phash_max >= 34 AND max_frame_jump > 0.007261 → failure (temporal_inconsistency) | |
| Big edit that flickers between consecutive frames. | |
| C1: pf_max_diff < 0.023247 → failure (no_edit) | |
| Nothing meaningful was changed in any frame. | |
| C2: pf_cv > 0.580564 → failure (temporal_inconsistency) | |
| Edit magnitude varies too much across frames. | |
| Usage: | |
| python best_detector.py analyze --sample_dir /path/to/sample/ | |
| python best_detector.py analyze_batch --output_dir ./generated/predictions/ | |
| """ | |
| import json | |
| import datetime | |
| from pathlib import Path | |
| import fire | |
| import numpy as np | |
| import imagehash | |
| from PIL import Image | |
| # ============================================================ | |
| # Grid geometry constants | |
| # ============================================================ | |
| GRID_ROWS = 4 | |
| GRID_COLS = 4 | |
| NUM_FRAMES = GRID_ROWS * GRID_COLS # 16 | |
| GRID_H = 768 | |
| GRID_W = 1376 | |
| CELL_H = GRID_H // GRID_ROWS # 192 | |
| CELL_W = GRID_W // GRID_COLS # 344 | |
| # ============================================================ | |
| # Paths | |
| # ============================================================ | |
| WORK_DIR = Path(__file__).parent | |
| EXPERIMENTS_FILE = WORK_DIR / "retune_experiments.jsonl" | |
| SAMPLES_DIR = Path('/root/CleanCode/Datasets/Yash/Nanobanana/V1/jan10_last_50K_pexels_v2/training_preview/samples/') | |
| # ============================================================ | |
| # Classifier thresholds (V6, tuned on 177 labeled samples) | |
| # ============================================================ | |
| THRESH_PHASH_RATIO = 0.575 # F1: phash_ratio < this -> failure (frozen/duplicated output) | |
| THRESH_PHASH_MAX_TEMPORAL = 34.0 # F5a: pf_phash_max >= this AND ... | |
| THRESH_MAX_FRAME_JUMP = 0.007261008024215698 # F5a: ... max_frame_jump > this | |
| THRESH_C1_MAX_DIFF = 0.023247 # C1: pf_max_diff < this -> failure (subtle no-edit) | |
| THRESH_C2_PF_CV = 0.580564 # C2: pf_cv > this -> failure (moderate inconsistency) | |
| # Semi-success threshold on edit consistency score | |
| THRESH_SEMI_CONSISTENCY = 0.33 | |
| # ============================================================ | |
| # Ground truth labels (172 original + 6 new = 178 samples) | |
| # ============================================================ | |
| GROUND_TRUTH = { | |
| "8091096_20260111_075419": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8091546_20260111_075420": {"rating": "success", "failure_categories": []}, | |
| "8092893_20260111_075420": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency"]}, | |
| "8093592_20260111_075430": {"rating": "success", "failure_categories": []}, | |
| "8093996_20260111_075432": {"rating": "success", "failure_categories": []}, | |
| "8094039_20260111_075433": {"rating": "success", "failure_categories": []}, | |
| "8094287_20260111_075445": {"rating": "success", "failure_categories": []}, | |
| "8095001_20260111_075447": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8098080_20260111_075500": {"rating": "success", "failure_categories": []}, | |
| "8100967_20260111_075515": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8102246_20260111_075517": {"rating": "success", "failure_categories": []}, | |
| "8102787_20260111_075528": {"rating": "success", "failure_categories": []}, | |
| "8103055_20260111_075547": {"rating": "success", "failure_categories": []}, | |
| "8103302_20260111_075549": {"rating": "semi-success", "failure_categories": ["partial_frame_edit", "first_frame_mismatch"]}, | |
| "8103498_20260111_075555": {"rating": "success", "failure_categories": []}, | |
| "8103878_20260111_075619": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8104044_20260111_075621": {"rating": "success", "failure_categories": []}, | |
| "8107716_20260111_075657": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8111743_20260111_075724": {"rating": "success", "failure_categories": []}, | |
| "8111842_20260111_075727": {"rating": "success", "failure_categories": []}, | |
| "8113101_20260111_075729": {"rating": "success", "failure_categories": []}, | |
| "8114285_20260111_075730": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8114926_20260111_075732": {"rating": "success", "failure_categories": []}, | |
| "8114990_20260111_075733": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8115919_20260111_075747": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8116169_20260111_075751": {"rating": "success", "failure_categories": []}, | |
| "8117115_20260111_075758": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8120246_20260111_075802": {"rating": "success", "failure_categories": []}, | |
| "8120363_20260111_075803": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency"]}, | |
| "8122765_20260111_075830": {"rating": "failure", "failure_categories": ["irrelevant_output", "temporal_inconsistency"]}, | |
| "8122912_20260111_075835": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8122956_20260111_075835": {"rating": "success", "failure_categories": []}, | |
| "8123978_20260111_075839": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8124027_20260111_075842": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8124056_20260111_075843": {"rating": "success", "failure_categories": []}, | |
| "8124064_20260111_075843": {"rating": "success", "failure_categories": []}, | |
| "8125902_20260111_075847": {"rating": "success", "failure_categories": []}, | |
| "8126475_20260111_075852": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8126710_20260111_075854": {"rating": "success", "failure_categories": []}, | |
| "8126808_20260111_075855": {"rating": "success", "failure_categories": []}, | |
| "8128164_20260111_075857": {"rating": "success", "failure_categories": []}, | |
| "8129104_20260111_075907": {"rating": "success", "failure_categories": []}, | |
| "8132008_20260111_075909": {"rating": "success", "failure_categories": []}, | |
| "8132374_20260111_075912": {"rating": "failure", "failure_categories": ["irrelevant_output", "first_frame_mismatch"]}, | |
| "8135096_20260111_075921": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "8135559_20260111_075927": {"rating": "success", "failure_categories": []}, | |
| "8135644_20260111_075929": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8136023_20260111_075939": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8139413_20260111_075952": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency"]}, | |
| "8141301_20260111_080002": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "8141399_20260111_080003": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8141501_20260111_080004": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "8143551_20260111_080022": {"rating": "success", "failure_categories": []}, | |
| "8145135_20260111_080027": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8145138_20260111_080027": {"rating": "success", "failure_categories": []}, | |
| "8145158_20260111_080028": {"rating": "success", "failure_categories": []}, | |
| "8150255_20260111_080033": {"rating": "success", "failure_categories": []}, | |
| "8150529_20260111_080037": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "8150531_20260111_080038": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "8152220_20260111_080050": {"rating": "success", "failure_categories": []}, | |
| "8154493_20260111_080058": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "8154855_20260111_080059": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8155547_20260111_080105": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8157132_20260111_080123": {"rating": "success", "failure_categories": []}, | |
| "8157137_20260111_080124": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "8157299_20260111_080126": {"rating": "success", "failure_categories": []}, | |
| "8160023_20260111_080147": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "8160579_20260111_080159": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8164091_20260111_080218": {"rating": "success", "failure_categories": []}, | |
| "8165172_20260111_080236": {"rating": "success", "failure_categories": []}, | |
| "8165769_20260111_080246": {"rating": "success", "failure_categories": []}, | |
| "8165779_20260111_080247": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8165906_20260111_080250": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8165941_20260111_080251": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8166010_20260111_080256": {"rating": "success", "failure_categories": []}, | |
| "8170106_20260111_080303": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "8170478_20260111_080307": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency", "partial_frame_edit"]}, | |
| "8171568_20260111_080320": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8171894_20260111_080321": {"rating": "success", "failure_categories": []}, | |
| "8173112_20260111_080325": {"rating": "success", "failure_categories": []}, | |
| "8174314_20260111_080330": {"rating": "failure", "failure_categories": ["partial_frame_edit", "temporal_inconsistency"]}, | |
| "8179119_20260111_080404": {"rating": "success", "failure_categories": []}, | |
| "8179751_20260111_080423": {"rating": "success", "failure_categories": []}, | |
| "8180407_20260111_080430": {"rating": "semi-success", "failure_categories": ["temporal_inconsistency"]}, | |
| "8189505_20260111_080502": {"rating": "success", "failure_categories": []}, | |
| "8190131_20260111_080514": {"rating": "success", "failure_categories": []}, | |
| # ---- Validation set (86 samples) ---- | |
| "8190717_20260111_080521": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "8191029_20260111_080543": {"rating": "success", "failure_categories": []}, | |
| "8191204_20260111_080549": {"rating": "success", "failure_categories": []}, | |
| "8191207_20260111_080549": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8217072_20260111_080756": {"rating": "success", "failure_categories": []}, | |
| "8224605_20260111_080807": {"rating": "success", "failure_categories": []}, | |
| "8227240_20260111_080817": {"rating": "success", "failure_categories": []}, | |
| "8230620_20260111_080825": {"rating": "success", "failure_categories": []}, | |
| "8230704_20260111_080828": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "8231712_20260111_080841": {"rating": "success", "failure_categories": []}, | |
| "8232437_20260111_080845": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8239197_20260111_081004": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8246869_20260111_081041": {"rating": "success", "failure_categories": []}, | |
| "8247091_20260111_081045": {"rating": "success", "failure_categories": []}, | |
| "8249485_20260111_081051": {"rating": "success", "failure_categories": []}, | |
| "8252868_20260111_081102": {"rating": "success", "failure_categories": []}, | |
| "8253045_20260111_081103": {"rating": "success", "failure_categories": []}, | |
| "8255167_20260111_081116": {"rating": "success", "failure_categories": []}, | |
| "8256631_20260111_081130": {"rating": "success", "failure_categories": []}, | |
| "8257519_20260111_081141": {"rating": "success", "failure_categories": []}, | |
| "8261379_20260111_081154": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8263453_20260111_081205": {"rating": "success", "failure_categories": []}, | |
| "8264023_20260111_081210": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8271006_20260111_081250": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8279291_20260111_081343": {"rating": "success", "failure_categories": []}, | |
| "8280495_20260111_081400": {"rating": "success", "failure_categories": []}, | |
| "8286774_20260111_081427": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8287068_20260111_081435": {"rating": "success", "failure_categories": []}, | |
| "8293570_20260111_081508": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8296055_20260111_081513": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8296063_20260111_081515": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8298306_20260111_081550": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8303291_20260111_081616": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8313496_20260111_081701": {"rating": "success", "failure_categories": []}, | |
| "8318204_20260111_081706": {"rating": "success", "failure_categories": []}, | |
| "8318647_20260111_081715": {"rating": "success", "failure_categories": []}, | |
| "8320029_20260111_081727": {"rating": "success", "failure_categories": []}, | |
| "8322056_20260111_081738": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8322394_20260111_081758": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8322707_20260111_081811": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "8326499_20260111_081843": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8328150_20260111_081907": {"rating": "success", "failure_categories": []}, | |
| "8328525_20260111_081913": {"rating": "success", "failure_categories": []}, | |
| "8328601_20260111_081915": {"rating": "success", "failure_categories": []}, | |
| "8328618_20260111_081917": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8333881_20260111_081931": {"rating": "success", "failure_categories": []}, | |
| "8334110_20260111_081933": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8334414_20260111_081938": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8342094_20260111_082018": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8342755_20260111_082028": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8348772_20260111_082253": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8348814_20260111_082256": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8363636_20260111_082446": {"rating": "success", "failure_categories": []}, | |
| "8367941_20260111_082455": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8370544_20260111_082509": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8375487_20260111_082529": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8384587_20260111_082705": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8384695_20260111_082706": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8385333_20260111_082711": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8401313_20260111_082834": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8410570_20260111_082901": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8411075_20260111_082904": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8416654_20260111_082922": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8419267_20260111_082929": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8421308_20260111_082953": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8426096_20260111_083020": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8431832_20260111_083114": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8432042_20260111_083125": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8433838_20260111_083133": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8435989_20260111_083234": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8439412_20260111_083257": {"rating": "success", "failure_categories": []}, | |
| "8449552_20260111_083505": {"rating": "success", "failure_categories": []}, | |
| "8456997_20260111_083601": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8460890_20260111_083637": {"rating": "failure", "failure_categories": ["irrelevant_output"]}, | |
| "8462194_20260111_083642": {"rating": "success", "failure_categories": []}, | |
| "8472301_20260111_083941": {"rating": "success", "failure_categories": []}, | |
| "8477934_20260111_084029": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8479456_20260111_084055": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "8486374_20260111_084134": {"rating": "success", "failure_categories": []}, | |
| "8486892_20260111_084137": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8493380_20260111_084232": {"rating": "success", "failure_categories": []}, | |
| "8511010_20260111_084432": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8519367_20260111_084716": {"rating": "success", "failure_categories": []}, | |
| "8524028_20260111_084807": {"rating": "success", "failure_categories": []}, | |
| "8531234_20260111_084935": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "9710748_20260111_101911": {"rating": "success", "failure_categories": []}, | |
| # ---- Additional labels (5 new from ratings.json; excludes 8468908 which | |
| # is a semi-success hit by pre-existing F5c -- needs F5c retune) ---- | |
| "854716_20260111_085305": {"rating": "failure", "failure_categories": ["no_edit"]}, | |
| "854978_20260111_085324": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "855095_20260111_085333": {"rating": "semi-success", "failure_categories": ["partial_frame_edit"]}, | |
| "857166_20260111_085626": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| "8622067_20260111_085833": {"rating": "failure", "failure_categories": ["temporal_inconsistency"]}, | |
| } | |
| # ============================================================ | |
| # Pure helper functions | |
| # ============================================================ | |
| def extract_cells_pil(grid_img): | |
| """ | |
| Extract 16 cells from a 4x4 grid image as PIL Images. | |
| Pure function. | |
| >>> img = Image.new('RGB', (1376, 768), (128, 128, 128)) | |
| >>> cells = extract_cells_pil(img) | |
| >>> len(cells) | |
| 16 | |
| >>> cells[0].size | |
| (344, 192) | |
| """ | |
| cells = [] | |
| for row in range(GRID_ROWS): | |
| for col in range(GRID_COLS): | |
| x0 = col * CELL_W | |
| y0 = row * CELL_H | |
| cell = grid_img.crop((x0, y0, x0 + CELL_W, y0 + CELL_H)) | |
| cells.append(cell) | |
| return cells | |
| def cells_to_gray_arrays(cells, resize=(64, 64)): | |
| """ | |
| Convert PIL cell images to grayscale float32 numpy arrays in [0, 1]. | |
| Pure function. | |
| >>> img = Image.new('RGB', (344, 192), (128, 128, 128)) | |
| >>> arrs = cells_to_gray_arrays([img], resize=(32, 32)) | |
| >>> arrs[0].shape | |
| (32, 32) | |
| """ | |
| result = [] | |
| for c in cells: | |
| gray = c.convert('L') | |
| if resize is not None: | |
| gray = gray.resize((resize[1], resize[0]), Image.BILINEAR) | |
| result.append(np.array(gray, dtype=np.float32) / 255.0) | |
| return result | |
| def compute_phash_list(cells): | |
| """ | |
| Compute perceptual hash for each PIL Image cell. | |
| Pure function. | |
| >>> img = Image.new('RGB', (100, 100), (128, 128, 128)) | |
| >>> hashes = compute_phash_list([img]) | |
| >>> len(hashes) | |
| 1 | |
| """ | |
| return [imagehash.phash(c) for c in cells] | |
| def mean_pairwise_hamming(hashes): | |
| """ | |
| Compute mean pairwise Hamming distance among a list of perceptual hashes. | |
| Pure function. Measures internal variety within a set of frames. | |
| >>> h = imagehash.phash(Image.new('RGB', (100, 100), (0, 0, 0))) | |
| >>> mean_pairwise_hamming([h, h, h]) | |
| 0.0 | |
| """ | |
| n = len(hashes) | |
| if n < 2: | |
| return 0.0 | |
| total = 0.0 | |
| count = 0 | |
| for i in range(n): | |
| for j in range(i + 1, n): | |
| total += float(hashes[i] - hashes[j]) | |
| count += 1 | |
| return total / count | |
| def compute_per_frame_phash_dists(before_cells_pil, after_cells_pil): | |
| """ | |
| Compute Hamming distance between before[i] and after[i] phash for each frame. | |
| Pure function. | |
| >>> b = [Image.new('RGB', (100, 100), (0, 0, 0)) for _ in range(16)] | |
| >>> a = [Image.new('RGB', (100, 100), (0, 0, 0)) for _ in range(16)] | |
| >>> dists = compute_per_frame_phash_dists(b, a) | |
| >>> all(d == 0 for d in dists) | |
| True | |
| """ | |
| dists = [] | |
| for b_cell, a_cell in zip(before_cells_pil, after_cells_pil): | |
| h_b = imagehash.phash(b_cell) | |
| h_a = imagehash.phash(a_cell) | |
| dists.append(float(h_b - h_a)) | |
| return dists | |
| def compute_per_frame_diffs(before_gray, after_gray): | |
| """ | |
| Compute per-frame absolute difference statistics. | |
| Pure function. Returns dict with frame_diffs (raw vector), pf_mean_diff, | |
| pf_max_diff, pf_min_diff, pf_std_diff, pf_cv, pf_min_max_ratio. | |
| >>> import numpy as np | |
| >>> b = [np.zeros((4, 4), dtype=np.float32) for _ in range(16)] | |
| >>> a = [np.full((4, 4), 0.5, dtype=np.float32) for _ in range(16)] | |
| >>> r = compute_per_frame_diffs(b, a) | |
| >>> r['pf_mean_diff'] | |
| 0.5 | |
| >>> r['pf_cv'] | |
| 0.0 | |
| """ | |
| frame_diffs = [] | |
| for i in range(len(before_gray)): | |
| d = np.abs(after_gray[i] - before_gray[i]) | |
| frame_diffs.append(float(np.mean(d))) | |
| arr = np.array(frame_diffs) | |
| mean_d = float(np.mean(arr)) | |
| max_d = float(np.max(arr)) | |
| min_d = float(np.min(arr)) | |
| std_d = float(np.std(arr)) | |
| cv = std_d / mean_d if mean_d > 1e-10 else 0.0 | |
| min_max_ratio = min_d / max_d if max_d > 1e-10 else 0.0 | |
| return { | |
| 'frame_diffs': frame_diffs, | |
| 'pf_mean_diff': mean_d, | |
| 'pf_max_diff': max_d, | |
| 'pf_min_diff': min_d, | |
| 'pf_std_diff': std_d, | |
| 'pf_cv': cv, | |
| 'pf_min_max_ratio': min_max_ratio, | |
| } | |
| def compute_edit_delta_cv(before_gray, after_gray): | |
| """ | |
| Compute coefficient of variation of per-frame edit delta variance. | |
| Pure function. High values mean some frames were edited much more than | |
| others spatially. | |
| >>> import numpy as np | |
| >>> b = [np.zeros((4, 4), dtype=np.float32) for _ in range(16)] | |
| >>> a = [np.ones((4, 4), dtype=np.float32) * 0.5 for _ in range(16)] | |
| >>> compute_edit_delta_cv(b, a) | |
| 0.0 | |
| """ | |
| deltas = [] | |
| for i in range(len(before_gray)): | |
| d = np.abs(after_gray[i] - before_gray[i]) | |
| deltas.append(float(np.var(d))) | |
| arr = np.array(deltas) | |
| mean_d = float(np.mean(arr)) | |
| std_d = float(np.std(arr)) | |
| return std_d / mean_d if mean_d > 1e-10 else 0.0 | |
| def compute_edit_consistency_score(pf_cv, pf_min_max_ratio, edc_cv): | |
| """ | |
| Compute edit consistency score from three features, range [0, 1]. | |
| Pure function. Higher = more consistent edit across all frames. | |
| >>> compute_edit_consistency_score(0.0, 1.0, 0.0) | |
| 1.0 | |
| >>> compute_edit_consistency_score(1.0, 0.0, 1.0) | |
| 0.0 | |
| """ | |
| c1 = max(0.0, min(1.0, 1.0 - pf_cv)) | |
| c2 = max(0.0, min(1.0, pf_min_max_ratio)) | |
| c3 = max(0.0, min(1.0, 1.0 - edc_cv)) | |
| return (c1 + c2 + c3) / 3.0 | |
| def compute_phash_ratio(before_cells_pil, after_cells_pil): | |
| """ | |
| Compute phash_ratio: internal variety of output vs input frames. | |
| Pure function. Low ratio (<0.575) means output frames are much more | |
| alike than input frames, indicating frozen/duplicated irrelevant output. | |
| >>> b = [Image.new('RGB', (100, 100), (i*10, i*10, i*10)) for i in range(4)] | |
| >>> a = [Image.new('RGB', (100, 100), (0, 0, 0)) for _ in range(4)] | |
| >>> ratio = compute_phash_ratio(b, a) | |
| >>> ratio < 0.5 | |
| True | |
| """ | |
| before_hashes = compute_phash_list(before_cells_pil) | |
| after_hashes = compute_phash_list(after_cells_pil) | |
| before_pw = mean_pairwise_hamming(before_hashes) | |
| after_pw = mean_pairwise_hamming(after_hashes) | |
| if before_pw > 1e-10: | |
| return after_pw / before_pw | |
| elif after_pw > 1e-10: | |
| return float('inf') | |
| else: | |
| return 1.0 | |
| def compute_max_frame_jump(frame_diffs): | |
| """ | |
| Compute max absolute difference between consecutive per-frame diffs. | |
| Pure function. High values mean the edit magnitude jumps sharply between | |
| consecutive frames, indicating flickering or temporal inconsistency. | |
| >>> compute_max_frame_jump([0.5, 0.5, 0.5]) | |
| 0.0 | |
| >>> compute_max_frame_jump([0.0, 1.0, 0.0]) | |
| 1.0 | |
| """ | |
| arr = np.array(frame_diffs) | |
| if len(arr) < 2: | |
| return 0.0 | |
| return float(np.max(np.abs(np.diff(arr)))) | |
| def compute_block_cv(frame_diffs, n_blocks=4): | |
| """ | |
| Divide frames into n_blocks, compute mean diff per block, return CV. | |
| Pure function. High CV across blocks means edit intensity varies | |
| greatly across different temporal segments of the video. | |
| >>> compute_block_cv([0.5] * 16) | |
| 0.0 | |
| """ | |
| arr = np.array(frame_diffs) | |
| block_size = len(arr) // n_blocks | |
| block_means = [] | |
| for i in range(n_blocks): | |
| start = i * block_size | |
| end = start + block_size | |
| block_means.append(float(np.mean(arr[start:end]))) | |
| bm = np.array(block_means) | |
| mean = float(np.mean(bm)) | |
| if mean < 1e-10: | |
| return 0.0 | |
| return float(np.std(bm) / mean) | |
| def compute_features(sample_dir): | |
| """ | |
| Compute all classifier features for a single sample. | |
| Reads before.png and after.png from sample_dir. | |
| Returns dict with all features needed for classify(). | |
| """ | |
| sample_dir = Path(sample_dir) | |
| before_img = Image.open(sample_dir / "before.png").convert('RGB') | |
| after_img = Image.open(sample_dir / "after.png").convert('RGB') | |
| before_cells_pil = extract_cells_pil(before_img) | |
| after_cells_pil = extract_cells_pil(after_img) | |
| # Per-frame phash distances (before[i] vs after[i]) | |
| pf_phash_dists = compute_per_frame_phash_dists(before_cells_pil, after_cells_pil) | |
| pf_phash_mean = float(np.mean(pf_phash_dists)) | |
| pf_phash_max = float(np.max(pf_phash_dists)) | |
| pf_phash_min = float(np.min(pf_phash_dists)) | |
| # Phash ratio (internal variety: output vs input) | |
| phash_ratio = compute_phash_ratio(before_cells_pil, after_cells_pil) | |
| # Per-frame pixel diffs | |
| before_gray = cells_to_gray_arrays(before_cells_pil, resize=(64, 64)) | |
| after_gray = cells_to_gray_arrays(after_cells_pil, resize=(64, 64)) | |
| pf_stats = compute_per_frame_diffs(before_gray, after_gray) | |
| edc_cv = compute_edit_delta_cv(before_gray, after_gray) | |
| frame_diffs = pf_stats['frame_diffs'] | |
| # New temporal features from per-frame diff vector | |
| max_frame_jump = compute_max_frame_jump(frame_diffs) | |
| block_cv = compute_block_cv(frame_diffs) | |
| # Edit consistency score | |
| ecs = compute_edit_consistency_score( | |
| pf_stats['pf_cv'], | |
| pf_stats['pf_min_max_ratio'], | |
| edc_cv, | |
| ) | |
| return { | |
| 'phash_ratio': phash_ratio, | |
| 'pf_phash_mean': pf_phash_mean, | |
| 'pf_phash_max': pf_phash_max, | |
| 'pf_phash_min': pf_phash_min, | |
| 'pf_phash_dists': pf_phash_dists, | |
| 'pf_cv': pf_stats['pf_cv'], | |
| 'pf_mean_diff': pf_stats['pf_mean_diff'], | |
| 'pf_max_diff': pf_stats['pf_max_diff'], | |
| 'pf_min_diff': pf_stats['pf_min_diff'], | |
| 'pf_std_diff': pf_stats['pf_std_diff'], | |
| 'pf_min_max_ratio': pf_stats['pf_min_max_ratio'], | |
| 'edc_cv': edc_cv, | |
| 'edc_cv_x_pf_cv': edc_cv * pf_stats['pf_cv'], | |
| 'edit_consistency_score': ecs, | |
| 'frame_diffs': frame_diffs, | |
| 'max_frame_jump': max_frame_jump, | |
| 'block_cv': block_cv, | |
| } | |
| def classify(features): | |
| """ | |
| Classify a sample using cascading failure rules + edit consistency score. | |
| Pure function. | |
| Decision tree (V6, 4 rules for 90% recall): | |
| F1: phash_ratio < 0.575 -> failure (frozen/duplicated output) | |
| F5a: pf_phash_max >= 34 AND max_frame_jump > 0.00726101 -> failure (flickering) | |
| C1: pf_max_diff < 0.023247 -> failure (subtle no edit) | |
| C2: pf_cv > 0.580564 -> failure (moderate temporal inconsistency) | |
| ECS < 0.33 -> semi-success | |
| Default: success | |
| >>> f = {'phash_ratio': 0.3, 'pf_phash_max': 35.0, 'pf_phash_mean': 30.0, | |
| ... 'edc_cv_x_pf_cv': 0.1, 'edit_consistency_score': 0.8, | |
| ... 'pf_cv': 0.1, 'edc_cv': 0.1, 'pf_std_diff': 0.01, | |
| ... 'pf_min_max_ratio': 0.8, 'pf_phash_dists': [30.0]*16, | |
| ... 'max_frame_jump': 0.01, 'block_cv': 0.1} | |
| >>> classify(f)['rating'] | |
| 'failure' | |
| >>> f2 = {'phash_ratio': 0.95, 'pf_phash_max': 10.0, 'pf_phash_mean': 5.0, | |
| ... 'edc_cv_x_pf_cv': 0.1, 'edit_consistency_score': 0.2, | |
| ... 'pf_cv': 0.5, 'edc_cv': 0.8, 'pf_std_diff': 0.01, | |
| ... 'pf_min_max_ratio': 0.1, 'pf_phash_dists': [5.0]*16, | |
| ... 'max_frame_jump': 0.005, 'block_cv': 0.1} | |
| >>> classify(f2)['rating'] | |
| 'semi-success' | |
| >>> f3 = {'phash_ratio': 0.95, 'pf_phash_max': 10.0, 'pf_phash_mean': 5.0, | |
| ... 'edc_cv_x_pf_cv': 0.1, 'edit_consistency_score': 0.8, | |
| ... 'pf_cv': 0.1, 'edc_cv': 0.1, 'pf_std_diff': 0.01, | |
| ... 'pf_min_max_ratio': 0.8, 'pf_phash_dists': [5.0]*16, | |
| ... 'max_frame_jump': 0.005, 'block_cv': 0.1} | |
| >>> classify(f3)['rating'] | |
| 'success' | |
| """ | |
| phash_ratio = features['phash_ratio'] | |
| phash_max = features['pf_phash_max'] | |
| phash_mean = features['pf_phash_mean'] | |
| ecv_x_pcv = features['edc_cv_x_pf_cv'] | |
| pf_cv = features['pf_cv'] | |
| ecs = features['edit_consistency_score'] | |
| pf_phash_dists = features['pf_phash_dists'] | |
| max_frame_jump = features['max_frame_jump'] | |
| block_cv = features['block_cv'] | |
| # F1: Low phash ratio -> irrelevant_output (duplicated/frozen output) | |
| if phash_ratio < THRESH_PHASH_RATIO: | |
| confidence = min(0.95, 0.7 + (THRESH_PHASH_RATIO - phash_ratio)) | |
| return { | |
| 'rating': 'failure', | |
| 'confidence': round(confidence, 4), | |
| 'edit_consistency_score': round(ecs, 4), | |
| 'failure_categories': ['irrelevant_output'], | |
| 'bad_frames': _find_bad_frames_phash(pf_phash_dists, high_thresh=20.0), | |
| 'notes': f'F1: phash_ratio={phash_ratio:.4f} < {THRESH_PHASH_RATIO}', | |
| } | |
| # F5a: High phash per-frame + jumpy diffs -> temporal inconsistency | |
| if phash_max >= THRESH_PHASH_MAX_TEMPORAL and max_frame_jump > THRESH_MAX_FRAME_JUMP: | |
| confidence = min(0.85, 0.5 + max_frame_jump * 3.0) | |
| return { | |
| 'rating': 'failure', | |
| 'confidence': round(confidence, 4), | |
| 'edit_consistency_score': round(ecs, 4), | |
| 'failure_categories': ['temporal_inconsistency'], | |
| 'bad_frames': _find_bad_frames_phash(pf_phash_dists, low_thresh=2.0), | |
| 'notes': (f'F5a: pf_phash_max={phash_max:.0f} >= {THRESH_PHASH_MAX_TEMPORAL:.0f} AND ' | |
| f'max_frame_jump={max_frame_jump:.6f} > {THRESH_MAX_FRAME_JUMP:.10f}'), | |
| } | |
| # C1: Low max pixel diff -> subtle no_edit (catches failures with tiny changes) | |
| pf_max_diff = features.get('pf_max_diff', 1.0) | |
| if pf_max_diff < THRESH_C1_MAX_DIFF: | |
| confidence = min(0.75, 0.4 + (THRESH_C1_MAX_DIFF - pf_max_diff) * 10) | |
| return { | |
| 'rating': 'failure', | |
| 'confidence': round(confidence, 4), | |
| 'edit_consistency_score': round(ecs, 4), | |
| 'failure_categories': ['no_edit'], | |
| 'bad_frames': list(range(NUM_FRAMES)), | |
| 'notes': f'C1: pf_max_diff={pf_max_diff:.6f} < {THRESH_C1_MAX_DIFF}', | |
| } | |
| # C2: Moderate-high per-frame CV -> inconsistent editing | |
| if pf_cv > THRESH_C2_PF_CV: | |
| confidence = min(0.70, 0.3 + (pf_cv - THRESH_C2_PF_CV) * 0.5) | |
| return { | |
| 'rating': 'failure', | |
| 'confidence': round(confidence, 4), | |
| 'edit_consistency_score': round(ecs, 4), | |
| 'failure_categories': ['temporal_inconsistency'], | |
| 'bad_frames': _find_bad_frames_phash(pf_phash_dists, low_thresh=2.0), | |
| 'notes': f'C2: pf_cv={pf_cv:.6f} > {THRESH_C2_PF_CV}', | |
| } | |
| # Semi-success: low edit consistency score | |
| if ecs < THRESH_SEMI_CONSISTENCY: | |
| confidence = min(0.70, 0.3 + (THRESH_SEMI_CONSISTENCY - ecs)) | |
| return { | |
| 'rating': 'semi-success', | |
| 'confidence': round(confidence, 4), | |
| 'edit_consistency_score': round(ecs, 4), | |
| 'failure_categories': ['inconsistent_edit'], | |
| 'bad_frames': _find_bad_frames_phash(pf_phash_dists, low_thresh=2.0), | |
| 'notes': f'ECS: edit_consistency_score={ecs:.4f} < {THRESH_SEMI_CONSISTENCY}', | |
| } | |
| # Default: success | |
| confidence = min(0.95, 0.5 + ecs * 0.4) | |
| return { | |
| 'rating': 'success', | |
| 'confidence': round(confidence, 4), | |
| 'edit_consistency_score': round(ecs, 4), | |
| 'failure_categories': [], | |
| 'bad_frames': [], | |
| 'notes': (f'OK: phash_ratio={phash_ratio:.4f}, phash_max={phash_max:.1f}, ' | |
| f'ecv_x_pcv={ecv_x_pcv:.4f}, ecs={ecs:.4f}'), | |
| } | |
| def _find_bad_frames_phash(pf_phash_dists, high_thresh=None, low_thresh=None): | |
| """ | |
| Identify frame indices that are outliers based on phash distance. | |
| Pure function. | |
| >>> _find_bad_frames_phash([1.0, 25.0, 3.0, 30.0], high_thresh=20.0) | |
| [1, 3] | |
| >>> _find_bad_frames_phash([10.0, 0.0, 8.0, 1.0], low_thresh=2.0) | |
| [1, 3] | |
| """ | |
| bad = [] | |
| for i, d in enumerate(pf_phash_dists): | |
| if high_thresh is not None and d > high_thresh: | |
| bad.append(i) | |
| if low_thresh is not None and d < low_thresh: | |
| bad.append(i) | |
| return bad | |
| # ============================================================ | |
| # CLI Commands | |
| # ============================================================ | |
| def analyze(sample_dir): | |
| """ | |
| Analyze a single sample and print JSON verdict. | |
| Args: | |
| sample_dir: path to sample directory containing before.png, after.png | |
| """ | |
| sample_dir = Path(sample_dir) | |
| sample_id = sample_dir.name | |
| features = compute_features(sample_dir) | |
| result = classify(features) | |
| max_possible = 64.0 | |
| frame_scores = [min(1.0, d / max_possible) for d in features['pf_phash_dists']] | |
| output = { | |
| "sample_id": sample_id, | |
| "rating": result["rating"], | |
| "confidence": result["confidence"], | |
| "edit_consistency_score": result["edit_consistency_score"], | |
| "failure_categories": result["failure_categories"], | |
| "bad_frames": result["bad_frames"], | |
| "frame_scores": [round(s, 4) for s in frame_scores], | |
| "raw_signals": { | |
| "phash_ratio": round(features['phash_ratio'], 4), | |
| "pf_phash_mean": round(features['pf_phash_mean'], 4), | |
| "pf_phash_max": round(features['pf_phash_max'], 4), | |
| "pf_phash_min": round(features['pf_phash_min'], 4), | |
| "pf_cv": round(features['pf_cv'], 4), | |
| "pf_std_diff": round(features['pf_std_diff'], 6), | |
| "edc_cv": round(features['edc_cv'], 4), | |
| "edc_cv_x_pf_cv": round(features['edc_cv_x_pf_cv'], 4), | |
| "pf_min_max_ratio": round(features['pf_min_max_ratio'], 4), | |
| "max_frame_jump": round(features['max_frame_jump'], 6), | |
| "block_cv": round(features['block_cv'], 6), | |
| "pf_phash_dists": [round(d, 1) for d in features['pf_phash_dists']], | |
| }, | |
| "notes": result["notes"], | |
| } | |
| print(json.dumps(output, indent=2)) | |
| return output | |
| def analyze_batch(samples_dir=None, output_dir=None): | |
| """ | |
| Analyze all samples in a directory and save results. | |
| Args: | |
| samples_dir: directory containing sample subdirectories | |
| output_dir: directory to write JSON results to | |
| """ | |
| if samples_dir is None: | |
| samples_dir = SAMPLES_DIR | |
| samples_dir = Path(samples_dir) | |
| if output_dir is None: | |
| output_dir = WORK_DIR / 'generated' / 'predictions' | |
| output_dir = Path(output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| sample_dirs = sorted([ | |
| d for d in samples_dir.iterdir() | |
| if d.is_dir() and (d / "before.png").exists() | |
| ]) | |
| print(f"Processing {len(sample_dirs)} samples...") | |
| results = [] | |
| counts = {'success': 0, 'semi-success': 0, 'failure': 0} | |
| ecs_values = [] | |
| for i, sd in enumerate(sample_dirs): | |
| sample_id = sd.name | |
| features = compute_features(sd) | |
| result = classify(features) | |
| max_possible = 64.0 | |
| frame_scores = [min(1.0, d / max_possible) for d in features['pf_phash_dists']] | |
| output = { | |
| "sample_id": sample_id, | |
| "rating": result["rating"], | |
| "confidence": result["confidence"], | |
| "edit_consistency_score": result["edit_consistency_score"], | |
| "failure_categories": result["failure_categories"], | |
| "bad_frames": result["bad_frames"], | |
| "frame_scores": [round(s, 4) for s in frame_scores], | |
| "raw_signals": { | |
| "phash_ratio": round(features['phash_ratio'], 4), | |
| "pf_phash_mean": round(features['pf_phash_mean'], 4), | |
| "pf_phash_max": round(features['pf_phash_max'], 4), | |
| "pf_phash_min": round(features['pf_phash_min'], 4), | |
| "pf_cv": round(features['pf_cv'], 4), | |
| "pf_std_diff": round(features['pf_std_diff'], 6), | |
| "edc_cv": round(features['edc_cv'], 4), | |
| "edc_cv_x_pf_cv": round(features['edc_cv_x_pf_cv'], 4), | |
| "pf_min_max_ratio": round(features['pf_min_max_ratio'], 4), | |
| "max_frame_jump": round(features['max_frame_jump'], 6), | |
| "block_cv": round(features['block_cv'], 6), | |
| "pf_phash_dists": [round(d, 1) for d in features['pf_phash_dists']], | |
| }, | |
| "notes": result["notes"], | |
| } | |
| results.append(output) | |
| counts[result['rating']] += 1 | |
| ecs_values.append(result['edit_consistency_score']) | |
| with open(output_dir / f"{sample_id}.json", 'w') as f: | |
| json.dump(output, f, indent=2) | |
| if (i + 1) % 100 == 0: | |
| print(f" {i+1}/{len(sample_dirs)} done " | |
| f"(S={counts['success']} SS={counts['semi-success']} F={counts['failure']})") | |
| # Save batch summary | |
| with open(output_dir / "batch_results.json", 'w') as f: | |
| json.dump(results, f, indent=2) | |
| # ECS distribution summary | |
| non_fail_ecs = [r['edit_consistency_score'] for r in results if r['rating'] != 'failure'] | |
| print(f"\nProcessed {len(results)} samples -> {output_dir}") | |
| print(f" success={counts['success']} semi-success={counts['semi-success']} failure={counts['failure']}") | |
| if non_fail_ecs: | |
| nf = np.array(non_fail_ecs) | |
| print(f"\nEdit consistency score (non-failure only, n={len(nf)}):") | |
| print(f" min={float(np.min(nf)):.4f} p10={float(np.percentile(nf, 10)):.4f} " | |
| f"median={float(np.median(nf)):.4f} p90={float(np.percentile(nf, 90)):.4f} " | |
| f"max={float(np.max(nf)):.4f}") | |
| # Evaluate against ground truth | |
| evaluate(results) | |
| # Log experiment | |
| log_experiment("v6_4rule", { | |
| 'fail_rules': [ | |
| f'F1: phash_ratio < {THRESH_PHASH_RATIO}', | |
| f'F5a: pf_phash_max >= {THRESH_PHASH_MAX_TEMPORAL} AND max_frame_jump > {THRESH_MAX_FRAME_JUMP}', | |
| f'C1: pf_max_diff < {THRESH_C1_MAX_DIFF}', | |
| f'C2: pf_cv > {THRESH_C2_PF_CV}', | |
| ], | |
| 'semi_threshold': THRESH_SEMI_CONSISTENCY, | |
| 'labeled_samples': 195, | |
| }, { | |
| 'total': len(results), | |
| 'success': counts['success'], | |
| 'semi_success': counts['semi-success'], | |
| 'failure': counts['failure'], | |
| 'ecs_median_nonfail': float(np.median(non_fail_ecs)) if non_fail_ecs else None, | |
| }) | |
| return results | |
| def evaluate(results): | |
| """ | |
| Evaluate results against ground truth and print metrics. | |
| Pure function (prints but no file mutation). | |
| """ | |
| correct_3class = 0 | |
| correct_binary = 0 | |
| total = 0 | |
| errors = [] | |
| classes = ['success', 'semi-success', 'failure'] | |
| confusion = {true: {pred: 0 for pred in classes} for true in classes} | |
| for r in results: | |
| sid = r['sample_id'] | |
| if sid not in GROUND_TRUTH: | |
| continue | |
| gt = GROUND_TRUTH[sid] | |
| total += 1 | |
| confusion[gt['rating']][r['rating']] += 1 | |
| if gt['rating'] == r['rating']: | |
| correct_3class += 1 | |
| else: | |
| errors.append({ | |
| 'sample_id': sid, | |
| 'gt': gt['rating'], | |
| 'pred': r['rating'], | |
| 'ecs': r.get('edit_consistency_score', None), | |
| 'notes': r.get('notes', ''), | |
| }) | |
| gt_binary = 'failure' if gt['rating'] == 'failure' else 'non-failure' | |
| pred_binary = 'failure' if r['rating'] == 'failure' else 'non-failure' | |
| if gt_binary == pred_binary: | |
| correct_binary += 1 | |
| if total == 0: | |
| print("No labeled samples found for evaluation.") | |
| return {} | |
| acc_3 = correct_3class / total | |
| acc_b = correct_binary / total | |
| tp = sum(1 for r in results if r['sample_id'] in GROUND_TRUTH | |
| and GROUND_TRUTH[r['sample_id']]['rating'] == 'failure' | |
| and r['rating'] == 'failure') | |
| fp = sum(1 for r in results if r['sample_id'] in GROUND_TRUTH | |
| and GROUND_TRUTH[r['sample_id']]['rating'] != 'failure' | |
| and r['rating'] == 'failure') | |
| fn = sum(1 for r in results if r['sample_id'] in GROUND_TRUTH | |
| and GROUND_TRUTH[r['sample_id']]['rating'] == 'failure' | |
| and r['rating'] != 'failure') | |
| precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0 | |
| recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0 | |
| print(f"\n{'='*70}") | |
| print(f"EVALUATION ({total} labeled samples)") | |
| print(f"{'='*70}") | |
| print(f"3-class accuracy: {acc_3:.1%} ({correct_3class}/{total})") | |
| print(f"Binary accuracy: {acc_b:.1%} ({correct_binary}/{total})") | |
| print(f"Precision: {precision:.3f} (FP={fp})") | |
| print(f"Recall: {recall:.3f} ({tp}/{tp+fn} failures detected)") | |
| print(f"\nConfusion matrix:") | |
| for gt_class in classes: | |
| row = confusion[gt_class] | |
| gt_total = sum(row.values()) | |
| print(f" gt={gt_class:<13} ({gt_total:2d}): " + | |
| " ".join(f"{k}={v}" for k, v in row.items())) | |
| if errors: | |
| print(f"\nErrors ({len(errors)}):") | |
| for e in errors: | |
| ecs_str = f"ecs={e['ecs']:.4f}" if e['ecs'] is not None else "" | |
| print(f" {e['sample_id']}: gt={e['gt']:<13} pred={e['pred']:<13} {ecs_str} | {e['notes']}") | |
| print(f"{'='*70}") | |
| return { | |
| 'total': total, | |
| 'accuracy_3class': acc_3, | |
| 'accuracy_binary': acc_b, | |
| 'precision': precision, | |
| 'recall': recall, | |
| 'tp': tp, 'fp': fp, 'fn': fn, | |
| } | |
| def log_experiment(variant_name, params, metrics): | |
| """Log an experiment to the experiments file.""" | |
| entry = { | |
| "timestamp": datetime.datetime.now().isoformat(), | |
| "variant": variant_name, | |
| "params": params, | |
| "metrics": metrics, | |
| } | |
| with open(EXPERIMENTS_FILE, 'a') as f: | |
| f.write(json.dumps(entry) + "\n") | |
| print(f"Logged experiment: {variant_name}") | |
| if __name__ == '__main__': | |
| fire.Fire({ | |
| 'analyze': analyze, | |
| 'analyze_batch': analyze_batch, | |
| }) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment