SqrtRyan · February 24, 2026 06:57
diff --git a/gistfile1.txt b/gistfile1.txt
 """
 Nanobanana edit quality filter — minimal, self-contained.

    pip install numpy imagehash rp

    >>> from filter_sample import filter_pair
    >>> filter_pair('before.png', 'after.png')
    True   # True = keep (good edit), False = reject (bad edit)

 Derived from .claude_auto_score/best_detector.py.

 Ryan Burgert 2026
 """

 import numpy as np
 from PIL import Image
 import imagehash
 from itertools import combinations


 def filter_pair(
    path_before,
    path_after,
    *,
    rows=4,
    cols=4,
    detect_no_edit=True,
    frozen_thresh=0.575,
    flicker_phash_thresh=34,
    flicker_jump_thresh=0.007261,
    no_edit_thresh=0.023247,
    inconsistency_thresh=0.580564,
 ):
    """
    Return True if the edit is good (keep), False if bad (reject).

    Pure function. Loads two keyframe grid images, splits into cells,
    computes features, applies failure rules. Any rule firing = reject.

    Performance on 222 labeled samples:
      detect_no_edit=True:  91.4% recall, 68% precision (stops more failures but also stops more good ones)
      detect_no_edit=False: 84.9% recall, 81% precision (fewer FPs but misses subtle no-edits)

    The no-edit rule (C1) catches samples where nothing meaningful was changed,
    but it also flags ~20 legitimate subtle edits (relighting, film grain, small
    object additions) as failures. Set detect_no_edit=False to disable this rule
    if you'd rather keep subtle edits at the cost of missing some no-edit failures.

    Glossary:
        CV = Coefficient of Variation (std / mean)
        FP = False Positive
        MAE = Mean Absolute Error
        pf = per-frame (variable prefix)
        phash = Perceptual Image Hash
        pw = pairwise (variable prefix)
        no-edits = A failure mode where the output looks too similar to the input

    Args:
        path_before: path to original keyframe grid image
        path_after: path to edited keyframe grid image
        rows: number of rows in the grid
        cols: number of columns in the grid
        detect_no_edit: if True, reject samples with very small pixel changes (C1 rule).
            True = higher recall (91.4%), more false positives (~40).
            False = higher precision (81%), misses subtle no-edits.
        frozen_thresh: phash_ratio below this = frozen/duplicated output
        flicker_phash_thresh: per-frame phash distance above this (AND jump) = flickering
        flicker_jump_thresh: max consecutive diff jump above this (AND phash) = flickering
        no_edit_thresh: max pixel diff below this = nothing was edited
        inconsistency_thresh: per-frame diff CV above this = inconsistent edit

    >>> # filter_pair('before.png', 'after.png') -> bool
    """
    before = Image.open(path_before).convert("RGB")
    after = Image.open(path_after).convert("RGB")

    w, h = before.size
    cell_w, cell_h = w // cols, h // rows
    n = rows * cols

    cells_b, cells_a = [], []
    for r in range(rows):
        for c in range(cols):
            box = (c * cell_w, r * cell_h, (c + 1) * cell_w, (r + 1) * cell_h)
            cells_b.append(before.crop(box))
            cells_a.append(after.crop(box))

    assert len(cells_b) == len(cells_a) == n

    # Perceptual hashes
    hashes_b = [imagehash.phash(c, hash_size=8) for c in cells_b]
    hashes_a = [imagehash.phash(c, hash_size=8) for c in cells_a]

    # phash_ratio: how similar after-frames are to each other vs before-frames
    pairs = list(combinations(range(n), 2))
    pw_b = np.mean([hashes_b[i] - hashes_b[j] for i, j in pairs])
    pw_a = np.mean([hashes_a[i] - hashes_a[j] for i, j in pairs])
    phash_ratio = pw_a / pw_b if pw_b > 0 else 0.0

    # Per-frame phash distance (before[i] vs after[i])
    pf_phash_max = max(hashes_b[i] - hashes_a[i] for i in range(n))

    # Per-frame pixel MAE (grayscale, 64x64, float)
    diffs = []
    for i in range(n):
        b = np.array(cells_b[i].convert("L").resize((64, 64), Image.BILINEAR), dtype=np.float32) / 255
        a = np.array(cells_a[i].convert("L").resize((64, 64), Image.BILINEAR), dtype=np.float32) / 255
        diffs.append(float(np.mean(np.abs(a - b))))

    pf_max_diff = max(diffs)
    pf_cv = float(np.std(diffs) / (np.mean(diffs) + 1e-10))
    max_jump = float(np.max(np.abs(np.diff(diffs))))

    # F1: Frozen/duplicated output
    if phash_ratio < frozen_thresh:
        return False

    # F5a: Flickering edit
    if pf_phash_max >= flicker_phash_thresh and max_jump > flicker_jump_thresh:
        return False

    # C1: No meaningful edit (optional — high FP rate on subtle edits)
    if detect_no_edit and pf_max_diff < no_edit_thresh:
        return False

    # C2: Inconsistent edit magnitude
    if pf_cv > inconsistency_thresh:
        return False

    return True


 if __name__ == "__main__":
    import fire

    fire.Fire(filter_pair)
	"""
	Nanobanana edit quality filter — minimal, self-contained.

	pip install numpy imagehash rp

	>>> from filter_sample import filter_pair
	>>> filter_pair('before.png', 'after.png')
	True # True = keep (good edit), False = reject (bad edit)

	Derived from .claude_auto_score/best_detector.py.

	Ryan Burgert 2026
	"""

	import numpy as np
	from PIL import Image
	import imagehash
	from itertools import combinations


	def filter_pair(
	path_before,
	path_after,
	*,
	rows=4,
	cols=4,
	detect_no_edit=True,
	frozen_thresh=0.575,
	flicker_phash_thresh=34,
	flicker_jump_thresh=0.007261,
	no_edit_thresh=0.023247,
	inconsistency_thresh=0.580564,
	):
	"""
	Return True if the edit is good (keep), False if bad (reject).

	Pure function. Loads two keyframe grid images, splits into cells,
	computes features, applies failure rules. Any rule firing = reject.

	Performance on 222 labeled samples:
	detect_no_edit=True: 91.4% recall, 68% precision (stops more failures but also stops more good ones)
	detect_no_edit=False: 84.9% recall, 81% precision (fewer FPs but misses subtle no-edits)

	The no-edit rule (C1) catches samples where nothing meaningful was changed,
	but it also flags ~20 legitimate subtle edits (relighting, film grain, small
	object additions) as failures. Set detect_no_edit=False to disable this rule
	if you'd rather keep subtle edits at the cost of missing some no-edit failures.

	Glossary:
	CV = Coefficient of Variation (std / mean)
	FP = False Positive
	MAE = Mean Absolute Error
	pf = per-frame (variable prefix)
	phash = Perceptual Image Hash
	pw = pairwise (variable prefix)
	no-edits = A failure mode where the output looks too similar to the input

	Args:
	path_before: path to original keyframe grid image
	path_after: path to edited keyframe grid image
	rows: number of rows in the grid
	cols: number of columns in the grid
	detect_no_edit: if True, reject samples with very small pixel changes (C1 rule).
	True = higher recall (91.4%), more false positives (~40).
	False = higher precision (81%), misses subtle no-edits.
	frozen_thresh: phash_ratio below this = frozen/duplicated output
	flicker_phash_thresh: per-frame phash distance above this (AND jump) = flickering
	flicker_jump_thresh: max consecutive diff jump above this (AND phash) = flickering
	no_edit_thresh: max pixel diff below this = nothing was edited
	inconsistency_thresh: per-frame diff CV above this = inconsistent edit

	>>> # filter_pair('before.png', 'after.png') -> bool
	"""
	before = Image.open(path_before).convert("RGB")
	after = Image.open(path_after).convert("RGB")

	w, h = before.size
	cell_w, cell_h = w // cols, h // rows
	n = rows * cols

	cells_b, cells_a = [], []
	for r in range(rows):
	for c in range(cols):
	box = (c * cell_w, r * cell_h, (c + 1) * cell_w, (r + 1) * cell_h)
	cells_b.append(before.crop(box))
	cells_a.append(after.crop(box))

	assert len(cells_b) == len(cells_a) == n

	# Perceptual hashes
	hashes_b = [imagehash.phash(c, hash_size=8) for c in cells_b]
	hashes_a = [imagehash.phash(c, hash_size=8) for c in cells_a]

	# phash_ratio: how similar after-frames are to each other vs before-frames
	pairs = list(combinations(range(n), 2))
	pw_b = np.mean([hashes_b[i] - hashes_b[j] for i, j in pairs])
	pw_a = np.mean([hashes_a[i] - hashes_a[j] for i, j in pairs])
	phash_ratio = pw_a / pw_b if pw_b > 0 else 0.0

	# Per-frame phash distance (before[i] vs after[i])
	pf_phash_max = max(hashes_b[i] - hashes_a[i] for i in range(n))

	# Per-frame pixel MAE (grayscale, 64x64, float)
	diffs = []
	for i in range(n):
	b = np.array(cells_b[i].convert("L").resize((64, 64), Image.BILINEAR), dtype=np.float32) / 255
	a = np.array(cells_a[i].convert("L").resize((64, 64), Image.BILINEAR), dtype=np.float32) / 255
	diffs.append(float(np.mean(np.abs(a - b))))

	pf_max_diff = max(diffs)
	pf_cv = float(np.std(diffs) / (np.mean(diffs) + 1e-10))
	max_jump = float(np.max(np.abs(np.diff(diffs))))

	# F1: Frozen/duplicated output
	if phash_ratio < frozen_thresh:
	return False

	# F5a: Flickering edit
	if pf_phash_max >= flicker_phash_thresh and max_jump > flicker_jump_thresh:
	return False

	# C1: No meaningful edit (optional — high FP rate on subtle edits)
	if detect_no_edit and pf_max_diff < no_edit_thresh:
	return False

	# C2: Inconsistent edit magnitude
	if pf_cv > inconsistency_thresh:
	return False

	return True


	if __name__ == "__main__":
	import fire

	fire.Fire(filter_pair)
No results found