Skip to content

Instantly share code, notes, and snippets.

@aribornstein
Created August 17, 2025 09:41
Show Gist options
  • Save aribornstein/63d4304310fe726c7bc5f0b70be325dc to your computer and use it in GitHub Desktop.
Save aribornstein/63d4304310fe726c7bc5f0b70be325dc to your computer and use it in GitHub Desktop.
# radiology_eval_trial_code.py
# Reproducible trial comparing independent vs combined LLM-as-judge evaluation.
import numpy as np
import matplotlib.pyplot as plt
from math import log2
rng = np.random.default_rng(20250817)
N = 200_000
C = 4
criteria = ["Accuracy","Completeness","Clarity","Appropriateness"]
p_true = np.array([0.80, 0.60, 0.85, 0.70])
X = rng.random((N, C)) < p_true
e = 0.10
flips_ind = rng.random((N, C)) < e
Y_ind = X ^ flips_ind
rho = 0.70
Y_comb = X.copy()
made_error = np.zeros(N, dtype=bool)
for j in range(C):
base_err = rng.random(N) < e
cascade_err = (rng.random(N) < (e + (1 - e) * rho)) & made_error
err = base_err | cascade_err
Y_comb[:, j] = X[:, j] ^ err
made_error |= err
def mi_binary(x, y):
p11 = np.mean(x & y)
p10 = np.mean(x & (~y))
p01 = np.mean((~x) & y)
p00 = np.mean((~x) & (~y))
px1 = p11 + p10
px0 = p01 + p00
py1 = p11 + p01
py0 = p10 + p00
def term(pxy, px, py):
from math import log2
return 0.0 if pxy==0 or px==0 or py==0 else pxy * (log2(pxy) - log2(px) - log2(py))
return term(p11, px1, py1) + term(p10, px1, py0) + term(p01, px0, py1) + term(p00, px0, py0)
tot_true = X.sum(axis=1)
tot_ind = Y_ind.sum(axis=1)
tot_comb = Y_comb.sum(axis=1)
mean_true, var_true, pass_true = float(tot_true.mean()), float(tot_true.var(ddof=1)), float((tot_true==4).mean())
mean_ind, var_ind, pass_ind = float(tot_ind.mean()), float(tot_ind.var(ddof=1)), float((tot_ind==4).mean())
mean_comb, var_comb, pass_comb = float(tot_comb.mean()), float(tot_comb.var(ddof=1)), float((tot_comb==4).mean())
Pass_true = (tot_true == 4)
Pass_ind = (tot_ind == 4)
Pass_comb = (tot_comb == 4)
def confusion(y_true, y_pred):
tp = np.mean(y_true & y_pred)
tn = np.mean((~y_true) & (~y_pred))
fp = np.mean((~y_true) & y_pred)
fn = np.mean(y_true & (~y_pred))
prev_pos = np.mean(y_true)
prev_neg = 1 - prev_pos
FPR = fp / prev_neg if prev_neg>0 else float('nan')
FNR = fn / prev_pos if prev_pos>0 else float('nan')
return dict(TP=float(tp), TN=float(tn), FP=float(fp), FN=float(fn), FPR=float(FPR), FNR=float(FNR))
cm_ind = confusion(Pass_true, Pass_ind)
cm_comb = confusion(Pass_true, Pass_comb)
mi_ind = [float(mi_binary(X[:,j], Y_ind[:,j])) for j in range(C)]
mi_comb = [float(mi_binary(X[:,j], Y_comb[:,j])) for j in range(C)]
mi_pass_ind = float(mi_binary(Pass_true, Pass_ind))
mi_pass_comb = float(mi_binary(Pass_true, Pass_comb))
print("Means/Vars/Pass-rate")
print(dict(True=dict(mean=mean_true, var=var_true, pass_rate=pass_true),
Independent=dict(mean=mean_ind, var=var_ind, pass_rate=pass_ind),
Combined=dict(mean=mean_comb, var=var_comb, pass_rate=pass_comb)))
print("Confusion (pass-all)")
print(dict(Independent=cm_ind, Combined=cm_comb))
print("MI per criterion (bits) and for pass-all")
for name, a, b in zip(criteria, mi_ind, mi_comb):
print(f"{name}: MI_ind={a:.6f}, MI_comb={b:.6f}")
print(f"All-criteria Pass: MI_ind={mi_pass_ind:.6f}, MI_comb={mi_pass_comb:.6f}")
# PMF plot
bins = np.arange(0, C+1)
pmf_ind = np.array([(tot_ind == k).mean() for k in bins])
pmf_comb = np.array([(tot_comb == k).mean() for k in bins])
x = np.arange(C+1); w = 0.35
plt.figure(figsize=(8,5))
plt.bar(x - w/2, pmf_ind, width=w, label="Independent (binary)")
plt.bar(x + w/2, pmf_comb, width=w, label="Combined (correlated)")
for i in range(C+1):
plt.text(x[i] - w/2, pmf_ind[i] + 0.002, f"{pmf_ind[i]*100:.1f}%", ha="center", va="bottom", fontsize=9)
plt.text(x[i] + w/2, pmf_comb[i] + 0.002, f"{pmf_comb[i]*100:.1f}%", ha="center", va="bottom", fontsize=9)
plt.xticks(x, [str(k) for k in bins])
plt.xlabel("Total criteria met (out of 4)")
plt.ylabel("Probability")
plt.title("Probability of Outcomes: Independent vs Combined Evaluation")
plt.legend()
plt.tight_layout()
plt.savefig("pmf_independent_vs_combined_bars_final.png", dpi=160)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment