aribornstein · August 17, 2025 09:41
diff --git a/radiology_eval_trial_code.py b/radiology_eval_trial_code.py
 # radiology_eval_trial_code.py
 # Reproducible trial comparing independent vs combined LLM-as-judge evaluation.

 import numpy as np
 import matplotlib.pyplot as plt
 from math import log2

 rng = np.random.default_rng(20250817)

 N = 200_000
 C = 4
 criteria = ["Accuracy","Completeness","Clarity","Appropriateness"]
 p_true = np.array([0.80, 0.60, 0.85, 0.70])

 X = rng.random((N, C)) < p_true

 e = 0.10
 flips_ind = rng.random((N, C)) < e
 Y_ind = X ^ flips_ind

 rho = 0.70
 Y_comb = X.copy()
 made_error = np.zeros(N, dtype=bool)
 for j in range(C):
    base_err = rng.random(N) < e
    cascade_err = (rng.random(N) < (e + (1 - e) * rho)) & made_error
    err = base_err | cascade_err
    Y_comb[:, j] = X[:, j] ^ err
    made_error |= err

 def mi_binary(x, y):
    p11 = np.mean(x & y)
    p10 = np.mean(x & (~y))
    p01 = np.mean((~x) & y)
    p00 = np.mean((~x) & (~y))
    px1 = p11 + p10
    px0 = p01 + p00
    py1 = p11 + p01
    py0 = p10 + p00
    def term(pxy, px, py):
        from math import log2
        return 0.0 if pxy==0 or px==0 or py==0 else pxy * (log2(pxy) - log2(px) - log2(py))
    return term(p11, px1, py1) + term(p10, px1, py0) + term(p01, px0, py1) + term(p00, px0, py0)

 tot_true = X.sum(axis=1)
 tot_ind  = Y_ind.sum(axis=1)
 tot_comb = Y_comb.sum(axis=1)

 mean_true, var_true, pass_true = float(tot_true.mean()), float(tot_true.var(ddof=1)), float((tot_true==4).mean())
 mean_ind, var_ind, pass_ind = float(tot_ind.mean()), float(tot_ind.var(ddof=1)), float((tot_ind==4).mean())
 mean_comb, var_comb, pass_comb = float(tot_comb.mean()), float(tot_comb.var(ddof=1)), float((tot_comb==4).mean())

 Pass_true = (tot_true == 4)
 Pass_ind  = (tot_ind  == 4)
 Pass_comb = (tot_comb == 4)

 def confusion(y_true, y_pred):
    tp = np.mean(y_true & y_pred)
    tn = np.mean((~y_true) & (~y_pred))
    fp = np.mean((~y_true) & y_pred)
    fn = np.mean(y_true & (~y_pred))
    prev_pos = np.mean(y_true)
    prev_neg = 1 - prev_pos
    FPR = fp / prev_neg if prev_neg>0 else float('nan')
    FNR = fn / prev_pos if prev_pos>0 else float('nan')
    return dict(TP=float(tp), TN=float(tn), FP=float(fp), FN=float(fn), FPR=float(FPR), FNR=float(FNR))

 cm_ind  = confusion(Pass_true, Pass_ind)
 cm_comb = confusion(Pass_true, Pass_comb)

 mi_ind = [float(mi_binary(X[:,j], Y_ind[:,j])) for j in range(C)]
 mi_comb = [float(mi_binary(X[:,j], Y_comb[:,j])) for j in range(C)]
 mi_pass_ind  = float(mi_binary(Pass_true, Pass_ind))
 mi_pass_comb = float(mi_binary(Pass_true, Pass_comb))

 print("Means/Vars/Pass-rate")
 print(dict(True=dict(mean=mean_true, var=var_true, pass_rate=pass_true),
           Independent=dict(mean=mean_ind, var=var_ind, pass_rate=pass_ind),
           Combined=dict(mean=mean_comb, var=var_comb, pass_rate=pass_comb)))

 print("Confusion (pass-all)")
 print(dict(Independent=cm_ind, Combined=cm_comb))

 print("MI per criterion (bits) and for pass-all")
 for name, a, b in zip(criteria, mi_ind, mi_comb):
    print(f"{name}: MI_ind={a:.6f}, MI_comb={b:.6f}")
 print(f"All-criteria Pass: MI_ind={mi_pass_ind:.6f}, MI_comb={mi_pass_comb:.6f}")

 # PMF plot
 bins = np.arange(0, C+1)
 pmf_ind = np.array([(tot_ind == k).mean() for k in bins])
 pmf_comb = np.array([(tot_comb == k).mean() for k in bins])

 x = np.arange(C+1); w = 0.35
 plt.figure(figsize=(8,5))
 plt.bar(x - w/2, pmf_ind, width=w, label="Independent (binary)")
 plt.bar(x + w/2, pmf_comb, width=w, label="Combined (correlated)")
 for i in range(C+1):
    plt.text(x[i] - w/2, pmf_ind[i] + 0.002, f"{pmf_ind[i]*100:.1f}%", ha="center", va="bottom", fontsize=9)
    plt.text(x[i] + w/2, pmf_comb[i] + 0.002, f"{pmf_comb[i]*100:.1f}%", ha="center", va="bottom", fontsize=9)
 plt.xticks(x, [str(k) for k in bins])
 plt.xlabel("Total criteria met (out of 4)")
 plt.ylabel("Probability")
 plt.title("Probability of Outcomes: Independent vs Combined Evaluation")
 plt.legend()
 plt.tight_layout()
 plt.savefig("pmf_independent_vs_combined_bars_final.png", dpi=160)
	# radiology_eval_trial_code.py
	# Reproducible trial comparing independent vs combined LLM-as-judge evaluation.

	import numpy as np
	import matplotlib.pyplot as plt
	from math import log2

	rng = np.random.default_rng(20250817)

	N = 200_000
	C = 4
	criteria = ["Accuracy","Completeness","Clarity","Appropriateness"]
	p_true = np.array([0.80, 0.60, 0.85, 0.70])

	X = rng.random((N, C)) < p_true

	e = 0.10
	flips_ind = rng.random((N, C)) < e
	Y_ind = X ^ flips_ind

	rho = 0.70
	Y_comb = X.copy()
	made_error = np.zeros(N, dtype=bool)
	for j in range(C):
	base_err = rng.random(N) < e
	cascade_err = (rng.random(N) < (e + (1 - e) * rho)) & made_error
	err = base_err \| cascade_err
	Y_comb[:, j] = X[:, j] ^ err
	made_error \|= err

	def mi_binary(x, y):
	p11 = np.mean(x & y)
	p10 = np.mean(x & (~y))
	p01 = np.mean((~x) & y)
	p00 = np.mean((~x) & (~y))
	px1 = p11 + p10
	px0 = p01 + p00
	py1 = p11 + p01
	py0 = p10 + p00
	def term(pxy, px, py):
	from math import log2
	return 0.0 if pxy==0 or px==0 or py==0 else pxy * (log2(pxy) - log2(px) - log2(py))
	return term(p11, px1, py1) + term(p10, px1, py0) + term(p01, px0, py1) + term(p00, px0, py0)

	tot_true = X.sum(axis=1)
	tot_ind = Y_ind.sum(axis=1)
	tot_comb = Y_comb.sum(axis=1)

	mean_true, var_true, pass_true = float(tot_true.mean()), float(tot_true.var(ddof=1)), float((tot_true==4).mean())
	mean_ind, var_ind, pass_ind = float(tot_ind.mean()), float(tot_ind.var(ddof=1)), float((tot_ind==4).mean())
	mean_comb, var_comb, pass_comb = float(tot_comb.mean()), float(tot_comb.var(ddof=1)), float((tot_comb==4).mean())

	Pass_true = (tot_true == 4)
	Pass_ind = (tot_ind == 4)
	Pass_comb = (tot_comb == 4)

	def confusion(y_true, y_pred):
	tp = np.mean(y_true & y_pred)
	tn = np.mean((~y_true) & (~y_pred))
	fp = np.mean((~y_true) & y_pred)
	fn = np.mean(y_true & (~y_pred))
	prev_pos = np.mean(y_true)
	prev_neg = 1 - prev_pos
	FPR = fp / prev_neg if prev_neg>0 else float('nan')
	FNR = fn / prev_pos if prev_pos>0 else float('nan')
	return dict(TP=float(tp), TN=float(tn), FP=float(fp), FN=float(fn), FPR=float(FPR), FNR=float(FNR))

	cm_ind = confusion(Pass_true, Pass_ind)
	cm_comb = confusion(Pass_true, Pass_comb)

	mi_ind = [float(mi_binary(X[:,j], Y_ind[:,j])) for j in range(C)]
	mi_comb = [float(mi_binary(X[:,j], Y_comb[:,j])) for j in range(C)]
	mi_pass_ind = float(mi_binary(Pass_true, Pass_ind))
	mi_pass_comb = float(mi_binary(Pass_true, Pass_comb))

	print("Means/Vars/Pass-rate")
	print(dict(True=dict(mean=mean_true, var=var_true, pass_rate=pass_true),
	Independent=dict(mean=mean_ind, var=var_ind, pass_rate=pass_ind),
	Combined=dict(mean=mean_comb, var=var_comb, pass_rate=pass_comb)))

	print("Confusion (pass-all)")
	print(dict(Independent=cm_ind, Combined=cm_comb))

	print("MI per criterion (bits) and for pass-all")
	for name, a, b in zip(criteria, mi_ind, mi_comb):
	print(f"{name}: MI_ind={a:.6f}, MI_comb={b:.6f}")
	print(f"All-criteria Pass: MI_ind={mi_pass_ind:.6f}, MI_comb={mi_pass_comb:.6f}")

	# PMF plot
	bins = np.arange(0, C+1)
	pmf_ind = np.array([(tot_ind == k).mean() for k in bins])
	pmf_comb = np.array([(tot_comb == k).mean() for k in bins])

	x = np.arange(C+1); w = 0.35
	plt.figure(figsize=(8,5))
	plt.bar(x - w/2, pmf_ind, width=w, label="Independent (binary)")
	plt.bar(x + w/2, pmf_comb, width=w, label="Combined (correlated)")
	for i in range(C+1):
	plt.text(x[i] - w/2, pmf_ind[i] + 0.002, f"{pmf_ind[i]*100:.1f}%", ha="center", va="bottom", fontsize=9)
	plt.text(x[i] + w/2, pmf_comb[i] + 0.002, f"{pmf_comb[i]*100:.1f}%", ha="center", va="bottom", fontsize=9)
	plt.xticks(x, [str(k) for k in bins])
	plt.xlabel("Total criteria met (out of 4)")
	plt.ylabel("Probability")
	plt.title("Probability of Outcomes: Independent vs Combined Evaluation")
	plt.legend()
	plt.tight_layout()
	plt.savefig("pmf_independent_vs_combined_bars_final.png", dpi=160)
No results found