dcondrey · April 14, 2026 10:20
diff --git a/posme-runpod-bench.sh b/posme-runpod-bench.sh
 #!/bin/bash
 set -e
 lscpu 2>/dev/null | grep -E 'Model name|L3' || true
 echo Arch: $(uname -m)
 echo Cores: $(nproc)
 free -h | grep Mem || true
 nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo GPU: none
 apt-get update -qq >/dev/null 2>&1
 apt-get install -y -qq gcc >/dev/null 2>&1
 curl -sSf https://sh.rustup.rs | sh -s -- -y --quiet >/dev/null 2>&1
 . /root/.cargo/env
 cat > /tmp/posme-bench.rs << 'RSEOF'
 // PoSME Reference Benchmark
 // =========================
 // Proof of Sequential Memory Execution -- single-file benchmark.
 //
 // Validates the latency-bound ASIC resistance claim by measuring
 // the per-step cost breakdown (hash vs memory) across arena sizes.
 //
 // BUILD:  rustc -O posme-bench.rs -o posme-bench
 // RUN:    ./posme-bench
 //
 // No external dependencies. Uses a built-in minimal hash (SipHash
 // via std) for portability. Production implementations should use
 // BLAKE3; the latency-bound property is independent of hash choice
 // as long as hash time << memory read time.

 use std::collections::hash_map::DefaultHasher;
 use std::hash::{Hash, Hasher};
 use std::time::Instant;

 const LAMBDA: usize = 32;

 #[derive(Clone, Copy)]
 struct Block {
    data: [u8; LAMBDA],
    causal: [u8; LAMBDA],
 }

 impl Block {
    fn zeroed() -> Self {
        Self {
            data: [0u8; LAMBDA],
            causal: [0u8; LAMBDA],
        }
    }
 }

 fn fast_hash(inputs: &[&[u8]]) -> [u8; LAMBDA] {
    let mut h = DefaultHasher::new();
    for inp in inputs {
        inp.hash(&mut h);
    }
    let v = h.finish();
    let mut out = [0u8; LAMBDA];
    let bytes = v.to_le_bytes();
    // Fill 32 bytes by hashing twice with different seeds
    out[..8].copy_from_slice(&bytes);
    let mut h2 = DefaultHasher::new();
    v.hash(&mut h2);
    let v2 = h2.finish();
    out[8..16].copy_from_slice(&v2.to_le_bytes());
    let mut h3 = DefaultHasher::new();
    v2.hash(&mut h3);
    let v3 = h3.finish();
    out[16..24].copy_from_slice(&v3.to_le_bytes());
    let mut h4 = DefaultHasher::new();
    v3.hash(&mut h4);
    out[24..32].copy_from_slice(&h4.finish().to_le_bytes());
    out
 }

 fn i2osp(x: u32) -> [u8; 4] {
    x.to_be_bytes()
 }

 fn addr_from(cursor: &[u8; LAMBDA], j: u32, n: usize) -> usize {
    let h = fast_hash(&[b"addr", cursor, &j.to_be_bytes()]);
    u32::from_be_bytes([h[0], h[1], h[2], h[3]]) as usize % n
 }

 fn initialize(blocks: &mut [Block], seed: &[u8]) {
    let n = blocks.len();
    blocks[0].data = fast_hash(&[b"init", seed, &i2osp(0)]);
    blocks[0].causal = fast_hash(&[b"causal", seed, &i2osp(0)]);
    for i in 1..n {
        let prev = blocks[i - 1].data;
        let skip = blocks[i / 2].data;
        blocks[i].data = fast_hash(&[b"init", seed, &i2osp(i as u32), &prev, &skip]);
        blocks[i].causal = fast_hash(&[b"causal", seed, &i2osp(i as u32)]);
    }
 }

 #[inline(never)]
 fn posme_step(blocks: &mut [Block], t_prev: &[u8; LAMBDA], t: u32, d: usize) -> [u8; LAMBDA] {
    let n = blocks.len();
    let mut cursor = *t_prev;

    // THE CRITICAL LOOP: pointer-chase with data-dependent addressing.
    // cursor = H(cursor || block.data || block.causal)
    // Each iteration MUST wait for the memory read before computing
    // the next address. This is where latency-hardness lives.
    for j in 0..d {
        let a = addr_from(&cursor, j as u32, n);
        let val = blocks[a];
        cursor = fast_hash(&[&cursor, &val.data, &val.causal]);
    }

    // Symbiotic write
    let w = addr_from(&cursor, d as u32, n);
    let old = blocks[w];
    blocks[w].data = fast_hash(&[&old.data, &cursor, &old.causal]);
    blocks[w].causal = fast_hash(&[&old.causal, &cursor, &i2osp(t)]);

    // Transcript chain
    fast_hash(&[t_prev, &i2osp(t), &cursor])
 }

 fn hex8(b: &[u8; LAMBDA]) -> String {
    b[..8].iter().map(|x| format!("{:02x}", x)).collect()
 }

 fn format_size(bytes: usize) -> String {
    if bytes >= 1 << 30 {
        format!("{:.1} GiB", bytes as f64 / (1u64 << 30) as f64)
    } else if bytes >= 1 << 20 {
        format!("{:.0} MiB", bytes as f64 / (1u64 << 20) as f64)
    } else {
        format!("{:.0} KiB", bytes as f64 / (1u64 << 10) as f64)
    }
 }

 fn run_benchmark(n: usize, k: u32, d: usize, label: &str) {
    let arena_bytes = n * 2 * LAMBDA;
    let rho = k as f64 / n as f64;

    println!("  {:.<44} {} arena, {} steps, rho={:.1}",
        format!("{} ", label), format_size(arena_bytes), k, rho);

    let mut blocks = vec![Block::zeroed(); n];
    initialize(&mut blocks, b"posme-bench-v1");

    let mut transcript = fast_hash(&[b"transcript", b"posme-bench-v1"]);

    let start = Instant::now();
    for t in 1..=k {
        transcript = posme_step(&mut blocks, &transcript, t, d);
    }
    let elapsed = start.elapsed();

    let total_s = elapsed.as_secs_f64();
    let avg_ns = total_s / k as f64 * 1e9;
    let hash_est_ns = (d as f64 + 3.0) * 5.0; // ~5ns per hash (SipHash)
    let _mem_ns = avg_ns - hash_est_ns;
    let hash_pct = hash_est_ns / avg_ns * 100.0;

    println!("    Total time:        {:.3}s", total_s);
    println!("    Avg step:          {:.0} ns", avg_ns);
    println!("    Hash fraction:     {:.1}%", hash_pct);
    println!("    T_K:               {}...", hex8(&transcript));
    println!("    TMTO (alpha=0):    {:.1}x", 1.0 + (2.0 * rho + 1.0));
    println!();
 }

 fn main() {
    let d: usize = 8;

    println!();
    println!("  ╔══════════════════════════════════════════════════════════╗");
    println!("  ║         PoSME  Reference  Benchmark  v1.0              ║");
    println!("  ║   Proof of Sequential Memory Execution                 ║");
    println!("  ║   draft-condrey-cfrg-posme                             ║");
    println!("  ╠══════════════════════════════════════════════════════════╣");
    println!("  ║  Reads/step (d):  8                                    ║");
    println!("  ║  Hash:            SipHash (std, portable)              ║");
    println!("  ║  Block size:      64 bytes (32 data + 32 causal)       ║");
    println!("  ╚══════════════════════════════════════════════════════════╝");
    println!();

    println!("  ── Phase 1: Cache Scaling ──────────────────────────────────");
    println!("  Measures per-step cost across arena sizes to demonstrate");
    println!("  the transition from cache-bound to memory-bound behavior.");
    println!();

    // Small arenas (fit in cache) -- short runs
    run_benchmark(1 << 14, 1 << 14, d, "L2-resident (1 MiB)");
    run_benchmark(1 << 16, 1 << 16, d, "L3-resident (4 MiB)");
    run_benchmark(1 << 18, 1 << 18, d, "L3-edge (16 MiB)");
    run_benchmark(1 << 20, 1 << 20, d, "DRAM-spill (64 MiB)");

    println!("  ── Phase 2: Production Parameters ─────────────────────────");
    println!("  1 GiB arena, rho=1 (K=N). This is the minimum recommended");
    println!("  configuration for TMTO resistance.");
    println!();

    let n_prod = 1 << 24; // 16M blocks = 1 GiB
    let k_prod = n_prod as u32; // rho = 1
    run_benchmark(n_prod, k_prod, d, "Production (1 GiB, rho=1)");

    println!("  ── Interpretation Guide ──────────────────────────────────");
    println!();
    println!("  MACHINE-DEPENDENT (will vary by hardware):");
    println!("    - Absolute step time (ns)");
    println!("    - Total execution time (s)");
    println!();
    println!("  MACHINE-INDEPENDENT (should hold on any hardware):");
    println!("    - Hash fraction < 10% at 1 GiB arena");
    println!("    - Step time INCREASES with arena size");
    println!("    - TMTO penalty matches formula: 1 + (1-a)(2p+1)");
    println!();
    println!("  WHY THIS MATTERS:");
    println!("    If hash fraction is small, an ASIC that computes");
    println!("    hashes 100x faster gains almost nothing. The");
    println!("    bottleneck is memory latency, which is bounded by");
    println!("    physics (~2x DDR5 vs HBM3, ~35ns vs ~20ns).");
    println!();
    println!("  For full details: draft-condrey-cfrg-posme");
    println!("  https://lf-decentralized-trust-labs.github.io/proof-of-effort/");
    println!();
 }

 RSEOF
 rustc -O /tmp/posme-bench.rs -o /tmp/posme-bench
 /tmp/posme-bench
 echo POSME_DONE
 sleep 60
	#!/bin/bash
	set -e
	lscpu 2>/dev/null \| grep -E 'Model name\|L3' \|\| true
	echo Arch: $(uname -m)
	echo Cores: $(nproc)
	free -h \| grep Mem \|\| true
	nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null \|\| echo GPU: none
	apt-get update -qq >/dev/null 2>&1
	apt-get install -y -qq gcc >/dev/null 2>&1
	curl -sSf https://sh.rustup.rs \| sh -s -- -y --quiet >/dev/null 2>&1
	. /root/.cargo/env
	cat > /tmp/posme-bench.rs << 'RSEOF'
	// PoSME Reference Benchmark
	// =========================
	// Proof of Sequential Memory Execution -- single-file benchmark.
	//
	// Validates the latency-bound ASIC resistance claim by measuring
	// the per-step cost breakdown (hash vs memory) across arena sizes.
	//
	// BUILD: rustc -O posme-bench.rs -o posme-bench
	// RUN: ./posme-bench
	//
	// No external dependencies. Uses a built-in minimal hash (SipHash
	// via std) for portability. Production implementations should use
	// BLAKE3; the latency-bound property is independent of hash choice
	// as long as hash time << memory read time.

	use std::collections::hash_map::DefaultHasher;
	use std::hash::{Hash, Hasher};
	use std::time::Instant;

	const LAMBDA: usize = 32;

	#[derive(Clone, Copy)]
	struct Block {
	data: [u8; LAMBDA],
	causal: [u8; LAMBDA],
	}

	impl Block {
	fn zeroed() -> Self {
	Self {
	data: [0u8; LAMBDA],
	causal: [0u8; LAMBDA],
	}
	}
	}

	fn fast_hash(inputs: &[&[u8]]) -> [u8; LAMBDA] {
	let mut h = DefaultHasher::new();
	for inp in inputs {
	inp.hash(&mut h);
	}
	let v = h.finish();
	let mut out = [0u8; LAMBDA];
	let bytes = v.to_le_bytes();
	// Fill 32 bytes by hashing twice with different seeds
	out[..8].copy_from_slice(&bytes);
	let mut h2 = DefaultHasher::new();
	v.hash(&mut h2);
	let v2 = h2.finish();
	out[8..16].copy_from_slice(&v2.to_le_bytes());
	let mut h3 = DefaultHasher::new();
	v2.hash(&mut h3);
	let v3 = h3.finish();
	out[16..24].copy_from_slice(&v3.to_le_bytes());
	let mut h4 = DefaultHasher::new();
	v3.hash(&mut h4);
	out[24..32].copy_from_slice(&h4.finish().to_le_bytes());
	out
	}

	fn i2osp(x: u32) -> [u8; 4] {
	x.to_be_bytes()
	}

	fn addr_from(cursor: &[u8; LAMBDA], j: u32, n: usize) -> usize {
	let h = fast_hash(&[b"addr", cursor, &j.to_be_bytes()]);
	u32::from_be_bytes([h[0], h[1], h[2], h[3]]) as usize % n
	}

	fn initialize(blocks: &mut [Block], seed: &[u8]) {
	let n = blocks.len();
	blocks[0].data = fast_hash(&[b"init", seed, &i2osp(0)]);
	blocks[0].causal = fast_hash(&[b"causal", seed, &i2osp(0)]);
	for i in 1..n {
	let prev = blocks[i - 1].data;
	let skip = blocks[i / 2].data;
	blocks[i].data = fast_hash(&[b"init", seed, &i2osp(i as u32), &prev, &skip]);
	blocks[i].causal = fast_hash(&[b"causal", seed, &i2osp(i as u32)]);
	}
	}

	#[inline(never)]
	fn posme_step(blocks: &mut [Block], t_prev: &[u8; LAMBDA], t: u32, d: usize) -> [u8; LAMBDA] {
	let n = blocks.len();
	let mut cursor = *t_prev;

	// THE CRITICAL LOOP: pointer-chase with data-dependent addressing.
	// cursor = H(cursor \|\| block.data \|\| block.causal)
	// Each iteration MUST wait for the memory read before computing
	// the next address. This is where latency-hardness lives.
	for j in 0..d {
	let a = addr_from(&cursor, j as u32, n);
	let val = blocks[a];
	cursor = fast_hash(&[&cursor, &val.data, &val.causal]);
	}

	// Symbiotic write
	let w = addr_from(&cursor, d as u32, n);
	let old = blocks[w];
	blocks[w].data = fast_hash(&[&old.data, &cursor, &old.causal]);
	blocks[w].causal = fast_hash(&[&old.causal, &cursor, &i2osp(t)]);

	// Transcript chain
	fast_hash(&[t_prev, &i2osp(t), &cursor])
	}

	fn hex8(b: &[u8; LAMBDA]) -> String {
	b[..8].iter().map(\|x\| format!("{:02x}", x)).collect()
	}

	fn format_size(bytes: usize) -> String {
	if bytes >= 1 << 30 {
	format!("{:.1} GiB", bytes as f64 / (1u64 << 30) as f64)
	} else if bytes >= 1 << 20 {
	format!("{:.0} MiB", bytes as f64 / (1u64 << 20) as f64)
	} else {
	format!("{:.0} KiB", bytes as f64 / (1u64 << 10) as f64)
	}
	}

	fn run_benchmark(n: usize, k: u32, d: usize, label: &str) {
	let arena_bytes = n * 2 * LAMBDA;
	let rho = k as f64 / n as f64;

	println!(" {:.<44} {} arena, {} steps, rho={:.1}",
	format!("{} ", label), format_size(arena_bytes), k, rho);

	let mut blocks = vec![Block::zeroed(); n];
	initialize(&mut blocks, b"posme-bench-v1");

	let mut transcript = fast_hash(&[b"transcript", b"posme-bench-v1"]);

	let start = Instant::now();
	for t in 1..=k {
	transcript = posme_step(&mut blocks, &transcript, t, d);
	}
	let elapsed = start.elapsed();

	let total_s = elapsed.as_secs_f64();
	let avg_ns = total_s / k as f64 * 1e9;
	let hash_est_ns = (d as f64 + 3.0) * 5.0; // ~5ns per hash (SipHash)
	let _mem_ns = avg_ns - hash_est_ns;
	let hash_pct = hash_est_ns / avg_ns * 100.0;

	println!(" Total time: {:.3}s", total_s);
	println!(" Avg step: {:.0} ns", avg_ns);
	println!(" Hash fraction: {:.1}%", hash_pct);
	println!(" T_K: {}...", hex8(&transcript));
	println!(" TMTO (alpha=0): {:.1}x", 1.0 + (2.0 * rho + 1.0));
	println!();
	}

	fn main() {
	let d: usize = 8;

	println!();
	println!(" ╔══════════════════════════════════════════════════════════╗");
	println!(" ║ PoSME Reference Benchmark v1.0 ║");
	println!(" ║ Proof of Sequential Memory Execution ║");
	println!(" ║ draft-condrey-cfrg-posme ║");
	println!(" ╠══════════════════════════════════════════════════════════╣");
	println!(" ║ Reads/step (d): 8 ║");
	println!(" ║ Hash: SipHash (std, portable) ║");
	println!(" ║ Block size: 64 bytes (32 data + 32 causal) ║");
	println!(" ╚══════════════════════════════════════════════════════════╝");
	println!();

	println!(" ── Phase 1: Cache Scaling ──────────────────────────────────");
	println!(" Measures per-step cost across arena sizes to demonstrate");
	println!(" the transition from cache-bound to memory-bound behavior.");
	println!();

	// Small arenas (fit in cache) -- short runs
	run_benchmark(1 << 14, 1 << 14, d, "L2-resident (1 MiB)");
	run_benchmark(1 << 16, 1 << 16, d, "L3-resident (4 MiB)");
	run_benchmark(1 << 18, 1 << 18, d, "L3-edge (16 MiB)");
	run_benchmark(1 << 20, 1 << 20, d, "DRAM-spill (64 MiB)");

	println!(" ── Phase 2: Production Parameters ─────────────────────────");
	println!(" 1 GiB arena, rho=1 (K=N). This is the minimum recommended");
	println!(" configuration for TMTO resistance.");
	println!();

	let n_prod = 1 << 24; // 16M blocks = 1 GiB
	let k_prod = n_prod as u32; // rho = 1
	run_benchmark(n_prod, k_prod, d, "Production (1 GiB, rho=1)");

	println!(" ── Interpretation Guide ──────────────────────────────────");
	println!();
	println!(" MACHINE-DEPENDENT (will vary by hardware):");
	println!(" - Absolute step time (ns)");
	println!(" - Total execution time (s)");
	println!();
	println!(" MACHINE-INDEPENDENT (should hold on any hardware):");
	println!(" - Hash fraction < 10% at 1 GiB arena");
	println!(" - Step time INCREASES with arena size");
	println!(" - TMTO penalty matches formula: 1 + (1-a)(2p+1)");
	println!();
	println!(" WHY THIS MATTERS:");
	println!(" If hash fraction is small, an ASIC that computes");
	println!(" hashes 100x faster gains almost nothing. The");
	println!(" bottleneck is memory latency, which is bounded by");
	println!(" physics (~2x DDR5 vs HBM3, ~35ns vs ~20ns).");
	println!();
	println!(" For full details: draft-condrey-cfrg-posme");
	println!(" https://lf-decentralized-trust-labs.github.io/proof-of-effort/");
	println!();
	}

	RSEOF
	rustc -O /tmp/posme-bench.rs -o /tmp/posme-bench
	/tmp/posme-bench
	echo POSME_DONE
	sleep 60
No results found