Skip to content

Instantly share code, notes, and snippets.

@itslukej
Created December 18, 2025 08:51
Show Gist options
  • Select an option

  • Save itslukej/32a73eeb06583b5bcca9690126b4f800 to your computer and use it in GitHub Desktop.

Select an option

Save itslukej/32a73eeb06583b5bcca9690126b4f800 to your computer and use it in GitHub Desktop.
lrclib.net pow cuda kernel
use cudarc::driver::{CudaDevice, LaunchAsync, LaunchConfig};
use cudarc::nvrtc::compile_ptx;
use data_encoding::HEXUPPER;
use ring::digest::{Context, SHA256};
use anyhow::{Result, anyhow};
use std::panic;
const CUDA_KERNEL: &str = r#"
// SHA256 constants
__constant__ unsigned int k[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
__device__ unsigned int rotr(unsigned int x, unsigned int n) {
return (x >> n) | (x << (32 - n));
}
__device__ void sha256_transform(unsigned int* state, const unsigned char* data) {
unsigned int w[64];
unsigned int a, b, c, d, e, f, g, h;
unsigned int t1, t2;
// Copy chunk into first 16 words of message schedule
for (int i = 0; i < 16; i++) {
w[i] = (data[i*4] << 24) | (data[i*4+1] << 16) | (data[i*4+2] << 8) | data[i*4+3];
}
// Extend the first 16 words into the remaining 48 words
for (int i = 16; i < 64; i++) {
unsigned int s0 = rotr(w[i-15], 7) ^ rotr(w[i-15], 18) ^ (w[i-15] >> 3);
unsigned int s1 = rotr(w[i-2], 17) ^ rotr(w[i-2], 19) ^ (w[i-2] >> 10);
w[i] = w[i-16] + s0 + w[i-7] + s1;
}
// Initialize working variables
a = state[0]; b = state[1]; c = state[2]; d = state[3];
e = state[4]; f = state[5]; g = state[6]; h = state[7];
// Main loop
for (int i = 0; i < 64; i++) {
unsigned int S1 = rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25);
unsigned int ch = (e & f) ^ (~e & g);
t1 = h + S1 + ch + k[i] + w[i];
unsigned int S0 = rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22);
unsigned int maj = (a & b) ^ (a & c) ^ (b & c);
t2 = S0 + maj;
h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2;
}
// Add the compressed chunk to the current hash value
state[0] += a; state[1] += b; state[2] += c; state[3] += d;
state[4] += e; state[5] += f; state[6] += g; state[7] += h;
}
__device__ void sha256_init(unsigned int* state) {
state[0] = 0x6a09e667; state[1] = 0xbb67ae85; state[2] = 0x3c6ef372; state[3] = 0xa54ff53a;
state[4] = 0x510e527f; state[5] = 0x9b05688c; state[6] = 0x1f83d9ab; state[7] = 0x5be0cd19;
}
__device__ void sha256_update(unsigned int* state, const unsigned char* data, unsigned int len) {
unsigned char buffer[64];
unsigned int buffer_len = 0;
// Copy data to buffer and pad
for (unsigned int i = 0; i < len; i++) {
buffer[buffer_len++] = data[i];
}
// Padding
buffer[buffer_len++] = 0x80;
// If we don't have enough space for length, process this block and start new one
while (buffer_len % 64 != 56) {
if (buffer_len >= 64) {
sha256_transform(state, buffer);
buffer_len = 0;
} else {
buffer[buffer_len++] = 0x00;
}
}
// Append length in bits as 64-bit big-endian
unsigned long long bit_len = len * 8;
for (int i = 7; i >= 0; i--) {
buffer[56 + (7 - i)] = (bit_len >> (i * 8)) & 0xff;
}
sha256_transform(state, buffer);
}
__device__ bool verify_nonce_gpu(const unsigned int* hash, const unsigned char* target, int len) {
unsigned char hash_bytes[32];
// Convert hash to bytes (big-endian)
for (int i = 0; i < 8; i++) {
hash_bytes[i*4] = (hash[i] >> 24) & 0xff;
hash_bytes[i*4+1] = (hash[i] >> 16) & 0xff;
hash_bytes[i*4+2] = (hash[i] >> 8) & 0xff;
hash_bytes[i*4+3] = hash[i] & 0xff;
}
// Compare with target
for (int i = 0; i < len; i++) {
if (hash_bytes[i] > target[i]) {
return false;
} else if (hash_bytes[i] < target[i]) {
return true;
}
}
return true;
}
extern "C" __global__ void solve_nonce_kernel(
const char* prefix,
int prefix_len,
const unsigned char* target,
int target_len,
unsigned long long start_nonce,
unsigned long long* result,
bool* found
) {
unsigned long long nonce = start_nonce + blockIdx.x * blockDim.x + threadIdx.x;
if (*found) return;
// Convert nonce to string
char nonce_str[32];
int nonce_len = 0;
unsigned long long temp = nonce;
if (temp == 0) {
nonce_str[0] = '0';
nonce_len = 1;
} else {
while (temp > 0) {
nonce_str[nonce_len++] = '0' + (temp % 10);
temp /= 10;
}
// Reverse the string
for (int i = 0; i < nonce_len / 2; i++) {
char tmp = nonce_str[i];
nonce_str[i] = nonce_str[nonce_len - 1 - i];
nonce_str[nonce_len - 1 - i] = tmp;
}
}
// Create input string: prefix + nonce
char input[256];
int input_len = 0;
for (int i = 0; i < prefix_len; i++) {
input[input_len++] = prefix[i];
}
for (int i = 0; i < nonce_len; i++) {
input[input_len++] = nonce_str[i];
}
// Compute SHA256
unsigned int hash_state[8];
sha256_init(hash_state);
sha256_update(hash_state, (unsigned char*)input, input_len);
// Check if this nonce satisfies the target
if (verify_nonce_gpu(hash_state, target, target_len)) {
atomicExch((unsigned long long*)result, nonce);
*found = true;
}
}
"#;
fn verify_nonce(result: &Vec<u8>, target: &Vec<u8>) -> bool {
if result.len() != target.len() {
return false;
}
for i in 0..(result.len() - 1) {
if result[i] > target[i] {
return false;
} else if result[i] < target[i] {
break;
}
}
return true;
}
/// CPU-based proof-of-work solver.
///
/// Searches for a nonce such that SHA256(prefix + nonce) <= target.
/// This is the original algorithm and serves as a fallback when GPU is unavailable.
///
/// # Arguments
///
/// * `prefix` - String prefix to prepend to nonce
/// * `target_hex` - Hex-encoded target difficulty (hash must be <= this value)
///
/// # Returns
///
/// The found nonce as a string
pub fn solve_challenge_cpu(prefix: &str, target_hex: &str) -> String {
let mut nonce = 0;
let mut hashed;
let target = HEXUPPER.decode(target_hex.as_bytes()).unwrap();
loop {
let mut context = Context::new(&SHA256);
let input = format!("{}{}", prefix, nonce);
context.update(input.as_bytes());
hashed = context.finish().as_ref().to_vec();
let result = verify_nonce(&hashed, &target);
if result {
break;
} else {
nonce += 1;
}
}
nonce.to_string()
}
/// GPU-accelerated proof-of-work solver using CUDA.
///
/// Launches thousands of parallel threads to search for a nonce such that
/// SHA256(prefix + nonce) <= target. Can achieve 100x-1000x speedup over CPU.
///
/// # Arguments
///
/// * `prefix` - String prefix to prepend to nonce
/// * `target_hex` - Hex-encoded target difficulty (hash must be <= this value)
///
/// # Returns
///
/// * `Ok(String)` - The found nonce as a string
/// * `Err(anyhow::Error)` - GPU initialization or kernel execution error
///
/// # Requirements
///
/// * NVIDIA GPU with CUDA compute capability 3.5+
/// * CUDA toolkit installed and in PATH
/// * Sufficient GPU memory (typically works with 2GB+)
pub fn solve_challenge_gpu(prefix: &str, target_hex: &str) -> Result<String> {
// Safely try to initialize CUDA device, catching panics from missing libraries
let device = panic::catch_unwind(|| {
CudaDevice::new(0)
}).map_err(|_| {
anyhow!("CUDA initialization failed - likely missing CUDA libraries or no compatible GPU found")
})??;
// Compile the CUDA kernel - also catch panics here
let ptx = panic::catch_unwind(|| {
compile_ptx(CUDA_KERNEL)
}).map_err(|_| {
anyhow!("CUDA kernel compilation failed - NVRTC libraries not available")
})??;
device.load_ptx(ptx, "solve_nonce", &["solve_nonce_kernel"])?;
let target = HEXUPPER.decode(target_hex.as_bytes()).unwrap();
// GPU memory allocations
let prefix_gpu = device.htod_copy(prefix.as_bytes().to_vec())?;
let target_gpu = device.htod_copy(target.clone())?;
let result_gpu = device.alloc_zeros::<u64>(1)?;
let mut found_gpu = device.alloc_zeros::<bool>(1)?;
let threads_per_block = 256;
let blocks = 65536; // Start with many blocks
let nonces_per_launch = threads_per_block * blocks;
let mut start_nonce = 0u64;
loop {
// Launch kernel
let cfg = LaunchConfig {
grid_dim: (blocks, 1, 1),
block_dim: (threads_per_block, 1, 1),
shared_mem_bytes: 0,
};
// Reset found flag
device.htod_copy_into(vec![false], &mut found_gpu)?;
unsafe {
let kernel = device.get_func("solve_nonce", "solve_nonce_kernel").unwrap();
kernel.launch(
cfg,
(
&prefix_gpu,
prefix.len() as i32,
&target_gpu,
target.len() as i32,
start_nonce,
&result_gpu,
&found_gpu,
),
)?;
}
// Check if solution was found
let found: Vec<bool> = device.dtoh_sync_copy(&found_gpu)?;
if found[0] {
let result: Vec<u64> = device.dtoh_sync_copy(&result_gpu)?;
return Ok(result[0].to_string());
}
start_nonce += nonces_per_launch as u64;
// Print progress occasionally
if start_nonce % (nonces_per_launch as u64 * 100) == 0 {
println!("Searched {} nonces...", start_nonce);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_verify_nonce() {
let result = vec![0x00, 0x00, 0xFF, 0xFF];
let target = vec![0x00, 0x01, 0x00, 0x00];
assert!(verify_nonce(&result, &target));
let result = vec![0x00, 0x02, 0x00, 0x00];
let target = vec![0x00, 0x01, 0xFF, 0xFF];
assert!(!verify_nonce(&result, &target));
}
#[test]
fn test_cpu_solver() {
let prefix = "test";
let target_hex = "0FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF";
let nonce = solve_challenge_cpu(prefix, target_hex);
// Verify the solution
let mut context = Context::new(&SHA256);
let input = format!("{}{}", prefix, nonce);
context.update(input.as_bytes());
let hash = context.finish().as_ref().to_vec();
let target = HEXUPPER.decode(target_hex.as_bytes()).unwrap();
assert!(verify_nonce(&hash, &target));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment