Created
December 18, 2025 08:51
-
-
Save itslukej/32a73eeb06583b5bcca9690126b4f800 to your computer and use it in GitHub Desktop.
lrclib.net pow cuda kernel
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| use cudarc::driver::{CudaDevice, LaunchAsync, LaunchConfig}; | |
| use cudarc::nvrtc::compile_ptx; | |
| use data_encoding::HEXUPPER; | |
| use ring::digest::{Context, SHA256}; | |
| use anyhow::{Result, anyhow}; | |
| use std::panic; | |
| const CUDA_KERNEL: &str = r#" | |
| // SHA256 constants | |
| __constant__ unsigned int k[64] = { | |
| 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, | |
| 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, | |
| 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, | |
| 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, | |
| 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, | |
| 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, | |
| 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, | |
| 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | |
| }; | |
| __device__ unsigned int rotr(unsigned int x, unsigned int n) { | |
| return (x >> n) | (x << (32 - n)); | |
| } | |
| __device__ void sha256_transform(unsigned int* state, const unsigned char* data) { | |
| unsigned int w[64]; | |
| unsigned int a, b, c, d, e, f, g, h; | |
| unsigned int t1, t2; | |
| // Copy chunk into first 16 words of message schedule | |
| for (int i = 0; i < 16; i++) { | |
| w[i] = (data[i*4] << 24) | (data[i*4+1] << 16) | (data[i*4+2] << 8) | data[i*4+3]; | |
| } | |
| // Extend the first 16 words into the remaining 48 words | |
| for (int i = 16; i < 64; i++) { | |
| unsigned int s0 = rotr(w[i-15], 7) ^ rotr(w[i-15], 18) ^ (w[i-15] >> 3); | |
| unsigned int s1 = rotr(w[i-2], 17) ^ rotr(w[i-2], 19) ^ (w[i-2] >> 10); | |
| w[i] = w[i-16] + s0 + w[i-7] + s1; | |
| } | |
| // Initialize working variables | |
| a = state[0]; b = state[1]; c = state[2]; d = state[3]; | |
| e = state[4]; f = state[5]; g = state[6]; h = state[7]; | |
| // Main loop | |
| for (int i = 0; i < 64; i++) { | |
| unsigned int S1 = rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25); | |
| unsigned int ch = (e & f) ^ (~e & g); | |
| t1 = h + S1 + ch + k[i] + w[i]; | |
| unsigned int S0 = rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22); | |
| unsigned int maj = (a & b) ^ (a & c) ^ (b & c); | |
| t2 = S0 + maj; | |
| h = g; g = f; f = e; e = d + t1; d = c; c = b; b = a; a = t1 + t2; | |
| } | |
| // Add the compressed chunk to the current hash value | |
| state[0] += a; state[1] += b; state[2] += c; state[3] += d; | |
| state[4] += e; state[5] += f; state[6] += g; state[7] += h; | |
| } | |
| __device__ void sha256_init(unsigned int* state) { | |
| state[0] = 0x6a09e667; state[1] = 0xbb67ae85; state[2] = 0x3c6ef372; state[3] = 0xa54ff53a; | |
| state[4] = 0x510e527f; state[5] = 0x9b05688c; state[6] = 0x1f83d9ab; state[7] = 0x5be0cd19; | |
| } | |
| __device__ void sha256_update(unsigned int* state, const unsigned char* data, unsigned int len) { | |
| unsigned char buffer[64]; | |
| unsigned int buffer_len = 0; | |
| // Copy data to buffer and pad | |
| for (unsigned int i = 0; i < len; i++) { | |
| buffer[buffer_len++] = data[i]; | |
| } | |
| // Padding | |
| buffer[buffer_len++] = 0x80; | |
| // If we don't have enough space for length, process this block and start new one | |
| while (buffer_len % 64 != 56) { | |
| if (buffer_len >= 64) { | |
| sha256_transform(state, buffer); | |
| buffer_len = 0; | |
| } else { | |
| buffer[buffer_len++] = 0x00; | |
| } | |
| } | |
| // Append length in bits as 64-bit big-endian | |
| unsigned long long bit_len = len * 8; | |
| for (int i = 7; i >= 0; i--) { | |
| buffer[56 + (7 - i)] = (bit_len >> (i * 8)) & 0xff; | |
| } | |
| sha256_transform(state, buffer); | |
| } | |
| __device__ bool verify_nonce_gpu(const unsigned int* hash, const unsigned char* target, int len) { | |
| unsigned char hash_bytes[32]; | |
| // Convert hash to bytes (big-endian) | |
| for (int i = 0; i < 8; i++) { | |
| hash_bytes[i*4] = (hash[i] >> 24) & 0xff; | |
| hash_bytes[i*4+1] = (hash[i] >> 16) & 0xff; | |
| hash_bytes[i*4+2] = (hash[i] >> 8) & 0xff; | |
| hash_bytes[i*4+3] = hash[i] & 0xff; | |
| } | |
| // Compare with target | |
| for (int i = 0; i < len; i++) { | |
| if (hash_bytes[i] > target[i]) { | |
| return false; | |
| } else if (hash_bytes[i] < target[i]) { | |
| return true; | |
| } | |
| } | |
| return true; | |
| } | |
| extern "C" __global__ void solve_nonce_kernel( | |
| const char* prefix, | |
| int prefix_len, | |
| const unsigned char* target, | |
| int target_len, | |
| unsigned long long start_nonce, | |
| unsigned long long* result, | |
| bool* found | |
| ) { | |
| unsigned long long nonce = start_nonce + blockIdx.x * blockDim.x + threadIdx.x; | |
| if (*found) return; | |
| // Convert nonce to string | |
| char nonce_str[32]; | |
| int nonce_len = 0; | |
| unsigned long long temp = nonce; | |
| if (temp == 0) { | |
| nonce_str[0] = '0'; | |
| nonce_len = 1; | |
| } else { | |
| while (temp > 0) { | |
| nonce_str[nonce_len++] = '0' + (temp % 10); | |
| temp /= 10; | |
| } | |
| // Reverse the string | |
| for (int i = 0; i < nonce_len / 2; i++) { | |
| char tmp = nonce_str[i]; | |
| nonce_str[i] = nonce_str[nonce_len - 1 - i]; | |
| nonce_str[nonce_len - 1 - i] = tmp; | |
| } | |
| } | |
| // Create input string: prefix + nonce | |
| char input[256]; | |
| int input_len = 0; | |
| for (int i = 0; i < prefix_len; i++) { | |
| input[input_len++] = prefix[i]; | |
| } | |
| for (int i = 0; i < nonce_len; i++) { | |
| input[input_len++] = nonce_str[i]; | |
| } | |
| // Compute SHA256 | |
| unsigned int hash_state[8]; | |
| sha256_init(hash_state); | |
| sha256_update(hash_state, (unsigned char*)input, input_len); | |
| // Check if this nonce satisfies the target | |
| if (verify_nonce_gpu(hash_state, target, target_len)) { | |
| atomicExch((unsigned long long*)result, nonce); | |
| *found = true; | |
| } | |
| } | |
| "#; | |
| fn verify_nonce(result: &Vec<u8>, target: &Vec<u8>) -> bool { | |
| if result.len() != target.len() { | |
| return false; | |
| } | |
| for i in 0..(result.len() - 1) { | |
| if result[i] > target[i] { | |
| return false; | |
| } else if result[i] < target[i] { | |
| break; | |
| } | |
| } | |
| return true; | |
| } | |
| /// CPU-based proof-of-work solver. | |
| /// | |
| /// Searches for a nonce such that SHA256(prefix + nonce) <= target. | |
| /// This is the original algorithm and serves as a fallback when GPU is unavailable. | |
| /// | |
| /// # Arguments | |
| /// | |
| /// * `prefix` - String prefix to prepend to nonce | |
| /// * `target_hex` - Hex-encoded target difficulty (hash must be <= this value) | |
| /// | |
| /// # Returns | |
| /// | |
| /// The found nonce as a string | |
| pub fn solve_challenge_cpu(prefix: &str, target_hex: &str) -> String { | |
| let mut nonce = 0; | |
| let mut hashed; | |
| let target = HEXUPPER.decode(target_hex.as_bytes()).unwrap(); | |
| loop { | |
| let mut context = Context::new(&SHA256); | |
| let input = format!("{}{}", prefix, nonce); | |
| context.update(input.as_bytes()); | |
| hashed = context.finish().as_ref().to_vec(); | |
| let result = verify_nonce(&hashed, &target); | |
| if result { | |
| break; | |
| } else { | |
| nonce += 1; | |
| } | |
| } | |
| nonce.to_string() | |
| } | |
| /// GPU-accelerated proof-of-work solver using CUDA. | |
| /// | |
| /// Launches thousands of parallel threads to search for a nonce such that | |
| /// SHA256(prefix + nonce) <= target. Can achieve 100x-1000x speedup over CPU. | |
| /// | |
| /// # Arguments | |
| /// | |
| /// * `prefix` - String prefix to prepend to nonce | |
| /// * `target_hex` - Hex-encoded target difficulty (hash must be <= this value) | |
| /// | |
| /// # Returns | |
| /// | |
| /// * `Ok(String)` - The found nonce as a string | |
| /// * `Err(anyhow::Error)` - GPU initialization or kernel execution error | |
| /// | |
| /// # Requirements | |
| /// | |
| /// * NVIDIA GPU with CUDA compute capability 3.5+ | |
| /// * CUDA toolkit installed and in PATH | |
| /// * Sufficient GPU memory (typically works with 2GB+) | |
| pub fn solve_challenge_gpu(prefix: &str, target_hex: &str) -> Result<String> { | |
| // Safely try to initialize CUDA device, catching panics from missing libraries | |
| let device = panic::catch_unwind(|| { | |
| CudaDevice::new(0) | |
| }).map_err(|_| { | |
| anyhow!("CUDA initialization failed - likely missing CUDA libraries or no compatible GPU found") | |
| })??; | |
| // Compile the CUDA kernel - also catch panics here | |
| let ptx = panic::catch_unwind(|| { | |
| compile_ptx(CUDA_KERNEL) | |
| }).map_err(|_| { | |
| anyhow!("CUDA kernel compilation failed - NVRTC libraries not available") | |
| })??; | |
| device.load_ptx(ptx, "solve_nonce", &["solve_nonce_kernel"])?; | |
| let target = HEXUPPER.decode(target_hex.as_bytes()).unwrap(); | |
| // GPU memory allocations | |
| let prefix_gpu = device.htod_copy(prefix.as_bytes().to_vec())?; | |
| let target_gpu = device.htod_copy(target.clone())?; | |
| let result_gpu = device.alloc_zeros::<u64>(1)?; | |
| let mut found_gpu = device.alloc_zeros::<bool>(1)?; | |
| let threads_per_block = 256; | |
| let blocks = 65536; // Start with many blocks | |
| let nonces_per_launch = threads_per_block * blocks; | |
| let mut start_nonce = 0u64; | |
| loop { | |
| // Launch kernel | |
| let cfg = LaunchConfig { | |
| grid_dim: (blocks, 1, 1), | |
| block_dim: (threads_per_block, 1, 1), | |
| shared_mem_bytes: 0, | |
| }; | |
| // Reset found flag | |
| device.htod_copy_into(vec![false], &mut found_gpu)?; | |
| unsafe { | |
| let kernel = device.get_func("solve_nonce", "solve_nonce_kernel").unwrap(); | |
| kernel.launch( | |
| cfg, | |
| ( | |
| &prefix_gpu, | |
| prefix.len() as i32, | |
| &target_gpu, | |
| target.len() as i32, | |
| start_nonce, | |
| &result_gpu, | |
| &found_gpu, | |
| ), | |
| )?; | |
| } | |
| // Check if solution was found | |
| let found: Vec<bool> = device.dtoh_sync_copy(&found_gpu)?; | |
| if found[0] { | |
| let result: Vec<u64> = device.dtoh_sync_copy(&result_gpu)?; | |
| return Ok(result[0].to_string()); | |
| } | |
| start_nonce += nonces_per_launch as u64; | |
| // Print progress occasionally | |
| if start_nonce % (nonces_per_launch as u64 * 100) == 0 { | |
| println!("Searched {} nonces...", start_nonce); | |
| } | |
| } | |
| } | |
| #[cfg(test)] | |
| mod tests { | |
| use super::*; | |
| #[test] | |
| fn test_verify_nonce() { | |
| let result = vec![0x00, 0x00, 0xFF, 0xFF]; | |
| let target = vec![0x00, 0x01, 0x00, 0x00]; | |
| assert!(verify_nonce(&result, &target)); | |
| let result = vec![0x00, 0x02, 0x00, 0x00]; | |
| let target = vec![0x00, 0x01, 0xFF, 0xFF]; | |
| assert!(!verify_nonce(&result, &target)); | |
| } | |
| #[test] | |
| fn test_cpu_solver() { | |
| let prefix = "test"; | |
| let target_hex = "0FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"; | |
| let nonce = solve_challenge_cpu(prefix, target_hex); | |
| // Verify the solution | |
| let mut context = Context::new(&SHA256); | |
| let input = format!("{}{}", prefix, nonce); | |
| context.update(input.as_bytes()); | |
| let hash = context.finish().as_ref().to_vec(); | |
| let target = HEXUPPER.decode(target_hex.as_bytes()).unwrap(); | |
| assert!(verify_nonce(&hash, &target)); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment