Last active
February 2, 2023 13:40
-
-
Save moyix/f78e0b0d5724a1bf02e1a035e8bec136 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* License: | |
* Public domain. | |
*/ | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <unistd.h> | |
#include <errno.h> | |
// CUDA cruft | |
#define cudaCheckErrors(msg) \ | |
do { \ | |
cudaError_t __err = cudaGetLastError(); \ | |
if (__err != cudaSuccess) { \ | |
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ | |
msg, cudaGetErrorString(__err), \ | |
__FILE__, __LINE__); \ | |
fprintf(stderr, "*** FAILED - ABORTING\n"); \ | |
exit(1); \ | |
} \ | |
} while (0) | |
// Number of threads is N*M = 65535 | |
// Threads per block | |
#define N 128 | |
// Number of blocks | |
#define M 512 | |
// This is 26^11 / 65536. Assumes that we will be running 65536 | |
// simultaneous threads an we want to partition our search space between | |
// them evenly. | |
#define STRIDE 56005012314UL | |
#define MAX_RESULTS 1073741824UL | |
#define MAX_CHARS 11 | |
__device__ static uint32_t obfuscateWord(unsigned char *e, int len) { | |
uint32_t t = 352960512; | |
for (int i = 0; i < len; i++) { | |
t = (t*33) ^ e[i]; | |
} | |
return t; | |
} | |
__global__ void check(uint32_t goal, volatile int *index) { | |
int bindex = blockIdx.x * blockDim.x + threadIdx.x; | |
int64_t start = bindex*STRIDE; | |
int64_t num = 0; | |
// tmp for holding the digits of the start value | |
unsigned char start_buf[MAX_CHARS]; | |
unsigned char candidate_buf[MAX_CHARS+1]; | |
candidate_buf[MAX_CHARS] = '\0'; | |
memset(candidate_buf, 'a', MAX_CHARS); | |
// Convert our start index to base 26. Don't know how many digits we | |
// will have so store it backward in start_buf and then reverse it | |
// into the end of candidate_buf | |
int digits = 0; | |
while (start > 0) { | |
int rem = start % 26; | |
unsigned char v = 'a' + rem; | |
start_buf[digits++] = v; | |
start = (start - rem) / 26; | |
} | |
for (int i = 0; i < digits; i++) { | |
candidate_buf[MAX_CHARS-i-1] = start_buf[i]; | |
} | |
//printf("Thread %d working on chunk starting at %s\n", | |
// bindex, candidate_buf); | |
while (num < STRIDE+26UL) { | |
// Trick from Paul Khuong: once we compute hash of N-1 characters | |
// we can get the last one by just xoring in the last character. | |
uint32_t prefix = obfuscateWord(candidate_buf, MAX_CHARS-1)*33; | |
candidate_buf[MAX_CHARS-1] = '\0'; | |
//printf("Prefix: %s Hash %08x\n", candidate_buf, prefix); | |
for (unsigned char c = 'a'; c <= 'z'; c++) { | |
if ((prefix^c) == goal) { | |
candidate_buf[MAX_CHARS-1] = c; | |
printf("%s\n", candidate_buf); | |
int old = atomicAdd((int *)index, 1); | |
//memcpy(resbuf+(old*MAX_CHARS), candidate_buf, MAX_CHARS); | |
} | |
} | |
num += 26; | |
// Now just incremement starting from the secont to last char | |
bool finished = true; | |
for (int i = MAX_CHARS-2; i >= 0; i--) { | |
// Increment | |
candidate_buf[i]++; | |
// Handle carry | |
if (candidate_buf[i] > 'z') { | |
candidate_buf[i] = 'a'; | |
} | |
else { | |
finished = false; | |
break; | |
} | |
} | |
// If finished is still set then the carry propagated through | |
// the entire array, so we must be done. | |
if (finished) break; | |
} | |
} | |
int main(int argc, char **argv) { | |
//unsigned char *host_results; | |
//unsigned char *device_results; | |
if (argc < 2) { | |
fprintf(stderr, "usage: %s <hash>\n", argv[0]); | |
return 1; | |
} | |
uint64_t goal = strtoul(argv[1], NULL, 0); | |
if (errno != 0) { | |
perror("strtoul"); | |
return 1; | |
} | |
if (goal > 0xffffffff) { | |
fprintf(stderr, "This doesn't look like a Copilot hash... too big!\n"); | |
return 1; | |
} | |
uint32_t goal32 = (uint32_t)goal; | |
// This sets up a pinned chunk of memory that is visible on both | |
// the device and host. We will use it to report progress from | |
// the device which tells us when we need to collect results on | |
// the host. | |
volatile int *d_data, *h_data; | |
cudaSetDeviceFlags(cudaDeviceMapHost); | |
cudaCheckErrors("cudaSetDeviceFlags error"); | |
cudaHostAlloc((void **)&h_data, sizeof(int), cudaHostAllocMapped); | |
cudaCheckErrors("cudaHostAlloc error"); | |
cudaHostGetDevicePointer((int **)&d_data, (int *)h_data, 0); | |
cudaCheckErrors("cudaHostGetDevicePointer error"); | |
*h_data = 0; | |
#if 0 | |
// Alloc mem for results device-side and host-side | |
cudaMallocHost((void **)&host_results, MAX_RESULTS*MAX_CHARS); // pinned | |
cudaMalloc((void **)&device_results, MAX_RESULTS*MAX_CHARS); | |
cudaCheckErrors("cudaMalloc device_results fail"); | |
#endif | |
// We create a second CUDA stream here so that we can do memory | |
// copies from device to host in parallel with the kernel execution | |
cudaStream_t stream0; | |
cudaStream_t stream1; | |
cudaStreamCreate(&stream0); | |
cudaCheckErrors("cudaStreamCreate"); | |
cudaStreamCreate(&stream1); | |
cudaCheckErrors("cudaStreamCreate"); | |
// Kernel launch! We use an event so we can tell when the kernel | |
// has finished. | |
cudaEvent_t start, stop; | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
cudaEventRecord(start,stream0); | |
check<<<M, N, 0, stream0>>>(goal32, d_data); | |
cudaEventRecord(stop,stream0); | |
// Monitor for results | |
int value = 0; | |
while (true) { | |
sleep(5); | |
int value1 = *h_data; | |
if (value1 > value) { | |
fprintf(stderr, "solutions found = %d\n", value1); | |
#if 0 | |
cudaMemcpyAsync(host_results+(value*MAX_CHARS), | |
device_results+(value*MAX_CHARS), | |
(value1-value)*MAX_CHARS, | |
cudaMemcpyDeviceToHost, | |
stream1); | |
cudaStreamSynchronize(stream1); | |
for (int i = value; i < value1; i++) { | |
printf("%.*s\n", MAX_CHARS, host_results+(i*MAX_CHARS)); | |
} | |
#endif | |
value = value1; | |
} | |
if (cudaEventQuery(stop) == cudaSuccess) break; | |
} | |
float et; | |
cudaEventElapsedTime(&et, start, stop); | |
double keyspace = pow(26,MAX_CHARS); | |
fprintf(stderr, "Searched keyspace of size %lu in %f ms (%f H/s)\n", | |
(uint64_t)keyspace, et, (double)keyspace / (et / 1000)); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Faster:
Also you can also do an early exit starting at
MAX_CHARS-4
which should be much faster (or at least use less power):[edit: fixed
earlyExit()
missing a parenthesis and forgot to give ittestHash
]