Skip to content

Instantly share code, notes, and snippets.

@tehnerd
Created August 17, 2018 04:11
Show Gist options
  • Save tehnerd/adfd2a5b5a2241a7a5380a2b35647e1a to your computer and use it in GitHub Desktop.
Save tehnerd/adfd2a5b5a2241a7a5380a2b35647e1a to your computer and use it in GitHub Desktop.
siphash cuda
to run:
compile: nvcc -o ./siphash_col ./siphash_collision.cu -O3 -std=c++11
run:
time ./siphash_col
...
hash : 12260507120865799508 collided hash 15883040871881799508
mod_hash: 1799508 mod_coll_hash: 1799508
index 3799705340
hash : 12260507120865799508 collided hash 4453885830073799508
mod_hash: 1799508 mod_coll_hash: 1799508
index 1566152445
hash : 12260507120865799508 collided hash 8030928623037799508
mod_hash: 1799508 mod_coll_hash: 1799508
num of collisions 689
num of collisions 0
real 0m15.143s
user 0m1.640s
sys 0m3.012s
or under proffile:
nvprof ./siphash_col
...
hash : 12260507120865799508 collided hash 4453885830073799508
mod_hash: 1799508 mod_coll_hash: 1799508
index 1566152445
hash : 12260507120865799508 collided hash 8030928623037799508
mod_hash: 1799508 mod_coll_hash: 1799508
num of collisions 689
num of collisions 0
==21339== Profiling application: ./siphash_col
==21339== Profiling result:
Time(%) Time Calls Avg Min Max Name
97.22% 14.3180s 9 1.59089s 1.22229s 2.65384s calc_hash(unsigned long, unsigned long, unsigned long, unsigned long*)
2.60% 383.39ms 10 38.339ms 13.796ms 51.558ms [CUDA memcpy DtoH]
0.18% 26.358ms 1 26.358ms 26.358ms 26.358ms [CUDA memcpy HtoD]
==21339== API calls:
Time(%) Time Calls Avg Min Max Name
98.55% 14.7354s 11 1.33958s 14.500ms 2.69470s cudaMemcpy
1.42% 211.93ms 1 211.93ms 211.93ms 211.93ms cudaMalloc
0.03% 4.2726ms 1 4.2726ms 4.2726ms 4.2726ms cudaFree
0.01% 999.52us 9 111.06us 97.550us 120.94us cudaLaunch
0.00% 106.87us 91 1.1740us 625ns 23.489us cuDeviceGetAttribute
0.00% 40.619us 36 1.1280us 572ns 4.6880us cudaSetupArgument
0.00% 29.898us 9 3.3220us 2.8130us 4.0110us cudaConfigureCall
0.00% 6.3540us 1 6.3540us 6.3540us 6.3540us cuDeviceTotalMem
0.00% 5.9380us 3 1.9790us 1.0410us 3.3860us cuDeviceGetCount
0.00% 3.6460us 3 1.2150us 938ns 1.4580us cuDeviceGet
0.00% 1.8230us 1 1.8230us 1.8230us 1.8230us cuDeviceGetName
#include <iostream>
#include <cstdint>
#include <vector>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
/* default: SipHash-2-4 */
#define cROUNDS 2
#define dROUNDS 4
#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))
#define U32TO8_LE(p, v) \
(p)[0] = (uint8_t)((v)); \
(p)[1] = (uint8_t)((v) >> 8); \
(p)[2] = (uint8_t)((v) >> 16); \
(p)[3] = (uint8_t)((v) >> 24);
#define U64TO8_LE(p, v) \
U32TO8_LE((p), (uint32_t)((v))); \
U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
#define U8TO64_LE(p) \
(((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) | \
((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) | \
((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) | \
((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56))
#define SIPROUND \
do { \
v0 += v1; \
v1 = ROTL(v1, 13); \
v1 ^= v0; \
v0 = ROTL(v0, 32); \
v2 += v3; \
v3 = ROTL(v3, 16); \
v3 ^= v2; \
v0 += v3; \
v3 = ROTL(v3, 21); \
v3 ^= v0; \
v2 += v1; \
v1 = ROTL(v1, 17); \
v1 ^= v2; \
v2 = ROTL(v2, 32); \
} while (0)
#ifdef DEBUG
#define TRACE \
do { \
printf("(%3d) v0 %08x %08x\n", (int)inlen, \
(uint32_t)(v0 >> 32), (uint32_t)v0); \
printf("(%3d) v1 %08x %08x\n", (int)inlen, \
(uint32_t)(v1 >> 32), (uint32_t)v1); \
printf("(%3d) v2 %08x %08x\n", (int)inlen, \
(uint32_t)(v2 >> 32), (uint32_t)v2); \
printf("(%3d) v3 %08x %08x\n", (int)inlen, \
(uint32_t)(v3 >> 32), (uint32_t)v3); \
} while (0)
#else
#define TRACE
#endif
__host__ __device__ int siphash(uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k)
{
/* "somepseudorandomlygeneratedbytes" */
uint64_t v0 = 0x736f6d6570736575ULL;
uint64_t v1 = 0x646f72616e646f6dULL;
uint64_t v2 = 0x6c7967656e657261ULL;
uint64_t v3 = 0x7465646279746573ULL;
uint64_t b;
uint64_t k0 = U8TO64_LE(k);
uint64_t k1 = U8TO64_LE(k + 8);
uint64_t m;
int i;
const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t));
const int left = inlen & 7;
b = ((uint64_t)inlen) << 56;
v3 ^= k1;
v2 ^= k0;
v1 ^= k1;
v0 ^= k0;
#ifdef DOUBLE
v1 ^= 0xee;
#endif
for (; in != end; in += 8) {
m = U8TO64_LE(in);
v3 ^= m;
TRACE;
for (i = 0; i < cROUNDS; ++i)
SIPROUND;
v0 ^= m;
}
switch (left) {
case 7:
b |= ((uint64_t)in[6]) << 48;
case 6:
b |= ((uint64_t)in[5]) << 40;
case 5:
b |= ((uint64_t)in[4]) << 32;
case 4:
b |= ((uint64_t)in[3]) << 24;
case 3:
b |= ((uint64_t)in[2]) << 16;
case 2:
b |= ((uint64_t)in[1]) << 8;
case 1:
b |= ((uint64_t)in[0]);
break;
case 0:
break;
}
v3 ^= b;
TRACE;
for (i = 0; i < cROUNDS; ++i)
SIPROUND;
v0 ^= b;
#ifndef DOUBLE
v2 ^= 0xff;
#else
v2 ^= 0xee;
#endif
TRACE;
for (i = 0; i < dROUNDS; ++i)
SIPROUND;
b = v0 ^ v1 ^ v2 ^ v3;
U64TO8_LE(out, b);
#ifdef DOUBLE
v1 ^= 0xdd;
TRACE;
for (i = 0; i < dROUNDS; ++i)
SIPROUND;
b = v0 ^ v1 ^ v2 ^ v3;
U64TO8_LE(out + 8, b);
#endif
return 0;
}
namespace {
constexpr int kTSize = 4000000; // size of hash table ; this is m in hash%m
}
__global__ void calc_hash(
uint64_t hash, uint64_t base, uint64_t max, uint64_t* bvec) {
uint64_t shash;
uint8_t key[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
int idx = threadIdx.x;
uint64_t i = base + idx;
int incr = 1024;
while (i < max) {
shash = 0;
siphash((uint8_t*)&shash, (uint8_t*)&i, sizeof(i), (uint8_t*)&key);
if (shash%kTSize == hash) {
bvec[idx] = i;
}
i += incr;
}
}
int main() {
std::vector<uint64_t> bvec(kTSize);
for (int i = 0; i < kTSize; i++) {
bvec[i] = 0;
}
uint64_t hash = 0;
uint64_t hash1 = 0;
uint64_t t = 2307;
uint64_t t2 = 464602104;
uint64_t* bvec_c;
uint8_t key[16] = {0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
// add_vec(a.data(), b.data(), c.data(), kSize);
cudaMalloc((void**)&bvec_c, kTSize * sizeof(uint64_t));
cudaMemcpy(bvec_c, bvec.data(),
kTSize * sizeof(uint64_t), cudaMemcpyHostToDevice);
siphash((uint8_t*)&hash, (uint8_t*)&t, sizeof(t), (uint8_t*)&key);
siphash((uint8_t*)&hash1, (uint8_t*)&t, sizeof(t), (uint8_t*)&key);
std::cout << "hash is " << hash%kTSize << " hash1 " << hash1%kTSize << std::endl;
hash1 %= kTSize;
uint64_t base = 0;
uint64_t increment = 500000000;
uint64_t max = 0;
uint64_t ml = 4294967296;
while(max < ml) {
/* my jetson tx1 was crashing if it were doing more then ~500mil iterations
at once. hence this stupid hack w/ increment */
max += increment;
calc_hash<<<1, 1024>>>(hash1, base, max, bvec_c);
base += increment;
/* again hack becuase of tx1 crashes. cudaMemcpy force host to blockwait till
all previously started kernels would be complited on gpu*/
cudaMemcpy(bvec.data(), bvec_c,
kTSize * sizeof(uint64_t), cudaMemcpyDeviceToHost);
}
cudaMemcpy(bvec.data(), bvec_c,
kTSize * sizeof(uint64_t), cudaMemcpyDeviceToHost);
cudaFree(bvec_c);
std::cout << "done\n";
int ctr = 0;
for (int i = 0; i < bvec.size(); i++) {
if (bvec[i] > 0) {
ctr++;
uint64_t nhash;
uint64_t idx = bvec[i];
std::cout << "index " << idx << std::endl;
siphash((uint8_t*)&nhash, (uint8_t*)&idx, sizeof(idx), (uint8_t*)&key);
std::cout << "hash : " << hash << " collided hash " << nhash << std::endl;
std::cout << "mod_hash: " << hash % kTSize << " mod_coll_hash: " << nhash%kTSize
<< std::endl;
}
}
std::cout << "num of collisions " << ctr << std::endl;
int cntr = 0;
uint64_t shash;
/* find collisions using cpu
for (uint64_t i = 0; i < 4294967296; i++) {
shash = 0;
siphash((uint8_t*)&shash, (uint8_t*)&i, sizeof(i), (uint8_t*)&key);
if (shash%kTSize == hash1) {
cntr++;
}
}
*/
std::cout << "num of collisions " << cntr << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment