tehnerd · August 17, 2018 04:11
diff --git a/gistfile1.txt b/gistfile1.txt
 to run:

 compile:  nvcc -o ./siphash_col ./siphash_collision.cu -O3 -std=c++11
 run:
 time ./siphash_col
 ...
 hash : 12260507120865799508 collided hash 15883040871881799508
 mod_hash: 1799508 mod_coll_hash: 1799508
 index 3799705340
 hash : 12260507120865799508 collided hash 4453885830073799508
 mod_hash: 1799508 mod_coll_hash: 1799508
 index 1566152445
 hash : 12260507120865799508 collided hash 8030928623037799508
 mod_hash: 1799508 mod_coll_hash: 1799508
 num of collisions 689
 num of collisions 0

 real    0m15.143s
 user    0m1.640s
 sys     0m3.012s



 or under proffile:
 nvprof ./siphash_col
 
 ...
 hash : 12260507120865799508 collided hash 4453885830073799508
 mod_hash: 1799508 mod_coll_hash: 1799508
 index 1566152445
 hash : 12260507120865799508 collided hash 8030928623037799508
 mod_hash: 1799508 mod_coll_hash: 1799508
 num of collisions 689
 num of collisions 0
 ==21339== Profiling application: ./siphash_col
 ==21339== Profiling result:
 Time(%)      Time     Calls       Avg       Min       Max  Name
 97.22%  14.3180s         9  1.59089s  1.22229s  2.65384s  calc_hash(unsigned long, unsigned long, unsigned long, unsigned long*)
  2.60%  383.39ms        10  38.339ms  13.796ms  51.558ms  [CUDA memcpy DtoH]
  0.18%  26.358ms         1  26.358ms  26.358ms  26.358ms  [CUDA memcpy HtoD]

 ==21339== API calls:
 Time(%)      Time     Calls       Avg       Min       Max  Name
 98.55%  14.7354s        11  1.33958s  14.500ms  2.69470s  cudaMemcpy
  1.42%  211.93ms         1  211.93ms  211.93ms  211.93ms  cudaMalloc
  0.03%  4.2726ms         1  4.2726ms  4.2726ms  4.2726ms  cudaFree
  0.01%  999.52us         9  111.06us  97.550us  120.94us  cudaLaunch
  0.00%  106.87us        91  1.1740us     625ns  23.489us  cuDeviceGetAttribute
  0.00%  40.619us        36  1.1280us     572ns  4.6880us  cudaSetupArgument
  0.00%  29.898us         9  3.3220us  2.8130us  4.0110us  cudaConfigureCall
  0.00%  6.3540us         1  6.3540us  6.3540us  6.3540us  cuDeviceTotalMem
  0.00%  5.9380us         3  1.9790us  1.0410us  3.3860us  cuDeviceGetCount
  0.00%  3.6460us         3  1.2150us     938ns  1.4580us  cuDeviceGet
  0.00%  1.8230us         1  1.8230us  1.8230us  1.8230us  cuDeviceGetName
  
  
diff --git a/siphash_collision.cu b/siphash_collision.cu
 #include <iostream>
 #include <cstdint>
 #include <vector>

 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>


 /* default: SipHash-2-4 */
 #define cROUNDS 2
 #define dROUNDS 4

 #define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))

 #define U32TO8_LE(p, v)                                                        \
 	(p)[0] = (uint8_t)((v));                                               \
 	(p)[1] = (uint8_t)((v) >> 8);                                          \
 	(p)[2] = (uint8_t)((v) >> 16);                                         \
 	(p)[3] = (uint8_t)((v) >> 24);

 #define U64TO8_LE(p, v)                                                        \
 	U32TO8_LE((p), (uint32_t)((v)));                                       \
 	U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));

 #define U8TO64_LE(p)                                                           \
 	(((uint64_t)((p)[0])) | ((uint64_t)((p)[1]) << 8) |                    \
 	 ((uint64_t)((p)[2]) << 16) | ((uint64_t)((p)[3]) << 24) |             \
 	 ((uint64_t)((p)[4]) << 32) | ((uint64_t)((p)[5]) << 40) |             \
 	 ((uint64_t)((p)[6]) << 48) | ((uint64_t)((p)[7]) << 56))

 #define SIPROUND                                                               \
 	do {                                                                   \
 		v0 += v1;                                                      \
 		v1 = ROTL(v1, 13);                                             \
 		v1 ^= v0;                                                      \
 		v0 = ROTL(v0, 32);                                             \
 		v2 += v3;                                                      \
 		v3 = ROTL(v3, 16);                                             \
 		v3 ^= v2;                                                      \
 		v0 += v3;                                                      \
 		v3 = ROTL(v3, 21);                                             \
 		v3 ^= v0;                                                      \
 		v2 += v1;                                                      \
 		v1 = ROTL(v1, 17);                                             \
 		v1 ^= v2;                                                      \
 		v2 = ROTL(v2, 32);                                             \
 	} while (0)

 #ifdef DEBUG
 #define TRACE                                                                  \
 	do {                                                                   \
 		printf("(%3d) v0 %08x %08x\n", (int)inlen,                     \
 		       (uint32_t)(v0 >> 32), (uint32_t)v0);                    \
 		printf("(%3d) v1 %08x %08x\n", (int)inlen,                     \
 		       (uint32_t)(v1 >> 32), (uint32_t)v1);                    \
 		printf("(%3d) v2 %08x %08x\n", (int)inlen,                     \
 		       (uint32_t)(v2 >> 32), (uint32_t)v2);                    \
 		printf("(%3d) v3 %08x %08x\n", (int)inlen,                     \
 		       (uint32_t)(v3 >> 32), (uint32_t)v3);                    \
 	} while (0)
 #else
 #define TRACE
 #endif

 __host__ __device__ int siphash(uint8_t *out, const uint8_t *in, uint64_t inlen, const uint8_t *k)
 {
 	/* "somepseudorandomlygeneratedbytes" */
 	uint64_t v0 = 0x736f6d6570736575ULL;
 	uint64_t v1 = 0x646f72616e646f6dULL;
 	uint64_t v2 = 0x6c7967656e657261ULL;
 	uint64_t v3 = 0x7465646279746573ULL;
 	uint64_t b;
 	uint64_t k0 = U8TO64_LE(k);
 	uint64_t k1 = U8TO64_LE(k + 8);
 	uint64_t m;
 	int i;
 	const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t));
 	const int left = inlen & 7;
 	b = ((uint64_t)inlen) << 56;
 	v3 ^= k1;
 	v2 ^= k0;
 	v1 ^= k1;
 	v0 ^= k0;
 #ifdef DOUBLE
 	v1 ^= 0xee;
 #endif

 	for (; in != end; in += 8) {
 		m = U8TO64_LE(in);
 		v3 ^= m;

 		TRACE;
 		for (i = 0; i < cROUNDS; ++i)
 			SIPROUND;

 		v0 ^= m;
 	}

 	switch (left) {
 	case 7:
 		b |= ((uint64_t)in[6]) << 48;
 	case 6:
 		b |= ((uint64_t)in[5]) << 40;
 	case 5:
 		b |= ((uint64_t)in[4]) << 32;
 	case 4:
 		b |= ((uint64_t)in[3]) << 24;
 	case 3:
 		b |= ((uint64_t)in[2]) << 16;
 	case 2:
 		b |= ((uint64_t)in[1]) << 8;
 	case 1:
 		b |= ((uint64_t)in[0]);
 		break;
 	case 0:
 		break;
 	}

 	v3 ^= b;
 	TRACE;
 	for (i = 0; i < cROUNDS; ++i)
 		SIPROUND;

 	v0 ^= b;

 #ifndef DOUBLE
 	v2 ^= 0xff;
 #else
 	v2 ^= 0xee;
 #endif


 	TRACE;
 	for (i = 0; i < dROUNDS; ++i)
 		SIPROUND;

 	b = v0 ^ v1 ^ v2 ^ v3;

 	U64TO8_LE(out, b);

 #ifdef DOUBLE
 	v1 ^= 0xdd;

 	TRACE;
 	for (i = 0; i < dROUNDS; ++i)
 		SIPROUND;

 	b = v0 ^ v1 ^ v2 ^ v3;
 	U64TO8_LE(out + 8, b);
 #endif

 	return 0;
 }

 namespace {
  constexpr int kTSize = 4000000; // size of hash table ; this is m in hash%m
 }

 __global__ void calc_hash(
        uint64_t hash, uint64_t base, uint64_t max, uint64_t* bvec) {
    uint64_t shash;
    uint8_t key[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
    int idx = threadIdx.x;
    uint64_t i = base + idx;
    int incr = 1024;
    while (i < max) {
        shash = 0;
        siphash((uint8_t*)&shash, (uint8_t*)&i, sizeof(i), (uint8_t*)&key);
        if (shash%kTSize == hash) {
            bvec[idx] = i;
        } 
        i += incr;
    }
 }

 int main() {
  std::vector<uint64_t> bvec(kTSize);
  for (int i = 0; i < kTSize; i++) {
    bvec[i] = 0;
  }
  uint64_t hash = 0;
  uint64_t hash1 = 0;
  uint64_t t = 2307;
  uint64_t t2 = 464602104;
  uint64_t* bvec_c;
  uint8_t key[16] = {0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
  // add_vec(a.data(), b.data(), c.data(), kSize);
  cudaMalloc((void**)&bvec_c, kTSize * sizeof(uint64_t));
  cudaMemcpy(bvec_c, bvec.data(),
     kTSize * sizeof(uint64_t), cudaMemcpyHostToDevice);
  siphash((uint8_t*)&hash, (uint8_t*)&t, sizeof(t), (uint8_t*)&key);
  siphash((uint8_t*)&hash1, (uint8_t*)&t, sizeof(t), (uint8_t*)&key);
  std::cout << "hash is " << hash%kTSize << " hash1 " << hash1%kTSize << std::endl;
  hash1 %= kTSize;
  uint64_t base = 0;
  uint64_t increment = 500000000;
  uint64_t max = 0;
  uint64_t ml = 4294967296;
  while(max < ml) {
      /* my jetson tx1 was crashing if it were doing more then ~500mil iterations
       at once. hence this stupid hack w/ increment */
      max += increment;
      calc_hash<<<1, 1024>>>(hash1, base, max, bvec_c);
      base += increment;
      /* again hack becuase of tx1 crashes. cudaMemcpy force host to blockwait till
        all previously started kernels would be complited on gpu*/
      cudaMemcpy(bvec.data(), bvec_c,
        kTSize * sizeof(uint64_t), cudaMemcpyDeviceToHost);

  }
  cudaMemcpy(bvec.data(), bvec_c,
     kTSize * sizeof(uint64_t), cudaMemcpyDeviceToHost);
  cudaFree(bvec_c);
  std::cout << "done\n";
  int ctr = 0;
  for (int i = 0; i < bvec.size(); i++) {
    if (bvec[i] > 0) {
      ctr++;
      uint64_t nhash;
      uint64_t idx = bvec[i];
      std::cout << "index " << idx << std::endl;
      siphash((uint8_t*)&nhash, (uint8_t*)&idx, sizeof(idx), (uint8_t*)&key);
      std::cout << "hash : " << hash << " collided hash " << nhash << std::endl;
      std::cout << "mod_hash: " << hash % kTSize << " mod_coll_hash: " << nhash%kTSize
                << std::endl;
    }
  }
  std::cout << "num of collisions " << ctr << std::endl;
  int cntr = 0;
  uint64_t shash;
  /* find collisions using cpu
  for (uint64_t i = 0; i < 4294967296; i++) {
      shash = 0;
      siphash((uint8_t*)&shash, (uint8_t*)&i, sizeof(i), (uint8_t*)&key);
      if (shash%kTSize == hash1) {
        cntr++;
      }
  }
  */
  std::cout << "num of collisions " << cntr << std::endl;
  return 0;
 }
	to run:

	compile: nvcc -o ./siphash_col ./siphash_collision.cu -O3 -std=c++11
	run:
	time ./siphash_col
	...
	hash : 12260507120865799508 collided hash 15883040871881799508
	mod_hash: 1799508 mod_coll_hash: 1799508
	index 3799705340
	hash : 12260507120865799508 collided hash 4453885830073799508
	mod_hash: 1799508 mod_coll_hash: 1799508
	index 1566152445
	hash : 12260507120865799508 collided hash 8030928623037799508
	mod_hash: 1799508 mod_coll_hash: 1799508
	num of collisions 689
	num of collisions 0

	real 0m15.143s
	user 0m1.640s
	sys 0m3.012s



	or under proffile:
	nvprof ./siphash_col

	...
	hash : 12260507120865799508 collided hash 4453885830073799508
	mod_hash: 1799508 mod_coll_hash: 1799508
	index 1566152445
	hash : 12260507120865799508 collided hash 8030928623037799508
	mod_hash: 1799508 mod_coll_hash: 1799508
	num of collisions 689
	num of collisions 0
	==21339== Profiling application: ./siphash_col
	==21339== Profiling result:
	Time(%) Time Calls Avg Min Max Name
	97.22% 14.3180s 9 1.59089s 1.22229s 2.65384s calc_hash(unsigned long, unsigned long, unsigned long, unsigned long*)
	2.60% 383.39ms 10 38.339ms 13.796ms 51.558ms [CUDA memcpy DtoH]
	0.18% 26.358ms 1 26.358ms 26.358ms 26.358ms [CUDA memcpy HtoD]

	==21339== API calls:
	Time(%) Time Calls Avg Min Max Name
	98.55% 14.7354s 11 1.33958s 14.500ms 2.69470s cudaMemcpy
	1.42% 211.93ms 1 211.93ms 211.93ms 211.93ms cudaMalloc
	0.03% 4.2726ms 1 4.2726ms 4.2726ms 4.2726ms cudaFree
	0.01% 999.52us 9 111.06us 97.550us 120.94us cudaLaunch
	0.00% 106.87us 91 1.1740us 625ns 23.489us cuDeviceGetAttribute
	0.00% 40.619us 36 1.1280us 572ns 4.6880us cudaSetupArgument
	0.00% 29.898us 9 3.3220us 2.8130us 4.0110us cudaConfigureCall
	0.00% 6.3540us 1 6.3540us 6.3540us 6.3540us cuDeviceTotalMem
	0.00% 5.9380us 3 1.9790us 1.0410us 3.3860us cuDeviceGetCount
	0.00% 3.6460us 3 1.2150us 938ns 1.4580us cuDeviceGet
	0.00% 1.8230us 1 1.8230us 1.8230us 1.8230us cuDeviceGetName
	#include <iostream>
	#include <cstdint>
	#include <vector>

	#include <stdint.h>
	#include <stdio.h>
	#include <string.h>


	/* default: SipHash-2-4 */
	#define cROUNDS 2
	#define dROUNDS 4

	#define ROTL(x, b) (uint64_t)(((x) << (b)) \| ((x) >> (64 - (b))))

	#define U32TO8_LE(p, v) \
	(p)[0] = (uint8_t)((v)); \
	(p)[1] = (uint8_t)((v) >> 8); \
	(p)[2] = (uint8_t)((v) >> 16); \
	(p)[3] = (uint8_t)((v) >> 24);

	#define U64TO8_LE(p, v) \
	U32TO8_LE((p), (uint32_t)((v))); \
	U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));

	#define U8TO64_LE(p) \
	(((uint64_t)((p)[0])) \| ((uint64_t)((p)[1]) << 8) \| \
	((uint64_t)((p)[2]) << 16) \| ((uint64_t)((p)[3]) << 24) \| \
	((uint64_t)((p)[4]) << 32) \| ((uint64_t)((p)[5]) << 40) \| \
	((uint64_t)((p)[6]) << 48) \| ((uint64_t)((p)[7]) << 56))

	#define SIPROUND \
	do { \
	v0 += v1; \
	v1 = ROTL(v1, 13); \
	v1 ^= v0; \
	v0 = ROTL(v0, 32); \
	v2 += v3; \
	v3 = ROTL(v3, 16); \
	v3 ^= v2; \
	v0 += v3; \
	v3 = ROTL(v3, 21); \
	v3 ^= v0; \
	v2 += v1; \
	v1 = ROTL(v1, 17); \
	v1 ^= v2; \
	v2 = ROTL(v2, 32); \
	} while (0)

	#ifdef DEBUG
	#define TRACE \
	do { \
	printf("(%3d) v0 %08x %08x\n", (int)inlen, \
	(uint32_t)(v0 >> 32), (uint32_t)v0); \
	printf("(%3d) v1 %08x %08x\n", (int)inlen, \
	(uint32_t)(v1 >> 32), (uint32_t)v1); \
	printf("(%3d) v2 %08x %08x\n", (int)inlen, \
	(uint32_t)(v2 >> 32), (uint32_t)v2); \
	printf("(%3d) v3 %08x %08x\n", (int)inlen, \
	(uint32_t)(v3 >> 32), (uint32_t)v3); \
	} while (0)
	#else
	#define TRACE
	#endif

	__host__ __device__ int siphash(uint8_t out, const uint8_t in, uint64_t inlen, const uint8_t *k)
	{
	/* "somepseudorandomlygeneratedbytes" */
	uint64_t v0 = 0x736f6d6570736575ULL;
	uint64_t v1 = 0x646f72616e646f6dULL;
	uint64_t v2 = 0x6c7967656e657261ULL;
	uint64_t v3 = 0x7465646279746573ULL;
	uint64_t b;
	uint64_t k0 = U8TO64_LE(k);
	uint64_t k1 = U8TO64_LE(k + 8);
	uint64_t m;
	int i;
	const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t));
	const int left = inlen & 7;
	b = ((uint64_t)inlen) << 56;
	v3 ^= k1;
	v2 ^= k0;
	v1 ^= k1;
	v0 ^= k0;
	#ifdef DOUBLE
	v1 ^= 0xee;
	#endif

	for (; in != end; in += 8) {
	m = U8TO64_LE(in);
	v3 ^= m;

	TRACE;
	for (i = 0; i < cROUNDS; ++i)
	SIPROUND;

	v0 ^= m;
	}

	switch (left) {
	case 7:
	b \|= ((uint64_t)in[6]) << 48;
	case 6:
	b \|= ((uint64_t)in[5]) << 40;
	case 5:
	b \|= ((uint64_t)in[4]) << 32;
	case 4:
	b \|= ((uint64_t)in[3]) << 24;
	case 3:
	b \|= ((uint64_t)in[2]) << 16;
	case 2:
	b \|= ((uint64_t)in[1]) << 8;
	case 1:
	b \|= ((uint64_t)in[0]);
	break;
	case 0:
	break;
	}

	v3 ^= b;
	TRACE;
	for (i = 0; i < cROUNDS; ++i)
	SIPROUND;

	v0 ^= b;

	#ifndef DOUBLE
	v2 ^= 0xff;
	#else
	v2 ^= 0xee;
	#endif


	TRACE;
	for (i = 0; i < dROUNDS; ++i)
	SIPROUND;

	b = v0 ^ v1 ^ v2 ^ v3;

	U64TO8_LE(out, b);

	#ifdef DOUBLE
	v1 ^= 0xdd;

	TRACE;
	for (i = 0; i < dROUNDS; ++i)
	SIPROUND;

	b = v0 ^ v1 ^ v2 ^ v3;
	U64TO8_LE(out + 8, b);
	#endif

	return 0;
	}

	namespace {
	constexpr int kTSize = 4000000; // size of hash table ; this is m in hash%m
	}

	__global__ void calc_hash(
	uint64_t hash, uint64_t base, uint64_t max, uint64_t* bvec) {
	uint64_t shash;
	uint8_t key[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
	int idx = threadIdx.x;
	uint64_t i = base + idx;
	int incr = 1024;
	while (i < max) {
	shash = 0;
	siphash((uint8_t)&shash, (uint8_t)&i, sizeof(i), (uint8_t*)&key);
	if (shash%kTSize == hash) {
	bvec[idx] = i;
	}
	i += incr;
	}
	}

	int main() {
	std::vector<uint64_t> bvec(kTSize);
	for (int i = 0; i < kTSize; i++) {
	bvec[i] = 0;
	}
	uint64_t hash = 0;
	uint64_t hash1 = 0;
	uint64_t t = 2307;
	uint64_t t2 = 464602104;
	uint64_t* bvec_c;
	uint8_t key[16] = {0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
	// add_vec(a.data(), b.data(), c.data(), kSize);
	cudaMalloc((void*)&bvec_c, kTSize sizeof(uint64_t));
	cudaMemcpy(bvec_c, bvec.data(),
	kTSize * sizeof(uint64_t), cudaMemcpyHostToDevice);
	siphash((uint8_t)&hash, (uint8_t)&t, sizeof(t), (uint8_t*)&key);
	siphash((uint8_t)&hash1, (uint8_t)&t, sizeof(t), (uint8_t*)&key);
	std::cout << "hash is " << hash%kTSize << " hash1 " << hash1%kTSize << std::endl;
	hash1 %= kTSize;
	uint64_t base = 0;
	uint64_t increment = 500000000;
	uint64_t max = 0;
	uint64_t ml = 4294967296;
	while(max < ml) {
	/* my jetson tx1 was crashing if it were doing more then ~500mil iterations
	at once. hence this stupid hack w/ increment */
	max += increment;
	calc_hash<<<1, 1024>>>(hash1, base, max, bvec_c);
	base += increment;
	/* again hack becuase of tx1 crashes. cudaMemcpy force host to blockwait till
	all previously started kernels would be complited on gpu*/
	cudaMemcpy(bvec.data(), bvec_c,
	kTSize * sizeof(uint64_t), cudaMemcpyDeviceToHost);

	}
	cudaMemcpy(bvec.data(), bvec_c,
	kTSize * sizeof(uint64_t), cudaMemcpyDeviceToHost);
	cudaFree(bvec_c);
	std::cout << "done\n";
	int ctr = 0;
	for (int i = 0; i < bvec.size(); i++) {
	if (bvec[i] > 0) {
	ctr++;
	uint64_t nhash;
	uint64_t idx = bvec[i];
	std::cout << "index " << idx << std::endl;
	siphash((uint8_t)&nhash, (uint8_t)&idx, sizeof(idx), (uint8_t*)&key);
	std::cout << "hash : " << hash << " collided hash " << nhash << std::endl;
	std::cout << "mod_hash: " << hash % kTSize << " mod_coll_hash: " << nhash%kTSize
	<< std::endl;
	}
	}
	std::cout << "num of collisions " << ctr << std::endl;
	int cntr = 0;
	uint64_t shash;
	/* find collisions using cpu
	for (uint64_t i = 0; i < 4294967296; i++) {
	shash = 0;
	siphash((uint8_t)&shash, (uint8_t)&i, sizeof(i), (uint8_t*)&key);
	if (shash%kTSize == hash1) {
	cntr++;
	}
	}
	*/
	std::cout << "num of collisions " << cntr << std::endl;
	return 0;
	}