jweinst1 · August 5, 2025 23:38
diff --git a/density.cpp b/density.cpp
 #include <stdint.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <time.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include <stdbool.h>

 #include <unordered_map>
 #include <vector>

 static std::unordered_map<uint64_t, size_t> GLOBAL_HASH_COUNT;

 static void addHash(uint64_t h) {
 	auto found = GLOBAL_HASH_COUNT.find(h);
 	if (found != GLOBAL_HASH_COUNT.end()) {
 		found->second += 1;
 	} else {
 		GLOBAL_HASH_COUNT.insert(std::make_pair(h, 1));
 	}
 }

 static size_t printColls() {
 	size_t total = 0;
 	for (const auto& elem: GLOBAL_HASH_COUNT) {
 		if (elem.second > 1) {
 			printf("COLL: %llu = %zu\n", elem.first, elem.second);
 			total += (elem.second - 1);
 		}
 	}
 	printf("TOTAL COLL: %zu\n", total);
 	return total;
 }


 // Finalization mix - force all bits of a hash block to avalanche
 static uint64_t fmix64(uint64_t k) {
    k ^= k >> 33;
    k *= 0xff51afd7ed558ccdULL;
    k ^= k >> 33;
    k *= 0xc4ceb9fe1a85ec53ULL;
    k ^= k >> 33;
    return k;
 }

 uint64_t murmur3_64(const void *key, size_t len, uint32_t seed) {
    const uint8_t *data = (const uint8_t *)key;
    const int nblocks = len / 16;

    uint64_t h1 = seed;
    uint64_t h2 = seed;

    const uint64_t c1 = 0x87c37b91114253d5ULL;
    const uint64_t c2 = 0x4cf5ad432745937fULL;

    //----------
    // body

    const uint64_t *blocks = (const uint64_t *)(data);
    for (int i = 0; i < nblocks; i++) {
        uint64_t k1 = blocks[i * 2 + 0];
        uint64_t k2 = blocks[i * 2 + 1];

        k1 *= c1; k1 = (k1 << 31) | (k1 >> (64 - 31)); k1 *= c2; h1 ^= k1;
        h1 = (h1 << 27) | (h1 >> (64 - 27)); h1 += h2; h1 = h1 * 5 + 0x52dce729;

        k2 *= c2; k2 = (k2 << 33) | (k2 >> (64 - 33)); k2 *= c1; h2 ^= k2;
        h2 = (h2 << 31) | (h2 >> (64 - 31)); h2 += h1; h2 = h2 * 5 + 0x38495ab5;
    }

    //----------
    // tail

    const uint8_t *tail = (const uint8_t *)(data + nblocks * 16);
    uint64_t k1 = 0;
    uint64_t k2 = 0;

    switch (len & 15) {
        case 15: k2 ^= ((uint64_t)tail[14]) << 48;
        case 14: k2 ^= ((uint64_t)tail[13]) << 40;
        case 13: k2 ^= ((uint64_t)tail[12]) << 32;
        case 12: k2 ^= ((uint64_t)tail[11]) << 24;
        case 11: k2 ^= ((uint64_t)tail[10]) << 16;
        case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
        case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
                 k2 *= c2; k2 = (k2 << 33) | (k2 >> (64 - 33)); k2 *= c1; h2 ^= k2;
        case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
        case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
        case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
        case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
        case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
        case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
        case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
        case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
                 k1 *= c1; k1 = (k1 << 31) | (k1 >> (64 - 31)); k1 *= c2; h1 ^= k1;
    }

    //----------
    // finalization

    h1 ^= len;
    h2 ^= len;

    h1 += h2;
    h2 += h1;

    h1 = fmix64(h1);
    h2 = fmix64(h2);

    h1 += h2;

    // Return 64-bit hash (can return h2 or h1 depending on needs)
    return h1;
 }

 void create_rand_hashes(std::vector<uint64_t>& hashes, size_t count, size_t* modder) {
    static const size_t input_size = 10;
    char input_string[input_size + 1] = {'\0'};

    for (size_t i = 0; i < count; ++i) {
        for (int j = 0; j < input_size; ++j)
        {
            input_string[j] = rand() % 253;
        }
        if (modder != NULL) {
            hashes.push_back(murmur3_64(input_string, input_size, input_size) % *modder);
        } else {
            hashes.push_back(murmur3_64(input_string, input_size, input_size));
        }
    }
 }

 struct HashBlock {
    size_t hash_count;
    std::vector<uint64_t> hashes;

    HashBlock() {
    	hash_count = 0;
    }

    explicit HashBlock(size_t count, size_t mod = 0): hash_count(count) {
    	create_rand_hashes(hashes, hash_count, mod == 0 ? NULL : &mod);
    	for (const auto& h : hashes) {
    		addHash(h);
    	}
    }
 };

 int main(int argc, char const *argv[])
 {
 	 srand(time(NULL));
 	std::vector<HashBlock> blocks;
 	for (int i = 0; i < 100; ++i)
 	{
 		HashBlock h(500, 1024 * 1024 * 10);
 		blocks.push_back(h);
 	}
 	printColls();
 	return 0;
 }
diff --git a/hash_density_research.c b/hash_density_research.c
 #include <stdint.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <time.h>
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
 #include <stdbool.h>

 // Finalization mix - force all bits of a hash block to avalanche
 static uint64_t fmix64(uint64_t k) {
    k ^= k >> 33;
    k *= 0xff51afd7ed558ccdULL;
    k ^= k >> 33;
    k *= 0xc4ceb9fe1a85ec53ULL;
    k ^= k >> 33;
    return k;
 }

 uint64_t murmur3_64(const void *key, size_t len, uint32_t seed) {
    const uint8_t *data = (const uint8_t *)key;
    const int nblocks = len / 16;

    uint64_t h1 = seed;
    uint64_t h2 = seed;

    const uint64_t c1 = 0x87c37b91114253d5ULL;
    const uint64_t c2 = 0x4cf5ad432745937fULL;

    //----------
    // body

    const uint64_t *blocks = (const uint64_t *)(data);
    for (int i = 0; i < nblocks; i++) {
        uint64_t k1 = blocks[i * 2 + 0];
        uint64_t k2 = blocks[i * 2 + 1];

        k1 *= c1; k1 = (k1 << 31) | (k1 >> (64 - 31)); k1 *= c2; h1 ^= k1;
        h1 = (h1 << 27) | (h1 >> (64 - 27)); h1 += h2; h1 = h1 * 5 + 0x52dce729;

        k2 *= c2; k2 = (k2 << 33) | (k2 >> (64 - 33)); k2 *= c1; h2 ^= k2;
        h2 = (h2 << 31) | (h2 >> (64 - 31)); h2 += h1; h2 = h2 * 5 + 0x38495ab5;
    }

    //----------
    // tail

    const uint8_t *tail = (const uint8_t *)(data + nblocks * 16);
    uint64_t k1 = 0;
    uint64_t k2 = 0;

    switch (len & 15) {
        case 15: k2 ^= ((uint64_t)tail[14]) << 48;
        case 14: k2 ^= ((uint64_t)tail[13]) << 40;
        case 13: k2 ^= ((uint64_t)tail[12]) << 32;
        case 12: k2 ^= ((uint64_t)tail[11]) << 24;
        case 11: k2 ^= ((uint64_t)tail[10]) << 16;
        case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
        case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
                 k2 *= c2; k2 = (k2 << 33) | (k2 >> (64 - 33)); k2 *= c1; h2 ^= k2;
        case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
        case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
        case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
        case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
        case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
        case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
        case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
        case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
                 k1 *= c1; k1 = (k1 << 31) | (k1 >> (64 - 31)); k1 *= c2; h1 ^= k1;
    }

    //----------
    // finalization

    h1 ^= len;
    h2 ^= len;

    h1 += h2;
    h2 += h1;

    h1 = fmix64(h1);
    h2 = fmix64(h2);

    h1 += h2;

    // Return 64-bit hash (can return h2 or h1 depending on needs)
    return h1;
 }


 struct number_node {
    uint64_t hash;
    uint64_t block;
 };

 struct number_part {
    size_t size;
 };

 uint64_t* create_rand_hashes(size_t count) {
    uint64_t* hashes = malloc(count * sizeof(uint64_t));
    static const size_t input_size = 10;
    char input_string[input_size + 1] = {'\0'};

    for (size_t i = 0; i < count; ++i) {
        for (int j = 0; j < input_size; ++j)
        {
            input_string[j] = rand() % 253;
        }
        hashes[i] = murmur3_64(input_string, input_size, input_size) % 9999999999;
    }
    return hashes;
 }


 bool find_min_max(const uint64_t *array, size_t size, uint64_t *min_out, uint64_t *max_out) {
    if (size == 0 || array == NULL || min_out == NULL || max_out == NULL) {
        return false; // Invalid input
    }

    uint64_t min = UINT64_MAX;
    uint64_t max = 0;

    for (size_t i = 0; i < size; ++i) {
        if (array[i] < min) min = array[i];
        if (array[i] > max) max = array[i];
    }

    *min_out = min;
    *max_out = max;

    return true;
 }

 struct num_range {
    uint64_t min;
    uint64_t max;
 };

 static inline uint64_t min_u64(uint64_t a, uint64_t b) {
    return a < b ? a : b;
 }

 static inline uint64_t max_u64(uint64_t a, uint64_t b) {
    return a > b ? a : b;
 }

 uint64_t line_overlap_amount(uint64_t a1, uint64_t a2, uint64_t b1, uint64_t b2) {
    // Normalize segments
    if (a1 > a2) {
        uint64_t tmp = a1; a1 = a2; a2 = tmp;
    }
    if (b1 > b2) {
        uint64_t tmp = b1; b1 = b2; b2 = tmp;
    }

    uint64_t start = max_u64(a1, b1);
    uint64_t end = min_u64(a2, b2);

    if (start >= end) {
        return 0; // No overlap
    }

    return end - start;
 }

 int main(int argc, char const *argv[])
 {
    srand(time(NULL));
    static const size_t repeats = 20;
    struct num_range* rngs = malloc(repeats * sizeof(struct num_range));
    for (int k = 0; k < repeats; ++k)
    {
        uint64_t* hs = create_rand_hashes(50);
        uint64_t mini = 0;
        uint64_t maxi = 0;
        find_min_max(hs, 50, &mini, &maxi);
        printf("MIN: %llu , MAX: %llu RNG: %llu FACT: %llu\n", mini, maxi, maxi - mini, (maxi - mini) / 64000);
        rngs[k].min = mini;
        rngs[k].max = maxi;
        free(hs);
    }

    for (int i = 0; i < repeats; ++i)
    {
        for (int j = i + 1; j < repeats; ++j)
        {
            printf("Pos1: %d Pos2: %d, Overlap: %llu\n", i, j, line_overlap_amount(rngs[i].min, rngs[i].max, rngs[j].min, rngs[j].max));
        }
    }

    free(rngs);
    return 0;
 }
	#include <stdint.h>
	#include <stddef.h>
	#include <stdio.h>
	#include <time.h>
	#include <stdlib.h>
	#include <string.h>
	#include <limits.h>
	#include <stdbool.h>

	#include <unordered_map>
	#include <vector>

	static std::unordered_map<uint64_t, size_t> GLOBAL_HASH_COUNT;

	static void addHash(uint64_t h) {
	auto found = GLOBAL_HASH_COUNT.find(h);
	if (found != GLOBAL_HASH_COUNT.end()) {
	found->second += 1;
	} else {
	GLOBAL_HASH_COUNT.insert(std::make_pair(h, 1));
	}
	}

	static size_t printColls() {
	size_t total = 0;
	for (const auto& elem: GLOBAL_HASH_COUNT) {
	if (elem.second > 1) {
	printf("COLL: %llu = %zu\n", elem.first, elem.second);
	total += (elem.second - 1);
	}
	}
	printf("TOTAL COLL: %zu\n", total);
	return total;
	}


	// Finalization mix - force all bits of a hash block to avalanche
	static uint64_t fmix64(uint64_t k) {
	k ^= k >> 33;
	k *= 0xff51afd7ed558ccdULL;
	k ^= k >> 33;
	k *= 0xc4ceb9fe1a85ec53ULL;
	k ^= k >> 33;
	return k;
	}

	uint64_t murmur3_64(const void *key, size_t len, uint32_t seed) {
	const uint8_t data = (const uint8_t )key;
	const int nblocks = len / 16;

	uint64_t h1 = seed;
	uint64_t h2 = seed;

	const uint64_t c1 = 0x87c37b91114253d5ULL;
	const uint64_t c2 = 0x4cf5ad432745937fULL;

	//----------
	// body

	const uint64_t blocks = (const uint64_t )(data);
	for (int i = 0; i < nblocks; i++) {
	uint64_t k1 = blocks[i * 2 + 0];
	uint64_t k2 = blocks[i * 2 + 1];

	k1 = c1; k1 = (k1 << 31) \| (k1 >> (64 - 31)); k1 = c2; h1 ^= k1;
	h1 = (h1 << 27) \| (h1 >> (64 - 27)); h1 += h2; h1 = h1 * 5 + 0x52dce729;

	k2 = c2; k2 = (k2 << 33) \| (k2 >> (64 - 33)); k2 = c1; h2 ^= k2;
	h2 = (h2 << 31) \| (h2 >> (64 - 31)); h2 += h1; h2 = h2 * 5 + 0x38495ab5;
	}

	//----------
	// tail

	const uint8_t tail = (const uint8_t )(data + nblocks * 16);
	uint64_t k1 = 0;
	uint64_t k2 = 0;

	switch (len & 15) {
	case 15: k2 ^= ((uint64_t)tail[14]) << 48;
	case 14: k2 ^= ((uint64_t)tail[13]) << 40;
	case 13: k2 ^= ((uint64_t)tail[12]) << 32;
	case 12: k2 ^= ((uint64_t)tail[11]) << 24;
	case 11: k2 ^= ((uint64_t)tail[10]) << 16;
	case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
	case 9: k2 ^= ((uint64_t)tail[ 8]) << 0;
	k2 = c2; k2 = (k2 << 33) \| (k2 >> (64 - 33)); k2 = c1; h2 ^= k2;
	case 8: k1 ^= ((uint64_t)tail[ 7]) << 56;
	case 7: k1 ^= ((uint64_t)tail[ 6]) << 48;
	case 6: k1 ^= ((uint64_t)tail[ 5]) << 40;
	case 5: k1 ^= ((uint64_t)tail[ 4]) << 32;
	case 4: k1 ^= ((uint64_t)tail[ 3]) << 24;
	case 3: k1 ^= ((uint64_t)tail[ 2]) << 16;
	case 2: k1 ^= ((uint64_t)tail[ 1]) << 8;
	case 1: k1 ^= ((uint64_t)tail[ 0]) << 0;
	k1 = c1; k1 = (k1 << 31) \| (k1 >> (64 - 31)); k1 = c2; h1 ^= k1;
	}

	//----------
	// finalization

	h1 ^= len;
	h2 ^= len;

	h1 += h2;
	h2 += h1;

	h1 = fmix64(h1);
	h2 = fmix64(h2);

	h1 += h2;

	// Return 64-bit hash (can return h2 or h1 depending on needs)
	return h1;
	}

	void create_rand_hashes(std::vector<uint64_t>& hashes, size_t count, size_t* modder) {
	static const size_t input_size = 10;
	char input_string[input_size + 1] = {'\0'};

	for (size_t i = 0; i < count; ++i) {
	for (int j = 0; j < input_size; ++j)
	{
	input_string[j] = rand() % 253;
	}
	if (modder != NULL) {
	hashes.push_back(murmur3_64(input_string, input_size, input_size) % *modder);
	} else {
	hashes.push_back(murmur3_64(input_string, input_size, input_size));
	}
	}
	}

	struct HashBlock {
	size_t hash_count;
	std::vector<uint64_t> hashes;

	HashBlock() {
	hash_count = 0;
	}

	explicit HashBlock(size_t count, size_t mod = 0): hash_count(count) {
	create_rand_hashes(hashes, hash_count, mod == 0 ? NULL : &mod);
	for (const auto& h : hashes) {
	addHash(h);
	}
	}
	};

	int main(int argc, char const *argv[])
	{
	srand(time(NULL));
	std::vector<HashBlock> blocks;
	for (int i = 0; i < 100; ++i)
	{
	HashBlock h(500, 1024 * 1024 * 10);
	blocks.push_back(h);
	}
	printColls();
	return 0;
	}