jweinst1 · August 31, 2025 23:26
diff --git a/generate_all_neighbors.cpp b/generate_all_neighbors.cpp
 #include <array>
 #include <cstdint>
 #include <cstddef>

 // ---------- ConstexprBitset ----------
 template<std::size_t N>
 struct ConstexprBitset {
    static constexpr std::size_t numBlocks = (N + 63) / 64;
    std::array<uint64_t, numBlocks> blocks{};

    constexpr void set(std::size_t idx) {
        blocks[idx / 64] |= (1ULL << (idx & 63));
    }

    template<typename T>
    constexpr std::size_t getSetIndexes(T* out) const {
        std::size_t count = 0;
        for (std::size_t block = 0; block < numBlocks; block++) {
            uint64_t bits = blocks[block];
            while (bits) {
                int idx = __builtin_ctzll(bits);
                out[count++] = static_cast<T>(block * 64 + idx);
                bits &= bits - 1;
            }
        }
        return count;
    }
 };

 // ---------- Product Metafunction ----------
 template<std::size_t... Ns>
 struct product;

 template<>
 struct product<> {
    static constexpr std::size_t value = 1;
 };

 template<std::size_t N, std::size_t... Ns>
 struct product<N, Ns...> {
    static constexpr std::size_t value = N * product<Ns...>::value;
 };

 // ---------- Generate Neighbors ----------
 template<std::size_t NROWS, std::size_t... COUNTS>
 constexpr auto generate_neighbors(
    const std::array<std::array<uint8_t, 256>, NROWS>& candidates,
    const std::array<std::size_t, NROWS>& counts)
 {
    constexpr std::size_t total_size = product<COUNTS...>::value;

    std::array<std::array<uint8_t, NROWS>, total_size> result{};

    std::array<std::size_t, NROWS> idx{};
    std::size_t outIdx = 0;
    bool done = false;

    while (!done) {
        // capture permutation
        for (std::size_t j = 0; j < NROWS; j++) {
            result[outIdx][j] = candidates[j][idx[j]];
        }
        ++outIdx;

        // advance counters
        for (std::size_t pos = NROWS; pos-- > 0;) {
            if (++idx[pos] < counts[pos]) {
                for (std::size_t j = pos + 1; j < NROWS; j++) idx[j] = 0;
                goto continue_outer;
            }
        }
        done = true;
        continue_outer:;
    }

    return result;
 }
diff --git a/hamming_iterator_array.cpp b/hamming_iterator_array.cpp
 #include <array>
 #include <cstdint>
 #include <cstddef>
 #include <iostream>

 // ========= ConstexprBitset =========
 template<std::size_t N>
 struct ConstexprBitset {
    static constexpr std::size_t numBlocks = (N + 63) / 64;
    std::array<uint64_t, numBlocks> blocks{};

    constexpr void set(std::size_t idx) {
        blocks[idx / 64] |= (1ULL << (idx & 63));
    }

    constexpr bool empty() const {
        for (auto b : blocks) if (b) return false;
        return true;
    }

    template<typename T>
    constexpr std::size_t getSetIndexes(T* out) const {
        std::size_t count = 0;
        for (std::size_t block = 0; block < numBlocks; block++) {
            uint64_t bits = blocks[block];
            while (bits) {
                int idx = __builtin_ctzll(bits); // constexpr_ctzll fallback if needed
                out[count++] = static_cast<T>(block * 64 + idx);
                bits &= bits - 1;
            }
        }
        return count;
    }
 };

 // ========= PermutationIterator (compile-time sized) =========
 template<std::size_t NROWS>
 struct PermutationIterator {
    static constexpr std::size_t MAX_CANDIDATES = 256;

    // rows[i][hamming-distance] = candidate bitset
    const std::array<std::array<ConstexprBitset<256>, 8>, NROWS>& rows;

    // storage
    std::array<std::array<uint8_t, MAX_CANDIDATES>, NROWS> candidates{};
    std::array<std::size_t, NROWS> candCounts{};
    std::array<std::size_t, NROWS> indices{};
    std::array<uint8_t, NROWS> current{};
    bool done = false;

    constexpr PermutationIterator(const std::array<std::array<ConstexprBitset<256>, 8>, NROWS>& r)
        : rows(r)
    {
        for (std::size_t i = 0; i < NROWS; i++) {
            std::size_t total = 0;
            for (auto& bs : rows[i]) {
                total += bs.getSetIndexes(candidates[i].data() + total);
            }
            candCounts[i] = total;
            if (total == 0) { done = true; return; }
            current[i] = candidates[i][0];
        }
    }

    constexpr bool next() {
        if (done) return false;
        for (std::size_t pos = NROWS; pos-- > 0;) {
            if (++indices[pos] < candCounts[pos]) {
                current[pos] = candidates[pos][indices[pos]];
                for (std::size_t j = pos + 1; j < NROWS; j++) {
                    indices[j] = 0;
                    current[j] = candidates[j][0];
                }
                return true;
            }
        }
        done = true;
        return false;
    }

    constexpr const std::array<uint8_t, NROWS>& value() const { return current; }
 };
diff --git a/hamming_iterator_vec.cpp b/hamming_iterator_vec.cpp
 #include <array>
 #include <cstdint>
 #include <cstddef>
 #include <iostream>
 #include <vector>

 // ========= ConstexprBitset with array-style getSetIndexes =========
 template<std::size_t N>
 struct ConstexprBitset {
    static constexpr std::size_t numBlocks = (N + 63) / 64;
    std::array<uint64_t, numBlocks> blocks{};

    constexpr void set(std::size_t idx) {
        blocks[idx / 64] |= (1ULL << (idx & 63));
    }

    constexpr bool empty() const {
        for (auto b : blocks) if (b) return false;
        return true;
    }

    template<typename T>
    constexpr std::size_t getSetIndexes(T* out) const {
        std::size_t count = 0;
        for (std::size_t block = 0; block < numBlocks; block++) {
            uint64_t bits = blocks[block];
            while (bits) {
                int idx = __builtin_ctzll(bits); // portable constexpr_ctzll if needed
                out[count++] = static_cast<T>(block * 64 + idx);
                bits &= bits - 1;
            }
        }
        return count;
    }
 };

 // ========= PermutationIterator with flat arrays =========
 struct PermutationIterator {
    static constexpr std::size_t MAX_CANDIDATES = 256;

    const std::vector<std::array<ConstexprBitset<256>, 8>>& rows;
    std::vector<std::array<uint8_t, MAX_CANDIDATES>> candidates; // row -> array of candidates
    std::vector<std::size_t> candCounts;                         // row -> how many valid
    std::vector<std::size_t> indices;                            // row -> current position
    std::vector<uint8_t> current;                                // assembled vector
    bool done = false;

    PermutationIterator(const std::vector<std::array<ConstexprBitset<256>, 8>>& r)
        : rows(r), candidates(r.size()), candCounts(r.size()), indices(r.size(), 0), current(r.size())
    {
        // Fill flat arrays
        for (std::size_t i = 0; i < rows.size(); i++) {
            auto& row = rows[i];
            std::size_t total = 0;
            for (auto& bs : row) {
                total += bs.getSetIndexes(candidates[i].data() + total);
            }
            candCounts[i] = total;
            if (total == 0) { done = true; return; }
            current[i] = candidates[i][0];
        }
    }

    bool next() {
        if (done) return false;

        // Odometer increment
        for (std::size_t pos = rows.size(); pos-- > 0;) {
            if (++indices[pos] < candCounts[pos]) {
                current[pos] = candidates[pos][indices[pos]];
                // reset lower positions
                for (std::size_t j = pos + 1; j < rows.size(); j++) {
                    indices[j] = 0;
                    current[j] = candidates[j][0];
                }
                return true;
            }
        }
        done = true;
        return false;
    }

    const std::vector<uint8_t>& value() const { return current; }
 };
diff --git a/hamming_lookup_table.cpp b/hamming_lookup_table.cpp
 #include <array>
 #include <cstdint>
 #include <cstddef>
 #include <iostream>
 #include <bitset>
 #include <utility>
 #include <chrono>
 #include <cstdlib>
 #include <random>
 #include <limits>

 static float randomFloatInRange(float min, float max) {
    return min + static_cast<float>(rand()) / (static_cast<float>(RAND_MAX / (max - min)));
 }

 // Generate a random std::array<float, N> within [minVal, maxVal]
 template <std::size_t N>
 std::array<float, N> make_random_array(float minVal = 0.0f, float maxVal = 1.0f) {
    static thread_local std::mt19937 rng(std::random_device{}()); 
    std::uniform_real_distribution<float> dist(minVal, maxVal);

    std::array<float, N> arr{};
    for (auto &val : arr) {
        val = dist(rng);
    }
    return arr;
 }



 constexpr int constexpr_ctzll(unsigned long long x) {
 #if defined(__GNUC__) || defined(__clang__)
    if (x == 0) return 64; // define ctz(0) = 64 (all bits zero)
    return __builtin_ctzll(x);
 #elif defined(_MSC_VER)
    unsigned long idx = 0;
    if (_BitScanForward64(&idx, x))
        return static_cast<int>(idx);
    return 64; // if x == 0
 #else
    // Portable fallback
    if (x == 0) return 64;
    int c = 0;
    while ((x & 1ULL) == 0) {
        x >>= 1;
        ++c;
    }
    return c;
 #endif
 }

 template<std::size_t N>
 struct ConstexprBitset {
    static constexpr std::size_t numBlocks = (N + 63) / 64;
    std::array<uint64_t, numBlocks> blocks{};

    // -------------------
    // Set & Test
    // -------------------
    constexpr void set(std::size_t idx) {
        blocks[idx / 64] |= (1ULL << (idx & 63));
    }

    constexpr bool test(std::size_t idx) const {
        return (blocks[idx / 64] >> (idx & 63)) & 1ULL;
    }

    // -------------------
    // Bitwise operators
    // -------------------
    constexpr auto operator|(ConstexprBitset const& other) const {
        ConstexprBitset res;
        for (std::size_t i = 0; i < numBlocks; i++)
            res.blocks[i] = blocks[i] | other.blocks[i];
        return res;
    }

    constexpr auto operator&(ConstexprBitset const& other) const {
        ConstexprBitset res;
        for (std::size_t i = 0; i < numBlocks; i++)
            res.blocks[i] = blocks[i] & other.blocks[i];
        return res;
    }

    constexpr auto operator&=(ConstexprBitset const& other) {
        for (std::size_t i = 0; i < numBlocks; i++)
            blocks[i] &= other.blocks[i];
    }

    constexpr auto operator^(ConstexprBitset const& other) const {
        ConstexprBitset res;
        for (std::size_t i = 0; i < numBlocks; i++)
            res.blocks[i] = blocks[i] ^ other.blocks[i];
        return res;
    }

    constexpr explicit operator bool() const noexcept {
        for (std::size_t i = 0; i < numBlocks; i++) {
            if (blocks[i] != 0)
                return true;
        }
        return false;
    }

    static constexpr ConstexprBitset<256> make_from_byte(uint8_t b) {
        ConstexprBitset<256> bs;
        bs.set(b);   // set exactly that index
        return bs;
    }

    constexpr bool isAllUnset() const noexcept {
        return isAllUnsetImpl(std::make_index_sequence<numBlocks>{});
    }

    template<typename T>
    constexpr std::size_t getSetIndexes(T* out) const {
        std::size_t count = 0;
        for (std::size_t block = 0; block < numBlocks; block++) {
            uint64_t bits = blocks[block];
            while (bits) {
                int idx = constexpr_ctzll(bits);
                out[count++] = static_cast<T>(block * 64 + idx);
                bits &= bits - 1; // clear lowest set bit
            }
        }
        return count;
    }

    template <size_t... Is>
    constexpr bool isAllUnsetImpl(std::index_sequence<Is...>) const noexcept {
        return ((blocks[Is] == 0ull) && ...);  // fold expression
    }
 };

 template<uint8_t V>
 constexpr ConstexprBitset<256> make_single_bitset() {
    ConstexprBitset<256> bs{};
    bs.set(V);
    return bs;
 }

 template<std::size_t... Is>
 constexpr auto make_identity_table(std::index_sequence<Is...>) {
    return std::array<ConstexprBitset<256>, 256>{
        make_single_bitset<static_cast<uint8_t>(Is)>()...
    };
 }

 // Public inline constexpr lookup table
 inline constexpr auto identity_bitset_table =
    make_identity_table(std::make_index_sequence<256>{});

 template<uint8_t X>
 constexpr const auto& get_identity_bitset_cmpl() {
    return identity_bitset_table[X];
 }

 constexpr const ConstexprBitset<256>& get_identity_bitset(uint8_t x) {
    return identity_bitset_table[x];
 }

 template<size_t NROWS>
 constexpr void byte_row_to_bit_sets(const std::array<uint8_t, NROWS>& input,
                                    std::array<ConstexprBitset<256>, NROWS>& out)
 {

    for (size_t i = 0; i < NROWS; ++i)
    {
        auto const& bs = get_identity_bitset(input[i]);
        out[i] = bs;
    }
 }

 template<size_t entropyDepth, size_t NROWS>
 struct EntropySet {
    using innerSetType = std::array<std::array<ConstexprBitset<256>, NROWS>, 256>;
    static_assert(entropyDepth > 0 && entropyDepth < 257, "Entropy must be within byte range");
    std::array<innerSetType, entropyDepth> entropySlots;

    constexpr void insert(const std::array<uint8_t, NROWS>& vec) {

    }
 };

 // ========= PermutationIterator (compile-time sized) =========
 template<std::size_t NROWS>
 struct PermutationIterator {
    static constexpr std::size_t MAX_CANDIDATES = 256;

    // rows[i][hamming-distance] = candidate bitset
    const std::array<std::array<ConstexprBitset<256>, 8>, NROWS>& rows;

    // storage
    std::array<std::array<uint8_t, MAX_CANDIDATES>, NROWS> candidates{};
    std::array<std::size_t, NROWS> candCounts{};
    std::array<std::size_t, NROWS> indices{};
    std::array<uint8_t, NROWS> current{};
    bool done = false;

    constexpr PermutationIterator(const std::array<std::array<ConstexprBitset<256>, 8>, NROWS>& r)
        : rows(r)
    {
        for (std::size_t i = 0; i < NROWS; i++) {
            std::size_t total = 0;
            for (auto& bs : rows[i]) {
                total += bs.getSetIndexes(candidates[i].data() + total);
            }
            candCounts[i] = total;
            if (total == 0) { done = true; return; }
            current[i] = candidates[i][0];
        }
    }

    constexpr bool next() {
        if (done) return false;
        for (std::size_t pos = NROWS; pos-- > 0;) {
            if (++indices[pos] < candCounts[pos]) {
                current[pos] = candidates[pos][indices[pos]];
                for (std::size_t j = pos + 1; j < NROWS; j++) {
                    indices[j] = 0;
                    current[j] = candidates[j][0];
                }
                return true;
            }
        }
        done = true;
        return false;
    }

    constexpr const std::array<uint8_t, NROWS>& value() const { return current; }
 };



 // ---------- helpers (all constexpr) ----------

 constexpr std::size_t binomial(unsigned n, unsigned k) {
    if (k > n) return 0;
    std::size_t res = 1;
    for (unsigned i = 1; i <= k; i++) {
        res = res * (n - (k - i)) / i;
    }
    return res;
 }

 // quantizers

 // Dequantize 32-bit unsigned int back to float in [0,1]
 constexpr float dequantize_float_32(uint32_t q) {
    constexpr float levels = 4294967295.0f; // 2^32 - 1
    // Use midpoint reconstruction
    return (static_cast<float>(q) + 0.5f) / levels;
 }

 // Impl with loop unrolling
 template <size_t N, size_t... Is>
 constexpr auto dequantize_unrolled_impl_32(const std::array<uint32_t, N>& input,
                                           std::index_sequence<Is...>) {
    std::array<float, N> out{};
    ((out[Is] = dequantize_float_32(input[Is])), ...);
    return out;
 }

 // Public wrapper
 template <size_t N>
 constexpr auto dequantize_unrolled32(const std::array<uint32_t, N>& input) {
    return dequantize_unrolled_impl_32<N>(input, std::make_index_sequence<N>{});
 }

 // Quantize float to 32-bit unsigned integer
 constexpr uint32_t quantize_float_32(float val) {
    constexpr float levels = 4294967295.0f; // 2^32 - 1, but in float
    uint64_t q = static_cast<uint64_t>(val * levels);
    return static_cast<uint32_t>(q);
 }

 // Implementation using index_sequence for loop unrolling
 template <size_t N, size_t... Is>
 constexpr auto quantize_unrolled_impl_32(const std::array<float, N>& input,
                                         std::index_sequence<Is...>) {
    std::array<uint32_t, N> out{};
    ((out[Is] = quantize_float_32(input[Is])), ...);
    return out;
 }

 // Public wrapper
 template <size_t N>
 constexpr auto quantize_unrolled32(const std::array<float, N>& input) {
    return quantize_unrolled_impl_32<N>(input, std::make_index_sequence<N>{});
 }

 template<int B>
 constexpr uint16_t quantize_float16(float val) {
    constexpr int levels = 1 << B;
    uint16_t q = static_cast<uint16_t>(val * levels);
    if (q >= levels) q = levels - 1;
    return q;
 }

 template <size_t N, int B, size_t... Is>
 constexpr auto quantize_unrolled_impl16(const std::array<float, N>& input,
                                        std::index_sequence<Is...>)
 {
    std::array<uint16_t, N> out{};
    ((out[Is] = quantize_float16<B>(input[Is])), ...);
    return out;
 }

 template <size_t N, int B>
 constexpr auto quantize_unrolled16(const std::array<float, N>& input)
 {
    return quantize_unrolled_impl16<N, B>(input, std::make_index_sequence<N>{});
 }

 template<int B>
 constexpr float dequantize_float16(uint16_t q) {
    constexpr int levels = 1 << B;
    // Reconstruct float to midpoint of the quantization bin
    return (static_cast<float>(q) + 0.5f) / static_cast<float>(levels);
 }

 template <size_t N, int B, size_t... Is>
 constexpr auto dequantize_unrolled_impl16(const std::array<uint16_t, N>& input,
                                          std::index_sequence<Is...>)
 {
    std::array<float, N> out{};
    ((out[Is] = dequantize_float16<B>(input[Is])), ...);
    return out;
 }

 template <size_t N, int B>
 constexpr auto dequantize_unrolled16(const std::array<uint16_t, N>& input)
 {
    return dequantize_unrolled_impl16<N, B>(input, std::make_index_sequence<N>{});
 }

 template<int B>
 constexpr float dequantize_float(uint8_t q) {
    constexpr int levels = 1 << B;
    // Map quantized int back to float in [0,1)
    return (static_cast<float>(q) + 0.5f) / static_cast<float>(levels);
 }

 template <size_t N, int B, size_t... Is>
 constexpr auto dequantize_unrolled_impl(const std::array<uint8_t, (N * B + 7) / 8>& input,
                                        std::index_sequence<Is...>)
 {
    std::array<float, N> out{};

    // One expression per dimension
    ((out[Is] = dequantize_float<B>(
        (input[(Is * B) >> 3] >> ((Is * B) & 7)) & ((1u << B) - 1)
    )), ...);

    return out;
 }

 template <size_t N, int B>
 constexpr auto dequantize_unrolled(const std::array<uint8_t, (N * B + 7) / 8>& input)
 {
    return dequantize_unrolled_impl<N, B>(input, std::make_index_sequence<N>{});
 }

 template<int B>
 constexpr uint8_t quantize_float(float val) {
    constexpr int levels = 1 << B;
    uint8_t q = static_cast<uint8_t>(val * levels);
    if (q >= levels) q = levels - 1;
    return q;
 }

 template <size_t N, int B, size_t... Is>
 constexpr auto quantize_unrolled_impl(const std::array<float, N>& input,
                                      std::index_sequence<Is...>)
 {
    constexpr size_t bitsPerByte = 8;
    constexpr size_t outSize = (N * B + 7) / bitsPerByte;
    std::array<uint8_t, outSize> out{};

    // unrolled expression: one line per dimension
    ((out[(Is * B) >> 3] |= quantize_float<B>(input[Is]) << ((Is * B) & 7)), ...);

    return out;
 }

 template <size_t N, int B>
 constexpr auto quantize_unrolled(const std::array<float, N>& input)
 {
    return quantize_unrolled_impl<N, B>(input, std::make_index_sequence<N>{});
 }

 // simple constexpr popcount (no loop, uses builtin if available)
 constexpr int popcount8(uint8_t x) {
 #if defined(__clang__) || defined(__GNUC__)
    return __builtin_popcount(x);
 #else
    int cnt = 0;
    for (; x; x &= (x - 1)) ++cnt;
    return cnt;
 #endif
 }

 // Generate masks with popcount == N
 template<int N>
 constexpr auto generate_masks() {
    constexpr std::size_t Count = binomial(8, N);
    std::array<uint8_t, Count> masks{};
    std::size_t idx = 0;
    for (int u = 0; u < 256; ++u) {
        if (popcount8(static_cast<uint8_t>(u)) == N) {
            masks[idx++] = static_cast<uint8_t>(u);
        }
    }
    return masks;
 }

 // Compute neighbors for a fixed value
 template<int N, uint8_t V>
 constexpr auto hamming_neighbors() {
    constexpr auto masks = generate_masks<N>();
    constexpr std::size_t Count = binomial(8, N);

    std::array<uint8_t, Count> res{};
    for (std::size_t i = 0; i < Count; i++) {
        res[i] = static_cast<uint8_t>(V ^ masks[i]);
    }
    return res; // returns exactly Count neighbors
 }

 template<int N, uint8_t V>
 constexpr auto hamming_neighbor_set() {
    constexpr auto masks = generate_masks<N>();
    constexpr std::size_t Count = binomial(8, N);

    ConstexprBitset<256> res{};
    for (std::size_t i = 0; i < Count; i++) {
        res.set(static_cast<uint8_t>(V ^ masks[i]));
    }
    return res;
 }

 template<int N, std::size_t... Is>
 constexpr auto make_table_of_sets(std::index_sequence<Is...>) {
    return std::array{ hamming_neighbor_set<N, static_cast<uint8_t>(Is)>()... };
 }

 template<int N>
 constexpr auto precomputed_neighbor_sets() {
    return make_table_of_sets<N>(std::make_index_sequence<256>{});
 }

 template<int N>
 inline constexpr auto hamming_set = precomputed_neighbor_sets<N>();

 // Full table: 256 entries of arrays of size C(8,N)
 template<int N, std::size_t... Is>
 constexpr auto make_table(std::index_sequence<Is...>) {
    return std::array{ hamming_neighbors<N, static_cast<uint8_t>(Is)>()... };
 }

 template<int N>
 constexpr auto precomputed_neighbors_table() {
    return make_table<N>(std::make_index_sequence<256>{});
 }

 template<int N>
 inline constexpr auto hamming_table = precomputed_neighbors_table<N>();

 /**
 * A lazy retrieval where only the bit sets are got, but the bit placements are not
 * extracted yet.
 * */
 constexpr void get_all_neighbor_sets(uint8_t byte, const ConstexprBitset<256>& set,
                                       std::array<ConstexprBitset<256>, 8>& out) {
    constexpr auto& table1 = hamming_set<1>;
    constexpr auto& table2 = hamming_set<2>;
    constexpr auto& table3 = hamming_set<3>;
    constexpr auto& table4 = hamming_set<4>;
    constexpr auto& table5 = hamming_set<5>;
    constexpr auto& table6 = hamming_set<6>;
    constexpr auto& table7 = hamming_set<7>;

    out[0] = set & get_identity_bitset(byte);
    out[1] = set & table1[byte];
    out[2] = set & table2[byte];
    out[3] = set & table3[byte];
    out[4] = set & table4[byte];
    out[5] = set & table5[byte];
    out[6] = set & table6[byte];
    out[7] = set & table7[byte];
 }

 constexpr void get_neighbor_set_1(uint8_t byte, const ConstexprBitset<256>& set,
                                       std::array<ConstexprBitset<256>, 8>& out) {
    constexpr auto& table1 = hamming_set<1>;

    out[0] = set & get_identity_bitset(byte);
    out[1] = set & table1[byte];
 }

 /**
 * Index sequencer to unroll the loop
 * */
 template<size_t NROWS, size_t... Is>
 constexpr void get_all_neighbor_sets_arr_impl(
    const std::array<uint8_t, NROWS>& inputVector,
    const std::array<ConstexprBitset<256>, NROWS>& inputSets,
    std::array<std::array<ConstexprBitset<256>, 8>, NROWS>& out,
    std::index_sequence<Is...>)
 {
    // Fold expression over comma operator — each index expands into its call
    (get_all_neighbor_sets(inputVector[Is], inputSets[Is], out[Is]), ...);
 }

 template<size_t NROWS, size_t... Is>
 constexpr void get_neighbor_sets_1_arr_impl(
    const std::array<uint8_t, NROWS>& inputVector,
    const std::array<ConstexprBitset<256>, NROWS>& inputSets,
    std::array<std::array<ConstexprBitset<256>, 8>, NROWS>& out,
    std::index_sequence<Is...>)
 {
    // Fold expression over comma operator — each index expands into its call
    (get_neighbor_set_1(inputVector[Is], inputSets[Is], out[Is]), ...);
 }

 /**
 * Takes an input vector, as the query, and generates an array of array of bitsets,
 * the returned value has indices which represent the hamming distance and sets of bytes as vectors
 * from the query vector.
 * */
 template<size_t NROWS>
 constexpr void get_all_neighbor_sets_arr(
    const std::array<uint8_t, NROWS>& inputVector,
    const std::array<ConstexprBitset<256>, NROWS>& inputSets,
    std::array<std::array<ConstexprBitset<256>, 8>, NROWS>& out)
 {
    get_all_neighbor_sets_arr_impl(inputVector, inputSets, out,
                                   std::make_index_sequence<NROWS>{});
 }

 template<size_t NROWS>
 constexpr void get_neighbor_sets_1_arr(
    const std::array<uint8_t, NROWS>& inputVector,
    const std::array<ConstexprBitset<256>, NROWS>& inputSets,
    std::array<std::array<ConstexprBitset<256>, 8>, NROWS>& out)
 {
    get_neighbor_sets_1_arr_impl(inputVector, inputSets, out,
                                   std::make_index_sequence<NROWS>{});
 }

 template<size_t NROWS, size_t... Is>
 constexpr bool is_fake_vector_impl(const std::array<std::array<ConstexprBitset<256>, 8>, NROWS>& inputSet,
                                   std::index_sequence<Is...>)
 {
    return ((inputSet[Is][0].isAllUnset()) || ...);
 }

 template<size_t NROWS>
 constexpr bool is_fake_vector(const std::array<std::array<ConstexprBitset<256>, 8>, NROWS>& inputSet) {
    return is_fake_vector_impl(inputSet, std::make_index_sequence<NROWS>{});
 } 

 /**
 * This expands neighbors from the bitsets to flat array,
 * slower than neighbor sets method
 * */
 constexpr size_t get_all_neighbors(uint8_t byte, const ConstexprBitset<256>& set,
                                       std::array<uint8_t, 256>& out) {
    constexpr auto& table1 = hamming_set<1>;
    constexpr auto& table2 = hamming_set<2>;
    constexpr auto& table3 = hamming_set<3>;
    constexpr auto& table4 = hamming_set<4>;
    constexpr auto& table5 = hamming_set<5>;
    constexpr auto& table6 = hamming_set<6>;
    constexpr auto& table7 = hamming_set<7>;

    size_t total = 0;

    const ConstexprBitset myNeighbors1 = set & table1[byte];
    const ConstexprBitset myNeighbors2 = set & table2[byte];
    const ConstexprBitset myNeighbors3 = set & table3[byte];
    const ConstexprBitset myNeighbors4 = set & table4[byte];
    const ConstexprBitset myNeighbors5 = set & table5[byte];
    const ConstexprBitset myNeighbors6 = set & table6[byte];
    const ConstexprBitset myNeighbors7 = set & table7[byte];

    total += myNeighbors1.getSetIndexes(out.data() + total);
    total += myNeighbors2.getSetIndexes(out.data() + total);
    total += myNeighbors3.getSetIndexes(out.data() + total);
    total += myNeighbors4.getSetIndexes(out.data() + total);
    total += myNeighbors5.getSetIndexes(out.data() + total);
    total += myNeighbors6.getSetIndexes(out.data() + total);
    total += myNeighbors7.getSetIndexes(out.data() + total);

    return total;
 }

 static void quantizePerfTesting() {
    const std::array<float, 100> numbers = {randomFloatInRange(0.0, 2.6)};
    size_t total = 0;

    auto start = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < 10000000; ++i){
        const auto q1 = quantize_unrolled<100, 8>(numbers);
        total += q1[0];

    }
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    std::cout << "Function 8bit Quantize execution time: " << duration.count() << " microseconds" << std::endl;
    printf("total is %zu\n", total);

 }

 static void quantizePerfTesting16() {
    const std::array<float, 100> numbers = {randomFloatInRange(0.0, 2.6)};

    auto start = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < 10000000; ++i){
        volatile const auto q1 = quantize_unrolled16<100, 16>(numbers);

    }
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    std::cout << "Function 16bit Quantize execution time: " << duration.count() << " microseconds" << std::endl;

 }

 static void quantizePerfTesting32() {
    const std::array<float, 100> numbers = {randomFloatInRange(0.0, 2.6)};
    size_t total = 0;

    auto start = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < 10000000; ++i){
        const auto q1 = quantize_unrolled32<100>(numbers);
        total += q1[0];

    }
    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    std::cout << "Function 32bit Quantize execution time: " << duration.count() << " microseconds" << std::endl;
    printf("total is %zu\n", total);

 }

 static void endToEndPerfTest() {
    constexpr size_t testRowSize = 96;
    static constexpr size_t collectionSize = 100000000;
    static constexpr size_t queryTestCount = 1000000;
    std::array<ConstexprBitset<256>, testRowSize> myCollection;
    // fill collection
    for (size_t i = 0; i < collectionSize; ++i)
    {
        const std::array<float, testRowSize> generatedRow = make_random_array<testRowSize>();
        const std::array<uint8_t, testRowSize> myByteRow = quantize_unrolled<testRowSize, 8>(generatedRow);
        for (size_t j = 0; j < myByteRow.size(); ++j)
        {
            myCollection[j] &= get_identity_bitset(myByteRow[j]);
        }
    }
    const auto queryRow = make_random_array<testRowSize>();
    std::array<std::array<ConstexprBitset<256>, 8>, testRowSize> queryResult;

    auto start = std::chrono::high_resolution_clock::now();

    for (int i = 0; i < queryTestCount; ++i)
    {
        const auto myByteQuery = quantize_unrolled<testRowSize, 8>(queryRow);
        get_neighbor_sets_1_arr<testRowSize>( myByteQuery, myCollection, queryResult);
        //PermutationIterator<testRowSize> iter(queryResult);
        //iter.next();
        //Function endToEndPerfTest execution time: 144397 microseconds
    }

    auto end = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    std::cout << "Function " << __FUNCTION__ << " execution time: " << duration.count() << " microseconds" << std::endl;
 }

 static void perfTesting(char* argv[]) {
    {
        int int_value = std::stoi(argv[1]);
        size_t total = 0;

        auto start = std::chrono::high_resolution_clock::now();

        std::array<uint8_t, 256> myArray;
        ConstexprBitset<256> mySet;
        mySet.set(39);
        mySet.set(143);

        for (int i = 0; i < 10000000; ++i)
        {
            total += get_all_neighbors((uint8_t)int_value, mySet,
                                           myArray);
        }

        auto end = std::chrono::high_resolution_clock::now();

        // Calculate the duration
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);

        printf("total was %zu\n", total);

        // Print the elapsed time
        std::cout << "Function execution time: " << duration.count() << " microseconds" << std::endl;
    }

    {
        int int_value = std::stoi(argv[1]);

        auto start = std::chrono::high_resolution_clock::now();
        volatile size_t number = 0xff;
        std::array<ConstexprBitset<256>, 8> myArray;
        ConstexprBitset<256> mySet;
        mySet.set(59);
        mySet.set(113);
        mySet.set(8);
        mySet.set(91);

        int i = 10000000;

        while(--i)
        {
            get_all_neighbor_sets(i & number, mySet,
                                           myArray);
        }

        auto end = std::chrono::high_resolution_clock::now();

        // Calculate the duration
        auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

        // Print the elapsed time
        std::cout << "Function2 execution time: " << duration.count() << " nanoseconds" << std::endl;
    }
 /**
 -------------
 total was 20000000
 Function execution time: 114794 microseconds
 total was 6988044573398270032, set is 134217728
 Function2 execution time: 6623583 nanoseconds

 * */
 }

 static void perfTesting2(char* argv[]) {
    {
        int int_value = std::stoi(argv[1]);
        size_t total = 0;

        auto start = std::chrono::high_resolution_clock::now();

        std::array<uint8_t, 256> myArray;
        ConstexprBitset<256> mySet;
        mySet.set(39);
        mySet.set(143);

        for (int i = 0; i < 10000000; ++i)
        {
            total += get_all_neighbors((uint8_t)int_value, mySet,
                                           myArray);
        }

        auto end = std::chrono::high_resolution_clock::now();

        // Calculate the duration
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);

        printf("total was %zu\n", total);

        // Print the elapsed time
        std::cout << "Function execution time: " << duration.count() << " microseconds" << std::endl;
    }

    {
        int int_value = std::stoi(argv[1]);
        volatile size_t number = 0xff;

        auto start = std::chrono::high_resolution_clock::now();

        std::array<ConstexprBitset<256>, 8> myArray;
        ConstexprBitset<256> mySet;
        mySet.set(56);
        mySet.set(113);
        mySet.set(8);
        mySet.set(91);

        for (int i = 0; i < 10000000; ++i)
        {
            get_neighbor_set_1(i & number, mySet,
                                           myArray);
            //total += myArray[1].blocks[1];
        }

        auto end = std::chrono::high_resolution_clock::now();

        // Calculate the duration
        auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);

        // Print the elapsed time
        std::cout << "Function3 test 2 execution time: " << duration.count() << " nanoseconds" << std::endl;
    }
 }

 int main(int argc, char* argv[]) {
    constexpr uint8_t v = 0b00001111;
    constexpr auto neighbors = hamming_neighbors<2, v>();
    constexpr auto neighbors2 = hamming_neighbors<1, v>();

    for (auto n : neighbors) {
        std::cout << std::bitset<8>(n) << " (" << int(n) << ")\n";
    }

    puts("------------");

    for (auto n : neighbors2) {
        std::cout << std::bitset<8>(n) << " (" << int(n) << ")\n";
    }

    constexpr auto& table1 = hamming_table<1>;
    constexpr auto& table2 = hamming_table<1>;

    puts("------------");

    for (auto n : table1[0b1101]) {
        std::cout << std::bitset<8>(n) << " (" << int(n) << ")\n";
    }

    puts("-------------");
    //perfTesting(argv);
    //perfTesting2(argv);
    //quantizePerfTesting();
    //quantizePerfTesting16();
    //quantizePerfTesting32();
    endToEndPerfTest();
 /**total was 20000000
 Function execution time: 120658 microseconds
 i is now 80
 Function2 execution time: 3108209 nanoseconds
 total was 20000000
 Function execution time: 109372 microseconds
 Function3 test 2 execution time: 3118209 nanoseconds
 Function 8bit Quantize execution time: 35470 microseconds
 total is 0
 Function 16bit Quantize execution time: 39884 microseconds
 total is 224130000000
 Function 32bit Quantize execution time: 119789 microseconds
 total is 41428131840000000*/
 }
	#include <array>
	#include <cstdint>
	#include <cstddef>

	// ---------- ConstexprBitset ----------
	template<std::size_t N>
	struct ConstexprBitset {
	static constexpr std::size_t numBlocks = (N + 63) / 64;
	std::array<uint64_t, numBlocks> blocks{};

	constexpr void set(std::size_t idx) {
	blocks[idx / 64] \|= (1ULL << (idx & 63));
	}

	template<typename T>
	constexpr std::size_t getSetIndexes(T* out) const {
	std::size_t count = 0;
	for (std::size_t block = 0; block < numBlocks; block++) {
	uint64_t bits = blocks[block];
	while (bits) {
	int idx = __builtin_ctzll(bits);
	out[count++] = static_cast<T>(block * 64 + idx);
	bits &= bits - 1;
	}
	}
	return count;
	}
	};

	// ---------- Product Metafunction ----------
	template<std::size_t... Ns>
	struct product;

	template<>
	struct product<> {
	static constexpr std::size_t value = 1;
	};

	template<std::size_t N, std::size_t... Ns>
	struct product<N, Ns...> {
	static constexpr std::size_t value = N * product<Ns...>::value;
	};

	// ---------- Generate Neighbors ----------
	template<std::size_t NROWS, std::size_t... COUNTS>
	constexpr auto generate_neighbors(
	const std::array<std::array<uint8_t, 256>, NROWS>& candidates,
	const std::array<std::size_t, NROWS>& counts)
	{
	constexpr std::size_t total_size = product<COUNTS...>::value;

	std::array<std::array<uint8_t, NROWS>, total_size> result{};

	std::array<std::size_t, NROWS> idx{};
	std::size_t outIdx = 0;
	bool done = false;

	while (!done) {
	// capture permutation
	for (std::size_t j = 0; j < NROWS; j++) {
	result[outIdx][j] = candidates[j][idx[j]];
	}
	++outIdx;

	// advance counters
	for (std::size_t pos = NROWS; pos-- > 0;) {
	if (++idx[pos] < counts[pos]) {
	for (std::size_t j = pos + 1; j < NROWS; j++) idx[j] = 0;
	goto continue_outer;
	}
	}
	done = true;
	continue_outer:;
	}

	return result;
	}