Skip to content

Instantly share code, notes, and snippets.

@jweinst1
Last active September 26, 2025 00:26
Show Gist options
  • Save jweinst1/49ea69d5dfd81a9bf5d01a8ff3625ee8 to your computer and use it in GitHub Desktop.
Save jweinst1/49ea69d5dfd81a9bf5d01a8ff3625ee8 to your computer and use it in GitHub Desktop.
closest sub mask in a mash set
>>> bin(0b10100 & ~0b1111)
'0b10000'
>>> bin(0b11000 & 0b1111)
'0b1000'
>>> bin(0b11000 & ~0b1111)
'0b10000'
>>> bin(0b01000 & ~0b1111)
'0b0'
>>> bin(0b11000 & ~0b1111)
'0b10000'
>>> bin(0b11000 & 0b1111)
'0b1000'
>>> bin((0b11000 & ~0b1111) & (0b11000 & 0b1111))
'0b0'
>>> bin((0b11000 & ~0b1111) & (0b11000 & 0b1111))
'0b0'
#include <iostream>
#include <bitset>
#include <array>
std::array<uint8_t, 2> closest(uint8_t set, uint8_t key) {
int len = 32 - __builtin_clz(key);
// must consider both 1 bit higher and lower
uint8_t ent_lo = ((1 << len) - 1) & set;
uint8_t ent_hi = (1 << len) & set;
return {ent_lo, ent_hi};
}
int main(int argc, char const *argv[])
{
const auto found = closest(0b10110101, 0b11001);
std::cout << std::bitset<8>(found[0]) << "\n";
std::cout << std::bitset<8>(found[1]) << "\n";
//00010101
//00100000
return 0;
}
#include <iostream>
#include <bitset>
#include <array>
std::array<uint8_t, 2> closest(uint8_t set, uint8_t key) {
int len = 32 - __builtin_clz(key);
uint8_t ent_lo = ((1 << len) - 1) & set;
uint8_t ent_hi = (1 << len) & set;
return {ent_lo, ent_hi};
}
// Grabs only the first bit of each number so that the distance is preserved
// 1 -> above 127
// 0 -> below 127
uint8_t arr_to_byte(const std::array<uint8_t, 8>& arr) {
return ((arr[0] >> 7) << 7) |
((arr[1] >> 7) << 6) |
((arr[2] >> 7) << 5) |
((arr[3] >> 7) << 4) |
((arr[4] >> 7) << 3) |
((arr[5] >> 7) << 2) |
((arr[6] >> 7) << 1) |
((arr[7] >> 7) << 0);
}
size_t euc_dist(const std::array<uint8_t, 8>& x, const std::array<uint8_t, 8>& y) {
size_t total = 0;
for (int i = 0; i < x.size(); ++i)
{
total += ((x[i] - y[i]) * (x[i] - y[i]));
}
return total;
}
/**
* This discretizes the vector further into a byte that can be used to index the partition of the mash set.
* This discretization retains some entropy, but does not distort distance in euclidean space
* */
int main(int argc, char const *argv[])
{
const auto found = closest(0b10110101, 0b11001);
std::cout << std::bitset<8>(found[0]) << "\n";
std::cout << std::bitset<8>(found[1]) << "\n";
const std::array<uint8_t, 8> arr = {221, 45, 129, 170, 48, 58, 92, 240};
const std::array<uint8_t, 8> arr2 = {121, 145, 129, 70, 97, 18, 32, 210};
const uint8_t arr1Val = arr_to_byte(arr);
const uint8_t arr2Val = arr_to_byte(arr2);
std::cout << std::bitset<8>(arr_to_byte(arr)) << "\n";
std::cout << std::bitset<8>(arr_to_byte(arr2)) << "\n";
std::cout << (size_t)((arr2Val - arr1Val) * (arr2Val - arr1Val)) << "\n";
std::cout << euc_dist(arr, arr2) << "\n";
const std::array<uint8_t, 8> arr3 = {21, 45, 129, 170, 48, 58, 82, 240};
const std::array<uint8_t, 8> arr4 = {121, 145, 29, 70, 197, 19, 32, 210};
const uint8_t arr3Val = arr_to_byte(arr3);
const uint8_t arr4Val = arr_to_byte(arr4);
std::cout << std::bitset<8>(arr_to_byte(arr3)) << "\n";
std::cout << std::bitset<8>(arr_to_byte(arr4)) << "\n";
std::cout << (size_t)((arr4Val - arr3Val) * (arr4Val - arr3Val)) << "\n";
std::cout << euc_dist(arr3, arr4) << "\n";
return 0;
}
>>> 0b1100 & 0b10100
4
>>> (0b1100 & 0b10100) ^ 0b1100
8
>>> (0b1100 & 0b10100) ^ 0b11100
24
>>> (0b1100 & 0b10100) ^ 0b1100
8
>>> (0b1100 & 0b11100) ^ 0b1100
0
>>> (0b1100 & 0b11100) ^ ~0b1100
-1
>>> (0b1100 & 0b11100) ^ 0b11110011
255
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment