Last active
September 13, 2025 20:48
-
-
Save jweinst1/433a36e2162ba6db12503529febc0f48 to your computer and use it in GitHub Desktop.
transform raw floats into thermometer encoding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <array> | |
| #include <vector> | |
| #include <cstdint> | |
| #include <cstring> | |
| #include <cstdlib> | |
| #include <ctime> | |
| #include <cmath> | |
| #include <algorithm> | |
| #include <iostream> | |
| #include <numeric> | |
| #include <random> | |
| #include <unordered_set> | |
| /* | |
| Query vector: [ 0.437273 0.391428 0.991635 0.67261 0.150629 0.322869 0.474333 0.63942 ] [ 111 100 253 172 38 82 121 163 ] | |
| euc: 917 euc-dist: 0.413847 [ 0.412045 0.321312 0.989504 0.705086 0.508689 0.49563 0.499615 0.562188 ] abs: 917 abs-dist: 194 [ 105 82 253 180 130 126 127 143 ] | |
| euc: 491 euc-dist: 0.361312 [ 0.505731 0.511975 0.790707 0.724153 0.298089 0.282344 0.262437 0.643086 ] abs: 491 abs-dist: 216 [ 129 131 202 185 76 72 67 164 ] | |
| */ | |
| using namespace std; | |
| // ---- Quantizer ---- | |
| template<size_t arrSize> | |
| void convertFloatToInt(std::array<uint32_t, arrSize>& dst, const std::array<float, arrSize>& src) { | |
| static_assert(sizeof(uint32_t) == sizeof(float)); | |
| static_assert(sizeof(float) == 4); | |
| memcpy(dst.data(), src.data(), arrSize * sizeof(float)); | |
| } | |
| static void printRawFloat(uint32_t number) { | |
| float foo; | |
| memcpy(&foo, &number, sizeof(float)); | |
| std::cout << foo << " "; | |
| } | |
| static uint32_t discretizeMantissa(uint32_t significand, uint32_t mantissa) { | |
| // todo check if this makes sense?? | |
| uint32_t discrete = significand; | |
| uint8_t divisior = 1; | |
| size_t i = 22; | |
| do { | |
| if( (mantissa >> i) & 1) discrete += significand >> divisior; | |
| ++divisior; | |
| if (i == 0) break; | |
| --i; | |
| } while (true); | |
| return discrete; | |
| } | |
| inline uint32_t quantizeRawFloat(uint32_t number) { | |
| /* | |
| Average Spearman correlation: 0.999967 | |
| Average Top-10 Recall: 0.9919 | |
| */ | |
| uint32_t exponent = (number >> 23) & 0xFF; | |
| std::cout << "exponent " << exponent; | |
| uint32_t mantissa = number & 0x7FFFFF; | |
| uint32_t exp_lvl = exponent & 0b111; | |
| if (exponent < 120) | |
| exp_lvl = 0; | |
| if (exponent >= 120) | |
| exp_lvl += 1; | |
| // todo, must be 1 << exp_level (discrete powers of 2 between exponents) | |
| uint32_t powered = 1 << (exp_lvl); | |
| uint32_t discrete_mode = discretizeMantissa(powered, mantissa); | |
| std::cout << " The discrete of " << powered << " is " << discrete_mode << " "; | |
| uint32_t completed = discrete_mode; | |
| printRawFloat(number); | |
| std::cout << std::bitset<23>(mantissa) << " " << completed << "\n"; | |
| return completed; | |
| } | |
| std::array<uint32_t, 8> quantizeToUintVec(const std::array<float, 8>& src) { | |
| std::array<uint32_t, 8> rawFloats; | |
| convertFloatToInt(rawFloats, src); | |
| std::array<uint32_t, 8> result; | |
| for (int i = 0; i < 8; i++) { | |
| result[i] = quantizeRawFloat(rawFloats[i]); | |
| } | |
| return result; | |
| } | |
| // ---- Distances ---- | |
| double euclidean(const std::array<float, 8>& a, const std::array<float, 8>& b) { | |
| double sum = 0.0; | |
| for (int i = 0; i < 8; i++) { | |
| double d = double(a[i]) - double(b[i]); | |
| sum += d * d; | |
| } | |
| return std::sqrt(sum); | |
| } | |
| int hamming(const std::array<uint32_t, 8>& a, const std::array<uint32_t, 8>& b) { | |
| int dist = 0; | |
| for (int i = 0; i < 8; i++) { | |
| dist += __builtin_popcount(a[i] ^ b[i]); | |
| } | |
| return dist; | |
| } | |
| int absolute_dist(const std::array<uint32_t, 8>& a, const std::array<uint32_t, 8>& b) { | |
| int dist = 0; | |
| for (int i = 0; i < 8; i++) { | |
| dist += (abs(int(a[i] - b[i])) * abs(int(a[i] - b[i]))); | |
| } | |
| return dist; | |
| } | |
| // ---- Ranking / Spearman ---- | |
| vector<int> rankOrder(const vector<double>& distances) { | |
| vector<int> idx(distances.size()); | |
| iota(idx.begin(), idx.end(), 0); | |
| sort(idx.begin(), idx.end(), [&](int i, int j) { | |
| return distances[i] < distances[j]; | |
| }); | |
| return idx; | |
| } | |
| double spearman(const vector<int>& a, const vector<int>& b) { | |
| int n = a.size(); | |
| vector<int> rankA(n), rankB(n); | |
| for (int i = 0; i < n; i++) { | |
| rankA[a[i]] = i; | |
| rankB[b[i]] = i; | |
| } | |
| double num = 0.0; | |
| for (int i = 0; i < n; i++) { | |
| double d = rankA[i] - rankB[i]; | |
| num += d * d; | |
| } | |
| return 1.0 - (6.0 * num) / (n * (n * n - 1)); | |
| } | |
| // ---- Recall@k ---- | |
| double topKRecall(const vector<int>& eRank, const vector<int>& hRank, int k) { | |
| unordered_set<int> eTop(eRank.begin(), eRank.begin() + k); | |
| int hit = 0; | |
| for (int i = 0; i < k; i++) { | |
| if (eTop.count(hRank[i])) hit++; | |
| } | |
| return double(hit) / k; | |
| } | |
| template<class T> | |
| void printTypedArr(const std::array<T, 8>& flv) { | |
| std::cout << " ["; | |
| for (const auto& f : flv) { | |
| std::cout << " " << f << " "; | |
| } | |
| std::cout << "] "; | |
| } | |
| static void printFloatQuant(float number) { | |
| uint32_t raw = 0; | |
| memcpy(&raw, &number, sizeof(float)); | |
| std::cout << number << " -> " << quantizeRawFloat(raw) << "\n"; | |
| } | |
| // ---- Main ---- | |
| int main() { | |
| mt19937 rng(time(nullptr)); | |
| uniform_real_distribution<float> dist(0.0, 1.0); | |
| const int N = 1000; // number of vectors | |
| const int D = 8; // dimensions | |
| const int K = 10; // top-k recall | |
| vector<array<float, D>> data(N); | |
| for (int i = 0; i < N; i++) { | |
| for (int j = 0; j < D; j++) { | |
| data[i][j] = dist(rng); | |
| } | |
| } | |
| vector<array<uint32_t, D>> qdata(N); | |
| for (int i = 0; i < N; i++) | |
| qdata[i] = quantizeToUintVec(data[i]); | |
| double totalSpearman = 0.0; | |
| double totalRecall = 0.0; | |
| for (int i = 0; i < N; i++) { | |
| vector<double> edist, hdist; | |
| for (int j = 0; j < N; j++) { | |
| if (i == j) continue; | |
| edist.push_back(euclidean(data[i], data[j])); | |
| hdist.push_back(absolute_dist(qdata[i], qdata[j])); | |
| } | |
| auto eRank = rankOrder(edist); | |
| auto hRank = rankOrder(hdist); | |
| totalSpearman += spearman(eRank, hRank); | |
| totalRecall += topKRecall(eRank, hRank, K); | |
| } | |
| cout << "Average Spearman correlation: " << totalSpearman / N << "\n"; | |
| cout << "Average Top-" << K << " Recall: " << totalRecall / N << "\n"; | |
| cout << "Starting Query Test\n"; | |
| std::array<float, D> newQuery; | |
| for (int i = 0; i < D; ++i) | |
| { | |
| newQuery[i] = dist(rng); | |
| } | |
| const auto queryVec = quantizeToUintVec(newQuery); | |
| std::vector<std::pair<int, double>> queryEucDists; | |
| std::vector<std::pair<int, int>> queryAbsDists; | |
| for (int i = 0; i < N; i++) { | |
| queryEucDists.push_back(std::make_pair(i, euclidean(newQuery, data[i]))); | |
| queryAbsDists.push_back(std::make_pair(i, absolute_dist(queryVec, qdata[i]))); | |
| } | |
| std::sort(queryEucDists.begin(), queryEucDists.end(), [](auto a, auto b) { | |
| return a.second < b.second; // Returns true if 'a' should come before 'b' | |
| }); | |
| std::sort(queryAbsDists.begin(), queryAbsDists.end(), [](auto a, auto b) { | |
| return a.second < b.second; // Returns true if 'a' should come before 'b' | |
| }); | |
| std::cout << "Query vector: "; | |
| printTypedArr<float>(newQuery); | |
| printTypedArr<uint32_t>(queryVec); | |
| std::cout << "\n"; | |
| for (int i = 0; i < 10; ++i) | |
| { | |
| std::cout << "euc: " << (uint32_t)queryEucDists[i].first | |
| << " euc-dist: " << (double)queryEucDists[i].second; | |
| printTypedArr<float>(data[queryEucDists[i].first]); | |
| std::cout << " abs: " << (uint32_t)queryAbsDists[i].first | |
| << " abs-dist: " << (uint32_t)queryAbsDists[i].second; | |
| printTypedArr<uint32_t>(qdata[queryAbsDists[i].first]); | |
| std::cout << "\n"; | |
| } | |
| return 0; | |
| } | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <array> | |
| #include <vector> | |
| #include <cstdint> | |
| #include <cstring> | |
| #include <cstdlib> | |
| #include <ctime> | |
| #include <cmath> | |
| #include <algorithm> | |
| #include <iostream> | |
| #include <numeric> | |
| #include <random> | |
| #include <unordered_set> | |
| #include <bitset> | |
| static uint8_t determineExponent(uint8_t quant) { | |
| #if defined(__GNUC__) || defined(__clang__) | |
| return quant ? 31 - __builtin_clz(quant) : 0; | |
| #else | |
| if (quant & (1 << 7)) | |
| return 7; | |
| if (quant & (1 << 6)) | |
| return 6; | |
| if (quant & (1 << 5)) | |
| return 5; | |
| if (quant & (1 << 4)) | |
| return 4; | |
| if (quant & (1 << 3)) | |
| return 3; | |
| if (quant & (1 << 2)) | |
| return 2; | |
| if (quant & (1 << 1)) | |
| return 1; | |
| return 0; | |
| #endif | |
| } | |
| int main(int argc, char const *argv[]) | |
| { | |
| float start = 0.82; | |
| printf("%f\n", start); | |
| uint32_t raw = 0; | |
| memcpy(&raw, &start, sizeof(float)); | |
| int8_t exponent = -(120 - ((raw >> 23) & 0xFF)); | |
| if (exponent < 0) | |
| exponent = 0; | |
| else | |
| ++exponent; | |
| uint32_t mantissa = raw & 0x7FFFFF; | |
| std::cout << std::bitset<23>(mantissa) << "\n"; | |
| uint8_t discrete = (1 << exponent) | (mantissa >> (23 - exponent)); | |
| printf("%u\n", discrete); | |
| uint32_t recovered_mant = discrete & ((1 << exponent) - 1); | |
| uint8_t determined_exp = determineExponent(discrete); | |
| printf("exponent is %u\n", determined_exp); | |
| exponent = determined_exp; | |
| uint32_t constructed_f_raw = (120 + (exponent > 0 ? exponent - 1 : 0)) << 23 | (recovered_mant << (23 - exponent)); | |
| float new_f = 0.0; | |
| memcpy(&new_f, &constructed_f_raw, sizeof(uint32_t)); | |
| printf("%f\n", new_f); | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <array> | |
| #include <vector> | |
| #include <cstdint> | |
| #include <cstring> | |
| #include <ctime> | |
| #include <cmath> | |
| #include <algorithm> | |
| #include <iostream> | |
| #include <numeric> | |
| #include <random> | |
| #include <unordered_set> | |
| using namespace std; | |
| // ---- Quantizer ---- | |
| template<size_t arrSize> | |
| void convertFloatToInt(std::array<uint32_t, arrSize>& dst, const std::array<float, arrSize>& src) { | |
| static_assert(sizeof(uint32_t) == sizeof(float)); | |
| static_assert(sizeof(float) == 4); | |
| memcpy(dst.data(), src.data(), arrSize * sizeof(float)); | |
| } | |
| static void printRawFloat(uint32_t number) { | |
| float foo; | |
| memcpy(&foo, &number, sizeof(float)); | |
| std::cout << foo << " "; | |
| } | |
| inline uint16_t quantizeRawFloat(uint32_t number) { | |
| /* | |
| Average Spearman correlation: 0.930783 | |
| Average Top-10 Recall: 0.746 | |
| */ | |
| uint32_t exponent = (number >> 23) & 0xFF; | |
| uint32_t mantissa = number & 0x7FFFFF; | |
| uint32_t exp_lvl = exponent & 0b111; | |
| if (exponent < 120) { | |
| exp_lvl = 0; | |
| } | |
| if (exp_lvl == 4) { | |
| // scaled up one | |
| exp_lvl += 1; | |
| exp_lvl += (mantissa >> 22); | |
| } else if (exp_lvl == 5) { | |
| // scaled up two | |
| exp_lvl += 2; | |
| exp_lvl += (mantissa >> 21); | |
| } else if (exp_lvl == 6) { | |
| // scaled up four | |
| // the highest exponent segment gets the most mantissa bits | |
| exp_lvl += 4; | |
| uint32_t mant6_val = mantissa >> 20; | |
| if (mant6_val > 6) | |
| mant6_val = 6; | |
| exp_lvl += mant6_val; | |
| } | |
| uint32_t exp_expand = (1 << (exp_lvl)) - 1; | |
| uint16_t completed = exp_expand; | |
| printRawFloat(number); | |
| std::cout << std::bitset<23>(mantissa) << " " << std::bitset<16>(completed) << "\n"; | |
| return completed; | |
| } | |
| std::array<uint16_t, 8> quantizeToUintVec(const std::array<float, 8>& src) { | |
| std::array<uint32_t, 8> rawFloats; | |
| convertFloatToInt(rawFloats, src); | |
| std::array<uint16_t, 8> result; | |
| for (int i = 0; i < 8; i++) { | |
| result[i] = quantizeRawFloat(rawFloats[i]); | |
| } | |
| return result; | |
| } | |
| // ---- Distances ---- | |
| double euclidean(const std::array<float, 8>& a, const std::array<float, 8>& b) { | |
| double sum = 0.0; | |
| for (int i = 0; i < 8; i++) { | |
| double d = double(a[i]) - double(b[i]); | |
| sum += d * d; | |
| } | |
| return std::sqrt(sum); | |
| } | |
| int hamming(const std::array<uint16_t, 8>& a, const std::array<uint16_t, 8>& b) { | |
| int dist = 0; | |
| for (int i = 0; i < 8; i++) { | |
| dist += __builtin_popcount(a[i] ^ b[i]); | |
| } | |
| return dist; | |
| } | |
| // ---- Ranking / Spearman ---- | |
| vector<int> rankOrder(const vector<double>& distances) { | |
| vector<int> idx(distances.size()); | |
| iota(idx.begin(), idx.end(), 0); | |
| sort(idx.begin(), idx.end(), [&](int i, int j) { | |
| return distances[i] < distances[j]; | |
| }); | |
| return idx; | |
| } | |
| double spearman(const vector<int>& a, const vector<int>& b) { | |
| int n = a.size(); | |
| vector<int> rankA(n), rankB(n); | |
| for (int i = 0; i < n; i++) { | |
| rankA[a[i]] = i; | |
| rankB[b[i]] = i; | |
| } | |
| double num = 0.0; | |
| for (int i = 0; i < n; i++) { | |
| double d = rankA[i] - rankB[i]; | |
| num += d * d; | |
| } | |
| return 1.0 - (6.0 * num) / (n * (n * n - 1)); | |
| } | |
| // ---- Recall@k ---- | |
| double topKRecall(const vector<int>& eRank, const vector<int>& hRank, int k) { | |
| unordered_set<int> eTop(eRank.begin(), eRank.begin() + k); | |
| int hit = 0; | |
| for (int i = 0; i < k; i++) { | |
| if (eTop.count(hRank[i])) hit++; | |
| } | |
| return double(hit) / k; | |
| } | |
| // ---- Main ---- | |
| int main() { | |
| mt19937 rng(time(nullptr)); | |
| uniform_real_distribution<float> dist(0.0, 1.0); | |
| const int N = 100; // number of vectors | |
| const int D = 8; // dimensions | |
| const int K = 10; // top-k recall | |
| vector<array<float, D>> data(N); | |
| for (int i = 0; i < N; i++) { | |
| for (int j = 0; j < D; j++) { | |
| data[i][j] = dist(rng); | |
| } | |
| } | |
| vector<array<uint16_t, D>> qdata(N); | |
| for (int i = 0; i < N; i++) | |
| qdata[i] = quantizeToUintVec(data[i]); | |
| double totalSpearman = 0.0; | |
| double totalRecall = 0.0; | |
| for (int i = 0; i < N; i++) { | |
| vector<double> edist, hdist; | |
| for (int j = 0; j < N; j++) { | |
| if (i == j) continue; | |
| edist.push_back(euclidean(data[i], data[j])); | |
| hdist.push_back(hamming(qdata[i], qdata[j])); | |
| } | |
| auto eRank = rankOrder(edist); | |
| auto hRank = rankOrder(hdist); | |
| totalSpearman += spearman(eRank, hRank); | |
| totalRecall += topKRecall(eRank, hRank, K); | |
| } | |
| cout << "Average Spearman correlation: " << totalSpearman / N << "\n"; | |
| cout << "Average Top-" << K << " Recall: " << totalRecall / N << "\n"; | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment