Created
August 21, 2025 05:21
-
-
Save jweinst1/3476cdd85465f16a05beb26e9e1f0f3c to your computer and use it in GitHub Desktop.
quantize floats to small values
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <vector> | |
#include <cstdint> | |
#include <iostream> | |
#include <cmath> | |
#include <numeric> | |
#include <bitset> | |
uint16_t quantizeVectorToBits(const std::vector<float>& vec, | |
int numBits, | |
bool useGlobalMean = false, | |
float customThreshold = NAN) | |
{ | |
if (vec.empty() || numBits < 1 || numBits > 16) return 0; | |
size_t N = vec.size(); | |
// Determine threshold | |
float threshold; | |
if (!std::isnan(customThreshold)) { | |
threshold = customThreshold; // user-specified | |
} else if (useGlobalMean) { | |
threshold = std::accumulate(vec.begin(), vec.end(), 0.0f) / N; // global mean | |
} else { | |
threshold = 0.0f; // default | |
} | |
// Divide dimensions into groups as evenly as possible | |
size_t baseGroupSize = N / numBits; | |
size_t remainder = N % numBits; | |
uint16_t result = 0; | |
size_t idx = 0; | |
for (int bit = 0; bit < numBits; ++bit) { | |
size_t groupSize = baseGroupSize + (bit < remainder ? 1 : 0); | |
if (groupSize == 0) continue; | |
float sum = 0.0f; | |
for (size_t j = 0; j < groupSize; ++j) { | |
sum += vec[idx++]; | |
} | |
float mean = sum / groupSize; | |
if (mean > threshold) { | |
result |= (1 << bit); | |
} | |
} | |
return result; | |
} | |
uint16_t quantizeVectorTo10Bits(const std::vector<float>& vec, float threshold = 0.0f) { | |
size_t N = vec.size(); | |
if (N == 0) return 0; | |
// Divide dimensions into 10 groups as evenly as possible | |
size_t baseGroupSize = N / 10; | |
size_t remainder = N % 10; | |
uint16_t result = 0; | |
size_t idx = 0; | |
for (int bit = 0; bit < 10; ++bit) { | |
// Group size (some groups get +1 if remainder left) | |
size_t groupSize = baseGroupSize + (bit < remainder ? 1 : 0); | |
if (groupSize == 0) continue; | |
// Compute group mean | |
float sum = 0.0f; | |
for (size_t j = 0; j < groupSize; ++j) { | |
sum += vec[idx++]; | |
} | |
float mean = sum / groupSize; | |
// Compare to threshold -> set bit | |
if (mean > threshold) { | |
result |= (1 << bit); | |
} | |
} | |
return result; | |
} | |
// Example usage | |
int main() { | |
std::vector<float> vec(50); | |
for (int i = 0; i < 50; i++) { | |
vec[i] = std::sin(i * 0.2f); // example values | |
} | |
uint16_t q = quantizeVectorTo10Bits(vec, 0.0f); | |
std::cout << "Quantized 10-bit code: " << std::bitset<10>(q) << "\n"; | |
std::cout << "As integer: " << q << "\n"; | |
{ | |
std::vector<float> vec1(42); | |
for (int i = 0; i < 42; i++) { | |
vec1[i] = (i % 7) - 3; // example pattern | |
} | |
std::cout << "Custom threshold = 0.5\n"; | |
uint16_t q1 = quantizeVectorToBits(vec1, 10, false, 0.5f); | |
std::cout << "10 bits -> " << std::bitset<16>(q1) << " (int=" << q1 << ")\n"; | |
std::cout << "\nGlobal mean threshold\n"; | |
uint16_t q2 = quantizeVectorToBits(vec1, 10, true); | |
std::cout << "10 bits -> " << std::bitset<16>(q2) << " (int=" << q2 << ")\n"; | |
std::cout << "\nFixed threshold = 0.0\n"; | |
uint16_t q3 = quantizeVectorToBits(vec1, 10); | |
std::cout << "10 bits -> " << std::bitset<16>(q3) << " (int=" << q3 << ")\n"; | |
} | |
return 0; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment