Created
February 6, 2025 01:28
-
-
Save muminoff/8db643031ec97685fbde8d991f8e9672 to your computer and use it in GitHub Desktop.
HyperLogLog Rust Implementation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::hash::{Hash, Hasher}; | |
use std::collections::hash_map::DefaultHasher; | |
struct HyperLogLog { | |
registers: Vec<u8>, // Vector to store the maximum number of leading zeros seen. | |
m: usize, // Number of registers (m = 2^b). | |
b: u8, // Number of bits used for indexing. | |
} | |
impl HyperLogLog { | |
fn new(b: u8) -> Self { | |
let m = 1 << b; // m = 2^b | |
HyperLogLog { | |
registers: vec![0; m], | |
m, | |
b, | |
} | |
} | |
// Add an element to the HyperLogLog structure. | |
fn add(&mut self, value: &str) { | |
let hash = Self::hash(value); // Hash the input value. | |
let index = (hash >> (32 - self.b)) as usize; // Use the first `b` bits as the index. | |
// Safely compute the mask to extract the remaining bits. | |
let mask = if self.b < 32 { | |
(1_u32 << (32 - self.b)).wrapping_sub(1) | |
} else { | |
0 // If b == 32, there are no remaining bits. | |
}; | |
let w = hash & mask; // Mask to extract the remaining bits. | |
let p = w.leading_zeros() as u8 + 1; // Count leading zeros in the remaining bits. | |
// Update the register if this value has more leading zeros than previously seen. | |
self.registers[index] = self.registers[index].max(p); | |
} | |
// Estimate the number of unique elements. | |
fn estimate(&self) -> f64 { | |
let mut z = 0.0; | |
for ® in &self.registers { | |
z += 1.0 / (1u64 << reg) as f64; // Use 64-bit shift to prevent overflow | |
} | |
let alpha = if self.m == 16 { | |
0.673 | |
} else if self.m == 32 { | |
0.697 | |
} else if self.m == 64 { | |
0.709 | |
} else { | |
0.7213 / (1.0 + 1.079 / self.m as f64) | |
}; | |
let estimate = alpha * self.m as f64 * self.m as f64 / z; | |
// Small range correction | |
if estimate <= 2.5 * self.m as f64 { | |
let zeros = self.registers.iter().filter(|&&x| x == 0).count(); | |
if zeros != 0 { | |
return self.m as f64 * (self.m as f64 / zeros as f64).ln(); | |
} | |
} | |
estimate | |
} | |
// Simple hash function (for demonstration purposes only). | |
fn hash(value: &str) -> u32 { | |
let mut hasher = DefaultHasher::new(); | |
value.hash(&mut hasher); // Hash the string. | |
hasher.finish() as u32 // Return the hash as a 32-bit integer. | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment