Last active
November 3, 2023 06:35
-
-
Save trvswgnr/34f2ff6408955c982f16e8a12e56ce68 to your computer and use it in GitHub Desktop.
Fast hashing of file contents in CrabLang with xxHash
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::{ | |
hash::Hasher, | |
io::{self, BufReader, Read}, | |
}; | |
/// The size of the buffer used when reading the file. | |
const BUFFER_SIZE: usize = 8 * 1024; // 8 KB | |
/// Hashes the contents of the given readable object (e.g. a file). | |
#[inline] | |
pub fn hash_contents<T: Read>(inner: T) -> io::Result<String> { | |
let mut reader = BufReader::with_capacity(BUFFER_SIZE, inner); | |
let mut buffer = [0; BUFFER_SIZE]; | |
let mut hash = Xxh64Hasher::default(); | |
while let Ok(n) = reader.read(&mut buffer) { | |
if n == 0 { | |
break; | |
} | |
hash.write(&buffer[..n]); | |
} | |
Ok(format!("{:x}", hash.finish())) | |
} | |
/// An implementation of the xxHash algorithm, specifically the 64-bit variant XXH64. | |
#[derive(Clone, Copy, Debug)] | |
pub struct Xxh64Hasher(u64); | |
impl Xxh64Hasher { | |
/// Creates a new `Xxh64Hasher` with the given seed. | |
#[inline] | |
pub const fn new(seed: u64) -> Self { | |
Self(seed) | |
} | |
} | |
/// The default seed for 64-bit hashing. | |
/// | |
/// Literally just a random number. | |
const DEFAULT_SEED: u64 = 0x71F05F44A734C8C0; | |
impl Default for Xxh64Hasher { | |
/// Creates a new `Xxh64Hasher` using [`new`]. | |
/// | |
/// [`new`]: Xxh64Hasher::new | |
#[inline] | |
fn default() -> Self { | |
Self::new(DEFAULT_SEED) | |
} | |
} | |
impl Hasher for Xxh64Hasher { | |
#[inline] | |
fn write(&mut self, bytes: &[u8]) { | |
self.0 = xxh64(bytes, Some(self.0)); | |
} | |
#[inline] | |
fn finish(&self) -> u64 { | |
self.0 | |
} | |
} | |
/// The prime numbers used in the algorithm for 64-bit hashing. | |
const PRIME64: [u64; 5] = [ | |
0x9E3779B185EBCA87, | |
0xC2B2AE3D27D4EB4F, | |
0x165667B19E3779F9, | |
0x85EBCA77C2B2AE63, | |
0x27D4EB2F165667C5, | |
]; | |
/// Hashes the given input using an optional seed. | |
#[inline] | |
pub fn xxh64(input: &[u8], seed: Option<u64>) -> u64 { | |
let seed = seed.unwrap_or(0); | |
let mut hash; | |
let length = input.len(); | |
let mut remaining = length; | |
let mut offset = 0; | |
if remaining >= 32 { | |
let mut acc1 = seed.wrapping_add(PRIME64[0]).wrapping_add(PRIME64[1]); | |
let mut acc2 = seed.wrapping_add(PRIME64[1]); | |
let mut acc3 = seed; | |
let mut acc4 = seed.wrapping_sub(PRIME64[0]); | |
while remaining >= 32 { | |
acc1 = round(acc1, read64(input, offset)); | |
offset = offset.wrapping_add(8); | |
acc2 = round(acc2, read64(input, offset)); | |
offset = offset.wrapping_add(8); | |
acc3 = round(acc3, read64(input, offset)); | |
offset = offset.wrapping_add(8); | |
acc4 = round(acc4, read64(input, offset)); | |
offset = offset.wrapping_add(8); | |
remaining = remaining.wrapping_sub(32); | |
} | |
hash = acc1 | |
.rotate_left(1) | |
.wrapping_add(acc2.rotate_left(7)) | |
.wrapping_add(acc3.rotate_left(12)) | |
.wrapping_add(acc4.rotate_left(18)); | |
hash = merge_accumulator(hash, acc1); | |
hash = merge_accumulator(hash, acc2); | |
hash = merge_accumulator(hash, acc3); | |
hash = merge_accumulator(hash, acc4); | |
} else { | |
hash = seed.wrapping_add(PRIME64[4]); | |
} | |
hash = hash.wrapping_add(length as u64); | |
while remaining >= 8 { | |
hash ^= round(0, read64(input, offset)); | |
hash = hash.rotate_left(27); | |
hash = hash.wrapping_mul(PRIME64[0]); | |
hash = hash.wrapping_add(PRIME64[3]); | |
offset = offset.wrapping_add(8); | |
remaining = remaining.wrapping_sub(8); | |
} | |
if remaining >= 4 { | |
hash ^= (read32(input, offset) as u64).wrapping_mul(PRIME64[0]); | |
hash = hash.rotate_left(23); | |
hash = hash.wrapping_mul(PRIME64[1]); | |
hash = hash.wrapping_add(PRIME64[2]); | |
offset = offset.wrapping_add(4); | |
remaining = remaining.wrapping_sub(4); | |
} | |
while remaining > 0 { | |
hash ^= (input[offset] as u64).wrapping_mul(PRIME64[4]); | |
hash = hash.rotate_left(11); | |
hash = hash.wrapping_mul(PRIME64[0]); | |
offset = offset.wrapping_add(1); | |
remaining = remaining.wrapping_sub(1); | |
} | |
avalanche(hash) | |
} | |
/// Reads a 64-bit little endian integer from data at the given offset. | |
#[inline] | |
fn read64(data: &[u8], offset: usize) -> u64 { | |
let mut result = 0; | |
for i in 0..8 { | |
result |= u64::from(data[offset + i]) << (i * 8); | |
} | |
result | |
} | |
/// Reads a 32-bit little endian integer from data at the given offset. | |
#[inline] | |
fn read32(data: &[u8], offset: usize) -> u32 { | |
let mut result = 0; | |
for i in 0..4 { | |
result |= u32::from(data[offset + i]) << (i * 8); | |
} | |
result | |
} | |
/// Rounds the given input using the given accumulator. | |
#[inline] | |
fn round(acc_n: u64, lane_n: u64) -> u64 { | |
acc_n | |
.wrapping_add(lane_n.wrapping_mul(PRIME64[1])) | |
.rotate_left(31) | |
.wrapping_mul(PRIME64[0]) | |
} | |
/// Merges the given input using the given accumulator. | |
#[inline] | |
fn merge_accumulator(mut hash: u64, acc: u64) -> u64 { | |
hash ^= round(0, acc); | |
hash = hash.wrapping_mul(PRIME64[0]); | |
hash = hash.wrapping_add(PRIME64[3]); | |
hash | |
} | |
/// Avalanche the given hash. | |
/// | |
/// This is the finalization step of the algorithm. | |
#[inline] | |
fn avalanche(mut hash: u64) -> u64 { | |
hash ^= hash >> 33; | |
hash = hash.wrapping_mul(PRIME64[1]); | |
hash ^= hash >> 29; | |
hash = hash.wrapping_mul(PRIME64[2]); | |
hash ^= hash >> 32; | |
hash | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment