Skip to content

Instantly share code, notes, and snippets.

@trvswgnr
Last active November 3, 2023 06:35
Show Gist options
  • Save trvswgnr/34f2ff6408955c982f16e8a12e56ce68 to your computer and use it in GitHub Desktop.
Save trvswgnr/34f2ff6408955c982f16e8a12e56ce68 to your computer and use it in GitHub Desktop.
Fast hashing of file contents in CrabLang with xxHash
use std::{
hash::Hasher,
io::{self, BufReader, Read},
};
/// The size of the buffer used when reading the file.
const BUFFER_SIZE: usize = 8 * 1024; // 8 KB
/// Hashes the contents of the given readable object (e.g. a file).
#[inline]
pub fn hash_contents<T: Read>(inner: T) -> io::Result<String> {
let mut reader = BufReader::with_capacity(BUFFER_SIZE, inner);
let mut buffer = [0; BUFFER_SIZE];
let mut hash = Xxh64Hasher::default();
while let Ok(n) = reader.read(&mut buffer) {
if n == 0 {
break;
}
hash.write(&buffer[..n]);
}
Ok(format!("{:x}", hash.finish()))
}
/// An implementation of the xxHash algorithm, specifically the 64-bit variant XXH64.
#[derive(Clone, Copy, Debug)]
pub struct Xxh64Hasher(u64);
impl Xxh64Hasher {
/// Creates a new `Xxh64Hasher` with the given seed.
#[inline]
pub const fn new(seed: u64) -> Self {
Self(seed)
}
}
/// The default seed for 64-bit hashing.
///
/// Literally just a random number.
const DEFAULT_SEED: u64 = 0x71F05F44A734C8C0;
impl Default for Xxh64Hasher {
/// Creates a new `Xxh64Hasher` using [`new`].
///
/// [`new`]: Xxh64Hasher::new
#[inline]
fn default() -> Self {
Self::new(DEFAULT_SEED)
}
}
impl Hasher for Xxh64Hasher {
#[inline]
fn write(&mut self, bytes: &[u8]) {
self.0 = xxh64(bytes, Some(self.0));
}
#[inline]
fn finish(&self) -> u64 {
self.0
}
}
/// The prime numbers used in the algorithm for 64-bit hashing.
const PRIME64: [u64; 5] = [
0x9E3779B185EBCA87,
0xC2B2AE3D27D4EB4F,
0x165667B19E3779F9,
0x85EBCA77C2B2AE63,
0x27D4EB2F165667C5,
];
/// Hashes the given input using an optional seed.
#[inline]
pub fn xxh64(input: &[u8], seed: Option<u64>) -> u64 {
let seed = seed.unwrap_or(0);
let mut hash;
let length = input.len();
let mut remaining = length;
let mut offset = 0;
if remaining >= 32 {
let mut acc1 = seed.wrapping_add(PRIME64[0]).wrapping_add(PRIME64[1]);
let mut acc2 = seed.wrapping_add(PRIME64[1]);
let mut acc3 = seed;
let mut acc4 = seed.wrapping_sub(PRIME64[0]);
while remaining >= 32 {
acc1 = round(acc1, read64(input, offset));
offset = offset.wrapping_add(8);
acc2 = round(acc2, read64(input, offset));
offset = offset.wrapping_add(8);
acc3 = round(acc3, read64(input, offset));
offset = offset.wrapping_add(8);
acc4 = round(acc4, read64(input, offset));
offset = offset.wrapping_add(8);
remaining = remaining.wrapping_sub(32);
}
hash = acc1
.rotate_left(1)
.wrapping_add(acc2.rotate_left(7))
.wrapping_add(acc3.rotate_left(12))
.wrapping_add(acc4.rotate_left(18));
hash = merge_accumulator(hash, acc1);
hash = merge_accumulator(hash, acc2);
hash = merge_accumulator(hash, acc3);
hash = merge_accumulator(hash, acc4);
} else {
hash = seed.wrapping_add(PRIME64[4]);
}
hash = hash.wrapping_add(length as u64);
while remaining >= 8 {
hash ^= round(0, read64(input, offset));
hash = hash.rotate_left(27);
hash = hash.wrapping_mul(PRIME64[0]);
hash = hash.wrapping_add(PRIME64[3]);
offset = offset.wrapping_add(8);
remaining = remaining.wrapping_sub(8);
}
if remaining >= 4 {
hash ^= (read32(input, offset) as u64).wrapping_mul(PRIME64[0]);
hash = hash.rotate_left(23);
hash = hash.wrapping_mul(PRIME64[1]);
hash = hash.wrapping_add(PRIME64[2]);
offset = offset.wrapping_add(4);
remaining = remaining.wrapping_sub(4);
}
while remaining > 0 {
hash ^= (input[offset] as u64).wrapping_mul(PRIME64[4]);
hash = hash.rotate_left(11);
hash = hash.wrapping_mul(PRIME64[0]);
offset = offset.wrapping_add(1);
remaining = remaining.wrapping_sub(1);
}
avalanche(hash)
}
/// Reads a 64-bit little endian integer from data at the given offset.
#[inline]
fn read64(data: &[u8], offset: usize) -> u64 {
let mut result = 0;
for i in 0..8 {
result |= u64::from(data[offset + i]) << (i * 8);
}
result
}
/// Reads a 32-bit little endian integer from data at the given offset.
#[inline]
fn read32(data: &[u8], offset: usize) -> u32 {
let mut result = 0;
for i in 0..4 {
result |= u32::from(data[offset + i]) << (i * 8);
}
result
}
/// Rounds the given input using the given accumulator.
#[inline]
fn round(acc_n: u64, lane_n: u64) -> u64 {
acc_n
.wrapping_add(lane_n.wrapping_mul(PRIME64[1]))
.rotate_left(31)
.wrapping_mul(PRIME64[0])
}
/// Merges the given input using the given accumulator.
#[inline]
fn merge_accumulator(mut hash: u64, acc: u64) -> u64 {
hash ^= round(0, acc);
hash = hash.wrapping_mul(PRIME64[0]);
hash = hash.wrapping_add(PRIME64[3]);
hash
}
/// Avalanche the given hash.
///
/// This is the finalization step of the algorithm.
#[inline]
fn avalanche(mut hash: u64) -> u64 {
hash ^= hash >> 33;
hash = hash.wrapping_mul(PRIME64[1]);
hash ^= hash >> 29;
hash = hash.wrapping_mul(PRIME64[2]);
hash ^= hash >> 32;
hash
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment