|
/* This code is MIT/Apache-2.0 licensed or whatever you'd like if you ask me. |
|
|
|
test shpfx ... bench: 8,433,171 ns/iter (+/- 145,316) = 5843 MB/s |
|
test shpfx_short ... bench: 4,885 ns/iter (+/- 117) = 9852 MB/s |
|
|
|
|
|
*/ |
|
|
|
|
|
#![feature(test)] |
|
use std::ptr; |
|
use std::cmp::min; |
|
|
|
unsafe fn load_u64_le(buf: &[u8], i: usize) -> u64 { |
|
debug_assert!(i + 8 <= buf.len()); |
|
let mut data = 0u64; |
|
ptr::copy_nonoverlapping(buf.get_unchecked(i), &mut data as *mut _ as *mut u8, 8); |
|
data.to_le() |
|
} |
|
|
|
#[inline(never)] |
|
#[no_mangle] |
|
pub fn shared_prefix(a: &[u8], b: &[u8]) -> usize { |
|
let len = min(a.len(), b.len()); |
|
let mut a = &a[..len]; |
|
let mut b = &b[..len]; |
|
let mut offset = 0; |
|
while a.len() >= 16 { |
|
unsafe { |
|
let a0 = load_u64_le(a, 0); |
|
let a1 = load_u64_le(a, 8); |
|
let b0 = load_u64_le(b, 0); |
|
let b1 = load_u64_le(b, 8); |
|
let d0 = a0 ^ b0; |
|
let d1 = a1 ^ b1; |
|
if d0 ^ d1 != 0 { |
|
//if d0 != 0 || d1 != 0 { |
|
break; |
|
} |
|
} |
|
offset += 16; |
|
a = &a[16..]; |
|
b = &b[16..]; |
|
} |
|
for i in 0..a.len() { |
|
if a[i] != b[i] { |
|
return offset + i; |
|
} |
|
} |
|
len |
|
} |
|
|
|
extern crate test; |
|
|
|
use test::Bencher; |
|
|
|
#[test] |
|
fn correct() { |
|
let mut a = vec![0xff; 1024]; |
|
let b = vec![0xff; 1024]; |
|
for byte in 0..255 { // don't test byte 255 |
|
for i in 0..a.len() { |
|
a[i] = byte; |
|
let ans = shared_prefix(&a, &b); |
|
assert!(ans == i, "failed for index {} and byte {:x} (got ans={})", |
|
i, byte, ans); |
|
a[i] = 0xff; |
|
} |
|
} |
|
} |
|
|
|
#[bench] |
|
fn shpfx(bench: &mut Bencher) { |
|
let a = vec![0; 64 * 1024 * 1024]; |
|
let mut b = vec![0; 64 * 1024 * 1024]; |
|
const OFF: usize = 47 * 1024 * 1024; |
|
b[OFF] = 1; |
|
|
|
bench.iter(|| { |
|
shared_prefix(&a, &b); |
|
}); |
|
bench.bytes = OFF as u64; |
|
} |
|
|
|
#[bench] |
|
fn shpfx_short(bench: &mut Bencher) { |
|
let a = vec![0; 64 * 1024]; |
|
let mut b = vec![0; 64 * 1024]; |
|
const OFF: usize = 47 * 1024; |
|
b[OFF] = 1; |
|
|
|
bench.iter(|| { |
|
shared_prefix(&a, &b); |
|
}); |
|
bench.bytes = OFF as u64; |
|
} |
I'm thinking that it will be straight up unlikely that both the slices are well aligned to 8 or 16-byte boundaries, so you'd simply shoot for unaligned loads that are fast on x86-64 anyway. This code is probably similar to what you were already doing, and similar to the memchr crate. The optimizer changes whether it's using two separate 64-bit loads or one 128-bit load depending on details on how the d0, d1 variables are used.
If more throughput is needed (only achievable for inputs shorter than one of the cpu caches I guess), you could unroll the loop further.
You can also use trailing_zeros() / 8 to get the offset straight from the the xor result (that's the reason for the .to_le() call in the code, so it's actually a redundant leftover in this version).