Created
May 21, 2025 14:29
-
-
Save folkertdev/0561efe5f779f5b5cd57d14cc57fbe18 to your computer and use it in GitHub Desktop.
benchmark `compare256` performance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "compare256-benchmark" | |
version = "0.1.0" | |
edition = "2024" | |
[dev-dependencies] | |
divan = "0.1.21" | |
[[bench]] | |
name = "compare256" | |
harness = false |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8}; | |
fn main() { | |
// Run registered benchmarks. | |
divan::main(); | |
} | |
const X: [u8; 256] = { | |
let mut result = [0; 256]; | |
let mut i = 0; | |
while i < result.len() { | |
result[i] = i as u8; | |
i += 1; | |
} | |
result | |
}; | |
const A: &[u8; 256] = &X; | |
const B: &[u8; 256] = &{ | |
let mut result = X; | |
result[128] += 1; | |
result | |
}; | |
const ARGS: [(&[u8; 256], &[u8; 256]); 2] = [(A, A), (A, B)]; | |
#[divan::bench(args = ARGS)] | |
pub fn compare256_old((src0, src1): (&[u8; 256], &[u8; 256])) -> usize { | |
src0.iter().zip(src1).take_while(|(x, y)| x == y).count() | |
} | |
#[divan::bench(args = ARGS)] | |
pub fn compare256_new(args: (&[u8; 256], &[u8; 256])) -> usize { | |
unsafe { compare256_new_helper(args) } | |
} | |
#[target_feature(enable = "sse2,bmi1")] | |
pub unsafe fn compare256_new_helper((src0, src1): (&[u8; 256], &[u8; 256])) -> usize { | |
let src0 = src0.chunks_exact(16); | |
let src1 = src1.chunks_exact(16); | |
let mut len = 0; | |
unsafe { | |
for (chunk0, chunk1) in src0.zip(src1) { | |
// load the next chunks into a simd register | |
let xmm_src0 = _mm_loadu_si128(chunk0.as_ptr() as *const __m128i); | |
let xmm_src1 = _mm_loadu_si128(chunk1.as_ptr() as *const __m128i); | |
// element-wise compare of the 8-bit elements | |
let xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); | |
// turn a 16 * 8-bit vector into a 16-bit integer. | |
// a bit in the output is set if the corresponding element is non-zero. | |
let mask = _mm_movemask_epi8(xmm_cmp) as u16; | |
if mask != 0xFFFF | |
/* i.e. all 1 bits */ | |
{ | |
let match_byte = mask.trailing_ones(); | |
return len + match_byte as usize; | |
} | |
len += 16; | |
} | |
} | |
256 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment