Skip to content

Instantly share code, notes, and snippets.

@raphlinus
Last active October 29, 2024 21:24
Show Gist options
  • Save raphlinus/cbfe5f39eddc7a378995168e6a02a245 to your computer and use it in GitHub Desktop.
Save raphlinus/cbfe5f39eddc7a378995168e6a02a245 to your computer and use it in GitHub Desktop.
Comparison of scalar and SIMD max reduction
// run with `RUSTFLAGS='-C target-cpu=native' cargo +nightly bench`
#![feature(test)]
fn main() {
let mut a = [0u32; 65536];
a[1] = 42;
println!("{}", scalar_max(&a));
println!("{}", avx2_max(&a));
}
use core::arch::x86_64;
fn scalar_max(a: &[u32; 65536]) -> u32 {
a.iter().reduce(|a, b| a.max(b)).unwrap().to_owned()
}
fn avx2_max(a: &[u32; 65536]) -> u32 {
unsafe {
let p = a as *const u32 as *const f32;
let mut a = x86_64::_mm256_castps_si256(x86_64::_mm256_loadu_ps(p));
let mut a1 = x86_64::_mm256_castps_si256(x86_64::_mm256_loadu_ps(p.offset(8)));
for i in 1..4096 {
let b = x86_64::_mm256_castps_si256(x86_64::_mm256_loadu_ps(p.offset(i * 16)));
let b1 = x86_64::_mm256_castps_si256(x86_64::_mm256_loadu_ps(p.offset(i * 16 + 8)));
a = x86_64::_mm256_max_epu32(a, b);
a1 = x86_64::_mm256_max_epu32(a1, b1);
}
a = x86_64::_mm256_max_epu32(a, a1);
let l = x86_64::_mm256_extracti128_si256(a, 0);
let h = x86_64::_mm256_extracti128_si256(a, 1);
let max128 = x86_64::_mm_max_epu32(l, h);
let x0 = x86_64::_mm_extract_epi32(max128, 0) as u32;
let x1 = x86_64::_mm_extract_epi32(max128, 1) as u32;
let x2 = x86_64::_mm_extract_epi32(max128, 2) as u32;
let x3 = x86_64::_mm_extract_epi32(max128, 3) as u32;
x0.max(x1).max(x2.max(x3))
}
}
extern crate test;
use test::{Bencher, black_box};
#[bench]
fn bench_scalar(b: &mut Bencher) {
let mut a = [0u32; 65536];
a[1] = 42;
b.iter(|| {
black_box(scalar_max(&a))
});
}
#[bench]
fn bench_avx2(b: &mut Bencher) {
let mut a = [0u32; 65536];
a[1] = 42;
b.iter(|| {
black_box(avx2_max(&a))
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment