Last active
April 12, 2025 20:33
-
-
Save raphlinus/cbfe5f39eddc7a378995168e6a02a245 to your computer and use it in GitHub Desktop.
Comparison of scalar and SIMD max reduction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// run with `RUSTFLAGS='-C target-cpu=native' cargo +nightly bench` | |
#![feature(test)] | |
fn main() { | |
let mut a = [0u32; 65536]; | |
a[1] = 42; | |
println!("{}", scalar_max(&a)); | |
println!("{}", avx2_max(&a)); | |
} | |
use core::arch::x86_64; | |
fn scalar_max(a: &[u32; 65536]) -> u32 { | |
a.iter().reduce(|a, b| a.max(b)).unwrap().to_owned() | |
} | |
fn avx2_max(a: &[u32; 65536]) -> u32 { | |
unsafe { | |
let p = a as *const u32 as *const f32; | |
let mut a = x86_64::_mm256_castps_si256(x86_64::_mm256_loadu_ps(p)); | |
let mut a1 = x86_64::_mm256_castps_si256(x86_64::_mm256_loadu_ps(p.offset(8))); | |
for i in 1..4096 { | |
let b = x86_64::_mm256_castps_si256(x86_64::_mm256_loadu_ps(p.offset(i * 16))); | |
let b1 = x86_64::_mm256_castps_si256(x86_64::_mm256_loadu_ps(p.offset(i * 16 + 8))); | |
a = x86_64::_mm256_max_epu32(a, b); | |
a1 = x86_64::_mm256_max_epu32(a1, b1); | |
} | |
a = x86_64::_mm256_max_epu32(a, a1); | |
let l = x86_64::_mm256_extracti128_si256(a, 0); | |
let h = x86_64::_mm256_extracti128_si256(a, 1); | |
let max128 = x86_64::_mm_max_epu32(l, h); | |
let x0 = x86_64::_mm_extract_epi32(max128, 0) as u32; | |
let x1 = x86_64::_mm_extract_epi32(max128, 1) as u32; | |
let x2 = x86_64::_mm_extract_epi32(max128, 2) as u32; | |
let x3 = x86_64::_mm_extract_epi32(max128, 3) as u32; | |
x0.max(x1).max(x2.max(x3)) | |
} | |
} | |
extern crate test; | |
use test::{Bencher, black_box}; | |
#[bench] | |
fn bench_scalar(b: &mut Bencher) { | |
let mut a = [0u32; 65536]; | |
a[1] = 42; | |
b.iter(|| { | |
black_box(scalar_max(&a)) | |
}); | |
} | |
#[bench] | |
fn bench_avx2(b: &mut Bencher) { | |
let mut a = [0u32; 65536]; | |
a[1] = 42; | |
b.iter(|| { | |
black_box(avx2_max(&a)) | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment