Last active
October 29, 2024 21:24
-
-
Save raphlinus/cbfe5f39eddc7a378995168e6a02a245 to your computer and use it in GitHub Desktop.
Comparison of scalar and SIMD max reduction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// run with `RUSTFLAGS='-C target-cpu=native' cargo +nightly bench` | |
#![feature(test)] | |
fn main() { | |
let mut a = [0u32; 65536]; | |
a[1] = 42; | |
println!("{}", scalar_max(&a)); | |
println!("{}", avx2_max(&a)); | |
} | |
use core::arch::x86_64; | |
fn scalar_max(a: &[u32; 65536]) -> u32 { | |
a.iter().reduce(|a, b| a.max(b)).unwrap().to_owned() | |
} | |
fn avx2_max(a: &[u32; 65536]) -> u32 { | |
unsafe { | |
let p = a as *const u32 as *const f32; | |
let mut a = x86_64::_mm256_castps_si256(x86_64::_mm256_loadu_ps(p)); | |
let mut a1 = x86_64::_mm256_castps_si256(x86_64::_mm256_loadu_ps(p.offset(8))); | |
for i in 1..4096 { | |
let b = x86_64::_mm256_castps_si256(x86_64::_mm256_loadu_ps(p.offset(i * 16))); | |
let b1 = x86_64::_mm256_castps_si256(x86_64::_mm256_loadu_ps(p.offset(i * 16 + 8))); | |
a = x86_64::_mm256_max_epu32(a, b); | |
a1 = x86_64::_mm256_max_epu32(a1, b1); | |
} | |
a = x86_64::_mm256_max_epu32(a, a1); | |
let l = x86_64::_mm256_extracti128_si256(a, 0); | |
let h = x86_64::_mm256_extracti128_si256(a, 1); | |
let max128 = x86_64::_mm_max_epu32(l, h); | |
let x0 = x86_64::_mm_extract_epi32(max128, 0) as u32; | |
let x1 = x86_64::_mm_extract_epi32(max128, 1) as u32; | |
let x2 = x86_64::_mm_extract_epi32(max128, 2) as u32; | |
let x3 = x86_64::_mm_extract_epi32(max128, 3) as u32; | |
x0.max(x1).max(x2.max(x3)) | |
} | |
} | |
extern crate test; | |
use test::{Bencher, black_box}; | |
#[bench] | |
fn bench_scalar(b: &mut Bencher) { | |
let mut a = [0u32; 65536]; | |
a[1] = 42; | |
b.iter(|| { | |
black_box(scalar_max(&a)) | |
}); | |
} | |
#[bench] | |
fn bench_avx2(b: &mut Bencher) { | |
let mut a = [0u32; 65536]; | |
a[1] = 42; | |
b.iter(|| { | |
black_box(avx2_max(&a)) | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment