Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save bitshifter/2fb86002961f33869b9873d5841e33a2 to your computer and use it in GitHub Desktop.
Save bitshifter/2fb86002961f33869b9873d5841e33a2 to your computer and use it in GitHub Desktop.
feature detection inlining
// test_dynamic.rs
#![crate_type="lib"]
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
pub fn add_scalar(a: &[f32], b: &[f32], c: &mut [f32]) {
for ((a, b), c) in a.iter().zip(b.iter()).zip(c.iter_mut()) {
*c = a + b;
}
}
#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), target_feature(enable = "sse2"))]
pub unsafe fn add_sse2(a: &[f32], b: &[f32], c: &mut [f32]) {
// for simplicity assume length is a multiple of chunk size
for ((a, b), c) in a.chunks(4).zip(b.chunks(4)).zip(c.chunks_mut(4)) {
_mm_storeu_ps(
c.as_mut_ptr(),
_mm_add_ps(
_mm_loadu_ps(a.as_ptr()),
_mm_loadu_ps(b.as_ptr())));
}
}
#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), target_feature(enable = "avx2"))]
pub unsafe fn add_avx2(a: &[f32], b: &[f32], c: &mut [f32]) {
// for simplicity assume length is a multiple of chunk size
for ((a, b), c) in a.chunks(8).zip(b.chunks(8)).zip(c.chunks_mut(8)) {
_mm256_storeu_ps(
c.as_mut_ptr(),
_mm256_add_ps(
_mm256_loadu_ps(a.as_ptr()),
_mm256_loadu_ps(b.as_ptr())));
}
}
pub fn add(a: &[f32], b: &[f32], c: &mut [f32]) {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
// const TEST: u32 = _MM_SHUFFLE(3, 3, 0, 0);
if is_x86_feature_detected!("avx2") {
return unsafe { add_avx2(a, b, c) };
}
if is_x86_feature_detected!("sse2") {
return unsafe { add_sse2(a, b, c) };
}
}
add_scalar(a, b, c);
}
// compiled with rust 1.27
// rustc -g -O -C lto=fat test_dynamic.rs
// objdump -dr libtest_dynamic.rlib | rustfilt
0000000000000000 <test_dynamic::add>:
0: 55 push %rbp
1: 41 57 push %r15
3: 41 56 push %r14
5: 41 55 push %r13
7: 41 54 push %r12
9: 53 push %rbx
a: 50 push %rax
b: 4d 89 ce mov %r9,%r14
e: 4d 89 c7 mov %r8,%r15
11: 49 89 cc mov %rcx,%r12
14: 49 89 d5 mov %rdx,%r13
17: 48 89 f3 mov %rsi,%rbx
1a: 48 89 fd mov %rdi,%rbp
1d: bf 0f 00 00 00 mov $0xf,%edi
22: e8 00 00 00 00 callq 27 <test_dynamic::add+0x27>
23: R_X86_64_PLT32 std::stdsimd::arch::detect::os::check_for-0x4
27: 48 89 ef mov %rbp,%rdi
2a: 48 89 de mov %rbx,%rsi
2d: 4c 89 ea mov %r13,%rdx
30: 4c 89 e1 mov %r12,%rcx
33: 4d 89 f8 mov %r15,%r8
36: 4d 89 f1 mov %r14,%r9
39: 48 83 c4 08 add $0x8,%rsp
3d: 84 c0 test %al,%al
3f: 74 0f je 50 <test_dynamic::add+0x50>
41: 5b pop %rbx
42: 41 5c pop %r12
44: 41 5d pop %r13
46: 41 5e pop %r14
48: 41 5f pop %r15
4a: 5d pop %rbp
4b: e9 00 00 00 00 jmpq 50 <test_dynamic::add+0x50>
4c: R_X86_64_PLT32 test_dynamic::add_avx2-0x4
50: 5b pop %rbx
51: 41 5c pop %r12
53: 41 5d pop %r13
55: 41 5e pop %r14
57: 41 5f pop %r15
59: 5d pop %rbp
5a: e9 00 00 00 00 jmpq 5f <test_dynamic::add+0x5f>
5b: R_X86_64_PLT32 test_dynamic::add_sse2-0x4
// compiled with rust nightly + latest stdsimd
// rustc -g -O -C lto=fat test_dynamic.rs
// objdump -dr libtest_dynamic.rlib | rustfilt
0000000000000000 <test_dynamic::add>:
0: 55 push %rbp
1: 41 57 push %r15
3: 41 56 push %r14
5: 41 55 push %r13
7: 41 54 push %r12
9: 53 push %rbx
a: 50 push %rax
b: 4d 89 c7 mov %r8,%r15
e: 49 89 cc mov %rcx,%r12
11: 49 89 d5 mov %rdx,%r13
14: 48 89 f3 mov %rsi,%rbx
17: 48 89 fd mov %rdi,%rbp
1a: 4c 8b 35 00 00 00 00 mov 0x0(%rip),%r14 # 21 <test_dynamic::add+0x21>
1d: R_X86_64_GOTPCREL std::stdsimd::arch::detect::cache::CACHE-0x4
21: 49 8b 06 mov (%r14),%rax
24: 48 83 f8 ff cmp $0xffffffffffffffff,%rax
28: 75 10 jne 3a <test_dynamic::add+0x3a>
2a: 4c 89 0c 24 mov %r9,(%rsp)
2e: e8 00 00 00 00 callq 33 <test_dynamic::add+0x33>
2f: R_X86_64_PLT32 std::stdsimd::arch::detect::os::detect_features-0x4
33: 4c 8b 0c 24 mov (%rsp),%r9
37: 49 89 06 mov %rax,(%r14)
3a: 49 8b 06 mov (%r14),%rax
3d: 48 89 ef mov %rbp,%rdi
40: 48 89 de mov %rbx,%rsi
43: 4c 89 ea mov %r13,%rdx
46: 4c 89 e1 mov %r12,%rcx
49: 4d 89 f8 mov %r15,%r8
4c: 48 83 c4 08 add $0x8,%rsp
50: 66 85 c0 test %ax,%ax
53: 78 0f js 64 <test_dynamic::add+0x64>
55: 5b pop %rbx
56: 41 5c pop %r12
58: 41 5d pop %r13
5a: 41 5e pop %r14
5c: 41 5f pop %r15
5e: 5d pop %rbp
5f: e9 00 00 00 00 jmpq 64 <test_dynamic::add+0x64>
60: R_X86_64_PLT32 test_dynamic::add_sse2-0x4
64: 5b pop %rbx
65: 41 5c pop %r12
67: 41 5d pop %r13
69: 41 5e pop %r14
6b: 41 5f pop %r15
6d: 5d pop %rbp
6e: e9 00 00 00 00 jmpq 73 <test_dynamic::add+0x73>
6f: R_X86_64_PLT32 test_dynamic::add_avx2-0x4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment