Created
June 25, 2018 10:18
-
-
Save bitshifter/2fb86002961f33869b9873d5841e33a2 to your computer and use it in GitHub Desktop.
feature detection inlining
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// test_dynamic.rs | |
#![crate_type="lib"] | |
#[cfg(target_arch = "x86")] | |
use std::arch::x86::*; | |
#[cfg(target_arch = "x86_64")] | |
use std::arch::x86_64::*; | |
pub fn add_scalar(a: &[f32], b: &[f32], c: &mut [f32]) { | |
for ((a, b), c) in a.iter().zip(b.iter()).zip(c.iter_mut()) { | |
*c = a + b; | |
} | |
} | |
#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), target_feature(enable = "sse2"))] | |
pub unsafe fn add_sse2(a: &[f32], b: &[f32], c: &mut [f32]) { | |
// for simplicity assume length is a multiple of chunk size | |
for ((a, b), c) in a.chunks(4).zip(b.chunks(4)).zip(c.chunks_mut(4)) { | |
_mm_storeu_ps( | |
c.as_mut_ptr(), | |
_mm_add_ps( | |
_mm_loadu_ps(a.as_ptr()), | |
_mm_loadu_ps(b.as_ptr()))); | |
} | |
} | |
#[cfg_attr(any(target_arch = "x86", target_arch = "x86_64"), target_feature(enable = "avx2"))] | |
pub unsafe fn add_avx2(a: &[f32], b: &[f32], c: &mut [f32]) { | |
// for simplicity assume length is a multiple of chunk size | |
for ((a, b), c) in a.chunks(8).zip(b.chunks(8)).zip(c.chunks_mut(8)) { | |
_mm256_storeu_ps( | |
c.as_mut_ptr(), | |
_mm256_add_ps( | |
_mm256_loadu_ps(a.as_ptr()), | |
_mm256_loadu_ps(b.as_ptr()))); | |
} | |
} | |
pub fn add(a: &[f32], b: &[f32], c: &mut [f32]) { | |
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] | |
{ | |
// const TEST: u32 = _MM_SHUFFLE(3, 3, 0, 0); | |
if is_x86_feature_detected!("avx2") { | |
return unsafe { add_avx2(a, b, c) }; | |
} | |
if is_x86_feature_detected!("sse2") { | |
return unsafe { add_sse2(a, b, c) }; | |
} | |
} | |
add_scalar(a, b, c); | |
} | |
// compiled with rust 1.27 | |
// rustc -g -O -C lto=fat test_dynamic.rs | |
// objdump -dr libtest_dynamic.rlib | rustfilt | |
0000000000000000 <test_dynamic::add>: | |
0: 55 push %rbp | |
1: 41 57 push %r15 | |
3: 41 56 push %r14 | |
5: 41 55 push %r13 | |
7: 41 54 push %r12 | |
9: 53 push %rbx | |
a: 50 push %rax | |
b: 4d 89 ce mov %r9,%r14 | |
e: 4d 89 c7 mov %r8,%r15 | |
11: 49 89 cc mov %rcx,%r12 | |
14: 49 89 d5 mov %rdx,%r13 | |
17: 48 89 f3 mov %rsi,%rbx | |
1a: 48 89 fd mov %rdi,%rbp | |
1d: bf 0f 00 00 00 mov $0xf,%edi | |
22: e8 00 00 00 00 callq 27 <test_dynamic::add+0x27> | |
23: R_X86_64_PLT32 std::stdsimd::arch::detect::os::check_for-0x4 | |
27: 48 89 ef mov %rbp,%rdi | |
2a: 48 89 de mov %rbx,%rsi | |
2d: 4c 89 ea mov %r13,%rdx | |
30: 4c 89 e1 mov %r12,%rcx | |
33: 4d 89 f8 mov %r15,%r8 | |
36: 4d 89 f1 mov %r14,%r9 | |
39: 48 83 c4 08 add $0x8,%rsp | |
3d: 84 c0 test %al,%al | |
3f: 74 0f je 50 <test_dynamic::add+0x50> | |
41: 5b pop %rbx | |
42: 41 5c pop %r12 | |
44: 41 5d pop %r13 | |
46: 41 5e pop %r14 | |
48: 41 5f pop %r15 | |
4a: 5d pop %rbp | |
4b: e9 00 00 00 00 jmpq 50 <test_dynamic::add+0x50> | |
4c: R_X86_64_PLT32 test_dynamic::add_avx2-0x4 | |
50: 5b pop %rbx | |
51: 41 5c pop %r12 | |
53: 41 5d pop %r13 | |
55: 41 5e pop %r14 | |
57: 41 5f pop %r15 | |
59: 5d pop %rbp | |
5a: e9 00 00 00 00 jmpq 5f <test_dynamic::add+0x5f> | |
5b: R_X86_64_PLT32 test_dynamic::add_sse2-0x4 | |
// compiled with rust nightly + latest stdsimd | |
// rustc -g -O -C lto=fat test_dynamic.rs | |
// objdump -dr libtest_dynamic.rlib | rustfilt | |
0000000000000000 <test_dynamic::add>: | |
0: 55 push %rbp | |
1: 41 57 push %r15 | |
3: 41 56 push %r14 | |
5: 41 55 push %r13 | |
7: 41 54 push %r12 | |
9: 53 push %rbx | |
a: 50 push %rax | |
b: 4d 89 c7 mov %r8,%r15 | |
e: 49 89 cc mov %rcx,%r12 | |
11: 49 89 d5 mov %rdx,%r13 | |
14: 48 89 f3 mov %rsi,%rbx | |
17: 48 89 fd mov %rdi,%rbp | |
1a: 4c 8b 35 00 00 00 00 mov 0x0(%rip),%r14 # 21 <test_dynamic::add+0x21> | |
1d: R_X86_64_GOTPCREL std::stdsimd::arch::detect::cache::CACHE-0x4 | |
21: 49 8b 06 mov (%r14),%rax | |
24: 48 83 f8 ff cmp $0xffffffffffffffff,%rax | |
28: 75 10 jne 3a <test_dynamic::add+0x3a> | |
2a: 4c 89 0c 24 mov %r9,(%rsp) | |
2e: e8 00 00 00 00 callq 33 <test_dynamic::add+0x33> | |
2f: R_X86_64_PLT32 std::stdsimd::arch::detect::os::detect_features-0x4 | |
33: 4c 8b 0c 24 mov (%rsp),%r9 | |
37: 49 89 06 mov %rax,(%r14) | |
3a: 49 8b 06 mov (%r14),%rax | |
3d: 48 89 ef mov %rbp,%rdi | |
40: 48 89 de mov %rbx,%rsi | |
43: 4c 89 ea mov %r13,%rdx | |
46: 4c 89 e1 mov %r12,%rcx | |
49: 4d 89 f8 mov %r15,%r8 | |
4c: 48 83 c4 08 add $0x8,%rsp | |
50: 66 85 c0 test %ax,%ax | |
53: 78 0f js 64 <test_dynamic::add+0x64> | |
55: 5b pop %rbx | |
56: 41 5c pop %r12 | |
58: 41 5d pop %r13 | |
5a: 41 5e pop %r14 | |
5c: 41 5f pop %r15 | |
5e: 5d pop %rbp | |
5f: e9 00 00 00 00 jmpq 64 <test_dynamic::add+0x64> | |
60: R_X86_64_PLT32 test_dynamic::add_sse2-0x4 | |
64: 5b pop %rbx | |
65: 41 5c pop %r12 | |
67: 41 5d pop %r13 | |
69: 41 5e pop %r14 | |
6b: 41 5f pop %r15 | |
6d: 5d pop %rbp | |
6e: e9 00 00 00 00 jmpq 73 <test_dynamic::add+0x73> | |
6f: R_X86_64_PLT32 test_dynamic::add_avx2-0x4 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment