Last active
August 29, 2015 14:16
-
-
Save robert-nix/b2e8d064ee0f807b9f44 to your computer and use it in GitHub Desktop.
(from slides: https://deplinenoise.wordpress.com/2015/03/06/slides-simd-at-insomniac-games-gdc-2015/ ) using http://agner.org/optimize/#testp as a base:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| naive unrolled mask gen: | |
| Processor 0 | |
| Clock | |
| 12072 | |
| 11996 | |
| 11996 | |
| 11996 | |
| 11996 | |
| 11996 | |
| 11996 | |
| 11996 | |
| SSE2 mask gen: | |
| Processor 0 | |
| Clock | |
| 3792 | |
| 3024 | |
| 3020 | |
| 3020 | |
| 3020 | |
| 3016 | |
| 3016 | |
| 3020 | |
| SSSE3 mask gen: | |
| Processor 0 | |
| Clock | |
| 2624 | |
| 2552 | |
| 2556 | |
| 2556 | |
| 2556 | |
| 2556 | |
| 2556 | |
| 2556 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // includes: | |
| #include <tmmintrin.h> | |
| #include <stdint.h> | |
| // remember to enable AVX instructions for the 3-op VEX mode if you want it | |
| #define USE_SSSE3_MASK_GEN 1 | |
| // initializations: | |
| __m128i ns_x[1000]; | |
| __m128i masks[1000]; | |
| srand(6); // very random | |
| for (uint32_t n = 0; n < 4000; n++) *((uint32_t *)ns_x + n) = rand() % 32; | |
| // test code: | |
| // * worth noting, I haven't tested the correctness of these, just made | |
| // sure the right instructions/dependencies were outputted by the compiler. | |
| __m128i result{ }; | |
| for (i = 0; i < 1000; i++) { | |
| #if USE_SSSE3_MASK_GEN | |
| const __m128i c_bytes = { 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 }; | |
| const __m128i c_ceil = {0xdf, 0xe7, 0xef, 0xf7, 0xdf, 0xe7, 0xef, 0xf7, 0xdf, 0xe7, 0xef, 0xf7, 0xdf, 0xe7, 0xef, 0xf7}; | |
| const __m128i c_floor = {0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7}; | |
| const __m128i c_lut = {0, 1, 3, 7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0, 0, 0, 0, 0, 0, 0}; | |
| __m128i ns = ns_x[i]; | |
| __m128i ii = _mm_shuffle_epi8(ns, c_bytes); | |
| __m128i si = _mm_adds_epu8(ii, c_ceil); | |
| __m128i fi = _mm_subs_epu8(si, c_floor); | |
| result = _mm_shuffle_epi8(c_lut, fi); | |
| #elif USE_SSE2_MASK_GEN | |
| const __m128i c_1 = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1}; | |
| const __m128i c_127 = {0, 0, 0, 127, 0, 0, 0, 127, 0, 0, 0, 127, 0, 0, 0, 127}; | |
| __m128i ns = ns_x[i]; | |
| __m128i exp = _mm_add_epi32(ns, c_127); | |
| __m128i fltv = _mm_slli_epi32(exp, 23); | |
| __m128i intv = _mm_cvtps_epi32(_mm_castsi128_ps(fltv)); | |
| result = _mm_sub_epi32(intv, c_1); | |
| memcpy(masks + i, &result, 16); | |
| #else | |
| for (int j = 0; j < 4; j++) { | |
| uint32_t mask32 = (1 << ((uint32_t *)ns_x)[4 * i + j]) - 1; | |
| memcpy(((uint32_t *)masks) + (4 * i) + j, &mask32, 4); | |
| } | |
| #endif | |
| } | |
| #if USE_SSSE3_MASK_GEN | |
| memcpy(masks, &result, 16); | |
| #endif |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| vmovdqu xmm4, XMMWORD PTR __xmm@0c0c0c0c080808080404040400000000 | |
| vmovdqu xmm5, XMMWORD PTR __xmm@f7efe7dff7efe7dff7efe7dff7efe7df | |
| vmovdqu xmm6, XMMWORD PTR __xmm@f7f7f7f7f7f7f7f7f7f7f7f7f7f7f7f7 | |
| vmovdqu xmm7, XMMWORD PTR __xmm@00000000000000ff7f3f1f0f07030100 | |
| $LL6@TestLoop: | |
| ; File d:\dev\testp\pmctestb.cpp | |
| ; 248 : __m128i ii = _mm_shuffle_epi8(ns, c_bytes); | |
| vmovdqu xmm0, XMMWORD PTR [rax] | |
| lea rax, QWORD PTR [rax+16] | |
| vpshufb xmm1, xmm0, xmm4 | |
| ; 249 : __m128i si = _mm_adds_epu8(ii, c_ceil); | |
| vpaddusb xmm2, xmm1, xmm5 | |
| ; 250 : __m128i fi = _mm_subs_epu8(si, c_floor); | |
| vpsubusb xmm3, xmm2, xmm6 | |
| ; 251 : result = _mm_shuffle_epi8(c_lut, fi); | |
| vpshufb xmm0, xmm7, xmm3 | |
| vmovdqu XMMWORD PTR result$25[rbp-256], xmm0 | |
| dec rcx | |
| jne SHORT $LL6@TestLoop |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment