Skip to content

Instantly share code, notes, and snippets.

@robert-nix
Last active August 29, 2015 14:16
Show Gist options
  • Select an option

  • Save robert-nix/b2e8d064ee0f807b9f44 to your computer and use it in GitHub Desktop.

Select an option

Save robert-nix/b2e8d064ee0f807b9f44 to your computer and use it in GitHub Desktop.
naive unrolled mask gen:
Processor 0
Clock
12072
11996
11996
11996
11996
11996
11996
11996
SSE2 mask gen:
Processor 0
Clock
3792
3024
3020
3020
3020
3016
3016
3020
SSSE3 mask gen:
Processor 0
Clock
2624
2552
2556
2556
2556
2556
2556
2556
// includes:
#include <tmmintrin.h>
#include <stdint.h>
// remember to enable AVX instructions for the 3-op VEX mode if you want it
#define USE_SSSE3_MASK_GEN 1
// initializations:
__m128i ns_x[1000];
__m128i masks[1000];
srand(6); // very random
for (uint32_t n = 0; n < 4000; n++) *((uint32_t *)ns_x + n) = rand() % 32;
// test code:
// * worth noting, I haven't tested the correctness of these, just made
// sure the right instructions/dependencies were outputted by the compiler.
__m128i result{ };
for (i = 0; i < 1000; i++) {
#if USE_SSSE3_MASK_GEN
const __m128i c_bytes = { 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 };
const __m128i c_ceil = {0xdf, 0xe7, 0xef, 0xf7, 0xdf, 0xe7, 0xef, 0xf7, 0xdf, 0xe7, 0xef, 0xf7, 0xdf, 0xe7, 0xef, 0xf7};
const __m128i c_floor = {0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7, 0xf7};
const __m128i c_lut = {0, 1, 3, 7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0, 0, 0, 0, 0, 0, 0};
__m128i ns = ns_x[i];
__m128i ii = _mm_shuffle_epi8(ns, c_bytes);
__m128i si = _mm_adds_epu8(ii, c_ceil);
__m128i fi = _mm_subs_epu8(si, c_floor);
result = _mm_shuffle_epi8(c_lut, fi);
#elif USE_SSE2_MASK_GEN
const __m128i c_1 = {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1};
const __m128i c_127 = {0, 0, 0, 127, 0, 0, 0, 127, 0, 0, 0, 127, 0, 0, 0, 127};
__m128i ns = ns_x[i];
__m128i exp = _mm_add_epi32(ns, c_127);
__m128i fltv = _mm_slli_epi32(exp, 23);
__m128i intv = _mm_cvtps_epi32(_mm_castsi128_ps(fltv));
result = _mm_sub_epi32(intv, c_1);
memcpy(masks + i, &result, 16);
#else
for (int j = 0; j < 4; j++) {
uint32_t mask32 = (1 << ((uint32_t *)ns_x)[4 * i + j]) - 1;
memcpy(((uint32_t *)masks) + (4 * i) + j, &mask32, 4);
}
#endif
}
#if USE_SSSE3_MASK_GEN
memcpy(masks, &result, 16);
#endif
vmovdqu xmm4, XMMWORD PTR __xmm@0c0c0c0c080808080404040400000000
vmovdqu xmm5, XMMWORD PTR __xmm@f7efe7dff7efe7dff7efe7dff7efe7df
vmovdqu xmm6, XMMWORD PTR __xmm@f7f7f7f7f7f7f7f7f7f7f7f7f7f7f7f7
vmovdqu xmm7, XMMWORD PTR __xmm@00000000000000ff7f3f1f0f07030100
$LL6@TestLoop:
; File d:\dev\testp\pmctestb.cpp
; 248 : __m128i ii = _mm_shuffle_epi8(ns, c_bytes);
vmovdqu xmm0, XMMWORD PTR [rax]
lea rax, QWORD PTR [rax+16]
vpshufb xmm1, xmm0, xmm4
; 249 : __m128i si = _mm_adds_epu8(ii, c_ceil);
vpaddusb xmm2, xmm1, xmm5
; 250 : __m128i fi = _mm_subs_epu8(si, c_floor);
vpsubusb xmm3, xmm2, xmm6
; 251 : result = _mm_shuffle_epi8(c_lut, fi);
vpshufb xmm0, xmm7, xmm3
vmovdqu XMMWORD PTR result$25[rbp-256], xmm0
dec rcx
jne SHORT $LL6@TestLoop
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment