Last active
May 21, 2020 15:24
-
-
Save xigh/063848652a066ff11048dff466383171 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <inttypes.h> | |
#include <stdio.h> | |
char *sprint_m256(__m256 v) | |
{ | |
static char tmp[256]; | |
float f[8]; | |
_mm256_store_ps(f, v); | |
snprintf(tmp, sizeof tmp, | |
"[%8.2f, %8.2f, %8.2f, %8.2f, %8.2f, %8.2f, %8.2f, %8.2f]", | |
f[7], f[6], f[5], f[4], f[3], f[2], f[1], f[0]); | |
return tmp; | |
} | |
char *sprint_m256i(__m256i v) | |
{ | |
static char tmp[256]; | |
union { | |
uint32_t u[8]; | |
float f[8]; | |
} x; | |
_mm256_store_ps(x.f, (__m256) v); | |
snprintf(tmp, sizeof tmp, | |
"[%08x, %08x, %08x, %08x, %08x, %08x, %08x, %08x]", | |
x.u[7], x.u[6], x.u[5], x.u[4], x.u[3], x.u[2], x.u[1], x.u[0]); | |
return tmp; | |
} | |
char *sprint_m128(__m128 v) | |
{ | |
static char tmp[256]; | |
float f[4]; | |
_mm_store_ps(f, v); | |
snprintf(tmp, sizeof tmp, | |
"[%8.2f, %8.2f, %8.2f, %8.2f]", | |
f[3], f[2], f[1], f[0]); | |
return tmp; | |
} | |
char *sprint_m128i(__m128i v) | |
{ | |
static char tmp[256]; | |
union { | |
uint32_t u[4]; | |
float f[4]; | |
} x; | |
_mm_store_ps(x.f, (__m128) v); | |
snprintf(tmp, sizeof tmp, "[%08x, %08x, %08x, %08x]", | |
x.u[3], x.u[2], x.u[1], x.u[0]); | |
return tmp; | |
} | |
char *sprint_uint8(uint8_t n) | |
{ | |
static char tmp[256]; | |
snprintf(tmp, sizeof tmp, "0b%d%d%d%d%d%d%d%d", | |
(n >> 7) & 1, (n >> 6) & 1, (n >> 5) & 1, (n >> 4) & 1, | |
(n >> 3) & 1, (n >> 2) & 1, (n >> 1) & 1, n & 1); | |
return tmp; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// my answer to https://twitter.com/rzidane360/status/1262610710732791813 | |
#include <immintrin.h> | |
#include <inttypes.h> | |
#include <stdio.h> | |
char *sprint_m256(__m256 v); | |
char *sprint_m256i(__m256i v); | |
char *sprint_m128(__m128 v); | |
char *sprint_m128i(__m128i v); | |
char *sprint_uint8(uint8_t n); | |
__m256 uncompress(__m256 a, __m256 b, uint8_t n) | |
{ | |
__m128 u = _mm256_extractf128_ps(b, 0); | |
__m128 v = _mm256_extractf128_ps(b, 1); | |
// b.lo - b.lo | |
__m256 l = _mm256_set_m128(u, u); | |
// b.hi - b.hi | |
__m256 h = _mm256_set_m128(v, v); | |
__m256i m = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); | |
__m256i r = _mm256_sub_epi32(m, _mm256_set1_epi32(8-n)); | |
// rotate lanes | |
__m256 x = _mm256_permutevar_ps(h, r); | |
__m256 y = _mm256_permutevar_ps(l, r); | |
// blend with hi lane | |
__m256 o = _mm256_blendv_ps(a, x, (__m256) r); | |
// blend with lo lane | |
__m256i s = _mm256_sub_epi32(m, _mm256_set1_epi32(4-n)); | |
return _mm256_blendv_ps(o, y, (__m256) s); | |
} | |
int same_m256(__m256 a, __m256 b) | |
{ | |
float fa[8]; | |
float fb[8]; | |
_mm256_store_ps(fa, a); | |
_mm256_store_ps(fb, b); | |
for (int i = 0; i < 8; i++) { | |
if (fa[i] != fb[i]) { | |
return 0; | |
} | |
} | |
return 1; | |
} | |
// n: number of element from a | |
// [0, 1, 2, 3, 4, 5, 6, 7, 8]; | |
int test(uint8_t n) | |
{ | |
printf("n=%d\n", n); | |
float f = 1.0f; | |
// prepare a | |
float va[8] __attribute__((__aligned__(32))) = {0}; | |
for (uint8_t i = 0; i < n; i++) | |
{ | |
va[7 - i] = f; | |
f += 1.0f; | |
} | |
__m256 a = _mm256_load_ps(va); | |
printf("a= %s\n", sprint_m256(a)); | |
// prepare b | |
float vb[8] __attribute__((__aligned__(32))) = {0}; | |
for (uint8_t i = 0; i < 8 - n; i++) | |
{ | |
vb[7 - i] = f + i; | |
} | |
__m256 b = _mm256_load_ps(vb); | |
printf("b= %s\n", sprint_m256(b)); | |
// compute c | |
float vc[8] __attribute__((__aligned__(32))) = {0}; | |
for (uint8_t i = 0; i < 8; i++) | |
{ | |
vc[7 - i] = (float) (i + 1); | |
f += 1.0f; | |
} | |
__m256 c = _mm256_load_ps(vc); | |
printf("c= %s\n", sprint_m256(c)); | |
// --- | |
__m256 d = uncompress(a, b, n); | |
printf("d= %s\n", sprint_m256(d)); | |
// --- | |
if (!same_m256(c, d)) { | |
printf("failed:\n"); | |
printf("\t%s\n", sprint_m256(c)); | |
printf("\t%s\n", sprint_m256(d)); | |
return 0; | |
} | |
return 1; | |
} | |
int main(int argc, char **argv) | |
{ | |
for (uint8_t n = 0; n <= 8; n++) | |
{ | |
if (!test(n)) { | |
break; | |
} | |
printf("\n"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment