Skip to content

Instantly share code, notes, and snippets.

@xigh
Last active May 21, 2020 15:24
Show Gist options
  • Save xigh/063848652a066ff11048dff466383171 to your computer and use it in GitHub Desktop.
Save xigh/063848652a066ff11048dff466383171 to your computer and use it in GitHub Desktop.
#include <immintrin.h>
#include <inttypes.h>
#include <stdio.h>
char *sprint_m256(__m256 v)
{
static char tmp[256];
float f[8];
_mm256_store_ps(f, v);
snprintf(tmp, sizeof tmp,
"[%8.2f, %8.2f, %8.2f, %8.2f, %8.2f, %8.2f, %8.2f, %8.2f]",
f[7], f[6], f[5], f[4], f[3], f[2], f[1], f[0]);
return tmp;
}
char *sprint_m256i(__m256i v)
{
static char tmp[256];
union {
uint32_t u[8];
float f[8];
} x;
_mm256_store_ps(x.f, (__m256) v);
snprintf(tmp, sizeof tmp,
"[%08x, %08x, %08x, %08x, %08x, %08x, %08x, %08x]",
x.u[7], x.u[6], x.u[5], x.u[4], x.u[3], x.u[2], x.u[1], x.u[0]);
return tmp;
}
char *sprint_m128(__m128 v)
{
static char tmp[256];
float f[4];
_mm_store_ps(f, v);
snprintf(tmp, sizeof tmp,
"[%8.2f, %8.2f, %8.2f, %8.2f]",
f[3], f[2], f[1], f[0]);
return tmp;
}
char *sprint_m128i(__m128i v)
{
static char tmp[256];
union {
uint32_t u[4];
float f[4];
} x;
_mm_store_ps(x.f, (__m128) v);
snprintf(tmp, sizeof tmp, "[%08x, %08x, %08x, %08x]",
x.u[3], x.u[2], x.u[1], x.u[0]);
return tmp;
}
char *sprint_uint8(uint8_t n)
{
static char tmp[256];
snprintf(tmp, sizeof tmp, "0b%d%d%d%d%d%d%d%d",
(n >> 7) & 1, (n >> 6) & 1, (n >> 5) & 1, (n >> 4) & 1,
(n >> 3) & 1, (n >> 2) & 1, (n >> 1) & 1, n & 1);
return tmp;
}
// my answer to https://twitter.com/rzidane360/status/1262610710732791813
#include <immintrin.h>
#include <inttypes.h>
#include <stdio.h>
char *sprint_m256(__m256 v);
char *sprint_m256i(__m256i v);
char *sprint_m128(__m128 v);
char *sprint_m128i(__m128i v);
char *sprint_uint8(uint8_t n);
__m256 uncompress(__m256 a, __m256 b, uint8_t n)
{
__m128 u = _mm256_extractf128_ps(b, 0);
__m128 v = _mm256_extractf128_ps(b, 1);
// b.lo - b.lo
__m256 l = _mm256_set_m128(u, u);
// b.hi - b.hi
__m256 h = _mm256_set_m128(v, v);
__m256i m = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
__m256i r = _mm256_sub_epi32(m, _mm256_set1_epi32(8-n));
// rotate lanes
__m256 x = _mm256_permutevar_ps(h, r);
__m256 y = _mm256_permutevar_ps(l, r);
// blend with hi lane
__m256 o = _mm256_blendv_ps(a, x, (__m256) r);
// blend with lo lane
__m256i s = _mm256_sub_epi32(m, _mm256_set1_epi32(4-n));
return _mm256_blendv_ps(o, y, (__m256) s);
}
int same_m256(__m256 a, __m256 b)
{
float fa[8];
float fb[8];
_mm256_store_ps(fa, a);
_mm256_store_ps(fb, b);
for (int i = 0; i < 8; i++) {
if (fa[i] != fb[i]) {
return 0;
}
}
return 1;
}
// n: number of element from a
// [0, 1, 2, 3, 4, 5, 6, 7, 8];
int test(uint8_t n)
{
printf("n=%d\n", n);
float f = 1.0f;
// prepare a
float va[8] __attribute__((__aligned__(32))) = {0};
for (uint8_t i = 0; i < n; i++)
{
va[7 - i] = f;
f += 1.0f;
}
__m256 a = _mm256_load_ps(va);
printf("a= %s\n", sprint_m256(a));
// prepare b
float vb[8] __attribute__((__aligned__(32))) = {0};
for (uint8_t i = 0; i < 8 - n; i++)
{
vb[7 - i] = f + i;
}
__m256 b = _mm256_load_ps(vb);
printf("b= %s\n", sprint_m256(b));
// compute c
float vc[8] __attribute__((__aligned__(32))) = {0};
for (uint8_t i = 0; i < 8; i++)
{
vc[7 - i] = (float) (i + 1);
f += 1.0f;
}
__m256 c = _mm256_load_ps(vc);
printf("c= %s\n", sprint_m256(c));
// ---
__m256 d = uncompress(a, b, n);
printf("d= %s\n", sprint_m256(d));
// ---
if (!same_m256(c, d)) {
printf("failed:\n");
printf("\t%s\n", sprint_m256(c));
printf("\t%s\n", sprint_m256(d));
return 0;
}
return 1;
}
int main(int argc, char **argv)
{
for (uint8_t n = 0; n <= 8; n++)
{
if (!test(n)) {
break;
}
printf("\n");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment