Skip to content

Instantly share code, notes, and snippets.

@pranasblk
Forked from rikusalminen/dot.c
Created December 9, 2019 00:18
Show Gist options
  • Save pranasblk/edc5762f506558ae0f417ec708ad7b75 to your computer and use it in GitHub Desktop.
Save pranasblk/edc5762f506558ae0f417ec708ad7b75 to your computer and use it in GitHub Desktop.
SIMD dot products: ARM NEON, SSE3, SSE
#if defined(__ARM_NEON__)
vec4 dot(vec4 a, vec4 b)
{
vec4 prod = vmulq_f32(a, b);
vec4 sum1 = vaddq_f32(prod, vrev64q_f32(prod));
vec4 sum2 = vaddq_f32(sum1, vcombine_f32(vget_high_f32(sum1), vget_low_f32(sum1)));
return sum2;
}
#else if defined(__SSE3__)
static inline vec4 vdot(vec4 x, vec4 y)
{
vec4 prod = x * y;
vec4 sum1 = _mm_hadd_ps(prod, prod);
vec4 sum2 = _mm_hadd_ps(sum1, sum1);
return sum2;
}
#else // SSE
static inline vec4 vdot(vec4 x, vec4 y)
{
vec4 prod = x * y;
vec4 sum1 = prod + vshuffle(prod, prod, 1, 0, 3, 2);
vec4 sum2 = sum1 + vshuffle(sum1, sum1, 2, 2, 0, 0);
return sum2;
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment