-
-
Save pranasblk/edc5762f506558ae0f417ec708ad7b75 to your computer and use it in GitHub Desktop.
SIMD dot products: ARM NEON, SSE3, SSE
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#if defined(__ARM_NEON__) | |
vec4 dot(vec4 a, vec4 b) | |
{ | |
vec4 prod = vmulq_f32(a, b); | |
vec4 sum1 = vaddq_f32(prod, vrev64q_f32(prod)); | |
vec4 sum2 = vaddq_f32(sum1, vcombine_f32(vget_high_f32(sum1), vget_low_f32(sum1))); | |
return sum2; | |
} | |
#else if defined(__SSE3__) | |
static inline vec4 vdot(vec4 x, vec4 y) | |
{ | |
vec4 prod = x * y; | |
vec4 sum1 = _mm_hadd_ps(prod, prod); | |
vec4 sum2 = _mm_hadd_ps(sum1, sum1); | |
return sum2; | |
} | |
#else // SSE | |
static inline vec4 vdot(vec4 x, vec4 y) | |
{ | |
vec4 prod = x * y; | |
vec4 sum1 = prod + vshuffle(prod, prod, 1, 0, 3, 2); | |
vec4 sum2 = sum1 + vshuffle(sum1, sum1, 2, 2, 0, 0); | |
return sum2; | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment