Created
July 3, 2012 14:55
-
-
Save rikusalminen/3040241 to your computer and use it in GitHub Desktop.
SIMD dot products: ARM NEON, SSE3, SSE
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#if defined(__ARM_NEON__) | |
vec4 dot(vec4 a, vec4 b) | |
{ | |
vec4 prod = vmulq_f32(a, b); | |
vec4 sum1 = vaddq_f32(prod, vrev64q_f32(prod)); | |
vec4 sum2 = vaddq_f32(sum1, vcombine_f32(vget_high_f32(sum1), vget_low_f32(sum1))); | |
return sum2; | |
} | |
#else if defined(__SSE3__) | |
static inline vec4 vdot(vec4 x, vec4 y) | |
{ | |
vec4 prod = x * y; | |
vec4 sum1 = _mm_hadd_ps(prod, prod); | |
vec4 sum2 = _mm_hadd_ps(sum1, sum1); | |
return sum2; | |
} | |
#else // SSE | |
static inline vec4 vdot(vec4 x, vec4 y) | |
{ | |
vec4 prod = x * y; | |
vec4 sum1 = prod + vshuffle(prod, prod, 1, 0, 3, 2); | |
vec4 sum2 = sum1 + vshuffle(sum1, sum1, 2, 2, 0, 0); | |
return sum2; | |
} | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment