-
-
Save GavinRay97/0b3b3377e6ee6fea470917cf24100ce1 to your computer and use it in GitHub Desktop.
JSFX extension: dot product
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// params: y0, y1, interleaved x, coeff, n | |
static EEL_F NSEEL_CGEN_CALL dotprod2(void *opaque, INT_PTR np, EEL_F **parms) { | |
EEL_F **blocks = (EEL_F **)opaque; | |
int xofs = *parms[2]; | |
int cofs = *parms[3]; | |
int n = *parms[4]; | |
*parms[0] = 0.0; | |
*parms[1] = 0.0; | |
if (unlikely(n <= 0 || xofs < 0 || cofs < 0 || | |
xofs >= NSEEL_RAM_BLOCKS*NSEEL_RAM_ITEMSPERBLOCK || cofs >= NSEEL_RAM_BLOCKS*NSEEL_RAM_ITEMSPERBLOCK || | |
(xofs&(NSEEL_RAM_ITEMSPERBLOCK-1)) + 2*n > NSEEL_RAM_ITEMSPERBLOCK || | |
(cofs&(NSEEL_RAM_ITEMSPERBLOCK-1)) + n > NSEEL_RAM_ITEMSPERBLOCK)) { | |
return 0.0; | |
} | |
EEL_F *xptr =__NSEEL_RAMAlloc(blocks, xofs); | |
if (unlikely(!xptr || xptr == nseel_ramalloc_onfail)) { | |
return 0.0; | |
} | |
EEL_F *cptr =__NSEEL_RAMAlloc(blocks, cofs); | |
if (unlikely(!cptr || cptr == nseel_ramalloc_onfail)) { | |
return 0.0; | |
} | |
const int remaining = n % 4; | |
n -= remaining; | |
__m128d y1 = _mm_setzero_pd(); | |
__m128d y2 = _mm_setzero_pd(); | |
__m128d y3 = _mm_setzero_pd(); | |
__m128d y4 = _mm_setzero_pd(); | |
while (n > 0) { | |
y1 = _mm_add_pd(y1, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr++))); | |
xptr += 2; | |
y2 = _mm_add_pd(y2, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr++))); | |
xptr += 2; | |
y3 = _mm_add_pd(y3, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr++))); | |
xptr += 2; | |
y4 = _mm_add_pd(y4, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr++))); | |
xptr += 2; | |
n -= 4; | |
} | |
switch (remaining) { | |
case 3: | |
y1 = _mm_add_pd(y1, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr++))); | |
xptr += 2; | |
case 2: | |
y2 = _mm_add_pd(y2, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr++))); | |
xptr += 2; | |
case 1: | |
y3 = _mm_add_pd(y3, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr))); | |
} | |
EEL_F y[2]; | |
_mm_storeu_pd(y, _mm_add_pd(_mm_add_pd(y1, y2), _mm_add_pd(y3, y4))); | |
*parms[0] = y[0]; | |
*parms[1] = y[1]; | |
return 0.0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment