Created
October 16, 2018 04:30
-
-
Save ess7/81b69e48ec55752d8497cae642e4ec12 to your computer and use it in GitHub Desktop.
JSFX extension: dot product
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// params: y0, y1, interleaved x, coeff, n | |
static EEL_F NSEEL_CGEN_CALL dotprod2(void *opaque, INT_PTR np, EEL_F **parms) { | |
EEL_F **blocks = (EEL_F **)opaque; | |
int xofs = *parms[2]; | |
int cofs = *parms[3]; | |
int n = *parms[4]; | |
*parms[0] = 0.0; | |
*parms[1] = 0.0; | |
if (unlikely(n <= 0 || xofs < 0 || cofs < 0 || | |
xofs >= NSEEL_RAM_BLOCKS*NSEEL_RAM_ITEMSPERBLOCK || cofs >= NSEEL_RAM_BLOCKS*NSEEL_RAM_ITEMSPERBLOCK || | |
(xofs&(NSEEL_RAM_ITEMSPERBLOCK-1)) + 2*n > NSEEL_RAM_ITEMSPERBLOCK || | |
(cofs&(NSEEL_RAM_ITEMSPERBLOCK-1)) + n > NSEEL_RAM_ITEMSPERBLOCK)) { | |
return 0.0; | |
} | |
EEL_F *xptr =__NSEEL_RAMAlloc(blocks, xofs); | |
if (unlikely(!xptr || xptr == nseel_ramalloc_onfail)) { | |
return 0.0; | |
} | |
EEL_F *cptr =__NSEEL_RAMAlloc(blocks, cofs); | |
if (unlikely(!cptr || cptr == nseel_ramalloc_onfail)) { | |
return 0.0; | |
} | |
const int remaining = n % 4; | |
n -= remaining; | |
__m128d y1 = _mm_setzero_pd(); | |
__m128d y2 = _mm_setzero_pd(); | |
__m128d y3 = _mm_setzero_pd(); | |
__m128d y4 = _mm_setzero_pd(); | |
while (n > 0) { | |
y1 = _mm_add_pd(y1, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr++))); | |
xptr += 2; | |
y2 = _mm_add_pd(y2, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr++))); | |
xptr += 2; | |
y3 = _mm_add_pd(y3, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr++))); | |
xptr += 2; | |
y4 = _mm_add_pd(y4, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr++))); | |
xptr += 2; | |
n -= 4; | |
} | |
switch (remaining) { | |
case 3: | |
y1 = _mm_add_pd(y1, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr++))); | |
xptr += 2; | |
case 2: | |
y2 = _mm_add_pd(y2, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr++))); | |
xptr += 2; | |
case 1: | |
y3 = _mm_add_pd(y3, _mm_mul_pd(_mm_loadu_pd(xptr), _mm_load1_pd(cptr))); | |
} | |
EEL_F y[2]; | |
_mm_storeu_pd(y, _mm_add_pd(_mm_add_pd(y1, y2), _mm_add_pd(y3, y4))); | |
*parms[0] = y[0]; | |
*parms[1] = y[1]; | |
return 0.0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment