Created
April 21, 2012 20:09
-
-
Save engie/2439378 to your computer and use it in GitHub Desktop.
Unrolled hand-crafted SSE matrix & vector product
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const double* b_p = &(B.data()[0]); | |
double* w_p = &(W.data()[0]); | |
typedef double v2df __attribute__ ((vector_size (16))); | |
for( uint32 i = 0; i < rows; i++ ) //For each row | |
{ | |
double* w_p_row = w_p; | |
double dp = 0; | |
/* Loop over 400 elements | |
* Sum into sum | |
* Leave the pointers munged | |
*/ | |
asm ( | |
"xorpd %%xmm4, %%xmm4\n\t" //Clear xmm4 (will sum up into xmm4) | |
"xorpd %%xmm5, %%xmm5\n\t" //Clear xmm5 (will sum up into xmm5) | |
//First run through - as normal loop, no deps | |
"movapd (%[va]), %%xmm0\n\t" //0 <- Va[0] | |
"movapd 16(%[va]), %%xmm1\n\t" //1 <- Va[2] | |
"mulpd (%[vb]), %%xmm0\n\t" //0 *= Vb[0] | |
"movapd 32(%[va]), %%xmm2\n\t" //2 <- Va[4] | |
"mulpd 16(%[vb]), %%xmm1\n\t" //1 *= Vb[2] | |
"addpd %%xmm0, %%xmm4\n\t" //4 += 0 | |
"movapd 48(%[va]), %%xmm3\n\t" //3 <- Va[6] | |
"mulpd 32(%[vb]), %%xmm2\n\t" //2 *= Vb[4] | |
"addpd %%xmm1, %%xmm5\n\t" //5 <- 1 | |
"lea 64( %[va] ), %[va]\n\t" //Increment the pointers | |
"lea 64( %[vb] ), %[vb]\n\t" | |
"mov $49, %%ecx\n\t" //Set ecx to 49 (8 per loop, above & below count as 1 iteration | |
"jmp bottom\n\t" //Go to the comparison at the bottom | |
"top:\n\t" | |
//Centre of the loop | |
"movapd (%[va]), %%xmm0\n\t" //0 <- Va[0] | |
"mulpd -16(%[vb]), %%xmm3\n\t" //3 *= Vb[-2] | |
"addpd %%xmm2, %%xmm4\n\t" //4 += 2 | |
"movapd 16(%[va]), %%xmm1\n\t" //1 <- Va[2] | |
"mulpd (%[vb]), %%xmm0\n\t" //0 *= Vb[0] | |
"addpd %%xmm3, %%xmm5\n\t" //5 += 3 | |
"movapd 32(%[va]), %%xmm2\n\t" //2 <- Va[4] | |
"mulpd 16(%[vb]), %%xmm1\n\t" //1 *= Vb[2] | |
"addpd %%xmm0, %%xmm4\n\t" //4 += 0 | |
"movapd 48(%[va]), %%xmm3\n\t" //3 <- Va[6] | |
"mulpd 32(%[vb]), %%xmm2\n\t" //2 *= Vb[4] | |
"addpd %%xmm1, %%xmm5\n\t" //5 <- 1 | |
"lea 64( %[va] ), %[va]\n\t" //Increment the pointers | |
"lea 64( %[vb] ), %[vb]\n\t" | |
"dec %%ecx\n\t" //Decrement ecx | |
"bottom:\n\t" | |
"cmp $0, %%ecx\n\t" //Jump back if necessary | |
"jne top\n\t" | |
"mulpd -16(%[vb]), %%xmm3\n\t" //3 *= Vb[-2] | |
"addpd %%xmm2, %%xmm4\n\t" //4 += 2 | |
"addpd %%xmm3, %%xmm5\n\t" //5 += 3 | |
//Combine the accumulators | |
"addpd %%xmm4, %%xmm5\n\t" //5 += 4 | |
//Sum now in xmm5 | |
"movapd %%xmm5, %%xmm0\n\t" //Copy the accumulator into xmm0 | |
"shufpd $1, %%xmm0, %%xmm0\n\t" //Move the top of xmm0 into the bottom of xmm0 | |
"addpd %%xmm0, %%xmm5\n\t" //Add the bottoms of xmm5 and xmm0 | |
"movq %%xmm5, %[sum]\n\t" //Move that total out to a normal register | |
: [sum] "=r" (dp), [va] "+r" (b_p), [vb] "+r" (w_p_row) | |
: | |
: "ecx", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" //list of clobbered registers | |
); | |
myX[i] = dp; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment