Created
April 21, 2012 19:59
-
-
Save engie/2439337 to your computer and use it in GitHub Desktop.
Hand written SSE matrix * vector product
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const double* b_p = &(B.data()[0]); | |
double* w_p = &(W.data()[0]); | |
for( uint32 i = 0; i < rows; i++ ) //For each row | |
{ | |
double* w_p_row = w_p; | |
double dp = 0; | |
/* Loop over 400 elements | |
* Sum into sum | |
* Leave the pointers munged | |
*/ | |
asm ( | |
"xorpd %%xmm1, %%xmm1\n\t" //Clear xmm1 (will sum up into xmm1) | |
"mov $200, %%ecx\n\t" //Set ecx to 200 | |
"jmp bottom\n\t" //Go to the comparison at the bottom | |
"top:\n\t" | |
//Centre of the loop | |
"movapd (%[va]), %%xmm0\n\t" //Move 2 doubles from b_p to xmm0 | |
"mulpd (%[vb]), %%xmm0\n\t" //Multiple 2 doubles in xmm0 by 2 from w_p_row | |
"addpd %%xmm0, %%xmm1\n\t" //Add the doubles to the accumulator in xmm1 | |
"lea 16( %[va] ), %[va]\n\t" //Increment the pointers | |
"lea 16( %[vb] ), %[vb]\n\t" | |
"dec %%ecx\n\t" //Decrement ecx | |
"bottom:\n\t" | |
"cmp $0, %%ecx\n\t" //Jump back if necessary | |
"jne top\n\t" | |
//Sum now in xmm1 | |
"movapd %%xmm1, %%xmm0\n\t" //Copy athe accumulator into xmm0 | |
"shufpd $1, %%xmm0, %%xmm0\n\t" //Move the top of xmm0 into the bottom of xmm0 | |
"addpd %%xmm0, %%xmm1\n\t" //Add the bottoms of xmm1 and xmm0 | |
"movq %%xmm1, %[sum]\n\t" //Move that total out to a normal register | |
: [sum] "=r" (dp), [va] "+r" (b_p), [vb] "+r" (w_p_row) | |
: | |
: "ecx", "xmm0", "xmm1" //list of clobbered registers | |
); | |
myX[i] = dp; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment