Created
May 10, 2019 01:56
-
-
Save magical/3beed59f0ae92c157b6b1e1c4cfc7c58 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "simd.p4.h" | |
// July 13, 2017: mjb -- changed rbx register to r8 | |
// The convention is that rbx needs to be saved by the callee (i.e., pushed and popped), | |
// but rcx, rdx, r8, and r9 do not | |
// This fixed the bug that showed up in cs 475/575 project #5 in SQ 2017 | |
void | |
SimdMul( float *a, float *b, float *c, int len ) | |
{ | |
int limit = ( len/SSE_WIDTH ) * SSE_WIDTH; | |
float *atmp = a, *btmp = b, *ctmp = c; | |
for( int i = 0; i < limit; i += SSE_WIDTH ) | |
{ | |
__asm | |
( | |
".att_syntax\n\t" | |
"movups (%0), %%xmm0\n\t" // load the first sse register | |
"movups (%1), %%xmm1\n\t" // load the second sse register | |
"mulps %%xmm1, %%xmm0\n\t" // do the multiply | |
"movups %%xmm0, (%2)\n\t" // store the result | |
"addq $16, %0\n\t" | |
"addq $16, %1\n\t" | |
"addq $16, %2\n\t" | |
: /* outputs */ "+r" (atmp), "+r" (btmp), "+r" (ctmp), "=m" (*ctmp) | |
: /* inputs */ "m" (*atmp), "m" (*btmp) | |
: /* clobbers */ "xmm0", "xmm1" | |
); | |
} | |
for( int i = limit; i < len; i++ ) | |
{ | |
c[i] = a[i] * b[i]; | |
} | |
} | |
float | |
SimdMulSum( float *a, float *b, int len ) | |
{ | |
float sum[4] = { 0., 0., 0., 0. }; | |
int limit = ( len/SSE_WIDTH ) * SSE_WIDTH; | |
{ | |
float *atmp = a, *btmp = b; | |
register __int128 tmp __asm ("xmm2") = 0; | |
for (int i = 0; i < limit; i += SSE_WIDTH) { | |
__asm ( | |
".att_syntax\n\t" | |
"movups (%0), %%xmm0\n\t" // load the first sse register | |
"movups (%1), %%xmm1\n\t" // load the second sse register | |
"mulps %%xmm1, %%xmm0\n\t" // do the multiply | |
"addps %%xmm0, %2\n\t" // do the add | |
"addq $16, %0\n\t" | |
"addq $16, %1\n\t" | |
: /* outputs */ "+r" (atmp), "+r" (btmp), "+x" (tmp) | |
: /* inputs */ "m" (*atmp), "m" (*btmp) | |
: /* clobbers */ "xmm0", "xmm1" | |
); | |
} | |
__asm ("movups %1,%0" : "=m" (sum) : "x" (tmp) : /*no clobbers*/); | |
} | |
for( int i = limit; i < len; i++ ) | |
{ | |
sum[0] += a[i] * b[i]; | |
} | |
return sum[0] + sum[1] + sum[2] + sum[3]; | |
} | |
void | |
NonSimdMul( float *a, float *b, float *c, int len ) | |
{ | |
for( int i = 0; i < len; i++ ) | |
{ | |
c[i] = a[i] * b[i]; | |
} | |
} | |
float | |
NonSimdMulSum( float *a, float *b, int len ) | |
{ | |
float sum[4] = { 0., 0., 0., 0. }; | |
//int limit = ( len/SSE_WIDTH ) * SSE_WIDTH; | |
for( int i = 0; i < len; i++ ) | |
{ | |
sum[0] += a[i] * b[i]; | |
} | |
return sum[0]; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef SIMD_H | |
#define SIMD_H | |
// SSE stands for Streaming SIMD Extensions | |
#define SSE_WIDTH 4 | |
#define ALIGNED __attribute__((aligned(16))) | |
void SimdMul( float *, float *, float *, int ); | |
float SimdMulSum( float *, float *, int ); | |
void NonSimdMul( float *, float *, float *, int ); | |
float NonSimdMulSum( float *, float *, int ); | |
#endif // SIMD_H |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment