Created
January 29, 2019 01:57
-
-
Save cartazio/db8647cc0a278ca38d9857f581465bbf to your computer and use it in GitHub Desktop.
old matrix mat code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// #include <emmintrin.h> | |
#include <immintrin.h> | |
// #if defined (__SSE4_2__) || defined (__SSE4_1__) | |
// for some reason i can't get anything newer than sse4.2 work with both gcc-clang and clang | |
// #include <smmintrin.h> | |
// #endif | |
#include <memory.h> | |
/* | |
-- aligned packed load VMOVAPD | |
__m128d _mm_load_pd (double const * p); | |
--aligned packed store | |
void _mm_store_pd(double * p, __m128d a); | |
--add | |
__m128d _mm_add_pd (__m128d a, __m128d b) | |
--- mult | |
__m128d _mm_mul_pd (m128d a, m128d b) | |
-- dot product | |
__m128d _mm_dp_pd ( __m128d a, __m128d b, const int mask); | |
*/ | |
#define restrict __restrict | |
typedef __attribute__((__aligned__(16))) double doubleAl; | |
void SimpleMatMult4x4( doubleAl * restrict ,doubleAl * restrict , doubleAl *restrict ); | |
/* | |
BLENDPD: __m128d _mm_blend_pd (__m128d v1, __m128d v2, const int mask); | |
*/ | |
/* | |
__m256d | |
DPPD: __m128d _mm_dp_pd ( __m128d a, __m128d b, const int mask); | |
for the dot product masking, an 8 bit (byte) mask is used, | |
the lower nibble is where to place the result in the vector (which subset of the entries) | |
the upper nibble is which entries to add to dot product | |
we use the full nibble in the 4 entry (floats) case, and the lower part of a | |
nibble in the | |
(0b11110001)0xF1 = 4d float dot, put result in the first slot | |
0xFF = 4d float dot, put result in all 4 slots | |
(0b00110001)0x31 = 2d dot (eg for doubles), put result in first slot | |
0x32 = 2d dot, put result in 2nd slot | |
*/ | |
// this is ok for stand alone 2x2, can do better for 4x4 if carefule | |
inline void matMultAvx2x2(double* res, double* leftM ,double* rightM){ | |
__m128d resRow1 = _mm_load_pd(res); // 1 | |
__m128d resRow2= _mm_load_pd(res + 2) ; //2 | |
__m128d leftRow1 = _mm_load_pd(leftM); //3 | |
__m128d leftRow2= _mm_load_pd(leftM+2); //4 | |
__m128d rightCol1 = _mm_load_pd(rightM); // 5 | |
__m128d rightCol2 = _mm_load_pd(rightM+2); // 6 | |
// i'm using 6 xmm registers at this point, plenty left! | |
__m128d res11 = _mm_dp_pd(leftRow1,rightCol1,0x31); // 7 | |
__m128d res12 = _mm_dp_pd(leftRow1,rightCol2,0x32); // 8 | |
__m128d res21 = _mm_dp_pd(leftRow2, rightCol1, 0x31); // 9 | |
__m128d res22 = _mm_dp_pd(leftRow2,rightCol2,0x32); // 10 | |
resRow1 = res11 + res12 ; // for some reason i don't trust that this does the expected thing | |
resRow2 = res21 + res22 ; | |
_mm_store_pd(res, resRow1); | |
_mm_store_pd(res+2,resRow2); | |
} | |
void fatDotProduct(doubleAl * restrict result, doubleAl * restrict leftMat, doubleAl * restrict rightMat, int sizeN){ | |
// we assume we're given sizeN x sizeN matrices | |
int scaledSized = sizeN / 16 ; // needs at least 8x8 for sizedN | |
int yIx = 0 ; | |
int xIx = 0; | |
int nIx = 0 ; | |
for(yIx=0 ; yIx < scaledSized ; yIx ++ ){ | |
for (xIx=0 ; xIx < scaledSized; ++xIx) | |
{ | |
for (nIx= 0; nIx< scaledSized; ++nIx) | |
{ | |
SimpleMatMult4x4(result + (((yIx * scaledSized )+ xIx)* 16), | |
leftMat + (((yIx * scaledSized) + nIx)* 16), rightMat + (((xIx * scaledSized )+ nIx ) * 16) ) ; | |
} | |
} | |
} | |
} | |
/* col col row | |
want to see if it gets packed vectorized | |
NOPE DOESN"T | |
*/ | |
void toyMMult2x2(doubleAl * restrict resultG, doubleAl * restrict leftMatG, doubleAl * restrict rightMatG){ | |
double result[4] ; | |
double leftMat[4] ; | |
double rightMat[4]; | |
memcpy(result,resultG, 4*8); | |
memcpy(leftMat,leftMatG, 4*8); | |
memcpy(rightMat,rightMatG,4*8); | |
// memcpy(target, source, number of bytes) | |
result[0] += leftMat[0]* rightMat[0]+ leftMat[1]*rightMat[1]; | |
result[1] += leftMat[0] * rightMat[2] + leftMat[1]* rightMat[3]; | |
result[2] += leftMat[2] * rightMat[0] + leftMat[3] * rightMat[1]; | |
result[3] += leftMat[2] * rightMat[2] + leftMat[3] * rightMat[3]; | |
memcpy(resultG, result,4*8); | |
} | |
/* only welldefined in the AVXMatMult4x4 codes, and thats ok | |
i think theres JUST enough xmm registers for this to squeek by | |
*/ | |
#define dotProd4x4RowBlock(ix) { \ | |
__m128d resRowLeft##ix = _mm_load_pd(resF + (ix)) ; \ | |
__m128d resRowRight##ix = _mm_load_pd(resF + 2 + (ix)); \ | |
__m128d lMatRowLeft##ix = _mm_load_pd(leftMF + (ix) ) ; \ | |
__m128d lMatRowRight##ix= _mm_load_pd(leftMF + (ix) + 2) ; \ | |
__m128d resRowLeftResultA##ix = _mm_dp_pd(lMatRowLeft##ix , rMatCol1Up, 0x31) ; \ | |
__m128d resRowLeftResultB##ix =_mm_dp_pd(lMatRowRight##ix, rMatCol1Down, 0x31) ; \ | |
resRowLeft##ix = resRowLeftResultA##ix ; \ | |
resRowLeft##ix = resRowLeftResultB##ix ; \ | |
resRowLeftResultA##ix = _mm_dp_pd(lMatRowLeft##ix , rMatCol2Up, 0x32) ; \ | |
resRowLeftResultB##ix = _mm_dp_pd(lMatRowRight##ix, rMatCol1Down, 0x32) ; \ | |
resRowLeft##ix += resRowLeftResultA##ix ; \ | |
resRowLeft##ix += resRowLeftResultB##ix ; \ | |
__m128d resRowRightResultA##ix = _mm_dp_pd(lMatRowLeft##ix , rMatCol3Up, 0x31) ; \ | |
__m128d resRowRightResultB##ix = _mm_dp_pd(lMatRowRight##ix, rMatCol3Down, 0x31); \ | |
resRowRight##ix += resRowRightResultA##ix ; \ | |
resRowRight##ix += resRowRightResultB##ix ; \ | |
resRowRightResultA##ix = _mm_dp_pd(lMatRowLeft##ix , rMatCol4Up, 0x32) ; \ | |
resRowRightResultB##ix = _mm_dp_pd(lMatRowRight##ix, rMatCol4Down, 0x32); \ | |
resRowRight##ix += resRowRightResultA##ix ; \ | |
resRowRight##ix += resRowRightResultB##ix ; \ | |
_mm_store_pd(resF + (ix), resRowLeft##ix); \ | |
_mm_store_pd(resF + 2 + (ix),resRowRight##ix); \ | |
} | |
// resRowRight = _mm_xor_pd(resRowRight,resRowRight); \ | |
// resRowLeft = _mm_xor_pd(resRowLeft,resRowLeft) ; \ | |
// lMatRowRight = _mm_xor_pd(lMatRowRight,lMatRowRight) ; \ | |
// lMatRowLeft = _mm_xor_pd(lMatRowLeft,lMatRowLeft) ; \ | |
// void AVXMatMult4x4( doubleAl * restrict resF,doubleAl * restrict leftMF, doubleAl *restrict rightMF){ | |
inline void SimpleMatMult4x4( doubleAl * restrict resF,doubleAl * restrict leftMF, doubleAl *restrict rightMF ){ | |
__m128d rMatCol1Up = _mm_load_pd(rightMF); | |
__m128d rMatCol1Down = _mm_load_pd(rightMF+2); | |
__m128d rMatCol2Up= _mm_load_pd(rightMF+4); | |
__m128d rMatCol2Down = _mm_load_pd(rightMF+ 6); | |
__m128d rMatCol3Up = _mm_load_pd(rightMF+8 ); | |
__m128d rMatCol3Down = _mm_load_pd(rightMF+10); | |
__m128d rMatCol4Up = _mm_load_pd(rightMF+12); | |
__m128d rMatCol4Down = _mm_load_pd(rightMF+14); | |
// __m128d resRowLeft ; | |
// __m128d resRowRight ; | |
// __m128d lMatRowLeft ; | |
// __m128d lMatRowRight ; | |
// at this point i'm using 12 XMM registers, have 4 xmm left! | |
// which gives me a "budget" of up to 4 intermediaries at time! | |
// i do this macro 4 times! | |
dotProd4x4RowBlock(0) ; | |
dotProd4x4RowBlock(4) ; | |
dotProd4x4RowBlock(8) ; | |
dotProd4x4RowBlock(12) ; | |
} | |
/*void SimpleMatMult4x4( doubleAl * restrict res,doubleAl * restrict leftM, doubleAl *restrict rightM | |
){ | |
// intialize these things | |
// copyFromTo(resF,res,16); | |
// copyFromTo(leftMatF,leftM, 16); | |
// copyFromTo(rightMatF,rightM,16); | |
// quadrant 1 | |
// __builtin_prefetch(leftM+4,0); // 1 | |
// __builtin_prefetch(rightM+4,0); // 2 | |
matMultAvx2x2(res,leftM, rightM); | |
// __builtin_prefetch(res+4,1); // 3 | |
// __builtin_prefetch(rightM+8,0); // 4 | |
matMultAvx2x2(res, leftM+4,rightM + 4 ); | |
// quadrant 2 | |
// __builtin_prefetch(rightM+12,0); // 5 | |
matMultAvx2x2(res + 4, leftM, rightM + 8); | |
// __builtin_prefetch(res+8,1); // 6 | |
// __builtin_prefetch(leftM+8, 0); // 7 | |
matMultAvx2x2(res+4, leftM + 4 , rightM + 12); | |
//quadrant 3 | |
// __builtin_prefetch(leftM+12, 0); // 8 | |
matMultAvx2x2(res + 8, leftM + 8 , rightM); | |
// __builtin_prefetch(res + 12,1); // 9, yes i've done all the prefetches | |
matMultAvx2x2(res+8, leftM+12, rightM + 4); | |
//quadrant 4 | |
matMultAvx2x2(res + 12 , leftM+ 8, rightM + 8); | |
matMultAvx2x2(res + 12 , leftM + 12, rightM + 12); | |
//write the results back, hopefully clang can do the write coalescing this way!! | |
// copyFromTo(res,resF,16); | |
}*/ | |
void SimpleRowRowColMatrixMultViaDots(doubleAl *restrict res,doubleAl *restrict leftM, doubleAl *restrict rightM , int nSize ){ | |
int row = 0 ; | |
int col = 0; | |
int ix = 0; | |
double accum = 0.0; | |
for(row = 0 ; row < nSize ; row ++ ){ | |
for(col=0; col < nSize ; col ++){ | |
accum = 0.0; | |
for(ix = 0 ; ix < nSize; ix ++ ){ | |
accum += leftM[row + ix * nSize] * rightM[col + nSize * ix]; | |
} | |
res[row + nSize * col] = accum ; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment