Last active
May 24, 2019 08:47
-
-
Save yui0/70063ae84ea9ab358383090cd589185f to your computer and use it in GitHub Desktop.
gemm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // clang -Ofast -o dgemm_sse dgemm_sse.c | |
| // http://apfel.mathematik.uni-ulm.de/~lehn/sghpc/gemm/page02/index.html | |
| #ifdef _MSC_VER | |
| #include <intrin.h> | |
| #else | |
| #include <x86intrin.h> | |
| #endif | |
| #if defined(__STDC_VERSION__) && __STDC_VERSION__ < 201102L | |
| #ifdef _MSC_VER | |
| #define _Alignas(n) __declspec(align(n)) | |
| #else | |
| #define _Alignas(n) __attribute__((aligned(n))) | |
| #endif // _MSC_VER | |
| #endif // defined(__cplusplus) && __cplusplus < 201103L | |
| #if defined(_MSC_VER) || defined(__MINGW32__) | |
| #include <malloc.h> | |
| #else | |
| #include <stdlib.h> | |
| #endif /* defined(_MSC_VER) || defined(__MINGW32__) */ | |
| #ifndef __cplusplus | |
| #if defined(_MSC_VER) | |
| #define inline __inline | |
| #define __inline__ __inline | |
| #elif !defined(__GNUC__) && !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L | |
| #define inline | |
| #define __inline | |
| #endif | |
| #endif | |
| static inline void* amalloc(size_t size, size_t alignment) | |
| { | |
| #if defined(_MSC_VER) || defined(__MINGW32__) | |
| return _aligned_malloc(size, alignment); | |
| #else | |
| void* p; | |
| return posix_memalign((void**) &p, alignment, size) == 0 ? p : NULL; | |
| #endif /* _MSC_VER */ | |
| } | |
| static inline void afree(void* ptr) | |
| { | |
| #if defined(_MSC_VER) || defined(__MINGW32__) | |
| _aligned_free(ptr); | |
| #else | |
| free(ptr); | |
| #endif /* _MSC_VER */ | |
| } | |
| #define MC 384 | |
| #define KC 384 | |
| #define NC 4096 | |
| #define MR 4 | |
| #define NR 4 | |
| // | |
| // Local buffers for storing panels from A, B and C | |
| // | |
| static double _A[MC*KC] __attribute__ ((aligned (16))); | |
| static double _B[KC*NC] __attribute__ ((aligned (16))); | |
| static double _C[MR*NR] __attribute__ ((aligned (16))); | |
| // | |
| // Packing complete panels from A (i.e. without padding) | |
| // | |
| static void pack_MRxk(int k, const double *A, int incRowA, int incColA, double *buffer) | |
| { | |
| int i, j; | |
| for (j=0; j<k; ++j) { | |
| for (i=0; i<MR; ++i) { | |
| buffer[i] = A[i*incRowA]; | |
| } | |
| buffer += MR; | |
| A += incColA; | |
| } | |
| } | |
| // | |
| // Packing panels from A with padding if required | |
| // | |
| static void | |
| pack_A(int mc, int kc, const double *A, int incRowA, int incColA, | |
| double *buffer) | |
| { | |
| int mp = mc / MR; | |
| int _mr = mc % MR; | |
| int i, j; | |
| for (i=0; i<mp; ++i) { | |
| pack_MRxk(kc, A, incRowA, incColA, buffer); | |
| buffer += kc*MR; | |
| A += MR*incRowA; | |
| } | |
| if (_mr>0) { | |
| for (j=0; j<kc; ++j) { | |
| for (i=0; i<_mr; ++i) { | |
| buffer[i] = A[i*incRowA]; | |
| } | |
| for (i=_mr; i<MR; ++i) { | |
| buffer[i] = 0.0; | |
| } | |
| buffer += MR; | |
| A += incColA; | |
| } | |
| } | |
| } | |
| // | |
| // Packing complete panels from B (i.e. without padding) | |
| // | |
| static void | |
| pack_kxNR(int k, const double *B, int incRowB, int incColB, | |
| double *buffer) | |
| { | |
| int i, j; | |
| for (i=0; i<k; ++i) { | |
| for (j=0; j<NR; ++j) { | |
| buffer[j] = B[j*incColB]; | |
| } | |
| buffer += NR; | |
| B += incRowB; | |
| } | |
| } | |
| // | |
| // Packing panels from B with padding if required | |
| // | |
| static void | |
| pack_B(int kc, int nc, const double *B, int incRowB, int incColB, | |
| double *buffer) | |
| { | |
| int np = nc / NR; | |
| int _nr = nc % NR; | |
| int i, j; | |
| for (j=0; j<np; ++j) { | |
| pack_kxNR(kc, B, incRowB, incColB, buffer); | |
| buffer += kc*NR; | |
| B += NR*incColB; | |
| } | |
| if (_nr>0) { | |
| for (i=0; i<kc; ++i) { | |
| for (j=0; j<_nr; ++j) { | |
| buffer[j] = B[j*incColB]; | |
| } | |
| for (j=_nr; j<NR; ++j) { | |
| buffer[j] = 0.0; | |
| } | |
| buffer += NR; | |
| B += incRowB; | |
| } | |
| } | |
| } | |
| // | |
| // Micro kernel for multiplying panels from A and B. | |
| // | |
| static void | |
| dgemm_micro_kernel(long kc, | |
| double alpha, const double *A, const double *B, | |
| double beta, | |
| double *C, long incRowC, long incColC) | |
| { | |
| double AB[MR*NR] __attribute__ ((aligned (16))); | |
| int i, j, l; | |
| // | |
| // Compute AB = A*B | |
| // | |
| register __m128d ab_00_11, ab_20_31; | |
| register __m128d ab_01_10, ab_21_30; | |
| register __m128d ab_02_13, ab_22_33; | |
| register __m128d ab_03_12, ab_23_32; | |
| register __m128d tmp0, tmp1, tmp2, tmp3; | |
| register __m128d tmp4, tmp5, tmp6, tmp7; | |
| tmp0 = _mm_load_pd(A); // (1) | |
| tmp1 = _mm_load_pd(A+2); // (2) | |
| tmp2 = _mm_load_pd(B); // (3) | |
| ab_00_11 = _mm_setzero_pd(); | |
| ab_20_31 = _mm_setzero_pd(); | |
| ab_01_10 = _mm_setzero_pd(); | |
| ab_21_30 = _mm_setzero_pd(); | |
| ab_02_13 = _mm_setzero_pd(); | |
| ab_22_33 = _mm_setzero_pd(); | |
| ab_03_12 = _mm_setzero_pd(); | |
| ab_23_32 = _mm_setzero_pd(); | |
| tmp3 = _mm_setzero_pd(); | |
| tmp4 = _mm_setzero_pd(); | |
| tmp5 = _mm_setzero_pd(); | |
| tmp6 = _mm_setzero_pd(); | |
| tmp7 = _mm_setzero_pd(); | |
| for (l=0; l<kc; ++l) { | |
| ab_02_13 = _mm_add_pd(ab_02_13, tmp3); // (9) | |
| tmp3 = _mm_load_pd(B+2); | |
| ab_22_33 = _mm_add_pd(ab_22_33, tmp6); // (10) | |
| tmp6 = tmp2; | |
| tmp4 = _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0, 1)); // (8) | |
| tmp2 = _mm_mul_pd(tmp2, tmp0); | |
| tmp6 = _mm_mul_pd(tmp6, tmp1); | |
| ab_03_12 = _mm_add_pd(ab_03_12, tmp5); // (15) | |
| ab_23_32 = _mm_add_pd(ab_23_32, tmp7); // (16) | |
| tmp7 = tmp4; | |
| tmp4 = _mm_mul_pd(tmp4, tmp0); | |
| tmp7 = _mm_mul_pd(tmp7, tmp1); | |
| ab_00_11 = _mm_add_pd(ab_00_11, tmp2); // (11) | |
| tmp2 = _mm_load_pd(B+4); // (6) | |
| ab_20_31 = _mm_add_pd(ab_20_31, tmp6); // (12) | |
| tmp6 = tmp3; | |
| tmp5 = _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0, 1)); // (7) | |
| tmp3 = _mm_mul_pd(tmp3, tmp0); | |
| tmp6 = _mm_mul_pd(tmp6, tmp1); | |
| ab_01_10 = _mm_add_pd(ab_01_10, tmp4); // (13) | |
| ab_21_30 = _mm_add_pd(ab_21_30, tmp7); // (14) | |
| tmp7 = tmp5; | |
| tmp5 = _mm_mul_pd(tmp5, tmp0); | |
| tmp0 = _mm_load_pd(A+4); // (4) | |
| tmp7 = _mm_mul_pd(tmp7, tmp1); | |
| tmp1 = _mm_load_pd(A+6); // (5) | |
| A += 4; | |
| B += 4; | |
| } | |
| ab_02_13 = _mm_add_pd(ab_02_13, tmp3); // (9) | |
| ab_22_33 = _mm_add_pd(ab_22_33, tmp6); // (10) | |
| ab_03_12 = _mm_add_pd(ab_03_12, tmp5); // (15) | |
| ab_23_32 = _mm_add_pd(ab_23_32, tmp7); // (16) | |
| _mm_storel_pd(&AB[0+0*4], ab_00_11); | |
| _mm_storeh_pd(&AB[1+0*4], ab_01_10); | |
| _mm_storel_pd(&AB[2+0*4], ab_20_31); | |
| _mm_storeh_pd(&AB[3+0*4], ab_21_30); | |
| _mm_storel_pd(&AB[0+1*4], ab_01_10); | |
| _mm_storeh_pd(&AB[1+1*4], ab_00_11); | |
| _mm_storel_pd(&AB[2+1*4], ab_21_30); | |
| _mm_storeh_pd(&AB[3+1*4], ab_20_31); | |
| _mm_storel_pd(&AB[0+2*4], ab_02_13); | |
| _mm_storeh_pd(&AB[1+2*4], ab_03_12); | |
| _mm_storel_pd(&AB[2+2*4], ab_22_33); | |
| _mm_storeh_pd(&AB[3+2*4], ab_23_32); | |
| _mm_storel_pd(&AB[0+3*4], ab_03_12); | |
| _mm_storeh_pd(&AB[1+3*4], ab_02_13); | |
| _mm_storel_pd(&AB[2+3*4], ab_23_32); | |
| _mm_storeh_pd(&AB[3+3*4], ab_22_33); | |
| // | |
| // Update C <- beta*C | |
| // | |
| if (beta==0.0) { | |
| for (j=0; j<NR; ++j) { | |
| for (i=0; i<MR; ++i) { | |
| C[i*incRowC+j*incColC] = 0.0; | |
| } | |
| } | |
| } else if (beta!=1.0) { | |
| for (j=0; j<NR; ++j) { | |
| for (i=0; i<MR; ++i) { | |
| C[i*incRowC+j*incColC] *= beta; | |
| } | |
| } | |
| } | |
| // | |
| // Update C <- C + alpha*AB (note: the case alpha==0.0 was already treated in | |
| // the above layer dgemm_nn) | |
| // | |
| if (alpha==1.0) { | |
| for (j=0; j<NR; ++j) { | |
| for (i=0; i<MR; ++i) { | |
| C[i*incRowC+j*incColC] += AB[i+j*MR]; | |
| } | |
| } | |
| } else { | |
| for (j=0; j<NR; ++j) { | |
| for (i=0; i<MR; ++i) { | |
| C[i*incRowC+j*incColC] += alpha*AB[i+j*MR]; | |
| } | |
| } | |
| } | |
| } | |
| // | |
| // Compute Y += alpha*X | |
| // | |
| static void | |
| dgeaxpy(int m, | |
| int n, | |
| double alpha, | |
| const double *X, | |
| int incRowX, | |
| int incColX, | |
| double *Y, | |
| int incRowY, | |
| int incColY) | |
| { | |
| int i, j; | |
| if (alpha!=1.0) { | |
| for (j=0; j<n; ++j) { | |
| for (i=0; i<m; ++i) { | |
| Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; | |
| } | |
| } | |
| } else { | |
| for (j=0; j<n; ++j) { | |
| for (i=0; i<m; ++i) { | |
| Y[i*incRowY+j*incColY] += X[i*incRowX+j*incColX]; | |
| } | |
| } | |
| } | |
| } | |
| // | |
| // Compute X *= alpha | |
| // | |
| static void | |
| dgescal(int m, | |
| int n, | |
| double alpha, | |
| double *X, | |
| int incRowX, | |
| int incColX) | |
| { | |
| int i, j; | |
| if (alpha!=0.0) { | |
| for (j=0; j<n; ++j) { | |
| for (i=0; i<m; ++i) { | |
| X[i*incRowX+j*incColX] *= alpha; | |
| } | |
| } | |
| } else { | |
| for (j=0; j<n; ++j) { | |
| for (i=0; i<m; ++i) { | |
| X[i*incRowX+j*incColX] = 0.0; | |
| } | |
| } | |
| } | |
| } | |
| // | |
| // Macro Kernel for the multiplication of blocks of A and B. We assume that | |
| // these blocks were previously packed to buffers _A and _B. | |
| // | |
| static void | |
| dgemm_macro_kernel(int mc, | |
| int nc, | |
| int kc, | |
| double alpha, | |
| double beta, | |
| double *C, | |
| int incRowC, | |
| int incColC) | |
| { | |
| int mp = (mc+MR-1) / MR; | |
| int np = (nc+NR-1) / NR; | |
| int _mr = mc % MR; | |
| int _nr = nc % NR; | |
| int mr, nr; | |
| int i, j; | |
| for (j=0; j<np; ++j) { | |
| nr = (j!=np-1 || _nr==0) ? NR : _nr; | |
| for (i=0; i<mp; ++i) { | |
| mr = (i!=mp-1 || _mr==0) ? MR : _mr; | |
| if (mr==MR && nr==NR) { | |
| dgemm_micro_kernel(kc, alpha, &_A[i*kc*MR], &_B[j*kc*NR], | |
| beta, | |
| &C[i*MR*incRowC+j*NR*incColC], | |
| incRowC, incColC); | |
| } else { | |
| dgemm_micro_kernel(kc, alpha, &_A[i*kc*MR], &_B[j*kc*NR], | |
| 0.0, | |
| _C, 1, MR); | |
| dgescal(mr, nr, beta, | |
| &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); | |
| dgeaxpy(mr, nr, 1.0, _C, 1, MR, | |
| &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC); | |
| } | |
| } | |
| } | |
| } | |
| // | |
| // Compute C <- beta*C + alpha*A*B | |
| // | |
| void dgemm_nn(int m, int n, int k, | |
| double alpha, const double *A, int incRowA, int incColA, const double *B, int incRowB, int incColB, | |
| double beta, double *C, int incRowC, int incColC) | |
| { | |
| int mb = (m+MC-1) / MC; | |
| int nb = (n+NC-1) / NC; | |
| int kb = (k+KC-1) / KC; | |
| int _mc = m % MC; | |
| int _nc = n % NC; | |
| int _kc = k % KC; | |
| int mc, nc, kc; | |
| int i, j, l; | |
| double _beta; | |
| if (alpha==0.0 || k==0) { | |
| dgescal(m, n, beta, C, incRowC, incColC); | |
| return; | |
| } | |
| for (j=0; j<nb; ++j) { | |
| nc = (j!=nb-1 || _nc==0) ? NC : _nc; | |
| for (l=0; l<kb; ++l) { | |
| kc = (l!=kb-1 || _kc==0) ? KC : _kc; | |
| _beta = (l==0) ? beta : 1.0; | |
| pack_B(kc, nc, | |
| &B[l*KC*incRowB+j*NC*incColB], incRowB, incColB, | |
| _B); | |
| for (i=0; i<mb; ++i) { | |
| mc = (i!=mb-1 || _mc==0) ? MC : _mc; | |
| pack_A(mc, kc, | |
| &A[i*MC*incRowA+l*KC*incColA], incRowA, incColA, | |
| _A); | |
| dgemm_macro_kernel(mc, nc, kc, alpha, _beta, | |
| &C[i*MC*incRowC+j*NC*incColC], | |
| incRowC, incColC); | |
| } | |
| } | |
| } | |
| } | |
| //#define gemm(m, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) \ | |
| // dgemm_nn(m, n, k, alpha, A, 1, ldA, B, 1, ldB, beta, C, 1, ldC) | |
| // https://github.com/pjreddie/darknet/blob/master/src/gemm.c | |
| #include <stdio.h> | |
| #include <time.h> | |
| #define real double | |
| real *random_matrix(int rows, int cols) | |
| { | |
| real *m = amalloc(rows*cols*sizeof(real), 16); | |
| for (int i=0; i<rows*cols; i++) { | |
| m[i] = (real)rand()/RAND_MAX; | |
| } | |
| return m; | |
| } | |
| void time_random_matrix(int TA, int TB, int m, int k, int n) | |
| { | |
| real *a; | |
| if (!TA) a = random_matrix(m, k); | |
| else a = random_matrix(k, m); | |
| int lda = (!TA)?k:m; | |
| real *b; | |
| if (!TB) b = random_matrix(k, n); | |
| else b = random_matrix(n, k); | |
| int ldb = (!TB)?n:k; | |
| real *c = random_matrix(m, n); | |
| clock_t start = clock(); | |
| for (int i=0; i<10; i++) { | |
| // gemm_cpu(TA, TB, m, n, k, 1, a, lda, b, ldb, 1, c, n); | |
| dgemm_nn(m, n, k, 1.0, a, 1, lda, b, 1, ldb, 1.0, c, 1, n); | |
| } | |
| clock_t end = clock(); | |
| printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n", m, k, k, n, TA, TB, (real)(end-start)/CLOCKS_PER_SEC); | |
| free(a); | |
| free(b); | |
| free(c); | |
| } | |
| int main() | |
| { | |
| //time_random_matrix(0, 0, 64, 75, 12544); | |
| time_random_matrix(0, 0, 10, 10, 10); | |
| time_random_matrix(0, 0, 100, 100, 100); | |
| time_random_matrix(0, 0, 1000, 1000, 1000); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment