Created
March 23, 2015 19:11
-
-
Save zhangce/b4b2d4e2e5c138aa809e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <x86intrin.h> | |
| #include "timer.h" | |
| #include "cblas.h" | |
| #define BLOCKSIZE 256 | |
| #define cstore16x1_rstore1x6_gemm(A_c, B_r) \ | |
| A_ci_first8 = _mm256_load_ps(A_c); \ | |
| A_ci_second8 = _mm256_load_ps(A_c+8); \ | |
| __builtin_prefetch(A_c+16, 0); \ | |
| __builtin_prefetch(A_c+24, 0); \ | |
| B_ri_num1 = _mm256_broadcast_ss(B_r); \ | |
| B_ri_num2 = _mm256_broadcast_ss(B_r+1); \ | |
| C_c1_first8 = _mm256_fmadd_ps(A_ci_first8, B_ri_num1, C_c1_first8); \ | |
| C_c1_second8 = _mm256_fmadd_ps(A_ci_second8, B_ri_num1, C_c1_second8); \ | |
| C_c2_first8 = _mm256_fmadd_ps(A_ci_first8, B_ri_num2, C_c2_first8); \ | |
| C_c2_second8 = _mm256_fmadd_ps(A_ci_second8, B_ri_num2, C_c2_second8); \ | |
| B_ri_num1 = _mm256_broadcast_ss(B_r+2); \ | |
| B_ri_num2 = _mm256_broadcast_ss(B_r+3); \ | |
| C_c3_first8 = _mm256_fmadd_ps(A_ci_first8, B_ri_num1, C_c3_first8); \ | |
| C_c3_second8 = _mm256_fmadd_ps(A_ci_second8, B_ri_num1, C_c3_second8); \ | |
| C_c4_first8 = _mm256_fmadd_ps(A_ci_first8, B_ri_num2, C_c4_first8); \ | |
| C_c4_second8 = _mm256_fmadd_ps(A_ci_second8, B_ri_num2, C_c4_second8); \ | |
| B_ri_num1 = _mm256_broadcast_ss(B_r+4); \ | |
| B_ri_num2 = _mm256_broadcast_ss(B_r+5); \ | |
| C_c5_first8 = _mm256_fmadd_ps(A_ci_first8, B_ri_num1, C_c5_first8); \ | |
| C_c5_second8 = _mm256_fmadd_ps(A_ci_second8, B_ri_num1, C_c5_second8); \ | |
| C_c6_first8 = _mm256_fmadd_ps(A_ci_first8, B_ri_num2, C_c6_first8); \ | |
| C_c6_second8 = _mm256_fmadd_ps(A_ci_second8, B_ri_num2, C_c6_second8); | |
| /** | |
| void cstore16x5x5x96_rstore28x28x96_CONV( | |
| const float * const K, const float * const D, float * O){ | |
| // D is blocked by two things -- feature map, and every 6 output position, | |
| // First, get the pointer for each feature map | |
| for(int d=0;d<96;d++){ | |
| const float * const D_ = D[d*28*28]; | |
| const float * const K_ = K[d*5*5*16]; | |
| for(int ) | |
| do(K_, D_) | |
| } | |
| } | |
| **/ | |
| inline void cstore16x4_rstore4x6_gemm( | |
| const float * const A, | |
| const float * const B, | |
| float * const C){ | |
| __m256 C_c1_first8 ; __m256 C_c1_second8 ; | |
| __m256 C_c2_first8 ; __m256 C_c2_second8 ; | |
| __m256 C_c3_first8 ; __m256 C_c3_second8 ; | |
| __m256 C_c4_first8 ; __m256 C_c4_second8 ; | |
| __m256 C_c5_first8 ; __m256 C_c5_second8 ; | |
| __m256 C_c6_first8 ; __m256 C_c6_second8 ; | |
| __m256 A_ci_first8; | |
| __m256 A_ci_second8; | |
| __m256 B_ri_num1; | |
| __m256 B_ri_num2; | |
| for(int i=0;i<BLOCKSIZE;i++){ | |
| cstore16x1_rstore1x6_gemm(A+i*16, B+i*6); | |
| } | |
| _mm256_store_ps(C, C_c1_first8); | |
| _mm256_store_ps(C+8, C_c1_second8); | |
| _mm256_store_ps(C+16, C_c2_first8); | |
| _mm256_store_ps(C+24, C_c2_second8); | |
| _mm256_store_ps(C+32, C_c3_first8); | |
| _mm256_store_ps(C+40, C_c3_second8); | |
| _mm256_store_ps(C+48, C_c4_first8); | |
| _mm256_store_ps(C+56, C_c4_second8); | |
| _mm256_store_ps(C+64, C_c5_first8); | |
| _mm256_store_ps(C+72, C_c5_second8); | |
| _mm256_store_ps(C+80, C_c6_first8); | |
| _mm256_store_ps(C+88, C_c6_second8); | |
| } | |
| void test_throughput(){ | |
| const int N_ABLOCK = 1; | |
| const int N_BBLOCK = N_ABLOCK; | |
| const int N_CBLOCK = N_ABLOCK; | |
| float * const _A = (float*) _mm_malloc(16*BLOCKSIZE*sizeof(float) * N_ABLOCK, 32); | |
| float * const _B = (float*) _mm_malloc(BLOCKSIZE*6*sizeof(float) * N_BBLOCK, 32); | |
| float * const _C = (float*) _mm_malloc(16*6*sizeof(float) * N_CBLOCK, 32); | |
| for(int i=0;i<16*BLOCKSIZE;i++){ | |
| _A[i] = i % 100; | |
| } | |
| for(int i=0;i<BLOCKSIZE*6;i++){ | |
| _B[i] = i * 100; | |
| } | |
| Timer t; | |
| float * A = _A; | |
| float * B = _B; | |
| float * C = _C; | |
| const int inc_a = 1; | |
| const int inc_b = 1; | |
| const int inc_c = 1; | |
| for(int i=0;i<N_ABLOCK;i++){ | |
| //cstore16x4_rstore4x6_gemm( | |
| // A, A+16, A+32, A+48, | |
| // B, B+6, A+12, B+18, | |
| // C, C+16, C+32, C+48, C+64, C+80); | |
| cstore16x4_rstore4x6_gemm(A, B, C); | |
| //cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, 16, 6, BLOCKSIZE, | |
| // 1.0, A, 16, B, 6, 1.0, C, 16); | |
| //cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, 16, 6, BLOCKSIZE, | |
| // 1.0, A, 16, B, BLOCKSIZE, 1.0, C, 16); | |
| A += inc_a * 16*4; | |
| B += inc_b * 4*6; | |
| C += inc_c * 16*6; | |
| } | |
| double elapsed = t.elapsed(); | |
| printf("Elapsed: %f\n seconds\n", elapsed); | |
| float flop = 1.0* N_ABLOCK * 16 * BLOCKSIZE * 6 * 2; | |
| float gflops = flop/elapsed/1024/1024/1024; | |
| printf("GFlops: %f GFLOPS\n", gflops); | |
| float io = 1.0 * N_ABLOCK * (16*BLOCKSIZE*inc_a + BLOCKSIZE*6*inc_b + 16*6*inc_c) * sizeof(float); | |
| float gb = 1.0* io / elapsed / 1024/1024/1024; | |
| printf("GB: %f GB/s\n", gb); | |
| printf("C=\n"); | |
| for(int r=0;r<16;r++){ | |
| for(int c=0;c<6;c++){ | |
| printf("%f ", _C[r+c*16]); | |
| } | |
| printf("\n"); | |
| } | |
| } | |
| void test_16x6(){ | |
| float * const A = (float*) _mm_malloc(16*4*sizeof(float), 32); | |
| for(int i=0;i<16*4;i++){ | |
| A[i] = i; | |
| } | |
| float * const B = (float*) _mm_malloc(4*6*sizeof(float), 32); | |
| for(int i=0;i<4*6;i++){ | |
| B[i] = i; | |
| } | |
| float * const C = (float*) _mm_malloc(16*6*sizeof(float), 32); | |
| /* | |
| cstore16x4_rstore4x6_gemm( | |
| A, | |
| B, | |
| C); | |
| */ | |
| //cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, 16, 6, 4, | |
| // 1.0, A, 16, B, 6, 1.0, C, 16); | |
| printf("A=\n"); | |
| for(int r=0;r<16;r++){ | |
| for(int c=0;c<4;c++){ | |
| printf("%f ", A[r+c*16]); | |
| } | |
| printf("\n"); | |
| } | |
| printf("B=\n"); | |
| for(int r=0;r<4;r++){ | |
| for(int c=0;c<6;c++){ | |
| printf("%f ", B[c+r*6]); | |
| } | |
| printf("\n"); | |
| } | |
| printf("C=\n"); | |
| for(int r=0;r<16;r++){ | |
| for(int c=0;c<6;c++){ | |
| printf("%f ", C[r+c*16]); | |
| } | |
| printf("\n"); | |
| } | |
| /* | |
| for(int i=0;i<16*6;i++){ | |
| printf("%f ", C[i]); | |
| } | |
| printf("\n"); | |
| */ | |
| } | |
| void test_blas_speed_16x6(){ | |
| const int M = 1000; | |
| const int N = M; | |
| const int K = M; | |
| float * const A = (float*) _mm_malloc(M*N*sizeof(float), 32); | |
| for(int i=0;i<16*4;i++){ | |
| A[i] = i; | |
| } | |
| float * const B = (float*) _mm_malloc(N*K*sizeof(float), 32); | |
| for(int i=0;i<4*6;i++){ | |
| B[i] = i; | |
| } | |
| float * const C = (float*) _mm_malloc(N*K*sizeof(float), 32); | |
| Timer t; | |
| cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, M, N, K, | |
| 1.0, A, M, B, M, 1.0, C, M); | |
| float elapsed = t.elapsed(); | |
| printf("Elapsed: %f\n seconds\n", elapsed); | |
| float flop = 1.0* M * N * K * 2; | |
| float gflops = flop/elapsed/1024/1024/1024; | |
| printf("GFlops: %f GFLOPS\n", gflops); | |
| printf("A=\n"); | |
| for(int r=0;r<16;r++){ | |
| for(int c=0;c<4;c++){ | |
| printf("%f ", A[r+c*16]); | |
| } | |
| printf("\n"); | |
| } | |
| printf("B=\n"); | |
| for(int r=0;r<4;r++){ | |
| for(int c=0;c<6;c++){ | |
| printf("%f ", B[c+r*6]); | |
| } | |
| printf("\n"); | |
| } | |
| printf("C=\n"); | |
| for(int r=0;r<16;r++){ | |
| for(int c=0;c<6;c++){ | |
| printf("%f ", C[r+c*16]); | |
| } | |
| printf("\n"); | |
| } | |
| /* | |
| for(int i=0;i<16*6;i++){ | |
| printf("%f ", C[i]); | |
| } | |
| printf("\n"); | |
| */ | |
| } | |
| int main(int argc, char** argv){ | |
| //cstore16x4_rstore4x6_gemm(NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| // NULL, NULL, NULL, NULL, NULL, NULL); | |
| openblas_set_num_threads(1); | |
| //test_16x6(); | |
| //test_blas_speed_16x6(); | |
| test_throughput(); | |
| return 0; | |
| } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment