Skip to content

Instantly share code, notes, and snippets.

@zhangce
Created March 23, 2015 19:11
Show Gist options
  • Select an option

  • Save zhangce/b4b2d4e2e5c138aa809e to your computer and use it in GitHub Desktop.

Select an option

Save zhangce/b4b2d4e2e5c138aa809e to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
#include "timer.h"
#include "cblas.h"
#define BLOCKSIZE 256
#define cstore16x1_rstore1x6_gemm(A_c, B_r) \
A_ci_first8 = _mm256_load_ps(A_c); \
A_ci_second8 = _mm256_load_ps(A_c+8); \
__builtin_prefetch(A_c+16, 0); \
__builtin_prefetch(A_c+24, 0); \
B_ri_num1 = _mm256_broadcast_ss(B_r); \
B_ri_num2 = _mm256_broadcast_ss(B_r+1); \
C_c1_first8 = _mm256_fmadd_ps(A_ci_first8, B_ri_num1, C_c1_first8); \
C_c1_second8 = _mm256_fmadd_ps(A_ci_second8, B_ri_num1, C_c1_second8); \
C_c2_first8 = _mm256_fmadd_ps(A_ci_first8, B_ri_num2, C_c2_first8); \
C_c2_second8 = _mm256_fmadd_ps(A_ci_second8, B_ri_num2, C_c2_second8); \
B_ri_num1 = _mm256_broadcast_ss(B_r+2); \
B_ri_num2 = _mm256_broadcast_ss(B_r+3); \
C_c3_first8 = _mm256_fmadd_ps(A_ci_first8, B_ri_num1, C_c3_first8); \
C_c3_second8 = _mm256_fmadd_ps(A_ci_second8, B_ri_num1, C_c3_second8); \
C_c4_first8 = _mm256_fmadd_ps(A_ci_first8, B_ri_num2, C_c4_first8); \
C_c4_second8 = _mm256_fmadd_ps(A_ci_second8, B_ri_num2, C_c4_second8); \
B_ri_num1 = _mm256_broadcast_ss(B_r+4); \
B_ri_num2 = _mm256_broadcast_ss(B_r+5); \
C_c5_first8 = _mm256_fmadd_ps(A_ci_first8, B_ri_num1, C_c5_first8); \
C_c5_second8 = _mm256_fmadd_ps(A_ci_second8, B_ri_num1, C_c5_second8); \
C_c6_first8 = _mm256_fmadd_ps(A_ci_first8, B_ri_num2, C_c6_first8); \
C_c6_second8 = _mm256_fmadd_ps(A_ci_second8, B_ri_num2, C_c6_second8);
/**
void cstore16x5x5x96_rstore28x28x96_CONV(
const float * const K, const float * const D, float * O){
// D is blocked by two things -- feature map, and every 6 output position,
// First, get the pointer for each feature map
for(int d=0;d<96;d++){
const float * const D_ = D[d*28*28];
const float * const K_ = K[d*5*5*16];
for(int )
do(K_, D_)
}
}
**/
inline void cstore16x4_rstore4x6_gemm(
const float * const A,
const float * const B,
float * const C){
__m256 C_c1_first8 ; __m256 C_c1_second8 ;
__m256 C_c2_first8 ; __m256 C_c2_second8 ;
__m256 C_c3_first8 ; __m256 C_c3_second8 ;
__m256 C_c4_first8 ; __m256 C_c4_second8 ;
__m256 C_c5_first8 ; __m256 C_c5_second8 ;
__m256 C_c6_first8 ; __m256 C_c6_second8 ;
__m256 A_ci_first8;
__m256 A_ci_second8;
__m256 B_ri_num1;
__m256 B_ri_num2;
for(int i=0;i<BLOCKSIZE;i++){
cstore16x1_rstore1x6_gemm(A+i*16, B+i*6);
}
_mm256_store_ps(C, C_c1_first8);
_mm256_store_ps(C+8, C_c1_second8);
_mm256_store_ps(C+16, C_c2_first8);
_mm256_store_ps(C+24, C_c2_second8);
_mm256_store_ps(C+32, C_c3_first8);
_mm256_store_ps(C+40, C_c3_second8);
_mm256_store_ps(C+48, C_c4_first8);
_mm256_store_ps(C+56, C_c4_second8);
_mm256_store_ps(C+64, C_c5_first8);
_mm256_store_ps(C+72, C_c5_second8);
_mm256_store_ps(C+80, C_c6_first8);
_mm256_store_ps(C+88, C_c6_second8);
}
void test_throughput(){
const int N_ABLOCK = 1;
const int N_BBLOCK = N_ABLOCK;
const int N_CBLOCK = N_ABLOCK;
float * const _A = (float*) _mm_malloc(16*BLOCKSIZE*sizeof(float) * N_ABLOCK, 32);
float * const _B = (float*) _mm_malloc(BLOCKSIZE*6*sizeof(float) * N_BBLOCK, 32);
float * const _C = (float*) _mm_malloc(16*6*sizeof(float) * N_CBLOCK, 32);
for(int i=0;i<16*BLOCKSIZE;i++){
_A[i] = i % 100;
}
for(int i=0;i<BLOCKSIZE*6;i++){
_B[i] = i * 100;
}
Timer t;
float * A = _A;
float * B = _B;
float * C = _C;
const int inc_a = 1;
const int inc_b = 1;
const int inc_c = 1;
for(int i=0;i<N_ABLOCK;i++){
//cstore16x4_rstore4x6_gemm(
// A, A+16, A+32, A+48,
// B, B+6, A+12, B+18,
// C, C+16, C+32, C+48, C+64, C+80);
cstore16x4_rstore4x6_gemm(A, B, C);
//cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, 16, 6, BLOCKSIZE,
// 1.0, A, 16, B, 6, 1.0, C, 16);
//cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, 16, 6, BLOCKSIZE,
// 1.0, A, 16, B, BLOCKSIZE, 1.0, C, 16);
A += inc_a * 16*4;
B += inc_b * 4*6;
C += inc_c * 16*6;
}
double elapsed = t.elapsed();
printf("Elapsed: %f\n seconds\n", elapsed);
float flop = 1.0* N_ABLOCK * 16 * BLOCKSIZE * 6 * 2;
float gflops = flop/elapsed/1024/1024/1024;
printf("GFlops: %f GFLOPS\n", gflops);
float io = 1.0 * N_ABLOCK * (16*BLOCKSIZE*inc_a + BLOCKSIZE*6*inc_b + 16*6*inc_c) * sizeof(float);
float gb = 1.0* io / elapsed / 1024/1024/1024;
printf("GB: %f GB/s\n", gb);
printf("C=\n");
for(int r=0;r<16;r++){
for(int c=0;c<6;c++){
printf("%f ", _C[r+c*16]);
}
printf("\n");
}
}
void test_16x6(){
float * const A = (float*) _mm_malloc(16*4*sizeof(float), 32);
for(int i=0;i<16*4;i++){
A[i] = i;
}
float * const B = (float*) _mm_malloc(4*6*sizeof(float), 32);
for(int i=0;i<4*6;i++){
B[i] = i;
}
float * const C = (float*) _mm_malloc(16*6*sizeof(float), 32);
/*
cstore16x4_rstore4x6_gemm(
A,
B,
C);
*/
//cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, 16, 6, 4,
// 1.0, A, 16, B, 6, 1.0, C, 16);
printf("A=\n");
for(int r=0;r<16;r++){
for(int c=0;c<4;c++){
printf("%f ", A[r+c*16]);
}
printf("\n");
}
printf("B=\n");
for(int r=0;r<4;r++){
for(int c=0;c<6;c++){
printf("%f ", B[c+r*6]);
}
printf("\n");
}
printf("C=\n");
for(int r=0;r<16;r++){
for(int c=0;c<6;c++){
printf("%f ", C[r+c*16]);
}
printf("\n");
}
/*
for(int i=0;i<16*6;i++){
printf("%f ", C[i]);
}
printf("\n");
*/
}
void test_blas_speed_16x6(){
const int M = 1000;
const int N = M;
const int K = M;
float * const A = (float*) _mm_malloc(M*N*sizeof(float), 32);
for(int i=0;i<16*4;i++){
A[i] = i;
}
float * const B = (float*) _mm_malloc(N*K*sizeof(float), 32);
for(int i=0;i<4*6;i++){
B[i] = i;
}
float * const C = (float*) _mm_malloc(N*K*sizeof(float), 32);
Timer t;
cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, M, N, K,
1.0, A, M, B, M, 1.0, C, M);
float elapsed = t.elapsed();
printf("Elapsed: %f\n seconds\n", elapsed);
float flop = 1.0* M * N * K * 2;
float gflops = flop/elapsed/1024/1024/1024;
printf("GFlops: %f GFLOPS\n", gflops);
printf("A=\n");
for(int r=0;r<16;r++){
for(int c=0;c<4;c++){
printf("%f ", A[r+c*16]);
}
printf("\n");
}
printf("B=\n");
for(int r=0;r<4;r++){
for(int c=0;c<6;c++){
printf("%f ", B[c+r*6]);
}
printf("\n");
}
printf("C=\n");
for(int r=0;r<16;r++){
for(int c=0;c<6;c++){
printf("%f ", C[r+c*16]);
}
printf("\n");
}
/*
for(int i=0;i<16*6;i++){
printf("%f ", C[i]);
}
printf("\n");
*/
}
int main(int argc, char** argv){
//cstore16x4_rstore4x6_gemm(NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
// NULL, NULL, NULL, NULL, NULL, NULL);
openblas_set_num_threads(1);
//test_16x6();
//test_blas_speed_16x6();
test_throughput();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment