TNick · February 12, 2019 15:33
diff --git a/test_blas.c b/test_blas.c
 /*
 * Author:  Nicu Tofan
 * License: BSD
 *
 * See below for getRealTime() license.
 */


 #include <cblas.h>

 #include <stdlib.h>
 #include <stdio.h>

 #define test_m          2
 #define test_k          3
 #define test_n          4
 #define test_m_extra    5
 #define test_k_extra    7
 #define test_n_extra    8

 #define test_row_major  1
 #define test_col_major  0


 double getRealTime();


 //! prints a matrix in either row major or column major order
 static void printMatrix (double *data, int rows, int cols, int row_major)
 {
    int i,j;

    // column header
    printf ("        ");
    for(j=0; j<cols;j++) {
        printf (" Col %2d  ", j+1);
    }
    printf ("\n");

    for(i=0; i<rows;i++) {
        printf ("Row %2d: ", i+1);
        for(j=0; j<cols;j++) {
            if (row_major) {
                printf ("%7.4f  ", data[i*cols+j]);
            } else {
                printf ("%7.4f  ", data[j*rows+i]);
            }
        }
        printf ("\n");
    }
    printf ("\n--------------------------------------------------------\n\n");
 }

 //! test printMatrix() to make sure it works
 static void testPrintMatrix()
 {
    printf ("==========================================================\n");
    printf ("Test print function\n");
    printf ("==========================================================\n\n");

    printf ("Matrix in row major order\n\n");
    double A[2*3] = {
        1, 2, 3,
        4, 5, 6
    };
    printMatrix (A, 2, 3, test_row_major);

    printf ("Matrix in column major order\n\n");
    double B[2*3] = {
        1, 4,
        2, 5,
        3, 6
    };
    printMatrix (B, 2, 3, test_col_major);
 }


 //! testing the extra-space
 static void testMulColMajorExtraSpace()
 {
    printf ("==========================================================\n");
    printf ("Extra space in matrix not used in ops\n");
    printf ("==========================================================\n\n");

    /*          MATRIX A[2x3]

                Col  1  Col  2  Col  3
        Row  1: 1.0000  2.0000  3.0000
        Row  2: 4.0000  5.0000  6.0000

    */

    printf ("A\n\n");
    double A[] = {
        1, 4, 9, 9, 9, // 9's will not affect the computation
        2, 5, 9, 9, 9,
        3, 6, 9, 9, 9
    };
    printMatrix (A, test_m_extra, test_k, test_col_major);

    /*          MATRIX B[3x4]

                Col  1  Col  2  Col  3  Col  4
        Row  1: 1.0000  2.0000  3.0000  4.0000
        Row  2: 0.1000  0.2000  0.3000  0.4000
        Row  3: 0.0100  0.0200  0.0300  0.0400

    */
    printf ("B\n\n");
    double B[] = {
        1.0, 0.1, 0.01, 9.0, 9.0, 9.0, 9.0, // 9's will not affect the computation
        2.0, 0.2, 0.02, 9.0, 9.0, 9.0, 9.0,
        3.0, 0.3, 0.03, 9.0, 9.0, 9.0, 9.0,
        4.0, 0.4, 0.04, 9.0, 9.0, 9.0, 9.0,
    };
    printMatrix (B, test_k_extra, test_n, test_col_major);

    /*          MATRIX C[2x4]

                Col  1  Col  2  Col  3  Col  4
        Row  1:  1.2300  2.4600  3.6900  4.9200
        Row  2:  4.5600  9.1200 13.6800 18.2400
    */
    double C[test_n_extra*test_n] = {0};

    printf ("C\n\n");
    // C = alpha*A*B+beta*C
    cblas_dgemm(
          CblasColMajor,
          CblasNoTrans, // TRANSA:
          CblasNoTrans, // TRANSB:
          test_m,   // A matrix: m rows, k columns
          test_n,   // B matrix: k rows, n columns
          test_k,   // C matrix: m rows, n columns
          1.0,      // alpha = 1.0
          A,        // A source matrix
          test_m_extra,   // true lenght of a column in A
                    //   if TRANSA = 'N' then >= m
                    //   if TRANSA = 'T' then >= k
          B,        // B source matrix
          test_k_extra,   // true lenght of a column in B
                    //   if TRANSB = 'N' then >= k
                    //   if TRANSB = 'T' then >= n
          0.0,      // beta = 0.0
          C,        // C destination matrix
          test_n_extra);  // true lenght of a row in C
                    //   must be >= m
    printMatrix (C, test_n_extra, test_n, test_col_major);
 }

 //! blas expects column major order
 static void testMulColMajor()
 {
    printf ("==========================================================\n");
    printf ("Test BLAS multiply - column major order\n");
    printf ("==========================================================\n\n");

    /*          MATRIX A[2x3]

                Col  1  Col  2  Col  3
        Row  1: 1.0000  2.0000  3.0000
        Row  2: 4.0000  5.0000  6.0000

    */

    printf ("A\n\n");
    double A[test_m*test_k] = {
        1, 4,
        2, 5,
        3, 6
    };
    printMatrix (A, test_m, test_k, test_col_major);

    /*          MATRIX B[3x4]

                Col  1  Col  2  Col  3  Col  4
        Row  1: 1.0000  2.0000  3.0000  4.0000
        Row  2: 0.1000  0.2000  0.3000  0.4000
        Row  3: 0.0100  0.0200  0.0300  0.0400

    */
    printf ("B\n\n");
    double B[test_k*test_n] = {
        1.0, 0.1, 0.01,
        2.0, 0.2, 0.02,
        3.0, 0.3, 0.03,
        4.0, 0.4, 0.04
    };
    printMatrix (B, test_k, test_n, test_col_major);

    /*          MATRIX C[2x4]

                Col  1  Col  2  Col  3  Col  4
        Row  1:  1.2300  2.4600  3.6900  4.9200
        Row  2:  4.5600  9.1200 13.6800 18.2400
    */
    double C[test_m*test_n] = {0};

    printf ("C\n\n");
    // http://www.math.utah.edu/software/lapack/lapack-blas/dgemm.html
    /*
    cblas_dgemm(
          'N',      // TRANSA: N - do not transpose A, T - transpose A
          'N',      // TRANSB: N - do not transpose A, T - transpose A
          test_m,   // A matrix: m rows, k columns
          test_n,   // B matrix: k rows, n columns
          test_k,   // C matrix: m rows, n columns
          1.0,      // alpha = 1.0
          A,        // A source matrix
          test_m,   // true lenght of a column in A
                    //   if TRANSA = 'N' then >= m
                    //   if TRANSA = 'T' then >= k
          B,        // B source matrix
          test_k,   // true lenght of a column in B
                    //   if TRANSB = 'N' then >= k
                    //   if TRANSB = 'T' then >= n
          0.0,      // beta = 0.0
          C,        // C destination matrix
          test_m);  // true lenght of a row in C
                    //   must be >= m
                    */
    // C = alpha*A*B+beta*C
    cblas_dgemm(
          CblasColMajor,
          CblasNoTrans, // TRANSA:
          CblasNoTrans, // TRANSB:
          test_m,   // A matrix: m rows, k columns
          test_n,   // B matrix: k rows, n columns
          test_k,   // C matrix: m rows, n columns
          1.0,      // alpha = 1.0
          A,        // A source matrix
          test_m,   // true lenght of a column in A
                    //   if TRANSA = 'N' then >= m
                    //   if TRANSA = 'T' then >= k
          B,        // B source matrix
          test_k,   // true lenght of a column in B
                    //   if TRANSB = 'N' then >= k
                    //   if TRANSB = 'T' then >= n
          0.0,      // beta = 0.0
          C,        // C destination matrix
          test_m);  // true lenght of a row in C
                    //   must be >= m
    printMatrix (C, test_m, test_n, test_col_major);
 }

 //! attempt the trickery
 static void testMulRowMajor_trickery()
 {
    printf ("==========================================================\n");
    printf ("Test BLAS multiply - row major order with trickery\n");
    printf ("==========================================================\n\n");

    /*          MATRIX A[2x3]

                Col  1  Col  2  Col  3
        Row  1: 1.0000  2.0000  3.0000
        Row  2: 4.0000  5.0000  6.0000

    */

    printf ("A\n\n");
    double A[test_m*test_k] = {
        1, 2, 3,
        4, 5, 6
    };
    printMatrix (A, test_m, test_k, test_row_major);

    /*          MATRIX B[3x4]

                Col  1  Col  2  Col  3  Col  4
        Row  1: 1.0000  2.0000  3.0000  4.0000
        Row  2: 0.1000  0.2000  0.3000  0.4000
        Row  3: 0.0100  0.0200  0.0300  0.0400

    */
    printf ("B\n\n");
    double B[test_k*test_n] = {
        1.00, 2.00, 3.00, 4.00,
        0.10, 0.20, 0.30, 0.40,
        0.01, 0.02, 0.03, 0.04
    };
    printMatrix (B, test_k, test_n, test_row_major);

    /*          MATRIX C[2x4]

                Col  1  Col  2  Col  3  Col  4
        Row  1:  1.2300  2.4600  3.6900  4.9200
        Row  2:  4.5600  9.1200 13.6800 18.2400
    */
    double C[test_m*test_n] = {0};

    printf ("C\n\n");

    // C = alpha*A*B+beta*C
    cblas_dgemm(
        CblasColMajor,
        CblasNoTrans, // TRANSA:
        CblasNoTrans, // TRANSB:
        test_n,   // B matrix: k rows, n columns
        test_m,   // A matrix: m rows, k columns
        test_k,   // C matrix: m rows, n columns
        1.0,      // alpha = 1.0
        B,        // B source matrix
        test_n,   // true lenght of a column in B
                  //   if TRANSB = 'N' then >= k
                  //   if TRANSB = 'T' then >= n
        A,        // A source matrix
        test_k,   // true lenght of a column in A
                  //   if TRANSA = 'N' then >= m
                  //   if TRANSA = 'T' then >= k
        0.0,      // beta = 0.0
        C,        // C destination matrix
        test_n);  // true lenght of a row in C
                  //   must be >= m
    printMatrix (C, test_m, test_n, test_row_major);
 }

 //! build in version
 static void testMulRowMajor()
 {
    printf ("==========================================================\n");
    printf ("Test BLAS multiply - row major order without trickery\n");
    printf ("==========================================================\n\n");

    /*          MATRIX A[2x3]

                Col  1  Col  2  Col  3
        Row  1: 1.0000  2.0000  3.0000
        Row  2: 4.0000  5.0000  6.0000

    */

    printf ("A\n\n");
    double A[test_m*test_k] = {
        1, 2, 3,
        4, 5, 6
    };
    printMatrix (A, test_m, test_k, test_row_major);

    /*          MATRIX B[3x4]

                Col  1  Col  2  Col  3  Col  4
        Row  1: 1.0000  2.0000  3.0000  4.0000
        Row  2: 0.1000  0.2000  0.3000  0.4000
        Row  3: 0.0100  0.0200  0.0300  0.0400

    */
    printf ("B\n\n");
    double B[test_k*test_n] = {
        1.00, 2.00, 3.00, 4.00,
        0.10, 0.20, 0.30, 0.40,
        0.01, 0.02, 0.03, 0.04
    };
    printMatrix (B, test_k, test_n, test_row_major);

    /*          MATRIX C[2x4]

                Col  1  Col  2  Col  3  Col  4
        Row  1:  1.2300  2.4600  3.6900  4.9200
        Row  2:  4.5600  9.1200 13.6800 18.2400
    */
    double C[test_m*test_n] = {0};

    printf ("C\n\n");

    // C = alpha*A*B+beta*C
    cblas_dgemm(
          CblasRowMajor,
          CblasNoTrans, // TRANSA:
          CblasNoTrans, // TRANSB:
          test_m,   // A matrix: m rows, k columns
          test_n,   // B matrix: k rows, n columns
          test_k,   // C matrix: m rows, n columns
          1.0,      // alpha = 1.0
          A,        // A source matrix
          test_k,   // true lenght of a column in A
                    //   if TRANSA = 'N' then >= m
                    //   if TRANSA = 'T' then >= k
          B,        // B source matrix
          test_n,   // true lenght of a column in B
                    //   if TRANSB = 'N' then >= k
                    //   if TRANSB = 'T' then >= n
          0.0,      // beta = 0.0
          C,        // C destination matrix
          test_n);  // true lenght of a row in C
                    //   must be >= m
    printMatrix (C, test_m, test_n, test_col_major);
 }

 #undef test_m
 #undef test_n
 #undef test_k

 #define test_m          500
 #define test_k          600
 #define test_n          700

 #define test_spins      5

 // ---------------------------------------------------------------------
 #define IMPLEMENTATION_ROW_MAJOR \
    cblas_dgemm( \
                CblasRowMajor, CblasNoTrans, CblasNoTrans, test_m, test_n, \
                test_k, 1.0, A, test_k, B, test_n, 0.0, C, test_n); \
    startTime = getRealTime (); \
    for(i=0; i<test_spins;i++) { \
        cblas_dgemm( \
                    CblasRowMajor, CblasNoTrans, CblasNoTrans, test_m, test_n, \
                    test_k, 1.0, A, test_k, B, test_n, 0.0, C, test_n); \
    } \
    endTime = getRealTime (); \
    printf ("- Build-in implementation - row major (%d) = %lf\n", test_spins, (endTime - startTime) );
 // ---------------------------------------------------------------------
 #define IMPLEMENTATION_TRICKERY \
    cblas_dgemm( \
                CblasColMajor, CblasNoTrans, CblasNoTrans, test_n, test_m, \
                test_k, 1.0, B, test_n, A, test_k, 0.0, C, test_n); \
    startTime = getRealTime (); \
    for(i=0; i<test_spins;i++) { \
        cblas_dgemm( \
                    CblasColMajor, CblasNoTrans, CblasNoTrans, test_n, test_m, \
                    test_k, 1.0, B, test_n, A, test_k, 0.0, C, test_n); \
    } \
    endTime = getRealTime (); \
    printf ("- Trickery implementation (%d)= %lf\n", test_spins, (endTime - startTime) );
 // ---------------------------------------------------------------------
 #define IMPLEMENTATION_COL_MAJOR \
    cblas_dgemm( \
                CblasColMajor, CblasNoTrans, CblasNoTrans, test_m, test_n, \
                test_k, 1.0, A, test_m, B, test_k, 0.0, C, test_m); \
    startTime = getRealTime (); \
    for(i=0; i<test_spins;i++) { \
        cblas_dgemm( \
                    CblasColMajor, CblasNoTrans, CblasNoTrans, test_m, test_n, \
                    test_k, 1.0, A, test_m, B, test_k, 0.0, C, test_m); \
    } \
    endTime = getRealTime (); \
    printf ("- Build-in implementation - column major (%d) = %lf\n", test_spins, (endTime - startTime) );
 // ---------------------------------------------------------------------


 //! test my undestanding of the parameters
 void compareBuildInWithTrickery()
 {

    printf ("==========================================================\n");
    printf ("Compare these two methods\n");
    printf ("==========================================================\n\n");

    // srand((unsigned)time(NULL));

    double startTime, endTime;
    int i;

    double *A = (double*)malloc(test_m*test_k*sizeof(double));
    int a_tot = test_m*test_k;
    double *iter = A;
    for (i = 0; i<a_tot; i++) {
        *iter = ((double)rand()/(double)RAND_MAX);;
        iter++;
    }

    double *B = (double*)malloc(test_k*test_n*sizeof(double));
    int b_tot = test_k*test_n;
    iter = B;
    for (i = 0; i<b_tot; i++) {
        *iter = ((double)rand()/(double)RAND_MAX);;
        iter++;
    }

    double *C = (double*)malloc(test_m*test_n*sizeof(double));
    iter = C;
    int c_tot = test_m*test_n;
    for (i = 0; i<c_tot; i++) {
        *iter = 0.0;
        iter++;
    }

    IMPLEMENTATION_ROW_MAJOR;
    IMPLEMENTATION_TRICKERY;
    IMPLEMENTATION_COL_MAJOR;

    IMPLEMENTATION_TRICKERY;
    IMPLEMENTATION_ROW_MAJOR;
    IMPLEMENTATION_COL_MAJOR;

    IMPLEMENTATION_ROW_MAJOR;
    IMPLEMENTATION_COL_MAJOR;
    IMPLEMENTATION_TRICKERY;

    IMPLEMENTATION_COL_MAJOR;
    IMPLEMENTATION_ROW_MAJOR;
    IMPLEMENTATION_TRICKERY;

    IMPLEMENTATION_COL_MAJOR;
    IMPLEMENTATION_TRICKERY;
    IMPLEMENTATION_ROW_MAJOR;

    IMPLEMENTATION_TRICKERY;
    IMPLEMENTATION_COL_MAJOR;
    IMPLEMENTATION_ROW_MAJOR;

 }


 int main(int argc, char **argv)
 {
    testPrintMatrix();
    testMulColMajorExtraSpace();
    testMulColMajor();
    testMulRowMajor_trickery();
    testMulRowMajor();

    compareBuildInWithTrickery();

    return 0;
 }






 /*
 * Author:  David Robert Nadeau
 * Site:    http://NadeauSoftware.com/
 * License: Creative Commons Attribution 3.0 Unported License
 *          http://creativecommons.org/licenses/by/3.0/deed.en_US
 *
 * http://nadeausoftware.com/articles/2012/04/c_c_tip_how_measure_elapsed_real_time_benchmarking
 */
 #if defined(_WIN32)
 #include <Windows.h>

 #elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
 #include <unistd.h>	/* POSIX flags */
 #include <time.h>	/* clock_gettime(), time() */
 #include <sys/time.h>	/* gethrtime(), gettimeofday() */

 #if defined(__MACH__) && defined(__APPLE__)
 #include <mach/mach.h>
 #include <mach/mach_time.h>
 #endif

 #else
 #error "Unable to define getRealTime( ) for an unknown OS."
 #endif





 /**
 * Returns the real time, in seconds, or -1.0 if an error occurred.
 *
 * Time is measured since an arbitrary and OS-dependent start time.
 * The returned real time is only useful for computing an elapsed time
 * between two calls to this function.
 */
 double getRealTime( )
 {
 #if defined(_WIN32)
    FILETIME tm;
    ULONGLONG t;
 #if defined(NTDDI_WIN8) && NTDDI_VERSION >= NTDDI_WIN8
    /* Windows 8, Windows Server 2012 and later. ---------------- */
    GetSystemTimePreciseAsFileTime( &tm );
 #else
    /* Windows 2000 and later. ---------------------------------- */
    GetSystemTimeAsFileTime( &tm );
 #endif
    t = ((ULONGLONG)tm.dwHighDateTime << 32) | (ULONGLONG)tm.dwLowDateTime;
    return (double)t / 10000000.0;

 #elif (defined(__hpux) || defined(hpux)) || ((defined(__sun__) || defined(__sun) || defined(sun)) && (defined(__SVR4) || defined(__svr4__)))
    /* HP-UX, Solaris. ------------------------------------------ */
    return (double)gethrtime( ) / 1000000000.0;

 #elif defined(__MACH__) && defined(__APPLE__)
    /* OSX. ----------------------------------------------------- */
    static double timeConvert = 0.0;
    if ( timeConvert == 0.0 )
    {
        mach_timebase_info_data_t timeBase;
        (void)mach_timebase_info( &timeBase );
        timeConvert = (double)timeBase.numer /
            (double)timeBase.denom /
            1000000000.0;
    }
    return (double)mach_absolute_time( ) * timeConvert;

 #elif defined(_POSIX_VERSION)
    /* POSIX. --------------------------------------------------- */
 #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
    {
        struct timespec ts;
 #if defined(CLOCK_MONOTONIC_PRECISE)
        /* BSD. --------------------------------------------- */
        const clockid_t id = CLOCK_MONOTONIC_PRECISE;
 #elif defined(CLOCK_MONOTONIC_RAW)
        /* Linux. ------------------------------------------- */
        const clockid_t id = CLOCK_MONOTONIC_RAW;
 #elif defined(CLOCK_HIGHRES)
        /* Solaris. ----------------------------------------- */
        const clockid_t id = CLOCK_HIGHRES;
 #elif defined(CLOCK_MONOTONIC)
        /* AIX, BSD, Linux, POSIX, Solaris. ----------------- */
        const clockid_t id = CLOCK_MONOTONIC;
 #elif defined(CLOCK_REALTIME)
        /* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */
        const clockid_t id = CLOCK_REALTIME;
 #else
        const clockid_t id = (clockid_t)-1;	/* Unknown. */
 #endif /* CLOCK_* */
        if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 )
            return (double)ts.tv_sec +
                (double)ts.tv_nsec / 1000000000.0;
        /* Fall thru. */
    }
 #endif /* _POSIX_TIMERS */

    /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */
    struct timeval tm;
    gettimeofday( &tm, NULL );
    return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0;
 #else
    return -1.0;		/* Failed. */
 #endif
 }
	/*
	* Author: Nicu Tofan
	* License: BSD
	*
	* See below for getRealTime() license.
	*/


	#include <cblas.h>

	#include <stdlib.h>
	#include <stdio.h>

	#define test_m 2
	#define test_k 3
	#define test_n 4
	#define test_m_extra 5
	#define test_k_extra 7
	#define test_n_extra 8

	#define test_row_major 1
	#define test_col_major 0


	double getRealTime();


	//! prints a matrix in either row major or column major order
	static void printMatrix (double *data, int rows, int cols, int row_major)
	{
	int i,j;

	// column header
	printf (" ");
	for(j=0; j<cols;j++) {
	printf (" Col %2d ", j+1);
	}
	printf ("\n");

	for(i=0; i<rows;i++) {
	printf ("Row %2d: ", i+1);
	for(j=0; j<cols;j++) {
	if (row_major) {
	printf ("%7.4f ", data[i*cols+j]);
	} else {
	printf ("%7.4f ", data[j*rows+i]);
	}
	}
	printf ("\n");
	}
	printf ("\n--------------------------------------------------------\n\n");
	}

	//! test printMatrix() to make sure it works
	static void testPrintMatrix()
	{
	printf ("==========================================================\n");
	printf ("Test print function\n");
	printf ("==========================================================\n\n");

	printf ("Matrix in row major order\n\n");
	double A[2*3] = {
	1, 2, 3,
	4, 5, 6
	};
	printMatrix (A, 2, 3, test_row_major);

	printf ("Matrix in column major order\n\n");
	double B[2*3] = {
	1, 4,
	2, 5,
	3, 6
	};
	printMatrix (B, 2, 3, test_col_major);
	}


	//! testing the extra-space
	static void testMulColMajorExtraSpace()
	{
	printf ("==========================================================\n");
	printf ("Extra space in matrix not used in ops\n");
	printf ("==========================================================\n\n");

	/* MATRIX A[2x3]

	Col 1 Col 2 Col 3
	Row 1: 1.0000 2.0000 3.0000
	Row 2: 4.0000 5.0000 6.0000

	*/

	printf ("A\n\n");
	double A[] = {
	1, 4, 9, 9, 9, // 9's will not affect the computation
	2, 5, 9, 9, 9,
	3, 6, 9, 9, 9
	};
	printMatrix (A, test_m_extra, test_k, test_col_major);

	/* MATRIX B[3x4]

	Col 1 Col 2 Col 3 Col 4
	Row 1: 1.0000 2.0000 3.0000 4.0000
	Row 2: 0.1000 0.2000 0.3000 0.4000
	Row 3: 0.0100 0.0200 0.0300 0.0400

	*/
	printf ("B\n\n");
	double B[] = {
	1.0, 0.1, 0.01, 9.0, 9.0, 9.0, 9.0, // 9's will not affect the computation
	2.0, 0.2, 0.02, 9.0, 9.0, 9.0, 9.0,
	3.0, 0.3, 0.03, 9.0, 9.0, 9.0, 9.0,
	4.0, 0.4, 0.04, 9.0, 9.0, 9.0, 9.0,
	};
	printMatrix (B, test_k_extra, test_n, test_col_major);

	/* MATRIX C[2x4]

	Col 1 Col 2 Col 3 Col 4
	Row 1: 1.2300 2.4600 3.6900 4.9200
	Row 2: 4.5600 9.1200 13.6800 18.2400
	*/
	double C[test_n_extra*test_n] = {0};

	printf ("C\n\n");
	// C = alphaAB+beta*C
	cblas_dgemm(
	CblasColMajor,
	CblasNoTrans, // TRANSA:
	CblasNoTrans, // TRANSB:
	test_m, // A matrix: m rows, k columns
	test_n, // B matrix: k rows, n columns
	test_k, // C matrix: m rows, n columns
	1.0, // alpha = 1.0
	A, // A source matrix
	test_m_extra, // true lenght of a column in A
	// if TRANSA = 'N' then >= m
	// if TRANSA = 'T' then >= k
	B, // B source matrix
	test_k_extra, // true lenght of a column in B
	// if TRANSB = 'N' then >= k
	// if TRANSB = 'T' then >= n
	0.0, // beta = 0.0
	C, // C destination matrix
	test_n_extra); // true lenght of a row in C
	// must be >= m
	printMatrix (C, test_n_extra, test_n, test_col_major);
	}

	//! blas expects column major order
	static void testMulColMajor()
	{
	printf ("==========================================================\n");
	printf ("Test BLAS multiply - column major order\n");
	printf ("==========================================================\n\n");

	/* MATRIX A[2x3]

	Col 1 Col 2 Col 3
	Row 1: 1.0000 2.0000 3.0000
	Row 2: 4.0000 5.0000 6.0000

	*/

	printf ("A\n\n");
	double A[test_m*test_k] = {
	1, 4,
	2, 5,
	3, 6
	};
	printMatrix (A, test_m, test_k, test_col_major);

	/* MATRIX B[3x4]

	Col 1 Col 2 Col 3 Col 4
	Row 1: 1.0000 2.0000 3.0000 4.0000
	Row 2: 0.1000 0.2000 0.3000 0.4000
	Row 3: 0.0100 0.0200 0.0300 0.0400

	*/
	printf ("B\n\n");
	double B[test_k*test_n] = {
	1.0, 0.1, 0.01,
	2.0, 0.2, 0.02,
	3.0, 0.3, 0.03,
	4.0, 0.4, 0.04
	};
	printMatrix (B, test_k, test_n, test_col_major);

	/* MATRIX C[2x4]

	Col 1 Col 2 Col 3 Col 4
	Row 1: 1.2300 2.4600 3.6900 4.9200
	Row 2: 4.5600 9.1200 13.6800 18.2400
	*/
	double C[test_m*test_n] = {0};

	printf ("C\n\n");
	// http://www.math.utah.edu/software/lapack/lapack-blas/dgemm.html
	/*
	cblas_dgemm(
	'N', // TRANSA: N - do not transpose A, T - transpose A
	'N', // TRANSB: N - do not transpose A, T - transpose A
	test_m, // A matrix: m rows, k columns
	test_n, // B matrix: k rows, n columns
	test_k, // C matrix: m rows, n columns
	1.0, // alpha = 1.0
	A, // A source matrix
	test_m, // true lenght of a column in A
	// if TRANSA = 'N' then >= m
	// if TRANSA = 'T' then >= k
	B, // B source matrix
	test_k, // true lenght of a column in B
	// if TRANSB = 'N' then >= k
	// if TRANSB = 'T' then >= n
	0.0, // beta = 0.0
	C, // C destination matrix
	test_m); // true lenght of a row in C
	// must be >= m
	*/
	// C = alphaAB+beta*C
	cblas_dgemm(
	CblasColMajor,
	CblasNoTrans, // TRANSA:
	CblasNoTrans, // TRANSB:
	test_m, // A matrix: m rows, k columns
	test_n, // B matrix: k rows, n columns
	test_k, // C matrix: m rows, n columns
	1.0, // alpha = 1.0
	A, // A source matrix
	test_m, // true lenght of a column in A
	// if TRANSA = 'N' then >= m
	// if TRANSA = 'T' then >= k
	B, // B source matrix
	test_k, // true lenght of a column in B
	// if TRANSB = 'N' then >= k
	// if TRANSB = 'T' then >= n
	0.0, // beta = 0.0
	C, // C destination matrix
	test_m); // true lenght of a row in C
	// must be >= m
	printMatrix (C, test_m, test_n, test_col_major);
	}

	//! attempt the trickery
	static void testMulRowMajor_trickery()
	{
	printf ("==========================================================\n");
	printf ("Test BLAS multiply - row major order with trickery\n");
	printf ("==========================================================\n\n");

	/* MATRIX A[2x3]

	Col 1 Col 2 Col 3
	Row 1: 1.0000 2.0000 3.0000
	Row 2: 4.0000 5.0000 6.0000

	*/

	printf ("A\n\n");
	double A[test_m*test_k] = {
	1, 2, 3,
	4, 5, 6
	};
	printMatrix (A, test_m, test_k, test_row_major);

	/* MATRIX B[3x4]

	Col 1 Col 2 Col 3 Col 4
	Row 1: 1.0000 2.0000 3.0000 4.0000
	Row 2: 0.1000 0.2000 0.3000 0.4000
	Row 3: 0.0100 0.0200 0.0300 0.0400

	*/
	printf ("B\n\n");
	double B[test_k*test_n] = {
	1.00, 2.00, 3.00, 4.00,
	0.10, 0.20, 0.30, 0.40,
	0.01, 0.02, 0.03, 0.04
	};
	printMatrix (B, test_k, test_n, test_row_major);

	/* MATRIX C[2x4]

	Col 1 Col 2 Col 3 Col 4
	Row 1: 1.2300 2.4600 3.6900 4.9200
	Row 2: 4.5600 9.1200 13.6800 18.2400
	*/
	double C[test_m*test_n] = {0};

	printf ("C\n\n");

	// C = alphaAB+beta*C
	cblas_dgemm(
	CblasColMajor,
	CblasNoTrans, // TRANSA:
	CblasNoTrans, // TRANSB:
	test_n, // B matrix: k rows, n columns
	test_m, // A matrix: m rows, k columns
	test_k, // C matrix: m rows, n columns
	1.0, // alpha = 1.0
	B, // B source matrix
	test_n, // true lenght of a column in B
	// if TRANSB = 'N' then >= k
	// if TRANSB = 'T' then >= n
	A, // A source matrix
	test_k, // true lenght of a column in A
	// if TRANSA = 'N' then >= m
	// if TRANSA = 'T' then >= k
	0.0, // beta = 0.0
	C, // C destination matrix
	test_n); // true lenght of a row in C
	// must be >= m
	printMatrix (C, test_m, test_n, test_row_major);
	}

	//! build in version
	static void testMulRowMajor()
	{
	printf ("==========================================================\n");
	printf ("Test BLAS multiply - row major order without trickery\n");
	printf ("==========================================================\n\n");

	/* MATRIX A[2x3]

	Col 1 Col 2 Col 3
	Row 1: 1.0000 2.0000 3.0000
	Row 2: 4.0000 5.0000 6.0000

	*/

	printf ("A\n\n");
	double A[test_m*test_k] = {
	1, 2, 3,
	4, 5, 6
	};
	printMatrix (A, test_m, test_k, test_row_major);

	/* MATRIX B[3x4]

	Col 1 Col 2 Col 3 Col 4
	Row 1: 1.0000 2.0000 3.0000 4.0000
	Row 2: 0.1000 0.2000 0.3000 0.4000
	Row 3: 0.0100 0.0200 0.0300 0.0400

	*/
	printf ("B\n\n");
	double B[test_k*test_n] = {
	1.00, 2.00, 3.00, 4.00,
	0.10, 0.20, 0.30, 0.40,
	0.01, 0.02, 0.03, 0.04
	};
	printMatrix (B, test_k, test_n, test_row_major);

	/* MATRIX C[2x4]

	Col 1 Col 2 Col 3 Col 4
	Row 1: 1.2300 2.4600 3.6900 4.9200
	Row 2: 4.5600 9.1200 13.6800 18.2400
	*/
	double C[test_m*test_n] = {0};

	printf ("C\n\n");

	// C = alphaAB+beta*C
	cblas_dgemm(
	CblasRowMajor,
	CblasNoTrans, // TRANSA:
	CblasNoTrans, // TRANSB:
	test_m, // A matrix: m rows, k columns
	test_n, // B matrix: k rows, n columns
	test_k, // C matrix: m rows, n columns
	1.0, // alpha = 1.0
	A, // A source matrix
	test_k, // true lenght of a column in A
	// if TRANSA = 'N' then >= m
	// if TRANSA = 'T' then >= k
	B, // B source matrix
	test_n, // true lenght of a column in B
	// if TRANSB = 'N' then >= k
	// if TRANSB = 'T' then >= n
	0.0, // beta = 0.0
	C, // C destination matrix
	test_n); // true lenght of a row in C
	// must be >= m
	printMatrix (C, test_m, test_n, test_col_major);
	}

	#undef test_m
	#undef test_n
	#undef test_k

	#define test_m 500
	#define test_k 600
	#define test_n 700

	#define test_spins 5

	// ---------------------------------------------------------------------
	#define IMPLEMENTATION_ROW_MAJOR \
	cblas_dgemm( \
	CblasRowMajor, CblasNoTrans, CblasNoTrans, test_m, test_n, \
	test_k, 1.0, A, test_k, B, test_n, 0.0, C, test_n); \
	startTime = getRealTime (); \
	for(i=0; i<test_spins;i++) { \
	cblas_dgemm( \
	CblasRowMajor, CblasNoTrans, CblasNoTrans, test_m, test_n, \
	test_k, 1.0, A, test_k, B, test_n, 0.0, C, test_n); \
	} \
	endTime = getRealTime (); \
	printf ("- Build-in implementation - row major (%d) = %lf\n", test_spins, (endTime - startTime) );
	// ---------------------------------------------------------------------
	#define IMPLEMENTATION_TRICKERY \
	cblas_dgemm( \
	CblasColMajor, CblasNoTrans, CblasNoTrans, test_n, test_m, \
	test_k, 1.0, B, test_n, A, test_k, 0.0, C, test_n); \
	startTime = getRealTime (); \
	for(i=0; i<test_spins;i++) { \
	cblas_dgemm( \
	CblasColMajor, CblasNoTrans, CblasNoTrans, test_n, test_m, \
	test_k, 1.0, B, test_n, A, test_k, 0.0, C, test_n); \
	} \
	endTime = getRealTime (); \
	printf ("- Trickery implementation (%d)= %lf\n", test_spins, (endTime - startTime) );
	// ---------------------------------------------------------------------
	#define IMPLEMENTATION_COL_MAJOR \
	cblas_dgemm( \
	CblasColMajor, CblasNoTrans, CblasNoTrans, test_m, test_n, \
	test_k, 1.0, A, test_m, B, test_k, 0.0, C, test_m); \
	startTime = getRealTime (); \
	for(i=0; i<test_spins;i++) { \
	cblas_dgemm( \
	CblasColMajor, CblasNoTrans, CblasNoTrans, test_m, test_n, \
	test_k, 1.0, A, test_m, B, test_k, 0.0, C, test_m); \
	} \
	endTime = getRealTime (); \
	printf ("- Build-in implementation - column major (%d) = %lf\n", test_spins, (endTime - startTime) );
	// ---------------------------------------------------------------------


	//! test my undestanding of the parameters
	void compareBuildInWithTrickery()
	{

	printf ("==========================================================\n");
	printf ("Compare these two methods\n");
	printf ("==========================================================\n\n");

	// srand((unsigned)time(NULL));

	double startTime, endTime;
	int i;

	double A = (double)malloc(test_mtest_ksizeof(double));
	int a_tot = test_m*test_k;
	double *iter = A;
	for (i = 0; i<a_tot; i++) {
	*iter = ((double)rand()/(double)RAND_MAX);;
	iter++;
	}

	double B = (double)malloc(test_ktest_nsizeof(double));
	int b_tot = test_k*test_n;
	iter = B;
	for (i = 0; i<b_tot; i++) {
	*iter = ((double)rand()/(double)RAND_MAX);;
	iter++;
	}

	double C = (double)malloc(test_mtest_nsizeof(double));
	iter = C;
	int c_tot = test_m*test_n;
	for (i = 0; i<c_tot; i++) {
	*iter = 0.0;
	iter++;
	}

	IMPLEMENTATION_ROW_MAJOR;
	IMPLEMENTATION_TRICKERY;
	IMPLEMENTATION_COL_MAJOR;

	IMPLEMENTATION_TRICKERY;
	IMPLEMENTATION_ROW_MAJOR;
	IMPLEMENTATION_COL_MAJOR;

	IMPLEMENTATION_ROW_MAJOR;
	IMPLEMENTATION_COL_MAJOR;
	IMPLEMENTATION_TRICKERY;

	IMPLEMENTATION_COL_MAJOR;
	IMPLEMENTATION_ROW_MAJOR;
	IMPLEMENTATION_TRICKERY;

	IMPLEMENTATION_COL_MAJOR;
	IMPLEMENTATION_TRICKERY;
	IMPLEMENTATION_ROW_MAJOR;

	IMPLEMENTATION_TRICKERY;
	IMPLEMENTATION_COL_MAJOR;
	IMPLEMENTATION_ROW_MAJOR;

	}


	int main(int argc, char **argv)
	{
	testPrintMatrix();
	testMulColMajorExtraSpace();
	testMulColMajor();
	testMulRowMajor_trickery();
	testMulRowMajor();

	compareBuildInWithTrickery();

	return 0;
	}






	/*
	* Author: David Robert Nadeau
	* Site: http://NadeauSoftware.com/
	* License: Creative Commons Attribution 3.0 Unported License
	* http://creativecommons.org/licenses/by/3.0/deed.en_US
	*
	* http://nadeausoftware.com/articles/2012/04/c_c_tip_how_measure_elapsed_real_time_benchmarking
	*/
	#if defined(_WIN32)
	#include <Windows.h>

	#elif defined(__unix__) \|\| defined(__unix) \|\| defined(unix) \|\| (defined(__APPLE__) && defined(__MACH__))
	#include <unistd.h> /* POSIX flags */
	#include <time.h> /* clock_gettime(), time() */
	#include <sys/time.h> /* gethrtime(), gettimeofday() */

	#if defined(__MACH__) && defined(__APPLE__)
	#include <mach/mach.h>
	#include <mach/mach_time.h>
	#endif

	#else
	#error "Unable to define getRealTime( ) for an unknown OS."
	#endif





	/**
	* Returns the real time, in seconds, or -1.0 if an error occurred.
	*
	* Time is measured since an arbitrary and OS-dependent start time.
	* The returned real time is only useful for computing an elapsed time
	* between two calls to this function.
	*/
	double getRealTime( )
	{
	#if defined(_WIN32)
	FILETIME tm;
	ULONGLONG t;
	#if defined(NTDDI_WIN8) && NTDDI_VERSION >= NTDDI_WIN8
	/* Windows 8, Windows Server 2012 and later. ---------------- */
	GetSystemTimePreciseAsFileTime( &tm );
	#else
	/* Windows 2000 and later. ---------------------------------- */
	GetSystemTimeAsFileTime( &tm );
	#endif
	t = ((ULONGLONG)tm.dwHighDateTime << 32) \| (ULONGLONG)tm.dwLowDateTime;
	return (double)t / 10000000.0;

	#elif (defined(__hpux) \|\| defined(hpux)) \|\| ((defined(__sun__) \|\| defined(__sun) \|\| defined(sun)) && (defined(__SVR4) \|\| defined(__svr4__)))
	/* HP-UX, Solaris. ------------------------------------------ */
	return (double)gethrtime( ) / 1000000000.0;

	#elif defined(__MACH__) && defined(__APPLE__)
	/* OSX. ----------------------------------------------------- */
	static double timeConvert = 0.0;
	if ( timeConvert == 0.0 )
	{
	mach_timebase_info_data_t timeBase;
	(void)mach_timebase_info( &timeBase );
	timeConvert = (double)timeBase.numer /
	(double)timeBase.denom /
	1000000000.0;
	}
	return (double)mach_absolute_time( ) * timeConvert;

	#elif defined(_POSIX_VERSION)
	/* POSIX. --------------------------------------------------- */
	#if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
	{
	struct timespec ts;
	#if defined(CLOCK_MONOTONIC_PRECISE)
	/* BSD. --------------------------------------------- */
	const clockid_t id = CLOCK_MONOTONIC_PRECISE;
	#elif defined(CLOCK_MONOTONIC_RAW)
	/* Linux. ------------------------------------------- */
	const clockid_t id = CLOCK_MONOTONIC_RAW;
	#elif defined(CLOCK_HIGHRES)
	/* Solaris. ----------------------------------------- */
	const clockid_t id = CLOCK_HIGHRES;
	#elif defined(CLOCK_MONOTONIC)
	/* AIX, BSD, Linux, POSIX, Solaris. ----------------- */
	const clockid_t id = CLOCK_MONOTONIC;
	#elif defined(CLOCK_REALTIME)
	/* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */
	const clockid_t id = CLOCK_REALTIME;
	#else
	const clockid_t id = (clockid_t)-1; /* Unknown. */
	#endif /* CLOCK_* */
	if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 )
	return (double)ts.tv_sec +
	(double)ts.tv_nsec / 1000000000.0;
	/* Fall thru. */
	}
	#endif /* _POSIX_TIMERS */

	/* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */
	struct timeval tm;
	gettimeofday( &tm, NULL );
	return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0;
	#else
	return -1.0; /* Failed. */
	#endif
	}