sedflix · April 1, 2018 14:53
diff --git a/idk_matrix.c b/idk_matrix.c
    #include <stdlib.h>
    #include <xmmintrin.h>
    #include <stdio.h>
    #include <sys/time.h>
    
    // gcc -DCLS=$(getconf LEVEL1_DCACHE_LINESIZE) -fopenmp idk_matrix.c

    // the nxn matrix you want
    #define N 1024
    // CLS is Cache Line Size
    // The block size
    #define SM (CLS / sizeof (double))
    #define AVG 20

    /*
    * We align the data in the cache and memory so that it can be easily cached and it is not affected by other processes memory
    * 64 is assumed to the cache line size on my system
    */
    // Input array initialisation
    double mul1[N][N] __attribute__ ((aligned (CLS)));
    double mul2[N][N] __attribute__ ((aligned (CLS)));

    // will store the result
    double res[N][N] __attribute__ ((aligned (CLS)));

    void multiply(int n) {

        // filling array with random numbers
        for (int r = 0; r < n; ++r) {
            for (int c = 0; c < n; ++c) {
                mul1[r][c] = (rand() + 0.0) / (RAND_MAX + 0.0);
                mul2[r][c] = (rand() + 0.0) / (RAND_MAX + 0.0);
            }
        }


        int i, i2, j, j2, k, k2;

        double *restrict rres;
        double *restrict rmul1;
        double *restrict rmul2;

        #pragma omp parallel shared(res, mul1, mul2) private(i, j, k, i2, k2, j2, rres, rmul1, rmul2)
        {
        #pragma omp for
            for (i = 0; i < N; i += SM) {

                for (j = 0; j < N; j += SM) {

                    for (k = 0; k < N; k += SM) {
                        
                        // We are multipying the matrix in blocks.
                        // We fetch the instructions that can fit into memory use it all.
                        for (i2 = 0, rres = &res[i][j], rmul1 = &mul1[i][k]; i2 < SM; ++i2, rres += N, rmul1 += N) {

                            _mm_prefetch (&rmul1[8], _MM_HINT_NTA);
                            for (k2 = 0, rmul2 = &mul2[k][j]; k2 < SM; ++k2, rmul2 += N) {
                                __m128d m1d = _mm_load_sd(&rmul1[k2]);
                                m1d = _mm_unpacklo_pd(m1d, m1d);
                                for (j2 = 0; j2 < SM; j2 += 2) {
                                    __m128d m2 = _mm_load_pd(&rmul2[j2]);
                                    __m128d r2 = _mm_load_pd(&rres[j2]);
                                    _mm_store_pd(&rres[j2],
                                                 _mm_add_pd(_mm_mul_pd(m2, m1d), r2));
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    void getTime(int n) {


        struct timeval t0, t1;
        double elapsed=0;

        for (int i = 0; i < AVG; ++i) {


            gettimeofday(&t0, 0);
            multiply(n);
            gettimeofday(&t1, 0);

            elapsed += (t1.tv_sec - t0.tv_sec) * 1.0f + (t1.tv_usec - t0.tv_usec) / 1000000.0f;
        }

        printf("%d \t %f\n", n, elapsed/AVG);
    }


   void main() {
            getTime(N);
    }
	#include <stdlib.h>
	#include <xmmintrin.h>
	#include <stdio.h>
	#include <sys/time.h>

	// gcc -DCLS=$(getconf LEVEL1_DCACHE_LINESIZE) -fopenmp idk_matrix.c

	// the nxn matrix you want
	#define N 1024
	// CLS is Cache Line Size
	// The block size
	#define SM (CLS / sizeof (double))
	#define AVG 20

	/*
	* We align the data in the cache and memory so that it can be easily cached and it is not affected by other processes memory
	* 64 is assumed to the cache line size on my system
	*/
	// Input array initialisation
	double mul1[N][N] __attribute__ ((aligned (CLS)));
	double mul2[N][N] __attribute__ ((aligned (CLS)));

	// will store the result
	double res[N][N] __attribute__ ((aligned (CLS)));

	void multiply(int n) {

	// filling array with random numbers
	for (int r = 0; r < n; ++r) {
	for (int c = 0; c < n; ++c) {
	mul1[r][c] = (rand() + 0.0) / (RAND_MAX + 0.0);
	mul2[r][c] = (rand() + 0.0) / (RAND_MAX + 0.0);
	}
	}


	int i, i2, j, j2, k, k2;

	double *restrict rres;
	double *restrict rmul1;
	double *restrict rmul2;

	#pragma omp parallel shared(res, mul1, mul2) private(i, j, k, i2, k2, j2, rres, rmul1, rmul2)
	{
	#pragma omp for
	for (i = 0; i < N; i += SM) {

	for (j = 0; j < N; j += SM) {

	for (k = 0; k < N; k += SM) {

	// We are multipying the matrix in blocks.
	// We fetch the instructions that can fit into memory use it all.
	for (i2 = 0, rres = &res[i][j], rmul1 = &mul1[i][k]; i2 < SM; ++i2, rres += N, rmul1 += N) {

	_mm_prefetch (&rmul1[8], _MM_HINT_NTA);
	for (k2 = 0, rmul2 = &mul2[k][j]; k2 < SM; ++k2, rmul2 += N) {
	__m128d m1d = _mm_load_sd(&rmul1[k2]);
	m1d = _mm_unpacklo_pd(m1d, m1d);
	for (j2 = 0; j2 < SM; j2 += 2) {
	__m128d m2 = _mm_load_pd(&rmul2[j2]);
	__m128d r2 = _mm_load_pd(&rres[j2]);
	_mm_store_pd(&rres[j2],
	_mm_add_pd(_mm_mul_pd(m2, m1d), r2));
	}
	}
	}
	}
	}
	}
	}
	}

	void getTime(int n) {


	struct timeval t0, t1;
	double elapsed=0;

	for (int i = 0; i < AVG; ++i) {


	gettimeofday(&t0, 0);
	multiply(n);
	gettimeofday(&t1, 0);

	elapsed += (t1.tv_sec - t0.tv_sec) * 1.0f + (t1.tv_usec - t0.tv_usec) / 1000000.0f;
	}

	printf("%d \t %f\n", n, elapsed/AVG);
	}


	void main() {
	getTime(N);
	}