Last active
April 1, 2018 14:53
-
-
Save sedflix/c27e7a3ab1df05fe26e9aa76bd9e8695 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <xmmintrin.h> | |
#include <stdio.h> | |
#include <sys/time.h> | |
// gcc -DCLS=$(getconf LEVEL1_DCACHE_LINESIZE) -fopenmp idk_matrix.c | |
// the nxn matrix you want | |
#define N 1024 | |
// CLS is Cache Line Size | |
// The block size | |
#define SM (CLS / sizeof (double)) | |
#define AVG 20 | |
/* | |
* We align the data in the cache and memory so that it can be easily cached and it is not affected by other processes memory | |
* 64 is assumed to the cache line size on my system | |
*/ | |
// Input array initialisation | |
double mul1[N][N] __attribute__ ((aligned (CLS))); | |
double mul2[N][N] __attribute__ ((aligned (CLS))); | |
// will store the result | |
double res[N][N] __attribute__ ((aligned (CLS))); | |
void multiply(int n) { | |
// filling array with random numbers | |
for (int r = 0; r < n; ++r) { | |
for (int c = 0; c < n; ++c) { | |
mul1[r][c] = (rand() + 0.0) / (RAND_MAX + 0.0); | |
mul2[r][c] = (rand() + 0.0) / (RAND_MAX + 0.0); | |
} | |
} | |
int i, i2, j, j2, k, k2; | |
double *restrict rres; | |
double *restrict rmul1; | |
double *restrict rmul2; | |
#pragma omp parallel shared(res, mul1, mul2) private(i, j, k, i2, k2, j2, rres, rmul1, rmul2) | |
{ | |
#pragma omp for | |
for (i = 0; i < N; i += SM) { | |
for (j = 0; j < N; j += SM) { | |
for (k = 0; k < N; k += SM) { | |
// We are multipying the matrix in blocks. | |
// We fetch the instructions that can fit into memory use it all. | |
for (i2 = 0, rres = &res[i][j], rmul1 = &mul1[i][k]; i2 < SM; ++i2, rres += N, rmul1 += N) { | |
_mm_prefetch (&rmul1[8], _MM_HINT_NTA); | |
for (k2 = 0, rmul2 = &mul2[k][j]; k2 < SM; ++k2, rmul2 += N) { | |
__m128d m1d = _mm_load_sd(&rmul1[k2]); | |
m1d = _mm_unpacklo_pd(m1d, m1d); | |
for (j2 = 0; j2 < SM; j2 += 2) { | |
__m128d m2 = _mm_load_pd(&rmul2[j2]); | |
__m128d r2 = _mm_load_pd(&rres[j2]); | |
_mm_store_pd(&rres[j2], | |
_mm_add_pd(_mm_mul_pd(m2, m1d), r2)); | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
void getTime(int n) { | |
struct timeval t0, t1; | |
double elapsed=0; | |
for (int i = 0; i < AVG; ++i) { | |
gettimeofday(&t0, 0); | |
multiply(n); | |
gettimeofday(&t1, 0); | |
elapsed += (t1.tv_sec - t0.tv_sec) * 1.0f + (t1.tv_usec - t0.tv_usec) / 1000000.0f; | |
} | |
printf("%d \t %f\n", n, elapsed/AVG); | |
} | |
void main() { | |
getTime(N); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment