-
-
Save metallurgix/8ee5262ed818730b5cc0 to your computer and use it in GitHub Desktop.
#include <stdio.h> | |
#include <stdlib.h> | |
#include <sys/time.h> | |
void Multiply(int n, double** a, double** b, double** c) | |
{ | |
int bi=0; | |
int bj=0; | |
int bk=0; | |
int i=0; | |
int j=0; | |
int k=0; | |
int blockSize=block; | |
for(bi=0; bi<n; bi+=blockSize) | |
for(bj=0; bj<n; bj+=blockSize) | |
for(bk=0; bk<n; bk+=blockSize) | |
for(i=0; i<blockSize; i++) | |
for(j=0; j<blockSize; j++) | |
for(k=0; k<blockSize; k++) | |
c[bi+i][bj+j] += a[bi+i][bk+k]*b[bk+k][bj+j]; | |
} | |
int main(void) | |
{ | |
int n; | |
double** A; | |
double** B; | |
double** C; | |
int numreps = 5; | |
int i=0; | |
int j=0; | |
struct timeval tv1, tv2; | |
struct timezone tz; | |
double elapsed; | |
printf ("Please enter matrix dimension n : "); | |
scanf("%d", &n); | |
// allocate memory for the matrices | |
///////////////////// Matrix A ////////////////////////// | |
A =(double **)malloc(n*sizeof(double *)); | |
A[0] = (double *)malloc(n*n*sizeof(double)); | |
if(!A) | |
{ | |
printf("memory failed \n"); | |
exit(1); | |
} | |
for(i=1; i<n; i++) | |
{ | |
A[i] = A[0]+i*n; | |
if (!A[i]) | |
{ | |
printf("memory failed \n"); | |
exit(1); | |
} | |
} | |
///////////////////// Matrix B ////////////////////////// | |
B =(double **)malloc(n*sizeof(double *)); | |
B[0] = (double *)malloc(n*n*sizeof(double)); | |
if(!B) | |
{ | |
printf("memory failed \n"); | |
exit(1); | |
} | |
for(i=1; i<n; i++) | |
{ | |
B[i] = B[0]+i*n; | |
if (!B[i]) | |
{ | |
printf("memory failed \n"); | |
exit(1); | |
} | |
} | |
///////////////////// Matrix C ////////////////////////// | |
C =(double **)malloc(n*sizeof(double *)); | |
C[0] = (double *)malloc(n*n*sizeof(double)); | |
if(!C) | |
{ | |
printf("memory failed \n"); | |
exit(1); | |
} | |
for(i=1; i<n; i++) | |
{ | |
C[i] = C[0]+i*n; | |
if (!C[i]) | |
{ | |
printf("memory failed \n"); | |
exit(1); | |
} | |
} | |
// initialize the matrices | |
for(i=0; i<n; i++) | |
{ | |
for(j=0; j<n; j++) | |
{ | |
A[i][j] = 1; | |
B[i][j] = 2; | |
} | |
} | |
//multiply matrices | |
printf("Multiply matrices %d times...\n", numreps); | |
for (i=0; i<numreps; i++) | |
{ | |
gettimeofday(&tv1, &tz); | |
Multiply(n,A,B,C); | |
gettimeofday(&tv2, &tz); | |
elapsed += (double) (tv2.tv_sec-tv1.tv_sec) + (double) (tv2.tv_usec-tv1.tv_usec) * 1.e-6; | |
} | |
printf("Time = %lf \n",elapsed); | |
//deallocate memory | |
free(A[0]); | |
free(A); | |
free(B[0]); | |
free(B); | |
free(C[0]); | |
free(C); | |
return 0; | |
} |
how about A(mxn) and B(nxp)
I don't think this is the correct approach to blocked matrix multiplication. Loading the elements of matrix B will always suffer cache misses as there is no reuse of the loaded block. I suppose it can be parallelized, but so can the naive algorithm. So inherently, this algorithm wouldn't speed up matrix multiplication.
I have a quest.
How much is the Blocksize ? I don't know how much write.
Is for a homework and I'm trying to undestand
@raymond1242 The blocksize is determined by the L1 cachesize. the formulation is b < (Mfast/3)^0.5 where Mfast is the size of the L1 cache.
if you are using integers of 4 byte, you can calculate the block size by Mfast = 256000/4 which gives b < 146 but I think the problem is caused because of remaining columns and rows. The block size b has to be a divisor without remainder of matrix dimension.
@bpirlepeli Thank you so much. When I put the block size of <= 100, it works. I still do not understand.
Is this supposed to be faster than the naive approach? I'm experimenting with 1000X1000 matrices and I observe that the naive approach is about 2X faster on a MacBook pro. I can't figure out the reason.