jatesy · April 1, 2014 18:17
diff --git a/Matrix_multiplication b/Matrix_multiplication
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <time.h>
 #include <cuda.h>

 // Thread block size
 #define BLOCK_SIZE 8   //We can change this to 16, 32, 64, 128, 256, 512 and 1024

 // Matrix dimensions
 // (chosen as multiples of the thread block size for simplicity)
 #define WA 8  // Matrix A width, we can change this to 16, 32, 64, 128, 256, 512 and 1024
 #define HA 8  // Matrix A height, we can change this to 16, 32, 64, 128, 256, 512 and 1024
 #define WB 8  // Matrix B width, we can change this to 16, 32, 64, 128, 256, 512 and 1024
 #define HB WA  // Matrix B height, we can change this to 16, 32, 64, 128, 256, 512 and 1024
 #define WC WB  // Matrix C width, we can change this to 16, 32, 64, 128, 256, 512 and 1024
 #define HC HA  // Matrix C height, we can change this to 16, 32, 64, 128, 256, 512 and 1024

 // Allocates a matrix with random float entries.
 void randomInit(float* data, int size)
 {
    for (int i = 0; i < size; ++i)
        data[i] = rand() / (float)RAND_MAX;
 }

 __global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width)
 {
   // Calculate the row index of the Pd element and M
   int Row = blockIdx.y*BLOCK_SIZE + threadIdx.y;
   // Calculate the column idenx of Pd and N
   int Col = blockIdx.x*BLOCK_SIZE + threadIdx.x;

   float Pvalue = 0;
   // each thread computes one element of the block sub-matrix
   for (int k = 0; k < Width; ++k)
       Pvalue += Md[Row*Width+k] * Nd[k*Width+Col];

   Pd[Row*Width+Col] = Pvalue;
 }

 void MatrixMulOnHost(float* M, float* N, float* P, int Width)
 {
    for (int i = 0; i < Width; ++i)
        for (int j = 0; j < Width; ++j) {
            double sum = 0;
            for (int k = 0; k < Width; ++k) {
                double a = M[i * Width + k];
                double b = N[k * Width + j];
                sum += a * b;
            }
            P[i * Width + j] = sum;
        }
 }



 int main()
 {

 	//cudaSetDevice( cutGetMaxGflopsDeviceId() );

 	srand(2006);

 	double ctime,cudatime,htime;
 	clock_t stime,stime2,etime, etime2;

    // allocate host memory for matrices A and B
    unsigned int size_A = WA * HA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float* h_A = (float*) malloc(mem_size_A);

    unsigned int size_B = WB * HB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float* h_B = (float*) malloc(mem_size_B);

    // initialize host memory
    randomInit(h_A, size_A);
    randomInit(h_B, size_B);

    stime = clock();

    // allocate device memory
    float* d_A;
    cudaMalloc((void**) &d_A, mem_size_A);
    float* d_B;
    cudaMalloc((void**) &d_B, mem_size_B);

    // copy host memory to device
    cudaMemcpy(d_A, h_A, mem_size_A,cudaMemcpyHostToDevice) ;
    cudaMemcpy(d_B, h_B, mem_size_B,cudaMemcpyHostToDevice);

    // allocate device memory for result
    unsigned int size_C = WC * HC;
    unsigned int mem_size_C = sizeof(float) * size_C;
    float* d_C;
    cudaMalloc((void**) &d_C, mem_size_C);

    stime2 = clock();
    // setup execution parameters
    dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid(1,1);
    dim3 block(1,1);

    // execute the kernel
    MatrixMulKernel<<< grid, threads >>>(d_A, d_B, d_C, WB);

    cudaThreadSynchronize();
    etime2 = clock();

    cudatime = (double)(etime2 - stime2) / CLOCKS_PER_SEC;

    // allocate host memory for the result
    float* h_C = (float*) malloc(mem_size_C);

    // copy result from device to host
    cudaMemcpy(h_C, d_C, mem_size_C,cudaMemcpyDeviceToHost);
    etime = clock();
    ctime = (double)(etime - stime) / CLOCKS_PER_SEC;

    // compute reference solution
    float* reference = (float*) malloc(mem_size_C);

 	stime = clock();
     MatrixMulOnHost(h_A, h_B, reference, WB);
 	etime = clock();
        htime = (double)(etime - stime) / CLOCKS_PER_SEC;

 	printf("host: %.10f\ncuda: %.10f\ncuda, w/copy: %.10f\n",htime,cudatime,ctime);

    // clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    free(reference);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    cudaThreadExit();

 	
 }
	#include <stdio.h>
	#include <stdlib.h>
	#include <math.h>
	#include <time.h>
	#include <cuda.h>

	// Thread block size
	#define BLOCK_SIZE 8 //We can change this to 16, 32, 64, 128, 256, 512 and 1024

	// Matrix dimensions
	// (chosen as multiples of the thread block size for simplicity)
	#define WA 8 // Matrix A width, we can change this to 16, 32, 64, 128, 256, 512 and 1024
	#define HA 8 // Matrix A height, we can change this to 16, 32, 64, 128, 256, 512 and 1024
	#define WB 8 // Matrix B width, we can change this to 16, 32, 64, 128, 256, 512 and 1024
	#define HB WA // Matrix B height, we can change this to 16, 32, 64, 128, 256, 512 and 1024
	#define WC WB // Matrix C width, we can change this to 16, 32, 64, 128, 256, 512 and 1024
	#define HC HA // Matrix C height, we can change this to 16, 32, 64, 128, 256, 512 and 1024

	// Allocates a matrix with random float entries.
	void randomInit(float* data, int size)
	{
	for (int i = 0; i < size; ++i)
	data[i] = rand() / (float)RAND_MAX;
	}

	__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width)
	{
	// Calculate the row index of the Pd element and M
	int Row = blockIdx.y*BLOCK_SIZE + threadIdx.y;
	// Calculate the column idenx of Pd and N
	int Col = blockIdx.x*BLOCK_SIZE + threadIdx.x;

	float Pvalue = 0;
	// each thread computes one element of the block sub-matrix
	for (int k = 0; k < Width; ++k)
	Pvalue += Md[RowWidth+k] Nd[k*Width+Col];

	Pd[Row*Width+Col] = Pvalue;
	}

	void MatrixMulOnHost(float* M, float* N, float* P, int Width)
	{
	for (int i = 0; i < Width; ++i)
	for (int j = 0; j < Width; ++j) {
	double sum = 0;
	for (int k = 0; k < Width; ++k) {
	double a = M[i * Width + k];
	double b = N[k * Width + j];
	sum += a * b;
	}
	P[i * Width + j] = sum;
	}
	}



	int main()
	{

	//cudaSetDevice( cutGetMaxGflopsDeviceId() );

	srand(2006);

	double ctime,cudatime,htime;
	clock_t stime,stime2,etime, etime2;

	// allocate host memory for matrices A and B
	unsigned int size_A = WA * HA;
	unsigned int mem_size_A = sizeof(float) * size_A;
	float* h_A = (float*) malloc(mem_size_A);

	unsigned int size_B = WB * HB;
	unsigned int mem_size_B = sizeof(float) * size_B;
	float* h_B = (float*) malloc(mem_size_B);

	// initialize host memory
	randomInit(h_A, size_A);
	randomInit(h_B, size_B);

	stime = clock();

	// allocate device memory
	float* d_A;
	cudaMalloc((void**) &d_A, mem_size_A);
	float* d_B;
	cudaMalloc((void**) &d_B, mem_size_B);

	// copy host memory to device
	cudaMemcpy(d_A, h_A, mem_size_A,cudaMemcpyHostToDevice) ;
	cudaMemcpy(d_B, h_B, mem_size_B,cudaMemcpyHostToDevice);

	// allocate device memory for result
	unsigned int size_C = WC * HC;
	unsigned int mem_size_C = sizeof(float) * size_C;
	float* d_C;
	cudaMalloc((void**) &d_C, mem_size_C);

	stime2 = clock();
	// setup execution parameters
	dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
	dim3 grid(1,1);
	dim3 block(1,1);

	// execute the kernel
	MatrixMulKernel<<< grid, threads >>>(d_A, d_B, d_C, WB);

	cudaThreadSynchronize();
	etime2 = clock();

	cudatime = (double)(etime2 - stime2) / CLOCKS_PER_SEC;

	// allocate host memory for the result
	float* h_C = (float*) malloc(mem_size_C);

	// copy result from device to host
	cudaMemcpy(h_C, d_C, mem_size_C,cudaMemcpyDeviceToHost);
	etime = clock();
	ctime = (double)(etime - stime) / CLOCKS_PER_SEC;

	// compute reference solution
	float* reference = (float*) malloc(mem_size_C);

	stime = clock();
	MatrixMulOnHost(h_A, h_B, reference, WB);
	etime = clock();
	htime = (double)(etime - stime) / CLOCKS_PER_SEC;

	printf("host: %.10f\ncuda: %.10f\ncuda, w/copy: %.10f\n",htime,cudatime,ctime);

	// clean up memory
	free(h_A);
	free(h_B);
	free(h_C);
	free(reference);
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);

	cudaThreadExit();


	}