andreinechaev · March 31, 2018 00:29
diff --git a/matrix_dot.cu b/matrix_dot.cu
 #include <stdio.h>

 #define N  64

 inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
  if (err != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
  }
  return err;
 }

 __global__ void matrixMulGPU( int * a, int * b, int * c )
 {
  /*
   * Build out this kernel.
   */
    
    int row = threadIdx.y + blockIdx.y * blockDim.y;
    int col = threadIdx.x + blockIdx.x * blockDim.x;
    
    int val = 0;
    if (row < N && col < N) {
      for (int i = 0; i < N; ++i) {
         val += a[row * N + i] * b[i * N + col];
       }
    
      c[row * N + col] = val;
    }
 }

 /*
 * This CPU function already works, and will run to create a solution matrix
 * against which to verify your work building out the matrixMulGPU kernel.
 */

 void matrixMulCPU( int * a, int * b, int * c )
 {
  int val = 0;

  for( int row = 0; row < N; ++row )
    for( int col = 0; col < N; ++col )
    {
      val = 0;
      for ( int k = 0; k < N; ++k )
        val += a[row * N + k] * b[k * N + col];
      c[row * N + col] = val;
    }
 }

 int main()
 {
  int *a, *b, *c_cpu, *c_gpu; // Allocate a solution matrix for both the CPU and the GPU operations

  int size = N * N * sizeof (int); // Number of bytes of an N x N matrix

  // Allocate memory
  cudaMallocManaged (&a, size);
  cudaMallocManaged (&b, size);
  cudaMallocManaged (&c_cpu, size);
  cudaMallocManaged (&c_gpu, size);

  // Initialize memory; create 2D matrices
  for( int row = 0; row < N; ++row )
    for( int col = 0; col < N; ++col )
    {
      a[row*N + col] = row;
      b[row*N + col] = col+2;
      c_cpu[row*N + col] = 0;
      c_gpu[row*N + col] = 0;
    }

  /*
   * Assign `threads_per_block` and `number_of_blocks` 2D values
   * that can be used in matrixMulGPU above.
   */

  dim3 threads_per_block(32, 32, 1);
  dim3 number_of_blocks(N / threads_per_block.x + 1, N / threads_per_block.y + 1, 1);

  matrixMulGPU <<< number_of_blocks, threads_per_block >>> ( a, b, c_gpu );

  checkCudaErr(cudaDeviceSynchronize(), "Syncronization");

  checkCudaErr(cudaGetLastError(), "GPU");

  // Call the CPU version to check our work
  matrixMulCPU( a, b, c_cpu );

  // Compare the two answers to make sure they are equal
  bool error = false;
  for( int row = 0; row < N && !error; ++row )
    for( int col = 0; col < N && !error; ++col )
      if (c_cpu[row * N + col] != c_gpu[row * N + col])
      {
        printf("FOUND ERROR at c[%d][%d]\n", row, col);
        error = true;
        break;
      }

  if (!error)
    printf("Success!\n");

  // Free all our allocated memory
  cudaFree(a); cudaFree(b);
  cudaFree( c_cpu ); cudaFree( c_gpu );
 }
	#include <stdio.h>

	#define N 64

	inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
	if (err != cudaSuccess) {
	fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
	}
	return err;
	}

	__global__ void matrixMulGPU( int * a, int * b, int * c )
	{
	/*
	* Build out this kernel.
	*/

	int row = threadIdx.y + blockIdx.y * blockDim.y;
	int col = threadIdx.x + blockIdx.x * blockDim.x;

	int val = 0;
	if (row < N && col < N) {
	for (int i = 0; i < N; ++i) {
	val += a[row * N + i] * b[i * N + col];
	}

	c[row * N + col] = val;
	}
	}

	/*
	* This CPU function already works, and will run to create a solution matrix
	* against which to verify your work building out the matrixMulGPU kernel.
	*/

	void matrixMulCPU( int * a, int * b, int * c )
	{
	int val = 0;

	for( int row = 0; row < N; ++row )
	for( int col = 0; col < N; ++col )
	{
	val = 0;
	for ( int k = 0; k < N; ++k )
	val += a[row * N + k] * b[k * N + col];
	c[row * N + col] = val;
	}
	}

	int main()
	{
	int a, b, c_cpu, c_gpu; // Allocate a solution matrix for both the CPU and the GPU operations

	int size = N * N * sizeof (int); // Number of bytes of an N x N matrix

	// Allocate memory
	cudaMallocManaged (&a, size);
	cudaMallocManaged (&b, size);
	cudaMallocManaged (&c_cpu, size);
	cudaMallocManaged (&c_gpu, size);

	// Initialize memory; create 2D matrices
	for( int row = 0; row < N; ++row )
	for( int col = 0; col < N; ++col )
	{
	a[row*N + col] = row;
	b[row*N + col] = col+2;
	c_cpu[row*N + col] = 0;
	c_gpu[row*N + col] = 0;
	}

	/*
	* Assign `threads_per_block` and `number_of_blocks` 2D values
	* that can be used in matrixMulGPU above.
	*/

	dim3 threads_per_block(32, 32, 1);
	dim3 number_of_blocks(N / threads_per_block.x + 1, N / threads_per_block.y + 1, 1);

	matrixMulGPU <<< number_of_blocks, threads_per_block >>> ( a, b, c_gpu );

	checkCudaErr(cudaDeviceSynchronize(), "Syncronization");

	checkCudaErr(cudaGetLastError(), "GPU");

	// Call the CPU version to check our work
	matrixMulCPU( a, b, c_cpu );

	// Compare the two answers to make sure they are equal
	bool error = false;
	for( int row = 0; row < N && !error; ++row )
	for( int col = 0; col < N && !error; ++col )
	if (c_cpu[row * N + col] != c_gpu[row * N + col])
	{
	printf("FOUND ERROR at c[%d][%d]\n", row, col);
	error = true;
	break;
	}

	if (!error)
	printf("Success!\n");

	// Free all our allocated memory
	cudaFree(a); cudaFree(b);
	cudaFree( c_cpu ); cudaFree( c_gpu );
	}