dniku · August 29, 2015 14:14
diff --git a/tiled_matrix_multiplication.cu b/tiled_matrix_multiplication.cu
 #define TILE_WIDTH 16

 // NOTE: Since TILE_WIDTH is a constant, I cannot use the more readable idiom
 // ceil(whatever/16.0) for calculating the number of tiles. Instead I use
 // (whatever + TILE_WIDTH - 1) / TILE_WIDTH, which is equivalent. Trust me.

 // Compute C = A * B
 __global__ void matrixMultiplyShared(float *A, float *B, float *C, int numARows,
                                     int numAColumns, int numBRows,
                                     int numBColumns, int numCRows,
                                     int numCColumns) {
  __shared__ float A_tile[TILE_WIDTH][TILE_WIDTH];
  __shared__ float B_tile[TILE_WIDTH][TILE_WIDTH];

  float sum = 0.0;

  // where am I?
  // tx for thread_x or tile_x
  int tx = threadIdx.x; int ty = threadIdx.y;
  // cx for top left corner of tile in C
  int cx = blockIdx.x * blockDim.x; int cy = blockIdx.y * blockDim.y;
  // Cx for cell coordinates in C
  int Cx = cx + tx; int Cy = cy + ty;

  int total_tiles = (numAColumns + TILE_WIDTH - 1) / TILE_WIDTH;

  for (int tile_idx = 0; tile_idx < total_tiles; tile_idx++) {
    // the corresponding tiles' top left corners are:
    // for A: row = blockIdx.y * blockDim.y, col = tile_idx * TILE_WIDTH
    // for B: row = tile_idx * TILE_WIDTH, col = blockIdx.x * blockDim.x
    
    // loading tiles
    int Ax = tile_idx * TILE_WIDTH + tx; int Ay = cy + ty;
    int Bx = cx + tx; int By = tile_idx * TILE_WIDTH + ty;

    if (Ax < numAColumns && Ay < numARows) {
      A_tile[ty][tx] = A[Ay * numAColumns + Ax];
    }
    else {
      A_tile[ty][tx] = 0.0;
    }
    if (Bx < numBColumns && By < numBRows) {
      B_tile[ty][tx] = B[By * numBColumns + Bx];
    }
    else {
      B_tile[ty][tx] = 0.0;
    }
    __syncthreads();

    // multiplying tiles
    for (int i = 0; i < TILE_WIDTH; i++) {
      sum += A_tile[ty][i] * B_tile[i][tx];
    }
    __syncthreads();
  }

  // saving result (discarded if we're in the wrong thread)
  if (Cx < numCColumns && Cy < numCRows) {
    C[Cy * numCColumns + Cx] = sum;
  }
 }
	#define TILE_WIDTH 16

	// NOTE: Since TILE_WIDTH is a constant, I cannot use the more readable idiom
	// ceil(whatever/16.0) for calculating the number of tiles. Instead I use
	// (whatever + TILE_WIDTH - 1) / TILE_WIDTH, which is equivalent. Trust me.

	// Compute C = A * B
	__global__ void matrixMultiplyShared(float A, float B, float *C, int numARows,
	int numAColumns, int numBRows,
	int numBColumns, int numCRows,
	int numCColumns) {
	__shared__ float A_tile[TILE_WIDTH][TILE_WIDTH];
	__shared__ float B_tile[TILE_WIDTH][TILE_WIDTH];

	float sum = 0.0;

	// where am I?
	// tx for thread_x or tile_x
	int tx = threadIdx.x; int ty = threadIdx.y;
	// cx for top left corner of tile in C
	int cx = blockIdx.x * blockDim.x; int cy = blockIdx.y * blockDim.y;
	// Cx for cell coordinates in C
	int Cx = cx + tx; int Cy = cy + ty;

	int total_tiles = (numAColumns + TILE_WIDTH - 1) / TILE_WIDTH;

	for (int tile_idx = 0; tile_idx < total_tiles; tile_idx++) {
	// the corresponding tiles' top left corners are:
	// for A: row = blockIdx.y * blockDim.y, col = tile_idx * TILE_WIDTH
	// for B: row = tile_idx * TILE_WIDTH, col = blockIdx.x * blockDim.x

	// loading tiles
	int Ax = tile_idx * TILE_WIDTH + tx; int Ay = cy + ty;
	int Bx = cx + tx; int By = tile_idx * TILE_WIDTH + ty;

	if (Ax < numAColumns && Ay < numARows) {
	A_tile[ty][tx] = A[Ay * numAColumns + Ax];
	}
	else {
	A_tile[ty][tx] = 0.0;
	}
	if (Bx < numBColumns && By < numBRows) {
	B_tile[ty][tx] = B[By * numBColumns + Bx];
	}
	else {
	B_tile[ty][tx] = 0.0;
	}
	__syncthreads();

	// multiplying tiles
	for (int i = 0; i < TILE_WIDTH; i++) {
	sum += A_tile[ty][i] * B_tile[i][tx];
	}
	__syncthreads();
	}

	// saving result (discarded if we're in the wrong thread)
	if (Cx < numCColumns && Cy < numCRows) {
	C[Cy * numCColumns + Cx] = sum;
	}
	}