michaelfortunato · April 26, 2025 22:59
diff --git a/simple-mat-mul.cu b/simple-mat-mul.cu
 #include <cuda_runtime_api.h>
 #include <stdio.h>

 // Matrices are stored in row-major order:
 // M(row, col) = *(M.elements + row * M.width + col)
 struct Matrix {
  int width;
  int height;
  float *elements;
 };

 __global__ void mat_add(float *A, float *B, float *C, int width) {

  int i = threadIdx.y + blockIdx.y * blockDim.y;
  int j = threadIdx.x + blockIdx.x * blockDim.x;
  C[i * width + j] = A[i * width + j] + B[i * width + j];
 }

 __global__ void mat_mul(struct Matrix *A, struct Matrix *B, struct Matrix *C) {
  int i = threadIdx.y + blockIdx.y * blockDim.y;
  int j = threadIdx.x + blockIdx.x * blockDim.x;
  float acc = 0;
  for (int k = 0; k < A->width; k++) {
    acc += A->elements[k + A->width * i] * B->elements[B->width * k + j];
  }
  C->elements[i * A->width + j] = acc;
 }

 struct Matrix *Matrix_init(int height, int width) {
  struct Matrix *m = (struct Matrix *)malloc(sizeof(float));
  m->height = height;
  m->width = width;
  m->elements = (float *)malloc(sizeof(float) * width * height);
  return m;
 }

 struct Matrix *Matrix_fill(struct Matrix *m, float value) {
  for (int i = 0; i < m->height; i++) {
    for (int j = 0; j < m->width; j++) {
      m->elements[i * m->width + j] = value;
    }
  }
  return m;
 }

 typedef void (*matrix_callback)(int i, int j, struct Matrix *m);

 struct Matrix *Matrix_iter(struct Matrix *m, matrix_callback cb) {
  for (int i = 0; i < m->height; i++) {
    for (int j = 0; j < m->width; j++) {
      cb(i, j, m);
    }
  }
  return m;
 }

 void Matrix_print(struct Matrix *m) {
  for (int i = 0; i < m->height; i++) {
    for (int j = 0; j < m->width; j++) {
      printf("M[%d][%d] = %f\n", i, j, m->elements[i * m->width + j]);
    }
  }
 }

 typedef void (*matrix_callback_old)(int i, int j, int n, float *A);

 void iterate_and_call(float *A, int n, matrix_callback_old cb) {
  for (int i = 0; i < n; i++) {
    for (int j = 0; j < n; j++) {
      cb(i, j, n, A);
    }
  }
 }

 void print_matrix(int i, int j, int n, float *A) {
  printf("X[%d][%d] = %f\n", i, j, A[i * n + j]);
 }

 void init_matrix_1(int i, int j, int n, float *A) { A[i * n + j] = 1.0f; }
 void init_matrix_2(int i, int j, int n, float *A) { A[i * n + j] = 2.0f; }

 // dim3 threadsPerBlock(int x_dim, int y_dim) {
 //   dim3 shape;
 //   shape.x= x_dim;
 //   shape.y = x_dim;
 //   return shape;
 // }
 // void assert_cudaError(cudaError_t err) {
 //   if (err != cudaSuccess) {
 //     fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
 //             cudaGetErrorString(err));
 //     exit(EXIT_FAILURE);
 //   }
 // }
 inline void _assert_cudaError(cudaError_t err, const char *file, int line,
                              bool abort = true) {
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
 }
 inline void _cudaDeviceFlushPrintf() { cudaDeviceSynchronize(); }

 #define assert_cudaError(ans)                                                  \
  {                                                                            \
    _assert_cudaError((ans), __FILE__, __LINE__);                              \
  }

 int main(int argc, char *argv[]) {
  printf("Hello, World!\n");
  cudaError_t err = cudaSuccess;

  const int n = 1 << 10;

  // float *A = (float *)malloc(sizeof(float) * n * n);
  // float *B = (float *)malloc(sizeof(float) * n * n);
  // float *C = (float *)malloc(sizeof(float) * n * n);
  // float *A_d = NULL;
  // float *B_d = NULL;
  // float *C_d = NULL;
  // // Error code to check return values for CUDA calls
  // err = cudaMalloc((void **)&A_d, sizeof(float) * n * n);
  // assert_cudaError(err);
  // err = cudaMalloc((void **)&B_d, sizeof(float) * n * n);
  // assert_cudaError(err);
  // err = cudaMalloc((void **)&C_d, sizeof(float) * n * n);
  // assert_cudaError(err);
  // iterate_and_call(A, n, init_matrix_1);
  // iterate_and_call(B, n, init_matrix_2);
  // cudaMemcpy(A_d, A, sizeof(float) * n * n, cudaMemcpyHostToDevice);
  // cudaMemcpy(B_d, B, sizeof(float) * n * n, cudaMemcpyHostToDevice);
  // // cudaMemcpy(C_d, C, sizeof(float) * n * n, cudaMemcpyHostToDevice);
  //
  // dim3 threadsPerBlock(16, 16, 1);
  // dim3 numBlocks(n / threadsPerBlock.x, n / threadsPerBlock.y, 1);
  // mat_add<<<numBlocks, threadsPerBlock>>>(A_d, B_d, C_d, n);
  // assert_cudaError(cudaPeekAtLastError());
  // cudaMemcpy(C, C_d, sizeof(float) * n * n, cudaMemcpyDeviceToHost);
  // // iterate_and_call(C, n, print_matrix);
  // // printf("Goodbye, World!\n");
  // _cudaDeviceFlushPrintf();

  struct Matrix *X = Matrix_init(20, 10);
  X = Matrix_fill(X, 3.0f);
  struct Matrix *Y = Matrix_init(10, 5);
  Y = Matrix_fill(Y, 5.0f);
  struct Matrix *Z = Matrix_init(20, 5);
  // what's a good way to block this out?
  // The first obvious way to is block it over Z
  // Let's say each grid is 5 units high and 1 unit long.
  // y is height and x is widht
  dim3 threadsPerBlock1(1, 5);
  dim3 blocksPerGrid(Z->width / threadsPerBlock1.x,
                     Z->height / threadsPerBlock1.y);

  struct Matrix *X_d, *Y_d, *Z_d;
  cudaMalloc((void **)&X_d, sizeof(struct Matrix));
  cudaMalloc((void **)&Y_d, sizeof(struct Matrix));
  cudaMalloc((void **)&Z_d, sizeof(struct Matrix));
  cudaMemcpy(X_d, X, sizeof(struct Matrix), cudaMemcpyHostToDevice);
  cudaMemcpy(Y_d, Y, sizeof(struct Matrix), cudaMemcpyHostToDevice);
  cudaMemcpy(Z_d, Z, sizeof(struct Matrix), cudaMemcpyHostToDevice);
  float *X_d_e, *Y_d_e, *Z_d_e;
  cudaMalloc((void **)&X_d_e, sizeof(float) * X->width * X->height);
  cudaMalloc((void **)&Y_d_e, sizeof(float) * Y->width * Y->height);
  cudaMalloc((void **)&Z_d_e, sizeof(float) * Z->width * Z->height);
  X_d->elements = X_d_e;
  Y_d->elements = Y_d_e;
  Z_d->elements = Z_d_e;
  cudaMemcpy(X_d->elements, X->elements, sizeof(float) * X->width * X->height,
             cudaMemcpyHostToDevice);
  cudaMemcpy(Y_d->elements, Y->elements, sizeof(float) * Y->width * Y->height,
             cudaMemcpyHostToDevice);
  cudaMemcpy(Z_d->elements, Z->elements, sizeof(float) * Z->width * Z->height,
             cudaMemcpyHostToDevice);
  mat_mul<<<blocksPerGrid, threadsPerBlock1>>>(X_d, Y_d, Z_d);
  cudaMemcpy(Z->elements, Z_d->elements,
             sizeof(float) * Z_d->width * Z_d->height, cudaMemcpyDeviceToHost);
  Matrix_print(Z);

  return 0;
 }
	#include <cuda_runtime_api.h>
	#include <stdio.h>

	// Matrices are stored in row-major order:
	// M(row, col) = (M.elements + row M.width + col)
	struct Matrix {
	int width;
	int height;
	float *elements;
	};

	__global__ void mat_add(float A, float B, float *C, int width) {

	int i = threadIdx.y + blockIdx.y * blockDim.y;
	int j = threadIdx.x + blockIdx.x * blockDim.x;
	C[i * width + j] = A[i * width + j] + B[i * width + j];
	}

	__global__ void mat_mul(struct Matrix A, struct Matrix B, struct Matrix *C) {
	int i = threadIdx.y + blockIdx.y * blockDim.y;
	int j = threadIdx.x + blockIdx.x * blockDim.x;
	float acc = 0;
	for (int k = 0; k < A->width; k++) {
	acc += A->elements[k + A->width * i] * B->elements[B->width * k + j];
	}
	C->elements[i * A->width + j] = acc;
	}

	struct Matrix *Matrix_init(int height, int width) {
	struct Matrix m = (struct Matrix )malloc(sizeof(float));
	m->height = height;
	m->width = width;
	m->elements = (float )malloc(sizeof(float) width * height);
	return m;
	}

	struct Matrix Matrix_fill(struct Matrix m, float value) {
	for (int i = 0; i < m->height; i++) {
	for (int j = 0; j < m->width; j++) {
	m->elements[i * m->width + j] = value;
	}
	}
	return m;
	}

	typedef void (matrix_callback)(int i, int j, struct Matrix m);

	struct Matrix Matrix_iter(struct Matrix m, matrix_callback cb) {
	for (int i = 0; i < m->height; i++) {
	for (int j = 0; j < m->width; j++) {
	cb(i, j, m);
	}
	}
	return m;
	}

	void Matrix_print(struct Matrix *m) {
	for (int i = 0; i < m->height; i++) {
	for (int j = 0; j < m->width; j++) {
	printf("M[%d][%d] = %f\n", i, j, m->elements[i * m->width + j]);
	}
	}
	}

	typedef void (matrix_callback_old)(int i, int j, int n, float A);

	void iterate_and_call(float *A, int n, matrix_callback_old cb) {
	for (int i = 0; i < n; i++) {
	for (int j = 0; j < n; j++) {
	cb(i, j, n, A);
	}
	}
	}

	void print_matrix(int i, int j, int n, float *A) {
	printf("X[%d][%d] = %f\n", i, j, A[i * n + j]);
	}

	void init_matrix_1(int i, int j, int n, float A) { A[i n + j] = 1.0f; }
	void init_matrix_2(int i, int j, int n, float A) { A[i n + j] = 2.0f; }

	// dim3 threadsPerBlock(int x_dim, int y_dim) {
	// dim3 shape;
	// shape.x= x_dim;
	// shape.y = x_dim;
	// return shape;
	// }
	// void assert_cudaError(cudaError_t err) {
	// if (err != cudaSuccess) {
	// fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
	// cudaGetErrorString(err));
	// exit(EXIT_FAILURE);
	// }
	// }
	inline void _assert_cudaError(cudaError_t err, const char *file, int line,
	bool abort = true) {
	if (err != cudaSuccess) {
	fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
	cudaGetErrorString(err));
	exit(EXIT_FAILURE);
	}
	}
	inline void _cudaDeviceFlushPrintf() { cudaDeviceSynchronize(); }

	#define assert_cudaError(ans) \
	{ \
	_assert_cudaError((ans), __FILE__, __LINE__); \
	}

	int main(int argc, char *argv[]) {
	printf("Hello, World!\n");
	cudaError_t err = cudaSuccess;

	const int n = 1 << 10;

	// float A = (float )malloc(sizeof(float) * n * n);
	// float B = (float )malloc(sizeof(float) * n * n);
	// float C = (float )malloc(sizeof(float) * n * n);
	// float *A_d = NULL;
	// float *B_d = NULL;
	// float *C_d = NULL;
	// // Error code to check return values for CUDA calls
	// err = cudaMalloc((void *)&A_d, sizeof(float) n * n);
	// assert_cudaError(err);
	// err = cudaMalloc((void *)&B_d, sizeof(float) n * n);
	// assert_cudaError(err);
	// err = cudaMalloc((void *)&C_d, sizeof(float) n * n);
	// assert_cudaError(err);
	// iterate_and_call(A, n, init_matrix_1);
	// iterate_and_call(B, n, init_matrix_2);
	// cudaMemcpy(A_d, A, sizeof(float) * n * n, cudaMemcpyHostToDevice);
	// cudaMemcpy(B_d, B, sizeof(float) * n * n, cudaMemcpyHostToDevice);
	// // cudaMemcpy(C_d, C, sizeof(float) * n * n, cudaMemcpyHostToDevice);
	//
	// dim3 threadsPerBlock(16, 16, 1);
	// dim3 numBlocks(n / threadsPerBlock.x, n / threadsPerBlock.y, 1);
	// mat_add<<<numBlocks, threadsPerBlock>>>(A_d, B_d, C_d, n);
	// assert_cudaError(cudaPeekAtLastError());
	// cudaMemcpy(C, C_d, sizeof(float) * n * n, cudaMemcpyDeviceToHost);
	// // iterate_and_call(C, n, print_matrix);
	// // printf("Goodbye, World!\n");
	// _cudaDeviceFlushPrintf();

	struct Matrix *X = Matrix_init(20, 10);
	X = Matrix_fill(X, 3.0f);
	struct Matrix *Y = Matrix_init(10, 5);
	Y = Matrix_fill(Y, 5.0f);
	struct Matrix *Z = Matrix_init(20, 5);
	// what's a good way to block this out?
	// The first obvious way to is block it over Z
	// Let's say each grid is 5 units high and 1 unit long.
	// y is height and x is widht
	dim3 threadsPerBlock1(1, 5);
	dim3 blocksPerGrid(Z->width / threadsPerBlock1.x,
	Z->height / threadsPerBlock1.y);

	struct Matrix X_d, Y_d, *Z_d;
	cudaMalloc((void **)&X_d, sizeof(struct Matrix));
	cudaMalloc((void **)&Y_d, sizeof(struct Matrix));
	cudaMalloc((void **)&Z_d, sizeof(struct Matrix));
	cudaMemcpy(X_d, X, sizeof(struct Matrix), cudaMemcpyHostToDevice);
	cudaMemcpy(Y_d, Y, sizeof(struct Matrix), cudaMemcpyHostToDevice);
	cudaMemcpy(Z_d, Z, sizeof(struct Matrix), cudaMemcpyHostToDevice);
	float X_d_e, Y_d_e, *Z_d_e;
	cudaMalloc((void *)&X_d_e, sizeof(float) X->width * X->height);
	cudaMalloc((void *)&Y_d_e, sizeof(float) Y->width * Y->height);
	cudaMalloc((void *)&Z_d_e, sizeof(float) Z->width * Z->height);
	X_d->elements = X_d_e;
	Y_d->elements = Y_d_e;
	Z_d->elements = Z_d_e;
	cudaMemcpy(X_d->elements, X->elements, sizeof(float) * X->width * X->height,
	cudaMemcpyHostToDevice);
	cudaMemcpy(Y_d->elements, Y->elements, sizeof(float) * Y->width * Y->height,
	cudaMemcpyHostToDevice);
	cudaMemcpy(Z_d->elements, Z->elements, sizeof(float) * Z->width * Z->height,
	cudaMemcpyHostToDevice);
	mat_mul<<<blocksPerGrid, threadsPerBlock1>>>(X_d, Y_d, Z_d);
	cudaMemcpy(Z->elements, Z_d->elements,
	sizeof(float) * Z_d->width * Z_d->height, cudaMemcpyDeviceToHost);
	Matrix_print(Z);

	return 0;
	}