Skip to content

Instantly share code, notes, and snippets.

@AxoyTO
Last active September 30, 2021 19:17
Show Gist options
  • Select an option

  • Save AxoyTO/bc6b45287c61ad3208be2a60aec21300 to your computer and use it in GitHub Desktop.

Select an option

Save AxoyTO/bc6b45287c61ad3208be2a60aec21300 to your computer and use it in GitHub Desktop.
CUDA CUBLAS/CURAND/THRUST MatrixMultiplication (TASK_12)
// High level matrix multiplication on GPU using CUDA with Thrust, CURAND and
// CUBLAS C(m,n) = A(m,k) * B(k,n)
#include <cublas_v2.h>
#include <curand.h>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// Fill the array A(nr_rows_A, nr_cols_A) with random numbers on GPU
void GPU_fill_rand(float* A, int nr_rows_A, int nr_cols_A) {
// Create a pseudo-random number generator
curandGenerator_t prng;
curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
// Set the seed for the random number generator using the system clock
curandSetPseudoRandomGeneratorSeed(prng, (unsigned long long)clock());
// Fill the array with random numbers on the device
curandGenerateUniform(prng, A, nr_rows_A * nr_cols_A);
}
// Multiply the arrays A and B on GPU and save the result in C
// C(m,n) = A(m,k) * B(k,n)
void gpu_blas_mmul(const float* A,
const float* B,
float* C,
const int m,
const int k,
const int n) {
int lda = m, ldb = k, ldc = m;
const float alf = 1;
const float bet = 0;
const float* alpha = &alf;
const float* beta = &bet;
// Create a handle for CUBLAS
cublasHandle_t handle;
cublasCreate(&handle);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
// Do the actual multiplication
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float msecs = 0;
cudaEventElapsedTime(&msecs, start, stop);
std::cout << "cublasSGEMM Elapsed Time on GPU: " << msecs << " ms.\n";
float numOps = 2 * 3 * 3 * 3;
std::cout << "Efficiency of the program: " << numOps / (msecs * 1000)
<< " GFLOPS.\n\n";
// Destroy the handle
cublasDestroy(handle);
}
// Print matrix A(nr_rows_A, nr_cols_A) storage in column-major format
void print_matrix(const thrust::device_vector<float>& A,
int nr_rows_A,
int nr_cols_A) {
for (int i = 0; i < nr_rows_A; ++i) {
for (int j = 0; j < nr_cols_A; ++j) {
std::cout << A[j * nr_rows_A + i] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
int main() {
// Allocate 3 arrays on CPU
int nr_rows_A, nr_cols_A, nr_rows_B, nr_cols_B, nr_rows_C, nr_cols_C;
// for simplicity we are going to use square arrays
nr_rows_A = nr_cols_A = nr_rows_B = nr_cols_B = nr_rows_C = nr_cols_C = 3;
thrust::device_vector<float> d_A(nr_rows_A * nr_cols_A),
d_B(nr_rows_B * nr_cols_B), d_C(nr_rows_C * nr_cols_C);
// Fill the arrays A and B on GPU with random numbers
GPU_fill_rand(thrust::raw_pointer_cast(&d_A[0]), nr_rows_A, nr_cols_A);
GPU_fill_rand(thrust::raw_pointer_cast(&d_B[0]), nr_rows_B, nr_cols_B);
// Optionally we can print the data
std::cout << "A =" << std::endl;
print_matrix(d_A, nr_rows_A, nr_cols_A);
std::cout << "B =" << std::endl;
print_matrix(d_B, nr_rows_B, nr_cols_B);
// Multiply A and B on GPU
gpu_blas_mmul(
thrust::raw_pointer_cast(&d_A[0]), thrust::raw_pointer_cast(&d_B[0]),
thrust::raw_pointer_cast(&d_C[0]), nr_rows_A, nr_cols_A, nr_cols_B);
// Print the result
std::cout << "C =" << std::endl;
print_matrix(d_C, nr_rows_C, nr_cols_C);
return 0;
}
@AxoyTO
Copy link
Author

AxoyTO commented Sep 30, 2021

Result
1
2
3

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment