Last active
August 29, 2015 14:05
-
-
Save johnwalley/4f4d9c231e4d7ba9aec5 to your computer and use it in GitHub Desktop.
Simple matrix-vector multiplication
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "cuda_runtime.h" | |
#include "device_launch_parameters.h" | |
#include <stdio.h> | |
cudaError_t multiplyWithCuda(float *c, const float *a, const float *b, unsigned int size); | |
__global__ void multiplyKernel(float *c, const float *a, const float *b, const int size) { | |
int index = threadIdx.x + blockIdx.x * blockDim.x; | |
c[index] = 0; | |
for (int j = 0; j < size; ++j) | |
c[index] += a[index * size + j] * b[index]; | |
} | |
int main() | |
{ | |
const int arraySize = 1024 * 8; | |
float* a; | |
float b[arraySize]; | |
float c[arraySize] = { 0 }; | |
int nIter = 1; | |
a = (float*)malloc(sizeof(float) * arraySize * arraySize); | |
// Initialize the host input vectors | |
for (int i = 0; i < arraySize; ++i) { | |
for (int j = 0; j < arraySize; ++j) { | |
a[i*arraySize + j] = (float)(i * j); | |
} | |
} | |
for (int i = 0; i < arraySize; ++i) { | |
a[i] = (float)i; | |
} | |
// Allocate CUDA events that we'll use for timing | |
cudaEvent_t start; | |
cudaEventCreate(&start); | |
cudaEvent_t stop; | |
cudaEventCreate(&stop); | |
// Record the start event | |
cudaEventRecord(start, NULL); | |
// Execute the kernel | |
for (int j = 0; j < nIter; j++) { | |
multiplyWithCuda(c, a, b, arraySize); | |
} | |
// Record the stop event | |
cudaEventRecord(stop, NULL); | |
// Wait for the stop event to complete | |
cudaEventSynchronize(stop); | |
float msecTotal = 0.0f; | |
cudaEventElapsedTime(&msecTotal, start, stop); | |
printf("Time= %.3f msec\n", msecTotal); | |
free(a); | |
return 0; | |
} | |
// Helper function for using CUDA to multiply a matrix by a vector in parallel. | |
cudaError_t multiplyWithCuda(float *c, const float *a, const float *b, unsigned int size) | |
{ | |
float *dev_a = 0; | |
float *dev_b = 0; | |
float *dev_c = 0; | |
cudaError_t cudaStatus; | |
// Choose which GPU to run on, change this on a multi-GPU system. | |
cudaStatus = cudaSetDevice(0); | |
// Allocate GPU buffers for three vectors (two input, one output) . | |
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(float)); | |
cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(float)); | |
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(float)); | |
// Copy input vectors from host memory to GPU buffers. | |
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(float), cudaMemcpyHostToDevice); | |
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(float), cudaMemcpyHostToDevice); | |
// Launch a kernel on the GPU with one thread for each element. | |
multiplyKernel<<<2, size/2>>>(dev_c, dev_a, dev_b, size); | |
// cudaDeviceSynchronize waits for the kernel to finish, and returns | |
// any errors encountered during the launch. | |
cudaDeviceSynchronize(); | |
// Copy output vector from GPU buffer to host memory. | |
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(float), cudaMemcpyDeviceToHost); | |
cudaFree(dev_c); | |
cudaFree(dev_a); | |
cudaFree(dev_b); | |
return cudaStatus; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment