Created
August 10, 2014 18:59
-
-
Save johnwalley/8d8032bdad2a3f8b72b6 to your computer and use it in GitHub Desktop.
Simple matrix-vector multiplication
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "cuda_runtime.h" | |
#include "device_launch_parameters.h" | |
#include <stdio.h> | |
cudaError_t multiplyWithCuda(float *c, const float *a, const float *b, unsigned int size); | |
__global__ void multiplyKernel(float *c, const float *a, const float *b, const int size) { | |
int index = threadIdx.x + blockIdx.x * blockDim.x; | |
c[index] = 0; | |
for (int j = 0; j < size; ++j) | |
c[index] += a[index * size + j] * b[index]; | |
} | |
int main() | |
{ | |
const int arraySize = 1024 * 8; | |
float* a; | |
float b[arraySize]; | |
float c[arraySize] = { 0 }; | |
int nIter = 1; | |
a = (float*)malloc(sizeof(float) * arraySize * arraySize); | |
// Initialize the host input vectors | |
for (int i = 0; i < arraySize; ++i) { | |
for (int j = 0; j < arraySize; ++j) { | |
a[i*arraySize + j] = (float)(i * j); | |
} | |
} | |
for (int i = 0; i < arraySize; ++i) { | |
a[i] = (float)i; | |
} | |
// Execute the kernel | |
for (int j = 0; j < nIter; j++) { | |
multiplyWithCuda(c, a, b, arraySize); | |
} | |
free(a); | |
return 0; | |
} | |
// Helper function for using CUDA to multiply a matrix by a vector in parallel. | |
cudaError_t multiplyWithCuda(float *c, const float *a, const float *b, unsigned int size) | |
{ | |
float *dev_a = 0; | |
float *dev_b = 0; | |
float *dev_c = 0; | |
cudaError_t cudaStatus; | |
// Choose which GPU to run on, change this on a multi-GPU system. | |
cudaStatus = cudaSetDevice(0); | |
// Allocate GPU buffers for three vectors (two input, one output) . | |
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(float)); | |
cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(float)); | |
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(float)); | |
// Copy input vectors from host memory to GPU buffers. | |
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(float), cudaMemcpyHostToDevice); | |
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(float), cudaMemcpyHostToDevice); | |
// Allocate CUDA events that we'll use for timing | |
cudaEvent_t start; | |
cudaEventCreate(&start); | |
cudaEvent_t stop; | |
cudaEventCreate(&stop); | |
// Record the start event | |
cudaEventRecord(start, NULL); | |
// Launch a kernel on the GPU with one thread for each element. | |
multiplyKernel<<<2, size/2>>>(dev_c, dev_a, dev_b, size); | |
// cudaDeviceSynchronize waits for the kernel to finish, and returns | |
// any errors encountered during the launch. | |
cudaDeviceSynchronize(); | |
// Record the stop event | |
cudaEventRecord(stop, NULL); | |
// Wait for the stop event to complete | |
cudaEventSynchronize(stop); | |
float msecTotal = 0.0f; | |
cudaEventElapsedTime(&msecTotal, start, stop); | |
printf("Time= %.3f msec\n", msecTotal); | |
// Copy output vector from GPU buffer to host memory. | |
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(float), cudaMemcpyDeviceToHost); | |
cudaFree(dev_c); | |
cudaFree(dev_a); | |
cudaFree(dev_b); | |
return cudaStatus; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment