johnwalley · August 10, 2014 18:59
diff --git a/MatrixVectorMultiply1.cu b/MatrixVectorMultiply1.cu
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"

 #include <stdio.h>

 cudaError_t multiplyWithCuda(float *c, const float *a, const float *b, unsigned int size);

 __global__ void multiplyKernel(float *c, const float *a, const float *b, const int size) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;

 	c[index] = 0;

 	for (int j = 0; j < size; ++j)
 		c[index] += a[index * size + j] * b[index];
 }

 int main()
 {
    const int arraySize = 1024 * 8;

    float* a;
    float b[arraySize];
    float c[arraySize] = { 0 };

    int nIter = 1;

 	a = (float*)malloc(sizeof(float) * arraySize * arraySize);

    // Initialize the host input vectors
    for (int i = 0; i < arraySize; ++i) {
 	    for (int j = 0; j < arraySize; ++j) {
 			a[i*arraySize + j] = (float)(i * j);
 		}
    }

    for (int i = 0; i < arraySize; ++i) {
 		a[i] = (float)i;
    }

 	// Execute the kernel
 	for (int j = 0; j < nIter; j++) {
 		multiplyWithCuda(c, a, b, arraySize);
 	}

 	free(a);

    return 0;
 }

 // Helper function for using CUDA to multiply a matrix by a vector in parallel.
 cudaError_t multiplyWithCuda(float *c, const float *a, const float *b, unsigned int size)
 {
    float *dev_a = 0;
    float *dev_b = 0;
    float *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(float));

    cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(float));

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(float));

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(float), cudaMemcpyHostToDevice);

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(float), cudaMemcpyHostToDevice);

 	// Allocate CUDA events that we'll use for timing
    cudaEvent_t start;
    cudaEventCreate(&start);

    cudaEvent_t stop;
    cudaEventCreate(&stop);

    // Record the start event
    cudaEventRecord(start, NULL);

    // Launch a kernel on the GPU with one thread for each element.
    multiplyKernel<<<2, size/2>>>(dev_c, dev_a, dev_b, size);
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaDeviceSynchronize();

    // Record the stop event
    cudaEventRecord(stop, NULL);

    // Wait for the stop event to complete
    cudaEventSynchronize(stop);

    float msecTotal = 0.0f;
    cudaEventElapsedTime(&msecTotal, start, stop);

    printf("Time= %.3f msec\n", msecTotal);

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
 }
	#include "cuda_runtime.h"
	#include "device_launch_parameters.h"

	#include <stdio.h>

	cudaError_t multiplyWithCuda(float c, const float a, const float *b, unsigned int size);

	__global__ void multiplyKernel(float c, const float a, const float *b, const int size) {
	int index = threadIdx.x + blockIdx.x * blockDim.x;

	c[index] = 0;

	for (int j = 0; j < size; ++j)
	c[index] += a[index * size + j] * b[index];
	}

	int main()
	{
	const int arraySize = 1024 * 8;

	float* a;
	float b[arraySize];
	float c[arraySize] = { 0 };

	int nIter = 1;

	a = (float)malloc(sizeof(float) arraySize * arraySize);

	// Initialize the host input vectors
	for (int i = 0; i < arraySize; ++i) {
	for (int j = 0; j < arraySize; ++j) {
	a[iarraySize + j] = (float)(i j);
	}
	}

	for (int i = 0; i < arraySize; ++i) {
	a[i] = (float)i;
	}

	// Execute the kernel
	for (int j = 0; j < nIter; j++) {
	multiplyWithCuda(c, a, b, arraySize);
	}

	free(a);

	return 0;
	}

	// Helper function for using CUDA to multiply a matrix by a vector in parallel.
	cudaError_t multiplyWithCuda(float c, const float a, const float *b, unsigned int size)
	{
	float *dev_a = 0;
	float *dev_b = 0;
	float *dev_c = 0;
	cudaError_t cudaStatus;

	// Choose which GPU to run on, change this on a multi-GPU system.
	cudaStatus = cudaSetDevice(0);

	// Allocate GPU buffers for three vectors (two input, one output) .
	cudaStatus = cudaMalloc((void*)&dev_c, size sizeof(float));

	cudaStatus = cudaMalloc((void*)&dev_a, size size * sizeof(float));

	cudaStatus = cudaMalloc((void*)&dev_b, size sizeof(float));

	// Copy input vectors from host memory to GPU buffers.
	cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(float), cudaMemcpyHostToDevice);

	cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(float), cudaMemcpyHostToDevice);

	// Allocate CUDA events that we'll use for timing
	cudaEvent_t start;
	cudaEventCreate(&start);

	cudaEvent_t stop;
	cudaEventCreate(&stop);

	// Record the start event
	cudaEventRecord(start, NULL);

	// Launch a kernel on the GPU with one thread for each element.
	multiplyKernel<<<2, size/2>>>(dev_c, dev_a, dev_b, size);

	// cudaDeviceSynchronize waits for the kernel to finish, and returns
	// any errors encountered during the launch.
	cudaDeviceSynchronize();

	// Record the stop event
	cudaEventRecord(stop, NULL);

	// Wait for the stop event to complete
	cudaEventSynchronize(stop);

	float msecTotal = 0.0f;
	cudaEventElapsedTime(&msecTotal, start, stop);

	printf("Time= %.3f msec\n", msecTotal);

	// Copy output vector from GPU buffer to host memory.
	cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(float), cudaMemcpyDeviceToHost);

	cudaFree(dev_c);
	cudaFree(dev_a);
	cudaFree(dev_b);

	return cudaStatus;
	}