Created
September 1, 2018 05:15
-
-
Save FernandoS27/b1ab4f8e861c9162f54952ed8273431b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda.h> | |
#include <stdio.h> | |
#include <time.h> | |
#include <stdlib.h> | |
#include <math.h> | |
int main() | |
{ | |
int N = 1000; | |
size_t size = N * sizeof(float); | |
srand(time(NULL)); | |
// Allocate input vectors h_A and h_B in host memory | |
float* h_A = (float*)malloc(size); | |
float* h_B = (float*)malloc(size); | |
float* h_C = (float*)malloc(size); | |
// Initialize input vectors | |
unsigned int i; | |
for (i = 0; i < N; i++ ) { | |
h_A[i] = 1.0f - (float)(rand() % 20000) / 10000.0f; | |
float f = 1.0 - (float)(rand() % 20000) / 10000.0f; | |
h_B[i] = 64.0 + 128.0 * f; | |
} | |
// Initialize | |
cuInit(0); | |
// Get number of devices supporting CUDA | |
int deviceCount = 0; | |
cuDeviceGetCount(&deviceCount); | |
if (deviceCount == 0) { | |
printf("There is no device supporting CUDA.\n"); | |
exit (0); | |
} | |
// Get handle for device 0 | |
CUdevice cuDevice; | |
cuDeviceGet(&cuDevice, 0); | |
// Create context | |
CUcontext cuContext; | |
cuCtxCreate(&cuContext, 0, cuDevice); | |
// Create module from binary file | |
CUmodule cuModule; | |
if (cuModuleLoad(&cuModule, "test.cubin") != 0) { | |
printf("Failed to load cubin.\n"); | |
exit (0); | |
} | |
// Allocate vectors in device memory | |
CUdeviceptr d_A; | |
cuMemAlloc(&d_A, size); | |
CUdeviceptr d_B; | |
cuMemAlloc(&d_B, size); | |
CUdeviceptr d_C; | |
cuMemAlloc(&d_C, size); | |
// Copy vectors from host memory to device memory | |
cuMemcpyHtoD(d_A, h_A, size); | |
cuMemcpyHtoD(d_B, h_B, size); | |
// Get function handle from module | |
CUfunction test; | |
const char name[] = "_Z4TestPfS_S_"; | |
unsigned int j = cuModuleGetFunction(&test, cuModule, name); | |
if (j != 0) { | |
printf("Failed to get Function. %d\n", j); | |
exit (0); | |
} | |
// Invoke kernel | |
int threadsPerBlock = 256; | |
int blocksPerGrid = | |
(N + threadsPerBlock - 1) / threadsPerBlock; | |
void* args[] = { &d_A, &d_B, &d_C, &N }; | |
cuLaunchKernel(test, | |
blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, | |
0, 0, args, 0); | |
cuCtxSynchronize(); | |
cuMemcpyDtoH(h_C, d_C, size); | |
for (i = 0; i < 256; i++ ) { | |
printf("Result: RRO %f -> %.12f\n", h_B[i], h_C[i]); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment