Created
September 4, 2014 10:38
-
-
Save moznion/2daca45af3f9e5307af3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#define DATA_NUM 1024 | |
#define DATA_SIZE sizeof(double) * DATA_NUM | |
#define BLOCK_SIZE 128 | |
#define TRIALS 100 | |
void cudaSetDeviceSafe(int); | |
void cudaMallocSafe(void*, int); | |
void cudaMallocHostSafe(void*, int); | |
void cudaMemcpySafe(void*, const void*, size_t, cudaMemcpyKind); | |
void cudaMemcpyPeerSafe(void*, int, const void*, int, size_t); | |
void cudaFreeSafe(void*); | |
void cudaFreeHostSafe(void*); | |
__global__ void vadd_kernel(double* A, double* B, double* C) { | |
int id = blockIdx.x * blockDim.x + threadIdx.x; | |
C[id] = A[id] + B[id]; | |
} | |
int vadd(double* host_dataA, double* host_dataB, double* host_result_data) { | |
cudaError_t err; | |
int device0 = 0; | |
int device1 = 2; | |
// Check peer connection is available | |
int can_access_0_to_1, can_access_1_to_0; | |
err = cudaDeviceCanAccessPeer(&can_access_0_to_1, device0, device1); | |
err = cudaDeviceCanAccessPeer(&can_access_1_to_0, device1, device0); | |
if (can_access_0_to_1 != 1 || can_access_1_to_0 != 1) { | |
fprintf(stderr, "Cannot connect via peer between device0 and device1\n"); | |
return 1; | |
} | |
// Check UVA is available | |
cudaDeviceProp device_prop; | |
int uva0; | |
cudaSetDeviceSafe(device0); | |
err = cudaGetDeviceProperties(&device_prop, device0); | |
uva0 = device_prop.unifiedAddressing; | |
err = cudaDeviceEnablePeerAccess(device1, 0); | |
int uva1; | |
cudaSetDeviceSafe(device1); | |
err = cudaGetDeviceProperties(&device_prop, device1); | |
uva1 = device_prop.unifiedAddressing; | |
err = cudaDeviceEnablePeerAccess(device0, 0); | |
if (uva0 != 1 || uva1 != 1) { | |
fprintf(stderr, "Cannot use UVA on your device\n"); | |
return 1; | |
} | |
// Allocate array variables on device0 | |
cudaSetDeviceSafe(device0); | |
double *device_dataA_0, *device_dataB_0; | |
cudaMallocSafe(&device_dataA_0, DATA_SIZE); | |
cudaMallocSafe(&device_dataB_0, DATA_SIZE); | |
cudaMemcpySafe(device_dataA_0, host_dataA, DATA_SIZE, cudaMemcpyHostToDevice); | |
cudaMemcpySafe(device_dataB_0, host_dataB, DATA_SIZE, cudaMemcpyHostToDevice); | |
// Allocate array variables on device1 | |
cudaSetDeviceSafe(device1); | |
double *device_dataA_1, *device_dataB_1, *device_result_data_1; | |
cudaMallocSafe(&device_dataA_1, DATA_SIZE); | |
cudaMallocSafe(&device_dataB_1, DATA_SIZE); | |
cudaMallocSafe(&device_result_data_1, DATA_SIZE); | |
// core | |
dim3 block(BLOCK_SIZE); | |
dim3 grid(DATA_NUM / BLOCK_SIZE); | |
cudaSetDeviceSafe(device1); | |
for (int i = 0; i < TRIALS; i++) { | |
vadd_kernel<<<grid, block>>>(device_dataA_0, device_dataB_0, device_result_data_1); | |
} | |
cudaMemcpySafe(host_result_data, device_result_data_1, DATA_SIZE, cudaMemcpyDeviceToHost); | |
// Cleanup | |
cudaSetDeviceSafe(device0); | |
cudaFreeSafe(device_dataA_0); | |
cudaFreeSafe(device_dataB_0); | |
cudaSetDeviceSafe(device1); | |
cudaFreeSafe(device_dataA_1); | |
cudaFreeSafe(device_dataB_1); | |
cudaFreeSafe(device_result_data_1); | |
return 0; | |
} | |
int main(int argc, char** argv) { | |
double *host_dataA, *host_dataB, *host_result_data; | |
cudaMallocHostSafe(&host_dataA, DATA_SIZE); | |
cudaMallocHostSafe(&host_dataB, DATA_SIZE); | |
cudaMallocHostSafe(&host_result_data, DATA_SIZE); | |
for (int i = 0; i < DATA_NUM; i++) { | |
host_dataA[i] = i * 10; | |
host_dataB[i] = i * 20; | |
} | |
int err = vadd(host_dataA, host_dataB, host_result_data); | |
if (err) { | |
exit(err); | |
} | |
for (int i = 0; i < DATA_NUM; i++) { | |
printf("%lf\n", host_result_data[i]); | |
} | |
cudaFreeHostSafe(host_dataA); | |
cudaFreeHostSafe(host_dataB); | |
return 0; | |
} | |
void cudaSetDeviceSafe(int device_id) { | |
cudaError_t err = cudaSetDevice(device_id); | |
if (err) { | |
fprintf(stderr, "Cannot set a device (device: %d)\n", device_id); | |
exit(256); | |
} | |
} | |
void cudaMallocSafe(void* ptr, int data_size) { | |
cudaError_t err = cudaMalloc((void **)ptr, data_size); | |
if (err) { | |
fprintf(stderr, "Failed to allocate the memory on device\n"); | |
} | |
} | |
void cudaMallocHostSafe(void* ptr, int data_size) { | |
cudaError_t err = cudaMallocHost((void **)ptr, data_size); | |
if (err) { | |
fprintf(stderr, "Failed to allocate the memory on host\n"); | |
} | |
} | |
void cudaMemcpySafe(void* dst, const void* src, size_t data_size, cudaMemcpyKind kind) { | |
cudaError_t err = cudaMemcpy(dst, src, data_size, kind); | |
if (err) { | |
fprintf(stderr, "Failed to copy memory (direction: %s)\n", kind); | |
} | |
} | |
void cudaMemcpyPeerSafe(void* dst, int dst_device, const void* src, int src_device, size_t data_size) { | |
cudaError_t err = cudaMemcpyPeer(dst, dst_device, src, src_device, data_size); | |
if (err) { | |
fprintf(stderr, "Failed to copy memory via peer (direction: %d -> %d)\n", src_device, dst_device); | |
} | |
} | |
void cudaFreeSafe(void* ptr) { | |
cudaError_t err = cudaFree(ptr); | |
if (err) { | |
fprintf(stderr, "Failed to make device memory free\n"); | |
} | |
} | |
void cudaFreeHostSafe(void* ptr) { | |
cudaError_t err = cudaFreeHost(ptr); | |
if (err) { | |
fprintf(stderr, "Failed to make host memory free\n"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment