Skip to content

Instantly share code, notes, and snippets.

@moznion
Last active August 29, 2015 14:06
Show Gist options
  • Save moznion/ccbfc67a6679d09be43d to your computer and use it in GitHub Desktop.
Save moznion/ccbfc67a6679d09be43d to your computer and use it in GitHub Desktop.
#include <stdio.h>
#define DATA_NUM 10
#define DATA_SIZE sizeof(double) * DATA_NUM
#define TRIALS 50
void cudaSetDeviceSafe(int);
void cudaMallocSafe(void*, int);
void cudaMallocHostSafe(void*, int);
void cudaMemcpySafe(void*, const void*, size_t, cudaMemcpyKind);
void cudaMemcpyPeerSafe(void*, int, const void*, int, size_t);
void cudaFreeSafe(void*);
void cudaFreeHostSafe(void*);
int is_deeply(double*, double*, int);
int ping_pong(double* host_dataA, double* host_dataB, double* host_dataA_rt, double* host_dataB_rt) {
cudaError_t err;
int device_count;
// 有効なデバイス数をチェック
err = cudaGetDeviceCount(&device_count);
if (err) {
fprintf(stderr, "Err: %d\n", err);
return 1;
}
int device0 = 0;
int device1 = 2;
// Device 0とDevice 1間でpeer通信ができるかどうかチェック
int can_access_A2B;
int can_access_B2A;
err = cudaDeviceCanAccessPeer(&can_access_A2B, device0, device1);
err = cudaDeviceCanAccessPeer(&can_access_B2A, device1, device0);
if (can_access_A2B != 1 || can_access_B2A != 1) {
fprintf(stderr, "Cannot connect via peer between device0 and device1\n");
return 1;
}
// Device0とDevice1でUVAが有効かどうかチェック
cudaDeviceProp device_prop;
int uva0;
cudaSetDeviceSafe(device0);
err = cudaGetDeviceProperties(&device_prop, device0);
uva0 = device_prop.unifiedAddressing;
err = cudaDeviceEnablePeerAccess(device1, 0);
int uva1;
cudaSetDeviceSafe(device1);
err = cudaGetDeviceProperties(&device_prop, device1);
uva1 = device_prop.unifiedAddressing;
err = cudaDeviceEnablePeerAccess(device0, 0);
if (uva0 != 1 || uva1 != 1) {
fprintf(stderr, "Cannot use UVA on your device\n");
return 1;
}
// device0上に配列をALLOCATE
cudaSetDeviceSafe(device0);
double *dev_dataA_0, *dev_dataB_0;
cudaMallocSafe(&dev_dataA_0, DATA_SIZE);
cudaMallocSafe(&dev_dataB_0, DATA_SIZE);
// host_dataA => dev_dataA_0
cudaMemcpySafe(dev_dataA_0, host_dataA, DATA_SIZE, cudaMemcpyHostToDevice);
// device1上に配列をALLOCATE
cudaSetDeviceSafe(device1);
double *dev_dataA_1, *dev_dataB_1;
cudaMallocSafe(&dev_dataA_1, DATA_SIZE);
cudaMallocSafe(&dev_dataB_1, DATA_SIZE);
// host_dataB => dev_dataB_1
cudaMemcpySafe(dev_dataB_1, host_dataB, DATA_SIZE, cudaMemcpyHostToDevice);
// ping-pong
for (int i = 0; i < TRIALS; i++) {
if (i % 2 == 0) { // even
cudaMemcpyPeerSafe(dev_dataA_1, device1, dev_dataA_0, device0, DATA_SIZE);
continue;
}
cudaMemcpyPeerSafe(dev_dataB_0, device0, dev_dataB_1, device1, DATA_SIZE);
}
cudaMemcpySafe(host_dataA_rt, dev_dataA_0, DATA_SIZE, cudaMemcpyDeviceToHost);
cudaMemcpySafe(host_dataB_rt, dev_dataB_1, DATA_SIZE, cudaMemcpyDeviceToHost);
// cleanup
cudaSetDeviceSafe(device0);
cudaFreeSafe(dev_dataA_0);
cudaFreeSafe(dev_dataB_0);
cudaDeviceDisablePeerAccess(device0);
cudaSetDeviceSafe(device1);
cudaFreeSafe(dev_dataA_1);
cudaFreeSafe(dev_dataB_1);
cudaDeviceDisablePeerAccess(device1);
return 0;
}
int main(int argc, char **argv) {
// Host上にデータ作る
double *host_dataA, *host_dataB;
cudaMallocHostSafe(&host_dataA, DATA_SIZE);
cudaMallocHostSafe(&host_dataB, DATA_SIZE);
for (int i = 0; i < DATA_NUM; i++) { // initialize
host_dataA[i] = 100.0;
host_dataB[i] = 200.0;
}
// for round trip
double *host_dataA_rt, *host_dataB_rt;
cudaMallocHostSafe(&host_dataA_rt, DATA_SIZE);
cudaMallocHostSafe(&host_dataB_rt, DATA_SIZE);
// この時点では違うはず
printf("Data A: %s\n", is_deeply(host_dataA_rt, host_dataA, DATA_NUM) ? "Same" : "Different");
printf("Data B: %s\n", is_deeply(host_dataB_rt, host_dataB, DATA_NUM) ? "Same" : "Different");
int err = ping_pong(host_dataA, host_dataB, host_dataA_rt, host_dataB_rt);
if (err) {
exit(err);
}
// ping_pongうまくいってたら一致する
printf("Data A: %s\n", is_deeply(host_dataA_rt, host_dataA, DATA_NUM) ? "Same" : "Different");
printf("Data B: %s\n", is_deeply(host_dataB_rt, host_dataB, DATA_NUM) ? "Same" : "Different");
cudaFreeHostSafe(host_dataA);
cudaFreeHostSafe(host_dataA_rt);
cudaFreeHostSafe(host_dataB);
cudaFreeHostSafe(host_dataB_rt);
return 0;
}
void cudaSetDeviceSafe(int device_id) {
cudaError_t err = cudaSetDevice(device_id);
if (err) {
fprintf(stderr, "Cannot set a device (device: %d)\n", device_id);
exit(1);
}
}
void cudaMallocSafe(void* ptr, int data_size) {
cudaError_t err = cudaMalloc((void **)ptr, data_size);
if (err) {
fprintf(stderr, "Failed to allocate the memory on device\n");
exit(1);
}
}
void cudaMallocHostSafe(void* ptr, int data_size) {
cudaError_t err = cudaMallocHost((void **)ptr, data_size);
if (err) {
fprintf(stderr, "Failed to allocate the memory on host\n");
exit(1);
}
}
void cudaMemcpySafe(void* dst, const void* src, size_t data_size, cudaMemcpyKind kind) {
cudaError_t err = cudaMemcpy(dst, src, data_size, kind);
if (err) {
fprintf(stderr, "Failed to copy memory (direction: %s)\n", kind);
exit(1);
}
}
void cudaMemcpyPeerSafe(void* dst, int dst_device, const void* src, int src_device, size_t data_size) {
cudaError_t err = cudaMemcpyPeer(dst, dst_device, src, src_device, data_size);
if (err) {
fprintf(stderr, "Failed to copy memory via peer (direction: %d -> %d)\n", src_device, dst_device);
exit(1);
}
}
void cudaFreeSafe(void* ptr) {
cudaError_t err = cudaFree(ptr);
if (err) {
fprintf(stderr, "Failed to make device memory free\n");
exit(1);
}
}
void cudaFreeHostSafe(void* ptr) {
cudaError_t err = cudaFreeHost(ptr);
if (err) {
fprintf(stderr, "Failed to make host memory free\n");
exit(1);
}
}
int is_deeply(double* got, double* expected, int data_num) {
for (int i = 0; i < data_num; i++) {
if (got[i] != expected[i]) {
return 0;
}
}
return 1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment