moznion · August 29, 2015 14:06
diff --git a/ping_pong.cu b/ping_pong.cu
 #include <stdio.h>

 #define DATA_NUM 10
 #define DATA_SIZE sizeof(double) * DATA_NUM
 #define TRIALS 50

 void cudaSetDeviceSafe(int);
 void cudaMallocSafe(void*, int);
 void cudaMallocHostSafe(void*, int);
 void cudaMemcpySafe(void*, const void*, size_t, cudaMemcpyKind);
 void cudaMemcpyPeerSafe(void*, int, const void*, int, size_t);
 void cudaFreeSafe(void*);
 void cudaFreeHostSafe(void*);
 int is_deeply(double*, double*, int);

 int ping_pong(double* host_dataA, double* host_dataB, double* host_dataA_rt, double* host_dataB_rt) {
    cudaError_t err;

    int device_count;

    // 有効なデバイス数をチェック
    err = cudaGetDeviceCount(&device_count);
    if (err) {
        fprintf(stderr, "Err: %d\n", err);
        return 1;
    }

    int device0 = 0;
    int device1 = 2;

    // Device 0とDevice 1間でpeer通信ができるかどうかチェック
    int can_access_A2B;
    int can_access_B2A;
    err = cudaDeviceCanAccessPeer(&can_access_A2B, device0, device1);
    err = cudaDeviceCanAccessPeer(&can_access_B2A, device1, device0);
    if (can_access_A2B != 1 || can_access_B2A != 1) {
        fprintf(stderr, "Cannot connect via peer between device0 and device1\n");
        return 1;
    }

    // Device0とDevice1でUVAが有効かどうかチェック
    cudaDeviceProp device_prop;

    int uva0;
    cudaSetDeviceSafe(device0);
    err = cudaGetDeviceProperties(&device_prop, device0);
    uva0 = device_prop.unifiedAddressing;
    err = cudaDeviceEnablePeerAccess(device1, 0);

    int uva1;
    cudaSetDeviceSafe(device1);
    err = cudaGetDeviceProperties(&device_prop, device1);
    uva1 = device_prop.unifiedAddressing;
    err = cudaDeviceEnablePeerAccess(device0, 0);

    if (uva0 != 1 || uva1 != 1) {
        fprintf(stderr, "Cannot use UVA on your device\n");
        return 1;
    }

    // device0上に配列をALLOCATE
    cudaSetDeviceSafe(device0);
    double *dev_dataA_0, *dev_dataB_0;
    cudaMallocSafe(&dev_dataA_0, DATA_SIZE);
    cudaMallocSafe(&dev_dataB_0, DATA_SIZE);
    // host_dataA => dev_dataA_0
    cudaMemcpySafe(dev_dataA_0, host_dataA, DATA_SIZE, cudaMemcpyHostToDevice);

    // device1上に配列をALLOCATE
    cudaSetDeviceSafe(device1);
    double *dev_dataA_1, *dev_dataB_1;
    cudaMallocSafe(&dev_dataA_1, DATA_SIZE);
    cudaMallocSafe(&dev_dataB_1, DATA_SIZE);
    // host_dataB => dev_dataB_1
    cudaMemcpySafe(dev_dataB_1, host_dataB, DATA_SIZE, cudaMemcpyHostToDevice);

    // ping-pong
    for (int i = 0; i < TRIALS; i++) {
        if (i % 2 == 0) { // even
            cudaMemcpyPeerSafe(dev_dataA_1, device1, dev_dataA_0, device0, DATA_SIZE);
            continue;
        }
        cudaMemcpyPeerSafe(dev_dataB_0, device0, dev_dataB_1, device1, DATA_SIZE);
    }

    cudaMemcpySafe(host_dataA_rt, dev_dataA_0, DATA_SIZE, cudaMemcpyDeviceToHost);
    cudaMemcpySafe(host_dataB_rt, dev_dataB_1, DATA_SIZE, cudaMemcpyDeviceToHost);

    // cleanup
    cudaSetDeviceSafe(device0);
    cudaFreeSafe(dev_dataA_0);
    cudaFreeSafe(dev_dataB_0);
    cudaDeviceDisablePeerAccess(device0);

    cudaSetDeviceSafe(device1);
    cudaFreeSafe(dev_dataA_1);
    cudaFreeSafe(dev_dataB_1);
    cudaDeviceDisablePeerAccess(device1);

    return 0;
 }

 int main(int argc, char **argv) {
    // Host上にデータ作る
    double *host_dataA, *host_dataB;
    cudaMallocHostSafe(&host_dataA, DATA_SIZE);
    cudaMallocHostSafe(&host_dataB, DATA_SIZE);
    for (int i = 0; i < DATA_NUM; i++) { // initialize
        host_dataA[i] = 100.0;
        host_dataB[i] = 200.0;
    }

    // for round trip
    double *host_dataA_rt, *host_dataB_rt;
    cudaMallocHostSafe(&host_dataA_rt, DATA_SIZE);
    cudaMallocHostSafe(&host_dataB_rt, DATA_SIZE);

    // この時点では違うはず
    printf("Data A: %s\n", is_deeply(host_dataA_rt, host_dataA, DATA_NUM) ? "Same" : "Different");
    printf("Data B: %s\n", is_deeply(host_dataB_rt, host_dataB, DATA_NUM) ? "Same" : "Different");

    int err = ping_pong(host_dataA, host_dataB, host_dataA_rt, host_dataB_rt);
    if (err) {
        exit(err);
    }

    // ping_pongうまくいってたら一致する
    printf("Data A: %s\n", is_deeply(host_dataA_rt, host_dataA, DATA_NUM) ? "Same" : "Different");
    printf("Data B: %s\n", is_deeply(host_dataB_rt, host_dataB, DATA_NUM) ? "Same" : "Different");

    cudaFreeHostSafe(host_dataA);
    cudaFreeHostSafe(host_dataA_rt);
    cudaFreeHostSafe(host_dataB);
    cudaFreeHostSafe(host_dataB_rt);
    return 0;
 }

 void cudaSetDeviceSafe(int device_id) {
    cudaError_t err = cudaSetDevice(device_id);

    if (err) {
        fprintf(stderr, "Cannot set a device (device: %d)\n", device_id);
        exit(1);
    }
 }

 void cudaMallocSafe(void* ptr, int data_size) {
    cudaError_t err = cudaMalloc((void **)ptr, data_size);

    if (err) {
        fprintf(stderr, "Failed to allocate the memory on device\n");
        exit(1);
    }
 }

 void cudaMallocHostSafe(void* ptr, int data_size) {
    cudaError_t err = cudaMallocHost((void **)ptr, data_size);

    if (err) {
        fprintf(stderr, "Failed to allocate the memory on host\n");
        exit(1);
    }
 }

 void cudaMemcpySafe(void* dst, const void* src, size_t data_size, cudaMemcpyKind kind) {
    cudaError_t err = cudaMemcpy(dst, src, data_size, kind);

    if (err) {
        fprintf(stderr, "Failed to copy memory (direction: %s)\n", kind);
        exit(1);
    }
 }

 void cudaMemcpyPeerSafe(void* dst, int dst_device, const void* src, int src_device, size_t data_size) {
    cudaError_t err = cudaMemcpyPeer(dst, dst_device, src, src_device, data_size);

    if (err) {
        fprintf(stderr, "Failed to copy memory via peer (direction: %d -> %d)\n", src_device, dst_device);
        exit(1);
    }
 }

 void cudaFreeSafe(void* ptr) {
    cudaError_t err = cudaFree(ptr);

    if (err) {
        fprintf(stderr, "Failed to make device memory free\n");
        exit(1);
    }
 }

 void cudaFreeHostSafe(void* ptr) {
    cudaError_t err = cudaFreeHost(ptr);

    if (err) {
        fprintf(stderr, "Failed to make host memory free\n");
        exit(1);
    }
 }

 int is_deeply(double* got, double* expected, int data_num) {
    for (int i = 0; i < data_num; i++) {
        if (got[i] != expected[i]) {
            return 0;
        }
    }

    return 1;
 }
	#include <stdio.h>

	#define DATA_NUM 10
	#define DATA_SIZE sizeof(double) * DATA_NUM
	#define TRIALS 50

	void cudaSetDeviceSafe(int);
	void cudaMallocSafe(void*, int);
	void cudaMallocHostSafe(void*, int);
	void cudaMemcpySafe(void, const void, size_t, cudaMemcpyKind);
	void cudaMemcpyPeerSafe(void, int, const void, int, size_t);
	void cudaFreeSafe(void*);
	void cudaFreeHostSafe(void*);
	int is_deeply(double, double, int);

	int ping_pong(double* host_dataA, double* host_dataB, double* host_dataA_rt, double* host_dataB_rt) {
	cudaError_t err;

	int device_count;

	// 有効なデバイス数をチェック
	err = cudaGetDeviceCount(&device_count);
	if (err) {
	fprintf(stderr, "Err: %d\n", err);
	return 1;
	}

	int device0 = 0;
	int device1 = 2;

	// Device 0とDevice 1間でpeer通信ができるかどうかチェック
	int can_access_A2B;
	int can_access_B2A;
	err = cudaDeviceCanAccessPeer(&can_access_A2B, device0, device1);
	err = cudaDeviceCanAccessPeer(&can_access_B2A, device1, device0);
	if (can_access_A2B != 1 \|\| can_access_B2A != 1) {
	fprintf(stderr, "Cannot connect via peer between device0 and device1\n");
	return 1;
	}

	// Device0とDevice1でUVAが有効かどうかチェック
	cudaDeviceProp device_prop;

	int uva0;
	cudaSetDeviceSafe(device0);
	err = cudaGetDeviceProperties(&device_prop, device0);
	uva0 = device_prop.unifiedAddressing;
	err = cudaDeviceEnablePeerAccess(device1, 0);

	int uva1;
	cudaSetDeviceSafe(device1);
	err = cudaGetDeviceProperties(&device_prop, device1);
	uva1 = device_prop.unifiedAddressing;
	err = cudaDeviceEnablePeerAccess(device0, 0);

	if (uva0 != 1 \|\| uva1 != 1) {
	fprintf(stderr, "Cannot use UVA on your device\n");
	return 1;
	}

	// device0上に配列をALLOCATE
	cudaSetDeviceSafe(device0);
	double dev_dataA_0, dev_dataB_0;
	cudaMallocSafe(&dev_dataA_0, DATA_SIZE);
	cudaMallocSafe(&dev_dataB_0, DATA_SIZE);
	// host_dataA => dev_dataA_0
	cudaMemcpySafe(dev_dataA_0, host_dataA, DATA_SIZE, cudaMemcpyHostToDevice);

	// device1上に配列をALLOCATE
	cudaSetDeviceSafe(device1);
	double dev_dataA_1, dev_dataB_1;
	cudaMallocSafe(&dev_dataA_1, DATA_SIZE);
	cudaMallocSafe(&dev_dataB_1, DATA_SIZE);
	// host_dataB => dev_dataB_1
	cudaMemcpySafe(dev_dataB_1, host_dataB, DATA_SIZE, cudaMemcpyHostToDevice);

	// ping-pong
	for (int i = 0; i < TRIALS; i++) {
	if (i % 2 == 0) { // even
	cudaMemcpyPeerSafe(dev_dataA_1, device1, dev_dataA_0, device0, DATA_SIZE);
	continue;
	}
	cudaMemcpyPeerSafe(dev_dataB_0, device0, dev_dataB_1, device1, DATA_SIZE);
	}

	cudaMemcpySafe(host_dataA_rt, dev_dataA_0, DATA_SIZE, cudaMemcpyDeviceToHost);
	cudaMemcpySafe(host_dataB_rt, dev_dataB_1, DATA_SIZE, cudaMemcpyDeviceToHost);

	// cleanup
	cudaSetDeviceSafe(device0);
	cudaFreeSafe(dev_dataA_0);
	cudaFreeSafe(dev_dataB_0);
	cudaDeviceDisablePeerAccess(device0);

	cudaSetDeviceSafe(device1);
	cudaFreeSafe(dev_dataA_1);
	cudaFreeSafe(dev_dataB_1);
	cudaDeviceDisablePeerAccess(device1);

	return 0;
	}

	int main(int argc, char **argv) {
	// Host上にデータ作る
	double host_dataA, host_dataB;
	cudaMallocHostSafe(&host_dataA, DATA_SIZE);
	cudaMallocHostSafe(&host_dataB, DATA_SIZE);
	for (int i = 0; i < DATA_NUM; i++) { // initialize
	host_dataA[i] = 100.0;
	host_dataB[i] = 200.0;
	}

	// for round trip
	double host_dataA_rt, host_dataB_rt;
	cudaMallocHostSafe(&host_dataA_rt, DATA_SIZE);
	cudaMallocHostSafe(&host_dataB_rt, DATA_SIZE);

	// この時点では違うはず
	printf("Data A: %s\n", is_deeply(host_dataA_rt, host_dataA, DATA_NUM) ? "Same" : "Different");
	printf("Data B: %s\n", is_deeply(host_dataB_rt, host_dataB, DATA_NUM) ? "Same" : "Different");

	int err = ping_pong(host_dataA, host_dataB, host_dataA_rt, host_dataB_rt);
	if (err) {
	exit(err);
	}

	// ping_pongうまくいってたら一致する
	printf("Data A: %s\n", is_deeply(host_dataA_rt, host_dataA, DATA_NUM) ? "Same" : "Different");
	printf("Data B: %s\n", is_deeply(host_dataB_rt, host_dataB, DATA_NUM) ? "Same" : "Different");

	cudaFreeHostSafe(host_dataA);
	cudaFreeHostSafe(host_dataA_rt);
	cudaFreeHostSafe(host_dataB);
	cudaFreeHostSafe(host_dataB_rt);
	return 0;
	}

	void cudaSetDeviceSafe(int device_id) {
	cudaError_t err = cudaSetDevice(device_id);

	if (err) {
	fprintf(stderr, "Cannot set a device (device: %d)\n", device_id);
	exit(1);
	}
	}

	void cudaMallocSafe(void* ptr, int data_size) {
	cudaError_t err = cudaMalloc((void **)ptr, data_size);

	if (err) {
	fprintf(stderr, "Failed to allocate the memory on device\n");
	exit(1);
	}
	}

	void cudaMallocHostSafe(void* ptr, int data_size) {
	cudaError_t err = cudaMallocHost((void **)ptr, data_size);

	if (err) {
	fprintf(stderr, "Failed to allocate the memory on host\n");
	exit(1);
	}
	}

	void cudaMemcpySafe(void* dst, const void* src, size_t data_size, cudaMemcpyKind kind) {
	cudaError_t err = cudaMemcpy(dst, src, data_size, kind);

	if (err) {
	fprintf(stderr, "Failed to copy memory (direction: %s)\n", kind);
	exit(1);
	}
	}

	void cudaMemcpyPeerSafe(void* dst, int dst_device, const void* src, int src_device, size_t data_size) {
	cudaError_t err = cudaMemcpyPeer(dst, dst_device, src, src_device, data_size);

	if (err) {
	fprintf(stderr, "Failed to copy memory via peer (direction: %d -> %d)\n", src_device, dst_device);
	exit(1);
	}
	}

	void cudaFreeSafe(void* ptr) {
	cudaError_t err = cudaFree(ptr);

	if (err) {
	fprintf(stderr, "Failed to make device memory free\n");
	exit(1);
	}
	}

	void cudaFreeHostSafe(void* ptr) {
	cudaError_t err = cudaFreeHost(ptr);

	if (err) {
	fprintf(stderr, "Failed to make host memory free\n");
	exit(1);
	}
	}

	int is_deeply(double* got, double* expected, int data_num) {
	for (int i = 0; i < data_num; i++) {
	if (got[i] != expected[i]) {
	return 0;
	}
	}

	return 1;
	}