takagi · January 30, 2023 01:12
diff --git a/test.cu b/test.cu
 #include <cassert>
 #include <iostream>
 #include <thread>

 __global__ void vecAddOne(float *a, int n) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    if (id < n)
        a[id] += 1.0f;
 }

 __global__ void vecAdd(const float *a, const float *b, float *c, int n) {
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    if (id < n)
        c[id] = a[id] + b[id];
 }

 int main(int argc, char* argv[]) {
    int n = 100;

    float *h_a, *h_b, *h_c;
    float *d_a, *d_b, *d_c;

    size_t bytes = n * sizeof(float);

    h_a = (float*)malloc(bytes); assert(h_a); memset(h_a, 0, bytes);
    h_b = (float*)malloc(bytes); assert(h_b); memset(h_b, 0, bytes);
    h_c = (float*)malloc(bytes); assert(h_c); memset(h_c, 0, bytes);

    cudaMalloc(&d_a, bytes); assert(d_a);
    cudaMalloc(&d_b, bytes); assert(d_b);
    cudaMalloc(&d_c, bytes); assert(d_c);

    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

    /*
    std::thread t1([&]() {
        for (int i = 0; i < 1000000; i++) {
            vecAddOne<<<1, 1024>>>(d_a, n);
            // std::cout << "t1" << std::endl;
        }
    });
    std::thread t2([&]() {
        for (int i = 0; i < 1000000; i++) {
            vecAddOne<<<1, 1024>>>(d_b, n);
            // std::cout << "t2" << std::endl;
        }
    });
    t1.join(); t2.join();
    */

    for (int i = 0; i < 1000000; i++) {
        vecAddOne<<<1, 1024>>>(d_a, n);
    }
    for (int i = 0; i < 1000000; i++) {
        vecAddOne<<<1, 1024>>>(d_b, n);
    }

    vecAdd<<<1, 1024>>>(d_a, d_b, d_c, n);

    cudaMemcpy(h_a, d_a, bytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_b, d_b, bytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    for (int i = 0; i < n; i++) {
        assert(h_a[i] == 1000000);
        assert(h_b[i] == 1000000);
        assert(h_c[i] == 2000000);
    }

    std::cout << "ok" << std::endl;

    return 0;
 }
	#include <cassert>
	#include <iostream>
	#include <thread>

	__global__ void vecAddOne(float *a, int n) {
	int id = blockIdx.x * blockDim.x + threadIdx.x;
	if (id < n)
	a[id] += 1.0f;
	}

	__global__ void vecAdd(const float a, const float b, float *c, int n) {
	int id = blockIdx.x * blockDim.x + threadIdx.x;
	if (id < n)
	c[id] = a[id] + b[id];
	}

	int main(int argc, char* argv[]) {
	int n = 100;

	float h_a, h_b, *h_c;
	float d_a, d_b, *d_c;

	size_t bytes = n * sizeof(float);

	h_a = (float*)malloc(bytes); assert(h_a); memset(h_a, 0, bytes);
	h_b = (float*)malloc(bytes); assert(h_b); memset(h_b, 0, bytes);
	h_c = (float*)malloc(bytes); assert(h_c); memset(h_c, 0, bytes);

	cudaMalloc(&d_a, bytes); assert(d_a);
	cudaMalloc(&d_b, bytes); assert(d_b);
	cudaMalloc(&d_c, bytes); assert(d_c);

	cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);

	/*
	std::thread t1([&]() {
	for (int i = 0; i < 1000000; i++) {
	vecAddOne<<<1, 1024>>>(d_a, n);
	// std::cout << "t1" << std::endl;
	}
	});
	std::thread t2([&]() {
	for (int i = 0; i < 1000000; i++) {
	vecAddOne<<<1, 1024>>>(d_b, n);
	// std::cout << "t2" << std::endl;
	}
	});
	t1.join(); t2.join();
	*/

	for (int i = 0; i < 1000000; i++) {
	vecAddOne<<<1, 1024>>>(d_a, n);
	}
	for (int i = 0; i < 1000000; i++) {
	vecAddOne<<<1, 1024>>>(d_b, n);
	}

	vecAdd<<<1, 1024>>>(d_a, d_b, d_c, n);

	cudaMemcpy(h_a, d_a, bytes, cudaMemcpyDeviceToHost);
	cudaMemcpy(h_b, d_b, bytes, cudaMemcpyDeviceToHost);
	cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

	for (int i = 0; i < n; i++) {
	assert(h_a[i] == 1000000);
	assert(h_b[i] == 1000000);
	assert(h_c[i] == 2000000);
	}

	std::cout << "ok" << std::endl;

	return 0;
	}
No results found