etale-cohomology · October 12, 2017 09:50
diff --git a/add1.cu b/add1.cu
 // Compile and run with:
 // nvcc add1.cu -o add1 && ./add1

 // This minimal CUDA program performs vector addition, component-wise!

 #include <stdio.h>

 #define N_ELEMENTS 8


 // ------------------------------------------------------------------------------------------------
 // __global__functions run on the device, so they must be pointers (to device memory)!
 __global__ void cuda_vec_add(uint* a, uint* b, uint* c){
  uint i = blockIdx.x * blockDim.x + threadIdx.x;
  c[i] = a[i] + b[i];
 }


 // ------------------------------------------------------------------------------------------------
 __host__ int main(){
  puts("MSG  This minimal CUDA program performs vector addition, component-wise!");

  // ----------------------------
  uint *cpu_a, *cpu_b, *cpu_c;  // Host copies of a, b, c!
  uint *gpu_a, *gpu_b, *gpu_c;  // Device copies of a, b, c!

  // ----------------------------
  puts("MALLOC  Data on host!");
  cpu_a = (uint*)malloc(N_ELEMENTS * sizeof(uint));
  cpu_b = (uint*)malloc(N_ELEMENTS * sizeof(uint));
  cpu_c = (uint*)malloc(N_ELEMENTS * sizeof(uint));

  puts("RUN  cudaMalloc()  Allocate memory for device copies of a, b, c!");
  cudaMalloc((void**)&gpu_a, N_ELEMENTS * sizeof(uint));  // For some reason, we don't pass `gpu_a` but `&gpu_a`. Why??
  cudaMalloc((void**)&gpu_b, N_ELEMENTS * sizeof(uint));
  cudaMalloc((void**)&gpu_c, N_ELEMENTS * sizeof(uint));

  // ----------------------------
  puts("INIT  Data on host!");
  for(uint i=0; i<N_ELEMENTS; ++i){
    cpu_a[i] = i;
    cpu_b[i] = 2 * i;
  }

  puts("RUN  cudaMemcpy()  Copy data from host to device!");
  cudaMemcpy(gpu_a, cpu_a, N_ELEMENTS * sizeof(uint), cudaMemcpyHostToDevice);
  cudaMemcpy(gpu_b, cpu_b, N_ELEMENTS * sizeof(uint), cudaMemcpyHostToDevice);

  // ----------------------------
  puts("RUN  cuda_vec_add()  Launch CUDA kernel on the device!");
  cuda_vec_add<<<1, N_ELEMENTS>>>(gpu_a, gpu_b, gpu_c);

  puts("RUN  cudaMemcpy()  Copy results back to the host");
  cudaMemcpy(cpu_c, gpu_c, N_ELEMENTS * sizeof(uint), cudaMemcpyDeviceToHost);  // Store resulting matrix `gpu_c` (GPU-side) in `cpu_c` (CPU-side)

  // ----------------------------
  puts("\nSHOW  Data on host (regardless of where it was computed)!");

  printf("cpu_a\n  ");
  for(uint i=0; i<N_ELEMENTS; ++i)
    printf("%u ", cpu_a[i]);
  puts("");

  printf("cpu_b\n  ");
  for(uint i=0; i<N_ELEMENTS; ++i)
    printf("%u ", cpu_b[i]);
  puts("");

  printf("cpu_c (from gpu_c, computed on GPU)\n  ");
  for(uint i=0; i<N_ELEMENTS; ++i)
    printf("%u ", cpu_c[i]);
  puts("");

  // ----------------------------
  puts("\nRUN  cudaFree()  Free device memory");
  cudaFree(gpu_a);
  cudaFree(gpu_b);
  cudaFree(gpu_c);

  puts("RUN  free()  Free host memory");
  free(cpu_a);
  free(cpu_b);
  free(cpu_c);

  // ----------------------------
  puts("\nExit success!");
 }
	// Compile and run with:
	// nvcc add1.cu -o add1 && ./add1

	// This minimal CUDA program performs vector addition, component-wise!

	#include <stdio.h>

	#define N_ELEMENTS 8


	// ------------------------------------------------------------------------------------------------
	// __global__functions run on the device, so they must be pointers (to device memory)!
	__global__ void cuda_vec_add(uint* a, uint* b, uint* c){
	uint i = blockIdx.x * blockDim.x + threadIdx.x;
	c[i] = a[i] + b[i];
	}


	// ------------------------------------------------------------------------------------------------
	__host__ int main(){
	puts("MSG This minimal CUDA program performs vector addition, component-wise!");

	// ----------------------------
	uint cpu_a, cpu_b, *cpu_c; // Host copies of a, b, c!
	uint gpu_a, gpu_b, *gpu_c; // Device copies of a, b, c!

	// ----------------------------
	puts("MALLOC Data on host!");
	cpu_a = (uint)malloc(N_ELEMENTS sizeof(uint));
	cpu_b = (uint)malloc(N_ELEMENTS sizeof(uint));
	cpu_c = (uint)malloc(N_ELEMENTS sizeof(uint));

	puts("RUN cudaMalloc() Allocate memory for device copies of a, b, c!");
	cudaMalloc((void*)&gpu_a, N_ELEMENTS sizeof(uint)); // For some reason, we don't pass `gpu_a` but `&gpu_a`. Why??
	cudaMalloc((void*)&gpu_b, N_ELEMENTS sizeof(uint));
	cudaMalloc((void*)&gpu_c, N_ELEMENTS sizeof(uint));

	// ----------------------------
	puts("INIT Data on host!");
	for(uint i=0; i<N_ELEMENTS; ++i){
	cpu_a[i] = i;
	cpu_b[i] = 2 * i;
	}

	puts("RUN cudaMemcpy() Copy data from host to device!");
	cudaMemcpy(gpu_a, cpu_a, N_ELEMENTS * sizeof(uint), cudaMemcpyHostToDevice);
	cudaMemcpy(gpu_b, cpu_b, N_ELEMENTS * sizeof(uint), cudaMemcpyHostToDevice);

	// ----------------------------
	puts("RUN cuda_vec_add() Launch CUDA kernel on the device!");
	cuda_vec_add<<<1, N_ELEMENTS>>>(gpu_a, gpu_b, gpu_c);

	puts("RUN cudaMemcpy() Copy results back to the host");
	cudaMemcpy(cpu_c, gpu_c, N_ELEMENTS * sizeof(uint), cudaMemcpyDeviceToHost); // Store resulting matrix `gpu_c` (GPU-side) in `cpu_c` (CPU-side)

	// ----------------------------
	puts("\nSHOW Data on host (regardless of where it was computed)!");

	printf("cpu_a\n ");
	for(uint i=0; i<N_ELEMENTS; ++i)
	printf("%u ", cpu_a[i]);
	puts("");

	printf("cpu_b\n ");
	for(uint i=0; i<N_ELEMENTS; ++i)
	printf("%u ", cpu_b[i]);
	puts("");

	printf("cpu_c (from gpu_c, computed on GPU)\n ");
	for(uint i=0; i<N_ELEMENTS; ++i)
	printf("%u ", cpu_c[i]);
	puts("");

	// ----------------------------
	puts("\nRUN cudaFree() Free device memory");
	cudaFree(gpu_a);
	cudaFree(gpu_b);
	cudaFree(gpu_c);

	puts("RUN free() Free host memory");
	free(cpu_a);
	free(cpu_b);
	free(cpu_c);

	// ----------------------------
	puts("\nExit success!");
	}