It occurred when I did not wait GPU process finishes.
#include <stdio.h>
#include <cuda_runtime.h>
__global__
void my_kernel(int val, int *A, int N)
{
int i = threadIdx.x;
if (i < N) A[i] = val;
}
void
my_kernel_launch()
{
int N = 9;
int *a;
cudaMallocManaged((void**)&a, N*sizeof(int));
my_kernel<<<2,2>>>(2, a, N);
// cudaDeviceSynchronize(); // <= need this!
for (int i = 0; i < N; i++) {
printf("%d ", a[i]);
}
printf("\n");
}
int main(int argc, char **argv)
{
my_kernel_launch();
}
Thanks a lot! This saved me a bunch of time. However, I couldn't find a reason as to why I need the call. I am using Unified Memory too, but the behaviour is inconsistent. Is there a reason as to why the call is required explicitly after the Kernel launch, especially in the case of Unified Memory?