manifoldhiker · September 25, 2019 10:29
diff --git a/gistfile1.txt b/gistfile1.txt
 %%cu

 #include <stdio.h>
 #include <iostream>
 #include <time.h>

 using namespace std;

 #define N  1024
 inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
  if (err != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
  }
  return err;
 }
 /*
 __global__ void scan(float *g_odata, float *g_idata, int n)
 {
    extern __shared__ float temp[]; // allocated on invocation
    int thid = threadIdx.x;
    int pout = 0, pin = 1;
    // load input into shared memory.
    // Exclusive scan: shift right by one and set first element to 0
    temp[thid] = (thid > 0) ? g_idata[thid-1] : 0;
    __syncthreads();
    for( int offset = 1; offset < n; offset <<= 1 )
    {
    pout = 1 - pout; // swap double buffer indices
    pin = 1 - pout;
    if (thid >= offset)
    temp[pout*n+thid] += temp[pin*n+thid - offset];
    else
    temp[pout*n+thid] = temp[pin*n+thid];
    __syncthreads();
    }
    g_odata[thid] = temp[pout*n+thid]; // write output
 }
 */

   __global__ void scan(float *g_odata, float *g_idata, int n)
 {
  extern __shared__ float temp[]; // allocated on invocation
   int thid = threadIdx.x;
  int pout = 0, pin = 1;
  // Load input into shared memory.
   // This is exclusive scan, so shift right by one
   // and set first element to 0
  temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
  __syncthreads();
  for (int offset = 1; offset < n; offset *= 2)
  {
    pout = 1 - pout; // swap double buffer indices
    pin = 1 - pout;
    if (thid >= offset)
      temp[pout*n+thid] += temp[pin*n+thid - offset];
    else
      temp[pout*n+thid] = temp[pin*n+thid];
    __syncthreads();
  }
  g_odata[thid] = temp[pout*n+thid]; // write output
 }

 int main()
 {
  float *a, *b, *a_gpu, *b_gpu; // Allocate a solution matrix for both the CPU and the GPU operations
 int size = N * N * sizeof (int); // Number of bytes of an N x N matrix
 
 a = (float*) malloc(size);
 b = (float*) malloc(size);
 // Allocate memory
  cudaMalloc (&a_gpu, size);
  cudaMalloc (&b_gpu, size);
 
 for (int i = 0; i < N; i++){
     a[i] = i + 1;
     b[i] = 0;
 }

 checkCudaErr(cudaGetLastError(), "GPU");
 cudaMemcpy(a_gpu, a, size, cudaMemcpyHostToDevice);
 cudaMemcpy(b_gpu, b, size, cudaMemcpyHostToDevice);
 checkCudaErr(cudaGetLastError(), "cudaMemcpy");
 scan<<<1, N,2 *  N * sizeof(float)>>>(b_gpu, a_gpu, N);

 checkCudaErr(cudaDeviceSynchronize(), "Syncronization");
 cudaMemcpy(b, b_gpu, size, cudaMemcpyDeviceToHost);
 for (int i = 0; i < N; i++){
    cout << a[i] << " ";
 }
 cout << "\n";
 for (int i = 0; i < N; i++){
    cout << b[i] << " ";
 }


 checkCudaErr(cudaDeviceSynchronize(), "Syncronization");
 checkCudaErr(cudaGetLastError(), "GPU");


 // Free all our allocated memory
  free(a); free(b);
  cudaFree( a_gpu ); cudaFree( b_gpu );
 }
	%%cu

	#include <stdio.h>
	#include <iostream>
	#include <time.h>

	using namespace std;

	#define N 1024
	inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
	if (err != cudaSuccess) {
	fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
	}
	return err;
	}
	/*
	__global__ void scan(float g_odata, float g_idata, int n)
	{
	extern __shared__ float temp[]; // allocated on invocation
	int thid = threadIdx.x;
	int pout = 0, pin = 1;
	// load input into shared memory.
	// Exclusive scan: shift right by one and set first element to 0
	temp[thid] = (thid > 0) ? g_idata[thid-1] : 0;
	__syncthreads();
	for( int offset = 1; offset < n; offset <<= 1 )
	{
	pout = 1 - pout; // swap double buffer indices
	pin = 1 - pout;
	if (thid >= offset)
	temp[poutn+thid] += temp[pinn+thid - offset];
	else
	temp[poutn+thid] = temp[pinn+thid];
	__syncthreads();
	}
	g_odata[thid] = temp[pout*n+thid]; // write output
	}
	*/

	__global__ void scan(float g_odata, float g_idata, int n)
	{
	extern __shared__ float temp[]; // allocated on invocation
	int thid = threadIdx.x;
	int pout = 0, pin = 1;
	// Load input into shared memory.
	// This is exclusive scan, so shift right by one
	// and set first element to 0
	temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
	__syncthreads();
	for (int offset = 1; offset < n; offset *= 2)
	{
	pout = 1 - pout; // swap double buffer indices
	pin = 1 - pout;
	if (thid >= offset)
	temp[poutn+thid] += temp[pinn+thid - offset];
	else
	temp[poutn+thid] = temp[pinn+thid];
	__syncthreads();
	}
	g_odata[thid] = temp[pout*n+thid]; // write output
	}

	int main()
	{
	float a, b, a_gpu, b_gpu; // Allocate a solution matrix for both the CPU and the GPU operations
	int size = N * N * sizeof (int); // Number of bytes of an N x N matrix

	a = (float*) malloc(size);
	b = (float*) malloc(size);
	// Allocate memory
	cudaMalloc (&a_gpu, size);
	cudaMalloc (&b_gpu, size);

	for (int i = 0; i < N; i++){
	a[i] = i + 1;
	b[i] = 0;
	}

	checkCudaErr(cudaGetLastError(), "GPU");
	cudaMemcpy(a_gpu, a, size, cudaMemcpyHostToDevice);
	cudaMemcpy(b_gpu, b, size, cudaMemcpyHostToDevice);
	checkCudaErr(cudaGetLastError(), "cudaMemcpy");
	scan<<<1, N,2 * N * sizeof(float)>>>(b_gpu, a_gpu, N);

	checkCudaErr(cudaDeviceSynchronize(), "Syncronization");
	cudaMemcpy(b, b_gpu, size, cudaMemcpyDeviceToHost);
	for (int i = 0; i < N; i++){
	cout << a[i] << " ";
	}
	cout << "\n";
	for (int i = 0; i < N; i++){
	cout << b[i] << " ";
	}


	checkCudaErr(cudaDeviceSynchronize(), "Syncronization");
	checkCudaErr(cudaGetLastError(), "GPU");


	// Free all our allocated memory
	free(a); free(b);
	cudaFree( a_gpu ); cudaFree( b_gpu );
	}