Skip to content

Instantly share code, notes, and snippets.

@manifoldhiker
Last active September 25, 2019 10:29
Show Gist options
  • Save manifoldhiker/6e3d5a72cbbd9eccb5b7217dc692fb68 to your computer and use it in GitHub Desktop.
Save manifoldhiker/6e3d5a72cbbd9eccb5b7217dc692fb68 to your computer and use it in GitHub Desktop.
%%cu
#include <stdio.h>
#include <iostream>
#include <time.h>
using namespace std;
#define N 1024
inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
if (err != cudaSuccess) {
fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
}
return err;
}
/*
__global__ void scan(float *g_odata, float *g_idata, int n)
{
extern __shared__ float temp[]; // allocated on invocation
int thid = threadIdx.x;
int pout = 0, pin = 1;
// load input into shared memory.
// Exclusive scan: shift right by one and set first element to 0
temp[thid] = (thid > 0) ? g_idata[thid-1] : 0;
__syncthreads();
for( int offset = 1; offset < n; offset <<= 1 )
{
pout = 1 - pout; // swap double buffer indices
pin = 1 - pout;
if (thid >= offset)
temp[pout*n+thid] += temp[pin*n+thid - offset];
else
temp[pout*n+thid] = temp[pin*n+thid];
__syncthreads();
}
g_odata[thid] = temp[pout*n+thid]; // write output
}
*/
__global__ void scan(float *g_odata, float *g_idata, int n)
{
extern __shared__ float temp[]; // allocated on invocation
int thid = threadIdx.x;
int pout = 0, pin = 1;
// Load input into shared memory.
// This is exclusive scan, so shift right by one
// and set first element to 0
temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
__syncthreads();
for (int offset = 1; offset < n; offset *= 2)
{
pout = 1 - pout; // swap double buffer indices
pin = 1 - pout;
if (thid >= offset)
temp[pout*n+thid] += temp[pin*n+thid - offset];
else
temp[pout*n+thid] = temp[pin*n+thid];
__syncthreads();
}
g_odata[thid] = temp[pout*n+thid]; // write output
}
int main()
{
float *a, *b, *a_gpu, *b_gpu; // Allocate a solution matrix for both the CPU and the GPU operations
int size = N * N * sizeof (int); // Number of bytes of an N x N matrix
a = (float*) malloc(size);
b = (float*) malloc(size);
// Allocate memory
cudaMalloc (&a_gpu, size);
cudaMalloc (&b_gpu, size);
for (int i = 0; i < N; i++){
a[i] = i + 1;
b[i] = 0;
}
checkCudaErr(cudaGetLastError(), "GPU");
cudaMemcpy(a_gpu, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(b_gpu, b, size, cudaMemcpyHostToDevice);
checkCudaErr(cudaGetLastError(), "cudaMemcpy");
scan<<<1, N,2 * N * sizeof(float)>>>(b_gpu, a_gpu, N);
checkCudaErr(cudaDeviceSynchronize(), "Syncronization");
cudaMemcpy(b, b_gpu, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
cout << a[i] << " ";
}
cout << "\n";
for (int i = 0; i < N; i++){
cout << b[i] << " ";
}
checkCudaErr(cudaDeviceSynchronize(), "Syncronization");
checkCudaErr(cudaGetLastError(), "GPU");
// Free all our allocated memory
free(a); free(b);
cudaFree( a_gpu ); cudaFree( b_gpu );
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment