Last active
September 25, 2019 10:29
-
-
Save manifoldhiker/6e3d5a72cbbd9eccb5b7217dc692fb68 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%%cu | |
#include <stdio.h> | |
#include <iostream> | |
#include <time.h> | |
using namespace std; | |
#define N 1024 | |
inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) { | |
if (err != cudaSuccess) { | |
fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err)); | |
} | |
return err; | |
} | |
/* | |
__global__ void scan(float *g_odata, float *g_idata, int n) | |
{ | |
extern __shared__ float temp[]; // allocated on invocation | |
int thid = threadIdx.x; | |
int pout = 0, pin = 1; | |
// load input into shared memory. | |
// Exclusive scan: shift right by one and set first element to 0 | |
temp[thid] = (thid > 0) ? g_idata[thid-1] : 0; | |
__syncthreads(); | |
for( int offset = 1; offset < n; offset <<= 1 ) | |
{ | |
pout = 1 - pout; // swap double buffer indices | |
pin = 1 - pout; | |
if (thid >= offset) | |
temp[pout*n+thid] += temp[pin*n+thid - offset]; | |
else | |
temp[pout*n+thid] = temp[pin*n+thid]; | |
__syncthreads(); | |
} | |
g_odata[thid] = temp[pout*n+thid]; // write output | |
} | |
*/ | |
__global__ void scan(float *g_odata, float *g_idata, int n) | |
{ | |
extern __shared__ float temp[]; // allocated on invocation | |
int thid = threadIdx.x; | |
int pout = 0, pin = 1; | |
// Load input into shared memory. | |
// This is exclusive scan, so shift right by one | |
// and set first element to 0 | |
temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0; | |
__syncthreads(); | |
for (int offset = 1; offset < n; offset *= 2) | |
{ | |
pout = 1 - pout; // swap double buffer indices | |
pin = 1 - pout; | |
if (thid >= offset) | |
temp[pout*n+thid] += temp[pin*n+thid - offset]; | |
else | |
temp[pout*n+thid] = temp[pin*n+thid]; | |
__syncthreads(); | |
} | |
g_odata[thid] = temp[pout*n+thid]; // write output | |
} | |
int main() | |
{ | |
float *a, *b, *a_gpu, *b_gpu; // Allocate a solution matrix for both the CPU and the GPU operations | |
int size = N * N * sizeof (int); // Number of bytes of an N x N matrix | |
a = (float*) malloc(size); | |
b = (float*) malloc(size); | |
// Allocate memory | |
cudaMalloc (&a_gpu, size); | |
cudaMalloc (&b_gpu, size); | |
for (int i = 0; i < N; i++){ | |
a[i] = i + 1; | |
b[i] = 0; | |
} | |
checkCudaErr(cudaGetLastError(), "GPU"); | |
cudaMemcpy(a_gpu, a, size, cudaMemcpyHostToDevice); | |
cudaMemcpy(b_gpu, b, size, cudaMemcpyHostToDevice); | |
checkCudaErr(cudaGetLastError(), "cudaMemcpy"); | |
scan<<<1, N,2 * N * sizeof(float)>>>(b_gpu, a_gpu, N); | |
checkCudaErr(cudaDeviceSynchronize(), "Syncronization"); | |
cudaMemcpy(b, b_gpu, size, cudaMemcpyDeviceToHost); | |
for (int i = 0; i < N; i++){ | |
cout << a[i] << " "; | |
} | |
cout << "\n"; | |
for (int i = 0; i < N; i++){ | |
cout << b[i] << " "; | |
} | |
checkCudaErr(cudaDeviceSynchronize(), "Syncronization"); | |
checkCudaErr(cudaGetLastError(), "GPU"); | |
// Free all our allocated memory | |
free(a); free(b); | |
cudaFree( a_gpu ); cudaFree( b_gpu ); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment