Last active
December 11, 2015 06:18
-
-
Save elfrank/4558072 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*The idea is to launch just enough threads to ?ll the machine; | |
CUDA occupancy calculator can tell the correct number of threads. | |
Launching a few too many is not a problem as the extra threads exit | |
immediately. The following code assumes warp and block widths | |
of 32.*/ | |
// global variables | |
const int B = 3*32; // example batch size | |
const int globalPoolRayCount; | |
int globalPoolNextRay = 0; | |
__global__ void kernel() | |
// variables shared by entire warp, place to shared memory | |
__shared__ volatile int nextRayArray[BLOCKDIM_Y]; | |
__shared__ volatile int rayCountArray[BLOCKDIM_Y] = f0g; | |
volatile int& localPoolNextRay = nextRayArray[threadIdx.y]; | |
volatile int& localPoolRayCount = rayCountArray[threadIdx.y]; | |
while (true) { | |
// get rays from global to local pool | |
if (localPoolRayCount==0 && threadIdx.x==0) f | |
localPoolNextRay = atomicAdd(globalPoolNextRay, B); | |
localPoolRayCount = B; g | |
// get rays from local pool | |
int myRayIndex = localPoolNextRay + threadIdx.x; | |
if (myRayIndex >= globalPoolRayCount) | |
return; | |
if (threadIdx.x==0) f | |
localPoolNextRay += 32; | |
localPoolRayCount -= 32; g | |
// init and execute, these must not exit the kernel | |
fetchAndInitRay(myRayIndex); | |
trace(); | |
//The use of a small local pool is bene?cial because it reduces pressure from the atomic counter (globalPoolNextRay). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment