elfrank · December 11, 2015 06:18
diff --git a/PersistentThreads.h b/PersistentThreads.h
 /*The idea is to launch just enough threads to ?ll the machine;
 CUDA occupancy calculator can tell the correct number of threads.
 Launching a few too many is not a problem as the extra threads exit
 immediately. The following code assumes warp and block widths
 of 32.*/
 // global variables
 const int B = 3*32; // example batch size
 const int globalPoolRayCount;
 int globalPoolNextRay = 0;
 __global__ void kernel()
 // variables shared by entire warp, place to shared memory
 __shared__ volatile int nextRayArray[BLOCKDIM_Y];
 __shared__ volatile int rayCountArray[BLOCKDIM_Y] = f0g;
 volatile int& localPoolNextRay = nextRayArray[threadIdx.y];
 volatile int& localPoolRayCount = rayCountArray[threadIdx.y];
 while (true) {
 // get rays from global to local pool
 if (localPoolRayCount==0 && threadIdx.x==0) f
 localPoolNextRay = atomicAdd(globalPoolNextRay, B);
 localPoolRayCount = B; g
 // get rays from local pool
 int myRayIndex = localPoolNextRay + threadIdx.x;
 if (myRayIndex >= globalPoolRayCount)
 return;
 if (threadIdx.x==0) f
 localPoolNextRay += 32;
 localPoolRayCount -= 32; g
 // init and execute, these must not exit the kernel
 fetchAndInitRay(myRayIndex);
 trace();

 //The use of a small local pool is bene?cial because it reduces pressure from the atomic counter (globalPoolNextRay).
	/*The idea is to launch just enough threads to ?ll the machine;
	CUDA occupancy calculator can tell the correct number of threads.
	Launching a few too many is not a problem as the extra threads exit
	immediately. The following code assumes warp and block widths
	of 32.*/
	// global variables
	const int B = 3*32; // example batch size
	const int globalPoolRayCount;
	int globalPoolNextRay = 0;
	__global__ void kernel()
	// variables shared by entire warp, place to shared memory
	__shared__ volatile int nextRayArray[BLOCKDIM_Y];
	__shared__ volatile int rayCountArray[BLOCKDIM_Y] = f0g;
	volatile int& localPoolNextRay = nextRayArray[threadIdx.y];
	volatile int& localPoolRayCount = rayCountArray[threadIdx.y];
	while (true) {
	// get rays from global to local pool
	if (localPoolRayCount==0 && threadIdx.x==0) f
	localPoolNextRay = atomicAdd(globalPoolNextRay, B);
	localPoolRayCount = B; g
	// get rays from local pool
	int myRayIndex = localPoolNextRay + threadIdx.x;
	if (myRayIndex >= globalPoolRayCount)
	return;
	if (threadIdx.x==0) f
	localPoolNextRay += 32;
	localPoolRayCount -= 32; g
	// init and execute, these must not exit the kernel
	fetchAndInitRay(myRayIndex);
	trace();

	//The use of a small local pool is bene?cial because it reduces pressure from the atomic counter (globalPoolNextRay).