Created
April 10, 2015 23:08
-
-
Save nathanjackson/09ce8db4d4b279f9273d to your computer and use it in GitHub Desktop.
Incomplete CUDA Statistics Kernel
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <math.h> | |
#include <stdio.h> | |
#include <stdint.h> | |
#include <stdlib.h> | |
#include <time.h> | |
static const size_t MAX_BLOCK_SIZE = 1024; | |
double cpu_var(double *input, size_t len) | |
{ | |
double mean = 0.0; | |
double m2 = 0.0; | |
double delta; | |
if(len < 2) return 0.0; | |
int i; | |
for(i = 0; i < len; ++i) { | |
delta = input[i] - mean; | |
mean = mean + delta / (i + 1); | |
m2 = m2 + delta * (input[i] - mean); | |
} | |
return m2 / i; | |
} | |
template<typename T> | |
__global__ void initKernel(T *data, const T val, const size_t count) | |
{ | |
unsigned tidx = threadIdx.x + blockIdx.x * blockDim.x; | |
if(tidx < count) data[tidx] = val; | |
} | |
__forceinline__ __device__ double combine_mean(unsigned na, double ua, | |
double nb, double ub) | |
{ | |
return (na * ua + nb * ub) / (na + nb); | |
} | |
__forceinline__ __device__ double combine_var(unsigned na, double ua, | |
double va, unsigned nb, double ub, double vb) | |
{ | |
double tmp = (ub - ua) / (na + nb); | |
return (na * va + nb * vb) / (na + nb) + na * nb * tmp * tmp; | |
} | |
__global__ void blocked_variance(double *varout, double *meanout, | |
unsigned *countout, double *varin, double *meanin, unsigned *countin, | |
size_t len) | |
{ | |
__shared__ double mean[MAX_BLOCK_SIZE]; | |
__shared__ double var[MAX_BLOCK_SIZE]; | |
__shared__ unsigned counts[MAX_BLOCK_SIZE]; | |
unsigned tx = threadIdx.x; | |
unsigned i = blockIdx.x * blockDim.x * 2 + threadIdx.x; | |
unsigned s = blockDim.x; | |
if( i >= len) | |
return; | |
/* for(s = blockDim.x; s > 0; s >>= 1) { | |
if(i + s < len) break; | |
} */ | |
var[tx] = combine_var(countin[tx], meanin[i], varin[i], countin[tx + s], meanin[i + s], | |
varin[i + s]); | |
mean[tx] = combine_mean(countin[tx], meanin[i], countin[tx + s], meanin[i + s]); | |
counts[tx] = countin[i] + countin[i + s]; | |
__syncthreads(); | |
for(s >>= 1; s > 0; s >>= 1) { | |
if(tx < s) { | |
var[tx] = combine_var(counts[tx], mean[tx], var[tx], counts[tx + s], mean[tx + s], | |
var[tx + s]); | |
mean[tx]= combine_mean(counts[tx], mean[tx], counts[tx + s], mean[tx + s]); | |
counts[0] = counts[tx] + counts[tx + s]; | |
} | |
__syncthreads(); | |
} | |
if(tx == 0){ | |
meanout[blockIdx.x] = mean[0]; | |
varout[blockIdx.x] = var[0]; | |
countout[blockIdx.x] = counts[0]; | |
} | |
} | |
double gpu_var(double *input, size_t len) | |
{ | |
double *dma, *dmb, *dva, *dvb; | |
unsigned *dca, *dcb; | |
cudaMalloc(&dma, sizeof(*dma) * len); | |
cudaMalloc(&dva, sizeof(*dva) * len); | |
cudaMalloc(&dca, sizeof(*dca) * len); | |
cudaMemcpyAsync(dma, input, sizeof(*dma) * len, cudaMemcpyHostToDevice); | |
// cudaMemsetAsync(dva, 0, sizeof(*dva) * len); | |
dim3 blk(len <= MAX_BLOCK_SIZE ? len : MAX_BLOCK_SIZE); | |
dim3 grid(1 + ((len - 1) / blk.x)); | |
initKernel<double><<<grid, blk>>>(dva, 0.0, len); | |
initKernel<unsigned><<<grid, blk>>>(dca, 1, len); | |
cudaMalloc(&dmb, sizeof(*dmb) * grid.x); | |
cudaMalloc(&dvb, sizeof(*dvb) * grid.x); | |
cudaMalloc(&dcb, sizeof(*dcb) * grid.x); | |
bool swap = false; | |
size_t sz = len; | |
cudaDeviceSynchronize(); | |
while(grid.x > 1) { | |
if(swap) blocked_variance<<<grid, blk.x / 2>>>(dva, dma, dca, dvb, | |
dmb, dcb, sz); | |
else blocked_variance<<<grid, blk.x / 2>>>(dvb, dmb, dcb, dva, dma, | |
dca, sz); | |
sz = grid.x; | |
swap = !swap; | |
blk.x = grid.x <= MAX_BLOCK_SIZE ? grid.x : MAX_BLOCK_SIZE; | |
grid.x = 1 + ((grid.x - 1) / blk.x); | |
} | |
double result; | |
if(swap) { | |
blocked_variance<<<grid,blk.x / 2>>>(dva, dma, dca, dvb, dmb, dcb, sz); | |
cudaMemcpy(&result, dva, sizeof(result), cudaMemcpyDeviceToHost); | |
} | |
else { | |
blocked_variance<<<grid,blk.x / 2>>>(dvb, dmb, dcb, dva, dma, dca, sz); | |
cudaMemcpy(&result, dvb, sizeof(result), cudaMemcpyDeviceToHost); | |
} | |
cudaFree(dma); | |
cudaFree(dmb); | |
cudaFree(dva); | |
cudaFree(dvb); | |
cudaFree(dca); | |
cudaFree(dcb); | |
return result; | |
} | |
int main() | |
{ | |
srand(time(NULL)); | |
double *foo; | |
cudaHostAlloc(&foo, 2048 * sizeof(*foo), cudaHostAllocDefault); | |
for(unsigned i = 0; i < 2048; ++i) { | |
foo[i] = fmod(rand(), 5); | |
} | |
printf("%f\n", cpu_var(foo, 2048)); | |
printf("%f\n", gpu_var(foo, 2048)); | |
cudaFreeHost(foo); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment