Skip to content

Instantly share code, notes, and snippets.

@nathanjackson
Created April 10, 2015 23:08
Show Gist options
  • Save nathanjackson/09ce8db4d4b279f9273d to your computer and use it in GitHub Desktop.
Save nathanjackson/09ce8db4d4b279f9273d to your computer and use it in GitHub Desktop.
Incomplete CUDA Statistics Kernel
#include <math.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <time.h>
static const size_t MAX_BLOCK_SIZE = 1024;
double cpu_var(double *input, size_t len)
{
double mean = 0.0;
double m2 = 0.0;
double delta;
if(len < 2) return 0.0;
int i;
for(i = 0; i < len; ++i) {
delta = input[i] - mean;
mean = mean + delta / (i + 1);
m2 = m2 + delta * (input[i] - mean);
}
return m2 / i;
}
template<typename T>
__global__ void initKernel(T *data, const T val, const size_t count)
{
unsigned tidx = threadIdx.x + blockIdx.x * blockDim.x;
if(tidx < count) data[tidx] = val;
}
__forceinline__ __device__ double combine_mean(unsigned na, double ua,
double nb, double ub)
{
return (na * ua + nb * ub) / (na + nb);
}
__forceinline__ __device__ double combine_var(unsigned na, double ua,
double va, unsigned nb, double ub, double vb)
{
double tmp = (ub - ua) / (na + nb);
return (na * va + nb * vb) / (na + nb) + na * nb * tmp * tmp;
}
__global__ void blocked_variance(double *varout, double *meanout,
unsigned *countout, double *varin, double *meanin, unsigned *countin,
size_t len)
{
__shared__ double mean[MAX_BLOCK_SIZE];
__shared__ double var[MAX_BLOCK_SIZE];
__shared__ unsigned counts[MAX_BLOCK_SIZE];
unsigned tx = threadIdx.x;
unsigned i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
unsigned s = blockDim.x;
if( i >= len)
return;
/* for(s = blockDim.x; s > 0; s >>= 1) {
if(i + s < len) break;
} */
var[tx] = combine_var(countin[tx], meanin[i], varin[i], countin[tx + s], meanin[i + s],
varin[i + s]);
mean[tx] = combine_mean(countin[tx], meanin[i], countin[tx + s], meanin[i + s]);
counts[tx] = countin[i] + countin[i + s];
__syncthreads();
for(s >>= 1; s > 0; s >>= 1) {
if(tx < s) {
var[tx] = combine_var(counts[tx], mean[tx], var[tx], counts[tx + s], mean[tx + s],
var[tx + s]);
mean[tx]= combine_mean(counts[tx], mean[tx], counts[tx + s], mean[tx + s]);
counts[0] = counts[tx] + counts[tx + s];
}
__syncthreads();
}
if(tx == 0){
meanout[blockIdx.x] = mean[0];
varout[blockIdx.x] = var[0];
countout[blockIdx.x] = counts[0];
}
}
double gpu_var(double *input, size_t len)
{
double *dma, *dmb, *dva, *dvb;
unsigned *dca, *dcb;
cudaMalloc(&dma, sizeof(*dma) * len);
cudaMalloc(&dva, sizeof(*dva) * len);
cudaMalloc(&dca, sizeof(*dca) * len);
cudaMemcpyAsync(dma, input, sizeof(*dma) * len, cudaMemcpyHostToDevice);
// cudaMemsetAsync(dva, 0, sizeof(*dva) * len);
dim3 blk(len <= MAX_BLOCK_SIZE ? len : MAX_BLOCK_SIZE);
dim3 grid(1 + ((len - 1) / blk.x));
initKernel<double><<<grid, blk>>>(dva, 0.0, len);
initKernel<unsigned><<<grid, blk>>>(dca, 1, len);
cudaMalloc(&dmb, sizeof(*dmb) * grid.x);
cudaMalloc(&dvb, sizeof(*dvb) * grid.x);
cudaMalloc(&dcb, sizeof(*dcb) * grid.x);
bool swap = false;
size_t sz = len;
cudaDeviceSynchronize();
while(grid.x > 1) {
if(swap) blocked_variance<<<grid, blk.x / 2>>>(dva, dma, dca, dvb,
dmb, dcb, sz);
else blocked_variance<<<grid, blk.x / 2>>>(dvb, dmb, dcb, dva, dma,
dca, sz);
sz = grid.x;
swap = !swap;
blk.x = grid.x <= MAX_BLOCK_SIZE ? grid.x : MAX_BLOCK_SIZE;
grid.x = 1 + ((grid.x - 1) / blk.x);
}
double result;
if(swap) {
blocked_variance<<<grid,blk.x / 2>>>(dva, dma, dca, dvb, dmb, dcb, sz);
cudaMemcpy(&result, dva, sizeof(result), cudaMemcpyDeviceToHost);
}
else {
blocked_variance<<<grid,blk.x / 2>>>(dvb, dmb, dcb, dva, dma, dca, sz);
cudaMemcpy(&result, dvb, sizeof(result), cudaMemcpyDeviceToHost);
}
cudaFree(dma);
cudaFree(dmb);
cudaFree(dva);
cudaFree(dvb);
cudaFree(dca);
cudaFree(dcb);
return result;
}
int main()
{
srand(time(NULL));
double *foo;
cudaHostAlloc(&foo, 2048 * sizeof(*foo), cudaHostAllocDefault);
for(unsigned i = 0; i < 2048; ++i) {
foo[i] = fmod(rand(), 5);
}
printf("%f\n", cpu_var(foo, 2048));
printf("%f\n", gpu_var(foo, 2048));
cudaFreeHost(foo);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment