Skip to content

Instantly share code, notes, and snippets.

@bkj
Last active October 1, 2018 20:37
Show Gist options
  • Save bkj/4b08ead35630d5b3b99286aebfce4902 to your computer and use it in GitHub Desktop.
Save bkj/4b08ead35630d5b3b99286aebfce4902 to your computer and use it in GitHub Desktop.
// CUDA CUB implementation of
// ```python
// x = np.random.uniform((0, 1), (1000, 10))
// x -= x.max(axis=0, keepdims=True) # subtract max element column-wise
// x = np.exp(x) # exponentiate all values
// x /= x.sum(axis=0, keepdims=True) # divide by column sums
// ```
__global__ void __transpose(double *d_xt, double *d_x, int num_rows, int num_cols) {
int offset = threadIdx.x + blockDim.x * blockIdx.x;
int row = offset / num_cols;
int col = offset % num_cols;
d_xt[col * num_rows + row] = d_x[offset];
}
__global__ void __columnSub(double* d_x, int num_rows, int num_cols, double* c) {
int offset = threadIdx.x + blockDim.x * blockIdx.x;
int col = offset / num_rows;
if(offset < num_rows * num_cols) {
d_x[offset] -= c[col];
}
}
__global__ void __logColumnSub(double* d_x, int num_rows, int num_cols, double* c) {
int offset = threadIdx.x + blockDim.x * blockIdx.x;
int col = offset / num_rows;
if(offset < num_rows * num_cols) {
d_x[offset] = log(d_x[offset]) - log(c[col]);
}
}
__global__ void __vectorExp(double* d_x, int num_rows, int num_cols) {
int offset = threadIdx.x + blockDim.x * blockIdx.x;
if(offset < num_rows * num_cols) {
d_x[offset] = exp(d_x[offset]);
}
}
void d_NormProb(int num_rows, int num_cols, double *d_x) {
int thread = 1024;
int block = (int)ceil((num_rows * num_cols) / thread);
// ----------------------------
// Compute max of columns
// Transpose matrix
double *d_xt;
cudaMalloc((void**)&d_xt, num_rows * num_cols * sizeof(double));
__transpose<<<block, thread>>>(d_xt, d_x, num_rows, num_cols);
// Compute offsets
int h_offsets[num_cols + 1];
int *d_offsets;
for(int i = 0; i < num_cols + 1; i++) {
h_offsets[i] = i * num_rows;
}
cudaMalloc((void**)&d_offsets, (num_cols + 1) * sizeof(int));
cudaMemcpy(d_offsets, h_offsets, (num_cols + 1) * sizeof(int), cudaMemcpyHostToDevice);
// Initialize output
// double h_colmax[num_cols];
double *d_col_storage;
cudaMalloc((void**)&d_col_storage, num_cols * sizeof(double));
cudaMemset(d_col_storage, 0, num_cols * sizeof(double));
// Compute column maxes
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceSegmentedReduce::Max(
d_temp_storage,
temp_storage_bytes,
d_xt,
d_col_storage,
num_cols,
d_offsets,
d_offsets + 1
);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
cub::DeviceSegmentedReduce::Max(
d_temp_storage,
temp_storage_bytes,
d_xt,
d_col_storage,
num_cols,
d_offsets,
d_offsets + 1
);
// --------------------------------
// Subtract max from columns
__columnSub<<<block, thread>>>(d_xt, num_rows, num_cols, d_col_storage);
// --------------------------------
// Sum exp'd values
__vectorExp<<<block, thread>>>(d_xt, num_rows, num_cols);
cudaMemset(d_col_storage, 0, num_cols * sizeof(double));
d_temp_storage = NULL;
temp_storage_bytes = 0;
cub::DeviceSegmentedReduce::Sum(
d_temp_storage,
temp_storage_bytes,
d_xt,
d_col_storage,
num_cols,
d_offsets,
d_offsets + 1
);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
cub::DeviceSegmentedReduce::Sum(
d_temp_storage,
temp_storage_bytes,
d_xt,
d_col_storage,
num_cols,
d_offsets,
d_offsets + 1
);
// ---------------------------------
// Subtract log-sum from columns
__logColumnSub<<<block, thread>>>(d_xt, num_rows, num_cols, d_col_storage);
// ---------------------------------
// Copy back to original shape
__transpose<<<block, thread>>>(d_x, d_xt, num_cols, num_rows);
cudaDeviceSynchronize();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment