Last active
October 1, 2018 20:37
-
-
Save bkj/4b08ead35630d5b3b99286aebfce4902 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// CUDA CUB implementation of | |
// ```python | |
// x = np.random.uniform((0, 1), (1000, 10)) | |
// x -= x.max(axis=0, keepdims=True) # subtract max element column-wise | |
// x = np.exp(x) # exponentiate all values | |
// x /= x.sum(axis=0, keepdims=True) # divide by column sums | |
// ``` | |
__global__ void __transpose(double *d_xt, double *d_x, int num_rows, int num_cols) { | |
int offset = threadIdx.x + blockDim.x * blockIdx.x; | |
int row = offset / num_cols; | |
int col = offset % num_cols; | |
d_xt[col * num_rows + row] = d_x[offset]; | |
} | |
__global__ void __columnSub(double* d_x, int num_rows, int num_cols, double* c) { | |
int offset = threadIdx.x + blockDim.x * blockIdx.x; | |
int col = offset / num_rows; | |
if(offset < num_rows * num_cols) { | |
d_x[offset] -= c[col]; | |
} | |
} | |
__global__ void __logColumnSub(double* d_x, int num_rows, int num_cols, double* c) { | |
int offset = threadIdx.x + blockDim.x * blockIdx.x; | |
int col = offset / num_rows; | |
if(offset < num_rows * num_cols) { | |
d_x[offset] = log(d_x[offset]) - log(c[col]); | |
} | |
} | |
__global__ void __vectorExp(double* d_x, int num_rows, int num_cols) { | |
int offset = threadIdx.x + blockDim.x * blockIdx.x; | |
if(offset < num_rows * num_cols) { | |
d_x[offset] = exp(d_x[offset]); | |
} | |
} | |
void d_NormProb(int num_rows, int num_cols, double *d_x) { | |
int thread = 1024; | |
int block = (int)ceil((num_rows * num_cols) / thread); | |
// ---------------------------- | |
// Compute max of columns | |
// Transpose matrix | |
double *d_xt; | |
cudaMalloc((void**)&d_xt, num_rows * num_cols * sizeof(double)); | |
__transpose<<<block, thread>>>(d_xt, d_x, num_rows, num_cols); | |
// Compute offsets | |
int h_offsets[num_cols + 1]; | |
int *d_offsets; | |
for(int i = 0; i < num_cols + 1; i++) { | |
h_offsets[i] = i * num_rows; | |
} | |
cudaMalloc((void**)&d_offsets, (num_cols + 1) * sizeof(int)); | |
cudaMemcpy(d_offsets, h_offsets, (num_cols + 1) * sizeof(int), cudaMemcpyHostToDevice); | |
// Initialize output | |
// double h_colmax[num_cols]; | |
double *d_col_storage; | |
cudaMalloc((void**)&d_col_storage, num_cols * sizeof(double)); | |
cudaMemset(d_col_storage, 0, num_cols * sizeof(double)); | |
// Compute column maxes | |
void *d_temp_storage = NULL; | |
size_t temp_storage_bytes = 0; | |
cub::DeviceSegmentedReduce::Max( | |
d_temp_storage, | |
temp_storage_bytes, | |
d_xt, | |
d_col_storage, | |
num_cols, | |
d_offsets, | |
d_offsets + 1 | |
); | |
cudaMalloc(&d_temp_storage, temp_storage_bytes); | |
cub::DeviceSegmentedReduce::Max( | |
d_temp_storage, | |
temp_storage_bytes, | |
d_xt, | |
d_col_storage, | |
num_cols, | |
d_offsets, | |
d_offsets + 1 | |
); | |
// -------------------------------- | |
// Subtract max from columns | |
__columnSub<<<block, thread>>>(d_xt, num_rows, num_cols, d_col_storage); | |
// -------------------------------- | |
// Sum exp'd values | |
__vectorExp<<<block, thread>>>(d_xt, num_rows, num_cols); | |
cudaMemset(d_col_storage, 0, num_cols * sizeof(double)); | |
d_temp_storage = NULL; | |
temp_storage_bytes = 0; | |
cub::DeviceSegmentedReduce::Sum( | |
d_temp_storage, | |
temp_storage_bytes, | |
d_xt, | |
d_col_storage, | |
num_cols, | |
d_offsets, | |
d_offsets + 1 | |
); | |
cudaMalloc(&d_temp_storage, temp_storage_bytes); | |
cub::DeviceSegmentedReduce::Sum( | |
d_temp_storage, | |
temp_storage_bytes, | |
d_xt, | |
d_col_storage, | |
num_cols, | |
d_offsets, | |
d_offsets + 1 | |
); | |
// --------------------------------- | |
// Subtract log-sum from columns | |
__logColumnSub<<<block, thread>>>(d_xt, num_rows, num_cols, d_col_storage); | |
// --------------------------------- | |
// Copy back to original shape | |
__transpose<<<block, thread>>>(d_x, d_xt, num_cols, num_rows); | |
cudaDeviceSynchronize(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment