bkj · October 1, 2018 20:37
diff --git a/col_softmax_v1.cu b/col_softmax_v1.cu
 // CUDA CUB implementation of 
 // ```python
 // x = np.random.uniform((0, 1), (1000, 10))
 // x -= x.max(axis=0, keepdims=True) # subtract max element column-wise
 // x = np.exp(x) # exponentiate all values
 // x /= x.sum(axis=0, keepdims=True) # divide by column sums
 // ```

 __global__ void __transpose(double *d_xt, double *d_x, int num_rows, int num_cols) {
  int offset = threadIdx.x + blockDim.x * blockIdx.x;
  int row = offset / num_cols;
  int col = offset % num_cols;
  d_xt[col * num_rows + row] = d_x[offset];
 }

 __global__ void __columnSub(double* d_x, int num_rows, int num_cols, double* c) {
  int offset = threadIdx.x + blockDim.x * blockIdx.x;
  int col = offset / num_rows;
  if(offset < num_rows * num_cols) {
    d_x[offset] -= c[col];
  }
 }

 __global__ void __logColumnSub(double* d_x, int num_rows, int num_cols, double* c) {
  int offset = threadIdx.x + blockDim.x * blockIdx.x;
  int col = offset / num_rows;
  if(offset < num_rows * num_cols) {
    d_x[offset] = log(d_x[offset]) - log(c[col]);
  }
 }

 __global__ void __vectorExp(double* d_x, int num_rows, int num_cols) {
  int offset = threadIdx.x + blockDim.x * blockIdx.x;
  if(offset < num_rows * num_cols) {
    d_x[offset] = exp(d_x[offset]);
  }
 }

 void d_NormProb(int num_rows, int num_cols, double *d_x) {
  int thread = 1024;
  int block  = (int)ceil((num_rows * num_cols) / thread);

  // ----------------------------
  // Compute max of columns

  // Transpose matrix
  double *d_xt;
  cudaMalloc((void**)&d_xt, num_rows * num_cols * sizeof(double));
  __transpose<<<block, thread>>>(d_xt, d_x, num_rows, num_cols);

  // Compute offsets
  int h_offsets[num_cols + 1];
  int *d_offsets;
  for(int i = 0; i < num_cols + 1; i++) {
    h_offsets[i] = i * num_rows;
  }
  cudaMalloc((void**)&d_offsets, (num_cols + 1) * sizeof(int));
  cudaMemcpy(d_offsets, h_offsets, (num_cols + 1) * sizeof(int), cudaMemcpyHostToDevice);

  // Initialize output
  // double h_colmax[num_cols];
  double *d_col_storage;
  cudaMalloc((void**)&d_col_storage, num_cols * sizeof(double));
  cudaMemset(d_col_storage, 0, num_cols * sizeof(double));

  // Compute column maxes
  void *d_temp_storage = NULL;
  size_t temp_storage_bytes = 0;
  cub::DeviceSegmentedReduce::Max(
    d_temp_storage,
    temp_storage_bytes,
    d_xt,
    d_col_storage,
    num_cols,
    d_offsets,
    d_offsets + 1
  );
  cudaMalloc(&d_temp_storage, temp_storage_bytes);
  cub::DeviceSegmentedReduce::Max(
    d_temp_storage,
    temp_storage_bytes,
    d_xt,
    d_col_storage,
    num_cols,
    d_offsets,
    d_offsets + 1
  );

  // --------------------------------
  // Subtract max from columns

  __columnSub<<<block, thread>>>(d_xt, num_rows, num_cols, d_col_storage);

  // --------------------------------
  // Sum exp'd values

  __vectorExp<<<block, thread>>>(d_xt, num_rows, num_cols);

  cudaMemset(d_col_storage, 0, num_cols * sizeof(double));
  d_temp_storage = NULL;
  temp_storage_bytes = 0;
  cub::DeviceSegmentedReduce::Sum(
    d_temp_storage,
    temp_storage_bytes,
    d_xt,
    d_col_storage,
    num_cols,
    d_offsets,
    d_offsets + 1
  );
  cudaMalloc(&d_temp_storage, temp_storage_bytes);
  cub::DeviceSegmentedReduce::Sum(
    d_temp_storage,
    temp_storage_bytes,
    d_xt,
    d_col_storage,
    num_cols,
    d_offsets,
    d_offsets + 1
  );

  // ---------------------------------
  // Subtract log-sum from columns

  __logColumnSub<<<block, thread>>>(d_xt, num_rows, num_cols, d_col_storage);

  // ---------------------------------
  // Copy back to original shape

  __transpose<<<block, thread>>>(d_x, d_xt, num_cols, num_rows);
  cudaDeviceSynchronize();
 }
	// CUDA CUB implementation of
	// ```python
	// x = np.random.uniform((0, 1), (1000, 10))
	// x -= x.max(axis=0, keepdims=True) # subtract max element column-wise
	// x = np.exp(x) # exponentiate all values
	// x /= x.sum(axis=0, keepdims=True) # divide by column sums
	// ```

	__global__ void __transpose(double d_xt, double d_x, int num_rows, int num_cols) {
	int offset = threadIdx.x + blockDim.x * blockIdx.x;
	int row = offset / num_cols;
	int col = offset % num_cols;
	d_xt[col * num_rows + row] = d_x[offset];
	}

	__global__ void __columnSub(double* d_x, int num_rows, int num_cols, double* c) {
	int offset = threadIdx.x + blockDim.x * blockIdx.x;
	int col = offset / num_rows;
	if(offset < num_rows * num_cols) {
	d_x[offset] -= c[col];
	}
	}

	__global__ void __logColumnSub(double* d_x, int num_rows, int num_cols, double* c) {
	int offset = threadIdx.x + blockDim.x * blockIdx.x;
	int col = offset / num_rows;
	if(offset < num_rows * num_cols) {
	d_x[offset] = log(d_x[offset]) - log(c[col]);
	}
	}

	__global__ void __vectorExp(double* d_x, int num_rows, int num_cols) {
	int offset = threadIdx.x + blockDim.x * blockIdx.x;
	if(offset < num_rows * num_cols) {
	d_x[offset] = exp(d_x[offset]);
	}
	}

	void d_NormProb(int num_rows, int num_cols, double *d_x) {
	int thread = 1024;
	int block = (int)ceil((num_rows * num_cols) / thread);

	// ----------------------------
	// Compute max of columns

	// Transpose matrix
	double *d_xt;
	cudaMalloc((void*)&d_xt, num_rows num_cols * sizeof(double));
	__transpose<<<block, thread>>>(d_xt, d_x, num_rows, num_cols);

	// Compute offsets
	int h_offsets[num_cols + 1];
	int *d_offsets;
	for(int i = 0; i < num_cols + 1; i++) {
	h_offsets[i] = i * num_rows;
	}
	cudaMalloc((void*)&d_offsets, (num_cols + 1) sizeof(int));
	cudaMemcpy(d_offsets, h_offsets, (num_cols + 1) * sizeof(int), cudaMemcpyHostToDevice);

	// Initialize output
	// double h_colmax[num_cols];
	double *d_col_storage;
	cudaMalloc((void*)&d_col_storage, num_cols sizeof(double));
	cudaMemset(d_col_storage, 0, num_cols * sizeof(double));

	// Compute column maxes
	void *d_temp_storage = NULL;
	size_t temp_storage_bytes = 0;
	cub::DeviceSegmentedReduce::Max(
	d_temp_storage,
	temp_storage_bytes,
	d_xt,
	d_col_storage,
	num_cols,
	d_offsets,
	d_offsets + 1
	);
	cudaMalloc(&d_temp_storage, temp_storage_bytes);
	cub::DeviceSegmentedReduce::Max(
	d_temp_storage,
	temp_storage_bytes,
	d_xt,
	d_col_storage,
	num_cols,
	d_offsets,
	d_offsets + 1
	);

	// --------------------------------
	// Subtract max from columns

	__columnSub<<<block, thread>>>(d_xt, num_rows, num_cols, d_col_storage);

	// --------------------------------
	// Sum exp'd values

	__vectorExp<<<block, thread>>>(d_xt, num_rows, num_cols);

	cudaMemset(d_col_storage, 0, num_cols * sizeof(double));
	d_temp_storage = NULL;
	temp_storage_bytes = 0;
	cub::DeviceSegmentedReduce::Sum(
	d_temp_storage,
	temp_storage_bytes,
	d_xt,
	d_col_storage,
	num_cols,
	d_offsets,
	d_offsets + 1
	);
	cudaMalloc(&d_temp_storage, temp_storage_bytes);
	cub::DeviceSegmentedReduce::Sum(
	d_temp_storage,
	temp_storage_bytes,
	d_xt,
	d_col_storage,
	num_cols,
	d_offsets,
	d_offsets + 1
	);

	// ---------------------------------
	// Subtract log-sum from columns

	__logColumnSub<<<block, thread>>>(d_xt, num_rows, num_cols, d_col_storage);

	// ---------------------------------
	// Copy back to original shape

	__transpose<<<block, thread>>>(d_x, d_xt, num_cols, num_rows);
	cudaDeviceSynchronize();
	}