Skip to content

Instantly share code, notes, and snippets.

@ShigekiKarita
Created January 20, 2018 14:05
Show Gist options
  • Select an option

  • Save ShigekiKarita/d2956956720b675a32d97379a557e5d1 to your computer and use it in GitHub Desktop.

Select an option

Save ShigekiKarita/d2956956720b675a32d97379a557e5d1 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <thrust/device_vector.h>
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
// using DataType = float;
// using DataType4 = float4;
using DataType = int;
using DataType4 = int4;
__global__ void iota(DataType* data) {
auto i = threadIdx.x + blockIdx.x * blockDim.x;
data[i] = i;
}
// [rank + 2の倍数] 番目のデータの和
__device__ DataType reduce_sum(cg::thread_group g, DataType* temp, DataType acc) {
auto lane = g.thread_rank();
for (auto i = g.size() / 2; i > 0; i /= 2) {
temp[lane] = acc;
g.sync(); // 全threadのストアが終わるまで待機
if (lane < i) {
acc += temp[lane + i];
}
g.sync(); // 全threadのロードが終わるまで待機
}
return acc; // 0番目スレッドの返り値が完全な合計になる
}
// from http://www.slis.tsukuba.ac.jp/~fujisawa.makoto.fu/cgi-bin/wiki/index.php?CUDAアトミック関数
__device__ void atomicFloatAdd(DataType *address, DataType val) {
int i_val = __float_as_int(val);
int tmp0 = 0;
int tmp1;
while((tmp1 = atomicCAS(reinterpret_cast<int *>(address), tmp0, i_val)) != tmp0) {
tmp0 = tmp1;
i_val = __float_as_int(val + __int_as_float(tmp1));
}
}
__device__ DataType thread_sum(DataType *input, int n) {
DataType sum = 0;
// 4-dim ベクトル化による高速化
// https://devblogs.nvidia.com/cuda-pro-tip-increase-performance-with-vectorized-memory-access
for(auto i = blockIdx.x * blockDim.x + threadIdx.x;
i < n / 4;
i += blockDim.x * gridDim.x) {
auto in = reinterpret_cast<DataType4*>(input)[i];
sum += in.x + in.y + in.z + in.w;
}
return sum;
}
__global__ void sum_kernel_block(DataType* sum, DataType* input, int n) {
auto my_sum = thread_sum(input, n);
extern __shared__ DataType temp[];
auto g = cg::this_thread_block();
auto block_sum = reduce_sum(g, temp, my_sum);
if (g.thread_rank() == 0) atomicAdd(sum, block_sum);
}
int main() {
thrust::device_vector<DataType> data(16);
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0); // 0番目デバイスの情報取得
int max_block_size = prop.maxThreadsPerBlock;
std::cout << max_block_size << std::endl;
auto block_size = 32;
auto grid_size = (data.size() + block_size - 1) / block_size;
auto shared_size = block_size * sizeof(DataType);
iota<<<block_size, grid_size>>>(thrust::raw_pointer_cast(data.data()));
for (auto d : data) {
std::cout << d << std::endl;
}
thrust::device_vector<DataType> s(1);
sum_kernel_block<<<block_size, grid_size, shared_size>>>(
thrust::raw_pointer_cast(s.data()),
thrust::raw_pointer_cast(data.data()),
data.size());
std::cout << s[0] << std::endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment