Created
January 20, 2018 14:05
-
-
Save ShigekiKarita/d2956956720b675a32d97379a557e5d1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <thrust/device_vector.h> | |
| #include <cooperative_groups.h> | |
| namespace cg = cooperative_groups; | |
| // using DataType = float; | |
| // using DataType4 = float4; | |
| using DataType = int; | |
| using DataType4 = int4; | |
| __global__ void iota(DataType* data) { | |
| auto i = threadIdx.x + blockIdx.x * blockDim.x; | |
| data[i] = i; | |
| } | |
| // [rank + 2の倍数] 番目のデータの和 | |
| __device__ DataType reduce_sum(cg::thread_group g, DataType* temp, DataType acc) { | |
| auto lane = g.thread_rank(); | |
| for (auto i = g.size() / 2; i > 0; i /= 2) { | |
| temp[lane] = acc; | |
| g.sync(); // 全threadのストアが終わるまで待機 | |
| if (lane < i) { | |
| acc += temp[lane + i]; | |
| } | |
| g.sync(); // 全threadのロードが終わるまで待機 | |
| } | |
| return acc; // 0番目スレッドの返り値が完全な合計になる | |
| } | |
| // from http://www.slis.tsukuba.ac.jp/~fujisawa.makoto.fu/cgi-bin/wiki/index.php?CUDAアトミック関数 | |
| __device__ void atomicFloatAdd(DataType *address, DataType val) { | |
| int i_val = __float_as_int(val); | |
| int tmp0 = 0; | |
| int tmp1; | |
| while((tmp1 = atomicCAS(reinterpret_cast<int *>(address), tmp0, i_val)) != tmp0) { | |
| tmp0 = tmp1; | |
| i_val = __float_as_int(val + __int_as_float(tmp1)); | |
| } | |
| } | |
| __device__ DataType thread_sum(DataType *input, int n) { | |
| DataType sum = 0; | |
| // 4-dim ベクトル化による高速化 | |
| // https://devblogs.nvidia.com/cuda-pro-tip-increase-performance-with-vectorized-memory-access | |
| for(auto i = blockIdx.x * blockDim.x + threadIdx.x; | |
| i < n / 4; | |
| i += blockDim.x * gridDim.x) { | |
| auto in = reinterpret_cast<DataType4*>(input)[i]; | |
| sum += in.x + in.y + in.z + in.w; | |
| } | |
| return sum; | |
| } | |
| __global__ void sum_kernel_block(DataType* sum, DataType* input, int n) { | |
| auto my_sum = thread_sum(input, n); | |
| extern __shared__ DataType temp[]; | |
| auto g = cg::this_thread_block(); | |
| auto block_sum = reduce_sum(g, temp, my_sum); | |
| if (g.thread_rank() == 0) atomicAdd(sum, block_sum); | |
| } | |
| int main() { | |
| thrust::device_vector<DataType> data(16); | |
| cudaDeviceProp prop; | |
| cudaGetDeviceProperties(&prop, 0); // 0番目デバイスの情報取得 | |
| int max_block_size = prop.maxThreadsPerBlock; | |
| std::cout << max_block_size << std::endl; | |
| auto block_size = 32; | |
| auto grid_size = (data.size() + block_size - 1) / block_size; | |
| auto shared_size = block_size * sizeof(DataType); | |
| iota<<<block_size, grid_size>>>(thrust::raw_pointer_cast(data.data())); | |
| for (auto d : data) { | |
| std::cout << d << std::endl; | |
| } | |
| thrust::device_vector<DataType> s(1); | |
| sum_kernel_block<<<block_size, grid_size, shared_size>>>( | |
| thrust::raw_pointer_cast(s.data()), | |
| thrust::raw_pointer_cast(data.data()), | |
| data.size()); | |
| std::cout << s[0] << std::endl; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment