Skip to content

Instantly share code, notes, and snippets.

@neoblizz
Last active October 23, 2022 03:21
Show Gist options
  • Save neoblizz/12b4934543cb7e4b47611aaad8156afd to your computer and use it in GitHub Desktop.
Save neoblizz/12b4934543cb7e4b47611aaad8156afd to your computer and use it in GitHub Desktop.
C++ wrapper around cooperative groups launch API.
#pragma once
// Includes CUDA
#include <cuda_runtime.h>
#include <cooperative_groups.h>
#include <utility>
namespace cg = cooperative_groups;
// Helper functions.
inline void for_each_argument_address(void**) {}
template <typename arg_t, typename... args_t>
inline void for_each_argument_address(void** collected_addresses,
arg_t&& arg,
args_t&&... args) {
collected_addresses[0] = const_cast<void*>(static_cast<const void*>(&arg));
for_each_argument_address(collected_addresses + 1,
::std::forward<args_t>(args)...);
}
/**
* @brief Launch a given kernel using cudaLaunchCooperativeKernel API for
* Cooperative Groups (CG). This is a C++ wrapper that makes the C-based API of
* CG more accessible. See the example below for use:
*
* @note For an example use see the commented code below.
* @note GodBolt.org link: https://tinyurl.com/cgeg289hw
*
* @tparam func_t type of kernel
* @tparam args_t types of the arguments (variadic)
* @param stream cuda stream
* @param f function name/identifier
* @param block_dimensions block dimension (dim3)
* @param grid_dimensions grid dimension (dim3)
* @param shared_memory_bytes allocated dynamic shared memory in bytes
* @param args all the arguments to the function f
*/
template <typename func_t, typename... args_t>
void launch_cooperative(cudaStream_t& stream,
const func_t& f,
dim3 block_dimensions,
dim3 grid_dimensions,
std::size_t shared_memory_bytes,
args_t&&... args) {
constexpr const auto non_zero_num_params =
sizeof...(args_t) == 0 ? 1 : sizeof...(args_t);
void* argument_ptrs[non_zero_num_params];
for_each_argument_address(argument_ptrs, ::std::forward<args_t>(args)...);
cudaLaunchCooperativeKernel<func_t>(
&f, grid_dimensions, block_dimensions,
argument_ptrs, shared_memory_bytes, stream);
}
// EOF
@neoblizz
Copy link
Author

neoblizz commented Oct 23, 2022

Uncommented example code, which basically uses the functions above to launch a simple kernel. https://tinyurl.com/cgeg289hw

#include <thrust/device_vector.h>
#include "launch.cuh" // include the file in this gist.

__global__ void dummy_kernel(int* x, int* y, int N) {
     int i = threadIdx.x + blockDim.x * blockIdx.x;
     if(i < N)
         y[i] = x[i] * i + i;
}

int main(int argc, char const *argv[]) {
     // Some problem to use for the kernel.
     constexpr int N = 1<<20;
     thrust::device_vector<int> x(N, 1);
     thrust::device_vector<int> y(N);

     // Set-up Block & Grid dimenions.
     // Ideally, you want Grid dimension = to number of SM (or 2*SM),
     // and have them always be resident (persistent-kernel).
     dim3 blockDims(128);
     dim3 gridDims((unsigned int) ceil(N / blockDims.x));

     // Create CUDA stream for the kernel.
     cudaStream_t stream;
     cudaStreamCreate(&stream);

     // Launch the kernel using cooperative launch:
     launch_cooperative(stream,   // cuda stream
         dummy_kernel,            // kernel's function name
         blockDims,               // block dimension 
         gridDims,                // grid dimension 
         0,                       // shared memory in bytes
         // arguments to the kernel function (in order) 
         x.data().get(), y.data().get(), N);
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment