The first round of reviews of submissions to technical conferences should be double-blind (e.g. reviewers don't know who the submitter is).
Non-double-blind submissions:
- Contribute to Hero Culture: Hero culture is the tendency within technical
| #include <cassert> | |
| int current_device() | |
| { | |
| int device = 0; | |
| cudaError_t const error = cudaGetDevice(&device); | |
| assert(cudaSuccess == error); | |
| return device; | |
| } |
| /****************************************************************************** | |
| * Copyright (c) 2011, Duane Merrill. All rights reserved. | |
| * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
| * | |
| * Redistribution and use in source and binary forms, with or without | |
| * modification, are permitted provided that the following conditions are met: | |
| * * Redistributions of source code must retain the above copyright | |
| * notice, this list of conditions and the following disclaimer. | |
| * * Redistributions in binary form must reproduce the above copyright | |
| * notice, this list of conditions and the following disclaimer in the |
| // This is how we run libc++ tests on the GPU without modification. | |
| // We force include this header into each test with `-include`. | |
| __host__ __device__ | |
| int fake_main(int, char**); | |
| __global__ | |
| void fake_main_kernel(int * ret) | |
| { | |
| *ret = fake_main(0, NULL); |
| // I have this code: | |
| struct thread_group { | |
| private: | |
| std::vector<std::thread> members; | |
| public: | |
| thread_group(thread_group const&) = delete; | |
| thread_group& operator=(thread_group const&) = delete; |
| // Sort the sequence of integers by the Nth bit. | |
| template <typename ExecutionPolicy, | |
| std::ranges::random_access_range InputRange, random_access_iterator OutputIt> | |
| requires std::integral<typename std::ranges_value_t<InputRange>> | |
| unique_future<std::uint64_t> async_radix_sort_pass(ExecutionPolicy&& exec, | |
| InputRange input, OutputRange output, | |
| std::uint64_t bit) | |
| { | |
| auto const elements = std::distance(input); |
| template <typename InputIt, typename OutputIt> | |
| OutputIt | |
| radix_sort_split(InputIt first, InputIt last, OutputIt output, std::uint64_t bit) | |
| { | |
| std::vector<std::uint64_t> e(std::distance(first, last)); | |
| // Count 0s. | |
| std::transform(first, last, e.begin(), | |
| [=] (auto t) { return !(t & (1 << bit)); }); |
| template <typename InputIterator, typename OutputIterator, typename T, typename BinaryOp> | |
| OutputIterator exclusive_scan(InputIterator first, InputIterator last, | |
| OutputIterator result, T init, BinaryOp op) | |
| { | |
| if (first != last) { | |
| T saved = init; | |
| do { | |
| init = op(init, *first); | |
| *result = saved; | |
| saved = init; |
| template <typename InputIt, typename OutputIt, typename BinaryOp, typename T, typename Size> | |
| unique_future<OutputIt> | |
| async_inclusive_scan(InputIt first, InputIt last, OutputIt output,BinaryOp op, T init, Size chunk_size) | |
| { | |
| Size const elements = std::distance(first, last); | |
| Size const chunks = (1 + ((elements - 1) / chunk_size)); // Round up. | |
| std::vector<unique_future<T>> sweep; | |
| sweep.reserve(chunks); |
| template <typename InputIt, typename OutputIt, typename BinaryOp, typename T, typename Size> | |
| unique_future<OutputIt> | |
| async_inclusive_scan(InputIt first, InputIt last, OutputIt output,BinaryOp op, T init, Size chunk_size) | |
| { | |
| Size const elements = std::distance(first, last); | |
| Size const chunks = (1 + ((elements - 1) / chunk_size)); // Round up. | |
| std::vector<unique_future<T>> sweep; | |
| sweep.reserve(chunks); |