Bryce Adelstein Lelbach aka wash brycelelbach

Blind the Submissions

The first round of reviews of submissions to technical conferences should be double-blind (e.g. reviewers don't know who the submitter is).

Non-double-blind submissions:

	#include <cassert>

	int current_device()
	{
	int device = 0;
	cudaError_t const error = cudaGetDevice(&device);
	assert(cudaSuccess == error);
	return device;
	}

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the

	// This is how we run libc++ tests on the GPU without modification.
	// We force include this header into each test with `-include`.

	__host__ __device__
	int fake_main(int, char**);

	__global__
	void fake_main_kernel(int * ret)
	{
	*ret = fake_main(0, NULL);

	// I have this code:

	struct thread_group {
	private:
	std::vector<std::thread> members;

	public:
	thread_group(thread_group const&) = delete;
	thread_group& operator=(thread_group const&) = delete;

	// Sort the sequence of integers by the Nth bit.
	template <typename ExecutionPolicy,
	std::ranges::random_access_range InputRange, random_access_iterator OutputIt>
	requires std::integral<typename std::ranges_value_t<InputRange>>
	unique_future<std::uint64_t> async_radix_sort_pass(ExecutionPolicy&& exec,
	InputRange input, OutputRange output,
	std::uint64_t bit)
	{
	auto const elements = std::distance(input);

	template <typename InputIt, typename OutputIt>
	OutputIt
	radix_sort_split(InputIt first, InputIt last, OutputIt output, std::uint64_t bit)
	{
	std::vector<std::uint64_t> e(std::distance(first, last));

	// Count 0s.
	std::transform(first, last, e.begin(),
	[=] (auto t) { return !(t & (1 << bit)); });

	template <typename InputIterator, typename OutputIterator, typename T, typename BinaryOp>
	OutputIterator exclusive_scan(InputIterator first, InputIterator last,
	OutputIterator result, T init, BinaryOp op)
	{
	if (first != last) {
	T saved = init;
	do {
	init = op(init, *first);
	*result = saved;
	saved = init;

	template <typename InputIt, typename OutputIt, typename BinaryOp, typename T, typename Size>
	unique_future<OutputIt>
	async_inclusive_scan(InputIt first, InputIt last, OutputIt output,BinaryOp op, T init, Size chunk_size)
	{
	Size const elements = std::distance(first, last);
	Size const chunks = (1 + ((elements - 1) / chunk_size)); // Round up.

	std::vector<unique_future<T>> sweep;
	sweep.reserve(chunks);