bernhardmgruber · November 2, 2021 15:05
diff --git a/llama.hpp b/llama.hpp
 #pragma once

 // ============================================================================
 // == ./Meta.hpp ==
 // ==
 // SPDX-License-Identifier: GPL-3.0-or-later

 // #pragma once
 #include <boost/mp11.hpp>

 #if BOOST_MP11_VERSION < 107300
 //  Copyright 2015 Peter Dimov.
 //
 //  Distributed under the Boost Software License, Version 1.0.
 //
 // Boost Software License - Version 1.0 - August 17th, 2003
 //
 // Permission is hereby granted, free of charge, to any person or organization
 // obtaining a copy of the software and accompanying documentation covered by
 // this license (the "Software") to use, reproduce, display, distribute,
 // execute, and transmit the Software, and to prepare derivative works of the
 // Software, and to permit third-parties to whom the Software is furnished to
 // do so, all subject to the following:
 //
 // The copyright notices in the Software and this entire statement, including
 // the above license grant, this restriction and the following disclaimer,
 // must be included in all copies of the Software, in whole or in part, and
 // all derivative works of the Software, unless such copies or derivative
 // works are solely in the form of machine-executable object code generated by
 // a source language processor.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 // DEALINGS IN THE SOFTWARE.

 namespace boost::mp11
 {
    namespace detail
    {
        template<class L2>
        struct mp_flatten_impl
        {
            template<class T>
            using fn = mp_if<mp_similar<L2, T>, T, mp_list<T>>;
        };
    } // namespace detail

    template<class L, class L2 = mp_clear<L>>
    using mp_flatten = mp_apply<mp_append, mp_push_front<mp_transform_q<detail::mp_flatten_impl<L2>, L>, mp_clear<L>>>;
 } // namespace boost::mp11
 #endif

 namespace llama
 {
    namespace internal
    {
        template<typename FromList, template<auto...> class ToList>
        struct mp_unwrap_values_into_impl;

        template<template<class...> class FromList, typename... Values, template<auto...> class ToList>
        struct mp_unwrap_values_into_impl<FromList<Values...>, ToList>
        {
            using type = ToList<Values::value...>;
        };

        template<typename FromList, template<auto...> class ToList>
        using mp_unwrap_values_into = typename mp_unwrap_values_into_impl<FromList, ToList>::type;
    } // namespace internal
 } // namespace llama
 // ==
 // == ./Meta.hpp ==
 // ============================================================================

 // ============================================================================
 // == ./macros.hpp ==
 // ==
 // Copyright 2018 Alexander Matthes
 // SPDX-License-Identifier: GPL-3.0-or-later

 // #pragma once
 #ifdef __INTEL_COMPILER
 #    error LLAMA has stopped supporting the Intel Classic Compiler after Intel announced its planned deprecation and \
 replacement by the Intel LLVM-based compiler. Please migrate to the Intel LLVM-based compiler.
 #endif

 #if defined(__INTEL_LLVM_COMPILER)
 #    define LLAMA_INDEPENDENT_DATA _Pragma("ivdep")
 #elif defined(__clang__)
 #    define LLAMA_INDEPENDENT_DATA _Pragma("clang loop vectorize(assume_safety) interleave(assume_safety)")
 #elif defined(__GNUC__)
 #    define LLAMA_INDEPENDENT_DATA _Pragma("GCC ivdep")
 #elif defined(_MSC_VER)
 #    define LLAMA_INDEPENDENT_DATA __pragma(loop(ivdep))
 #else
 /// May be put in front of a loop statement. Indicates that all (!) data access inside the loop is indepent, so the
 /// loop can be safely vectorized. Example: \code{.cpp}
 ///     LLAMA_INDEPENDENT_DATA
 ///     for(int i = 0; i < N; ++i)
 ///         // because of LLAMA_INDEPENDENT_DATA the compiler knows that a and b
 ///         // do not overlap and the operation can safely be vectorized
 ///         a[i] += b[i];
 /// \endcode
 #    define LLAMA_INDEPENDENT_DATA
 #endif

 #ifndef LLAMA_FORCE_INLINE
 #    if defined(__NVCC__)
 #        define LLAMA_FORCE_INLINE __forceinline__
 #    elif defined(__GNUC__) || defined(__clang__)
 #        define LLAMA_FORCE_INLINE inline __attribute__((always_inline))
 #    elif defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
 #        define LLAMA_FORCE_INLINE __forceinline
 #    else
 /// Forces the compiler to inline a function annotated with this macro
 #        define LLAMA_FORCE_INLINE inline
 #        warning LLAMA_FORCE_INLINE is only defined to "inline" for this compiler
 #    endif
 #endif

 #ifndef LLAMA_PRAGMA
 #    define LLAMA_PRAGMA(tokens) _Pragma(#    tokens)
 #endif

 #ifndef LLAMA_UNROLL
 #    if defined(__NVCC__) || defined(__clang__) || defined(__INTEL_LLVM_COMPILER)
 #        define LLAMA_UNROLL(...) LLAMA_PRAGMA(unroll __VA_ARGS__)
 #    elif defined(__GNUG__)
 #        define LLAMA_UNROLL(...) LLAMA_PRAGMA(GCC unroll __VA_ARGS__)
 #    elif defined(_MSC_VER)
 // MSVC does not support a pragma for unrolling
 #        define LLAMA_UNROLL(...)
 #    else
 /// Requests the compiler to unroll the loop following this directive. An optional unrolling count may be provided as
 /// argument, which must be a constant expression.
 #        define LLAMA_UNROLL(...)
 #        warning LLAMA_UNROLL is not implemented for your compiler
 #    endif
 #endif

 #ifndef LLAMA_HOST_ACC
 #    if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
 #        define LLAMA_HOST_ACC __host__ __device__
 #    elif defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
 #        define LLAMA_HOST_ACC
 #    else
 /// Some offloading parallelization language extensions such a CUDA, OpenACC or OpenMP 4.5 need to specify whether a
 /// class, struct, function or method "resides" on the host, the accelerator (the offloading device) or both. LLAMA
 /// supports this with marking every function needed on an accelerator with `LLAMA_HOST_ACC`.
 #        define LLAMA_HOST_ACC
 #        warning LLAMA_HOST_ACC is only defined empty for this compiler
 #    endif
 #endif

 #define LLAMA_FN_HOST_ACC_INLINE LLAMA_FORCE_INLINE LLAMA_HOST_ACC

 #ifndef LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS
 #    if defined(__clang__) || defined(__INTEL_LLVM_COMPILER)
 #        define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) __attribute__((always_inline)) __VA_ARGS__
 #    elif defined(__GNUC__) || (defined(__NVCC__) && !defined(_MSC_VER))
 #        define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) __VA_ARGS__ __attribute__((always_inline))
 #    elif defined(_MSC_VER)
 #        define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...)                                                              \
            __VA_ARGS__ /* FIXME: MSVC cannot combine constexpr and [[msvc::forceinline]] */
 #    else
 #        define LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(...) __VA_ARGS__
 #        warning LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS not defined for this compiler
 #    endif
 #endif
 #ifndef LLAMA_LAMBDA_INLINE
 /// Gives strong indication to the compiler to inline the attributed lambda.
 #    define LLAMA_LAMBDA_INLINE LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS()
 #endif

 /// Suppresses nvcc warning: 'calling a __host__ function from __host__ __device__ function.'
 #if defined(__NVCC__) && !defined(__clang__)
 #    define LLAMA_SUPPRESS_HOST_DEVICE_WARNING _Pragma("nv_exec_check_disable")
 #else
 #    define LLAMA_SUPPRESS_HOST_DEVICE_WARNING
 #endif

 #if defined(_MSC_VER)
 #    define LLAMA_FORCE_INLINE_RECURSIVE __pragma(inline_depth(255))
 #else
 /// Forces the compiler to recursively inline the call hiearchy started by the subsequent function call.
 #    define LLAMA_FORCE_INLINE_RECURSIVE
 #endif

 /// Forces a copy of a value. This is useful to prevent ODR usage of constants when compiling for GPU targets.
 #define LLAMA_COPY(x) decltype(x)(x)

 // TODO(bgruber): clang 10 and 11 fail to compile this currently with the issue described here:
 // https://stackoverflow.com/questions/64300832/why-does-clang-think-gccs-subrange-does-not-satisfy-gccs-ranges-begin-functi
 // let's try again with clang 12
 // Intel LLVM compiler is also using the clang frontend
 #if(__has_include(<ranges>) && defined(__cpp_concepts) && !defined(__clang__) && !defined(__INTEL_LLVM_COMPILER))
 #    define CAN_USE_RANGES 1
 #else
 #    define CAN_USE_RANGES 0
 #endif
 // ==
 // == ./macros.hpp ==
 // ============================================================================

 // ============================================================================
 // == ./Proofs.hpp ==
 // ==
 // SPDX-License-Identifier: GPL-3.0-or-later

 // #pragma once
 	// ============================================================================
 	// == ./ArrayIndexRange.hpp ==
 	// ==
 	// #pragma once
 		// ============================================================================
 		// == ./ArrayExtents.hpp ==
 		// ==
 		// SPDX-License-Identifier: GPL-3.0-or-later

 		// #pragma once
 			// ============================================================================
 			// == ./Array.hpp ==
 			// ==
 			// Copyright 2018 Alexander Matthes
 			// SPDX-License-Identifier: GPL-3.0-or-later

 			// #pragma once
 			// #include "macros.hpp"    // amalgamate: file already expanded

 			#include <ostream>
 			#include <tuple>

 			namespace llama
 			{
 			    /// Array class like `std::array` but suitable for use with offloading devices like GPUs.
 			    /// \tparam T type if array elements.
 			    /// \tparam N rank of the array.
 			    template<typename T, std::size_t N>
 			    struct Array
 			    {
 			        using value_type = T;
 			        T element[N > 0 ? N : 1];

 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto size() const
 			        {
 			            return N;
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() -> T*
 			        {
 			            return &element[0];
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() const -> const T*
 			        {
 			            return &element[0];
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto end() -> T*
 			        {
 			            return &element[N];
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto end() const -> const T*
 			        {
 			            return &element[N];
 			        }

 			        template<typename IndexType>
 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](IndexType&& idx) -> T&
 			        {
 			            return element[idx];
 			        }

 			        template<typename IndexType>
 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](IndexType&& idx) const -> T const&
 			        {
 			            return element[idx];
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator==(const Array& a, const Array& b) -> bool
 			        {
 			            for(std::size_t i = 0; i < N; ++i)
 			                if(a.element[i] != b.element[i])
 			                    return false;
 			            return true;
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator!=(const Array& a, const Array& b) -> bool
 			        {
 			            return !(a == b);
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator+(const Array& a, const Array& b) -> Array
 			        {
 			            Array temp{};
 			            for(std::size_t i = 0; i < N; ++i)
 			                temp[i] = a[i] + b[i];
 			            return temp;
 			        }

 			        template<std::size_t I>
 			        constexpr auto get() -> T&
 			        {
 			            return element[I];
 			        }

 			        template<std::size_t I>
 			        constexpr auto get() const -> const T&
 			        {
 			            return element[I];
 			        }
 			    };

 			    template<typename T>
 			    struct Array<T, 0>
 			    {
 			        using value_type = T;

 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto size() const
 			        {
 			            return 0;
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() -> T*
 			        {
 			            return nullptr;
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto begin() const -> const T*
 			        {
 			            return nullptr;
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto end() -> T*
 			        {
 			            return nullptr;
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto end() const -> const T*
 			        {
 			            return nullptr;
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator==(const Array&, const Array&) -> bool
 			        {
 			            return true;
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator!=(const Array&, const Array&) -> bool
 			        {
 			            return false;
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator+(const Array&, const Array&) -> Array
 			        {
 			            return {};
 			        }
 			    };

 			    template<typename First, typename... Args>
 			    Array(First, Args... args) -> Array<First, sizeof...(Args) + 1>;

 			    template<typename T, std::size_t N>
 			    auto operator<<(std::ostream& os, const Array<T, N>& a) -> std::ostream&
 			    {
 			        os << "Array{";
 			        bool first = true;
 			        for(auto e : a)
 			        {
 			            if(first)
 			                first = false;
 			            else
 			                os << ", ";
 			            os << e;
 			        }
 			        os << "}";
 			        return os;
 			    }

 			    template<typename T, std::size_t N>
 			    LLAMA_FN_HOST_ACC_INLINE constexpr auto push_front([[maybe_unused]] Array<T, N> a, T v) -> Array<T, N + 1>
 			    {
 			        Array<T, N + 1> r{};
 			        r[0] = v;
 			        if constexpr(N > 0)
 			            for(std::size_t i = 0; i < N; i++)
 			                r[i + 1] = a[i];
 			        return r;
 			    }

 			    template<typename T, std::size_t N>
 			    LLAMA_FN_HOST_ACC_INLINE constexpr auto push_back([[maybe_unused]] Array<T, N> a, T v) -> Array<T, N + 1>
 			    {
 			        Array<T, N + 1> r{};
 			        if constexpr(N > 0)
 			            for(std::size_t i = 0; i < N; i++)
 			                r[i] = a[i];
 			        r[N] = v;
 			        return r;
 			    }

 			    template<typename T, std::size_t N>
 			    LLAMA_FN_HOST_ACC_INLINE constexpr auto pop_back([[maybe_unused]] Array<T, N> a)
 			    {
 			        static_assert(N > 0);
 			        Array<T, N - 1> r{};
 			        if constexpr(N > 1)
 			            for(std::size_t i = 0; i < N - 1; i++)
 			                r[i] = a[i];
 			        return r;
 			    }

 			    template<typename T, std::size_t N>
 			    LLAMA_FN_HOST_ACC_INLINE constexpr auto pop_front([[maybe_unused]] Array<T, N> a)
 			    {
 			        static_assert(N > 0);
 			        Array<T, N - 1> r{};
 			        if constexpr(N > 1)
 			            for(std::size_t i = 0; i < N - 1; i++)
 			                r[i] = a[i + 1];
 			        return r;
 			    }

 			    template<typename T, std::size_t N>
 			    LLAMA_FN_HOST_ACC_INLINE constexpr auto product(Array<T, N> a) -> T
 			    {
 			        T prod = 1;
 			        for(auto s : a)
 			            prod *= s;
 			        return prod;
 			    }
 			} // namespace llama

 			namespace std
 			{
 			    template<typename T, size_t N>
 			    struct tuple_size<llama::Array<T, N>> : integral_constant<size_t, N>
 			    {
 			    };

 			    template<size_t I, typename T, size_t N>
 			    struct tuple_element<I, llama::Array<T, N>>
 			    {
 			        using type = T;
 			    };
 			} // namespace std
 			// ==
 			// == ./Array.hpp ==
 			// ============================================================================

 		// #include "Meta.hpp"    // amalgamate: file already expanded

 		#include <limits>
 		#include <type_traits>

 		namespace llama
 		{
 		    // TODO(bgruber): make this an alias in C++20, when we have CTAD for aliases
 		    /// Represents a run-time index into the array dimensions.
 		    /// \tparam Dim Compile-time number of dimensions.
 		    template<std::size_t Dim>
 		    struct ArrayIndex : Array<std::size_t, Dim>
 		    {
 		        static constexpr std::size_t rank = Dim;
 		    };

 		    static_assert(
 		        std::is_trivially_default_constructible_v<ArrayIndex<1>>); // so ArrayIndex<1>{} will produce a zeroed
 		                                                                   // index. Should hold for all dimensions,
 		                                                                   // but just checking for <1> here.
 		    static_assert(std::is_trivially_copy_constructible_v<ArrayIndex<1>>);
 		    static_assert(std::is_trivially_move_constructible_v<ArrayIndex<1>>);
 		    static_assert(std::is_trivially_copy_assignable_v<ArrayIndex<1>>);
 		    static_assert(std::is_trivially_move_assignable_v<ArrayIndex<1>>);

 		    template<typename... Args>
 		    ArrayIndex(Args...) -> ArrayIndex<sizeof...(Args)>;
 		} // namespace llama

 		template<size_t N>
 		struct std::tuple_size<llama::ArrayIndex<N>> : std::integral_constant<size_t, N>
 		{
 		};

 		template<size_t I, size_t N>
 		struct std::tuple_element<I, llama::ArrayIndex<N>>
 		{
 		    using type = size_t;
 		};

 		namespace llama
 		{
 		    /// Used as a template argument to \ref ArrayExtents to mark a dynamic extent.
 		    inline constexpr std::size_t dyn = std::numeric_limits<std::size_t>::max();

 		    /// ArrayExtents holding compile and runtime indices. This is conceptually equivalent to the std::extent of
 		    /// std::mdspan. See: https://wg21.link/P0009
 		    template<std::size_t... Sizes>
 		    struct ArrayExtents : Array<typename ArrayIndex<sizeof...(Sizes)>::value_type, ((Sizes == dyn) + ... + 0)>
 		    {
 		        static constexpr std::size_t rank = sizeof...(Sizes);
 		        static constexpr auto rank_dynamic = ((Sizes == dyn) + ... + 0);
 		        static constexpr auto rank_static = rank - rank_dynamic;

 		        using Index = ArrayIndex<rank>;
 		        using value_type = typename Index::value_type;

 		        template<std::size_t I>
 		        LLAMA_FN_HOST_ACC_INLINE constexpr auto get() const
 		        {
 		            using namespace boost::mp11;
 		            using TypeList = mp_list_c<std::size_t, Sizes...>;
 		            constexpr auto extent = mp_at_c<TypeList, I>::value;
 		            if constexpr(extent != dyn)
 		                return extent;
 		            else
 		                return static_cast<const Array<value_type, rank_dynamic>&>(
 		                    *this)[+mp_count<mp_take_c<TypeList, I>, mp_size_t<dyn>>::value];
 		        }

 		        LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](std::size_t i) const
 		        {
 		            return boost::mp11::mp_with_index<rank>(i, [&](auto ic) { return get<decltype(ic)::value>(); });
 		        }

 		    private:
 		        template<std::size_t... Is>
 		        LLAMA_FN_HOST_ACC_INLINE constexpr auto toArray(std::index_sequence<Is...>) const -> Index
 		        {
 		            return {get<Is>()...};
 		        }

 		    public:
 		        LLAMA_FN_HOST_ACC_INLINE constexpr auto toArray() const -> Index
 		        {
 		            return toArray(std::make_index_sequence<rank>{});
 		        }

 		        LLAMA_FN_HOST_ACC_INLINE constexpr operator Index() const
 		        {
 		            return toArray();
 		        }
 		    };

 		    template<>
 		    struct ArrayExtents<>
 		    {
 		        static constexpr std::size_t rank = 0;
 		        static constexpr auto rank_dynamic = 0;
 		        static constexpr auto rank_static = 0;

 		        using Index = ArrayIndex<rank>;
 		        using value_type = typename Index::value_type;

 		        LLAMA_FN_HOST_ACC_INLINE constexpr auto toArray() const -> Index
 		        {
 		            return {};
 		        }

 		        LLAMA_FN_HOST_ACC_INLINE constexpr operator Index() const
 		        {
 		            return toArray();
 		        }
 		    };

 		    template<typename... Args>
 		    ArrayExtents(Args... args) -> ArrayExtents<(Args{}, dyn)...>;

 		    static_assert(std::is_trivially_default_constructible_v<ArrayExtents<1>>);
 		    static_assert(std::is_trivially_copy_constructible_v<ArrayExtents<1>>);
 		    static_assert(std::is_trivially_move_constructible_v<ArrayExtents<1>>);
 		    static_assert(std::is_trivially_copy_assignable_v<ArrayExtents<1>>);
 		    static_assert(std::is_trivially_move_assignable_v<ArrayExtents<1>>);

 		    template<std::size_t... SizesA, std::size_t... SizesB>
 		    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==(ArrayExtents<SizesA...> a, ArrayExtents<SizesB...> b) -> bool
 		    {
 		        return a.toArray() == b.toArray();
 		    }

 		    template<std::size_t... SizesA, std::size_t... SizesB>
 		    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator!=(ArrayExtents<SizesA...> a, ArrayExtents<SizesB...> b) -> bool
 		    {
 		        return !(a == b);
 		    }

 		    template<std::size_t... Sizes>
 		    LLAMA_FN_HOST_ACC_INLINE constexpr auto product(ArrayExtents<Sizes...> e) ->
 		        typename ArrayExtents<Sizes...>::value_type
 		    {
 		        return product(e.toArray());
 		    }

 		    /// N-dimensional ArrayExtents where all values are dynamic.
 		    template<std::size_t N>
 		    using ArrayExtentsDynamic = internal::
 		        mp_unwrap_values_into<boost::mp11::mp_repeat_c<boost::mp11::mp_list_c<std::size_t, dyn>, N>, ArrayExtents>;

 		    /// N-dimensional ArrayExtents where all values are Extent.
 		    template<std::size_t N, std::size_t Extent>
 		    using ArrayExtentsStatic = internal::
 		        mp_unwrap_values_into<boost::mp11::mp_repeat_c<boost::mp11::mp_list_c<std::size_t, Extent>, N>, ArrayExtents>;

 		    template<std::size_t Dim, typename Func, typename... OuterIndices>
 		    LLAMA_FN_HOST_ACC_INLINE void forEachADCoord(
 		        [[maybe_unused]] ArrayIndex<Dim> adSize,
 		        Func&& func,
 		        OuterIndices... outerIndices)
 		    {
 		        if constexpr(Dim > 0)
 		            for(std::size_t i = 0; i < adSize[0]; i++)
 		                forEachADCoord(ArrayIndex<Dim - 1>{pop_front(adSize)}, std::forward<Func>(func), outerIndices..., i);
 		        else
 		            std::forward<Func>(func)(ArrayIndex<sizeof...(outerIndices)>{outerIndices...});
 		    }

 		    template<std::size_t... Sizes, typename Func>
 		    LLAMA_FN_HOST_ACC_INLINE void forEachADCoord(ArrayExtents<Sizes...> extents, Func&& func)
 		    {
 		        forEachADCoord(extents.toArray(), std::forward<Func>(func));
 		    }
 		} // namespace llama

 		template<std::size_t... Sizes>
 		struct std::tuple_size<llama::ArrayExtents<Sizes...>> : std::integral_constant<std::size_t, sizeof...(Sizes)>
 		{
 		};

 		template<std::size_t I, std::size_t... Sizes>
 		struct std::tuple_element<I, llama::ArrayExtents<Sizes...>>
 		{
 		    using type = typename llama::ArrayExtents<Sizes...>::value_type;
 		};
 		// ==
 		// == ./ArrayExtents.hpp ==
 		// ============================================================================

 		// ============================================================================
 		// == ./Core.hpp ==
 		// ==
 		// Copyright 2018 Alexander Matthes
 		// SPDX-License-Identifier: GPL-3.0-or-later

 		// #pragma once
 		// #include "ArrayExtents.hpp"    // amalgamate: file already expanded
 		// #include "Meta.hpp"    // amalgamate: file already expanded
 			// ============================================================================
 			// == ./RecordCoord.hpp ==
 			// ==
 			// Copyright 2018 Alexander Matthes
 			// SPDX-License-Identifier: GPL-3.0-or-later

 			// #pragma once
 			// #include "Meta.hpp"    // amalgamate: file already expanded

 			#include <array>
 			// #include <ostream>    // amalgamate: file already included
 			// #include <type_traits>    // amalgamate: file already included

 			namespace llama
 			{
 			    /// Represents a coordinate for a record inside the record dimension tree.
 			    /// \tparam Coords... the compile time coordinate.
 			    template<std::size_t... Coords>
 			    struct RecordCoord
 			    {
 			        /// The list of integral coordinates as `boost::mp11::mp_list`.
 			        using List = boost::mp11::mp_list_c<std::size_t, Coords...>;

 			        static constexpr std::size_t front = boost::mp11::mp_front<List>::value;
 			        static constexpr std::size_t back = boost::mp11::mp_back<List>::value;
 			        static constexpr std::size_t size = sizeof...(Coords);
 			    };

 			    template<>
 			    struct RecordCoord<>
 			    {
 			        using List = boost::mp11::mp_list_c<std::size_t>;

 			        static constexpr std::size_t size = 0;
 			    };

 			    template<std::size_t... CoordsA, std::size_t... CoordsB>
 			    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==(RecordCoord<CoordsA...>, RecordCoord<CoordsB...>)
 			    {
 			        return false;
 			    }

 			    template<std::size_t... Coords>
 			    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==(RecordCoord<Coords...>, RecordCoord<Coords...>)
 			    {
 			        return true;
 			    }

 			    template<std::size_t... CoordsA, std::size_t... CoordsB>
 			    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator!=(RecordCoord<CoordsA...> a, RecordCoord<CoordsB...> b)
 			    {
 			        return !(a == b);
 			    }

 			    template<typename T>
 			    inline constexpr bool isRecordCoord = false;

 			    template<std::size_t... Coords>
 			    inline constexpr bool isRecordCoord<RecordCoord<Coords...>> = true;

 			    template<std::size_t... RCs>
 			    auto operator<<(std::ostream& os, RecordCoord<RCs...>) -> std::ostream&
 			    {
 			        os << "RecordCoord<";
 			        bool first = true;
 			        for(auto rc : std::array<std::size_t, sizeof...(RCs)>{RCs...})
 			        {
 			            if(first)
 			                first = false;
 			            else
 			                os << ", ";
 			            os << rc;
 			        }
 			        os << ">";
 			        return os;
 			    }

 			    inline namespace literals
 			    {
 			        /// Literal operator for converting a numeric literal into a \ref RecordCoord.
 			        template<char... Digits>
 			        constexpr auto operator"" _RC()
 			        {
 			            constexpr auto coord = []() constexpr
 			            {
 			                char digits[] = {(Digits - 48)...};
 			                std::size_t acc = 0;
 			                std ::size_t powerOf10 = 1;
 			                for(int i = sizeof...(Digits) - 1; i >= 0; i--)
 			                {
 			                    acc += digits[i] * powerOf10;
 			                    powerOf10 *= 10;
 			                }
 			                return acc;
 			            }
 			            ();
 			            return RecordCoord<coord>{};
 			        }
 			    } // namespace literals

 			    /// Converts a type list of integral constants into a \ref RecordCoord.
 			    template<typename L>
 			    using RecordCoordFromList = internal::mp_unwrap_values_into<L, RecordCoord>;

 			    /// Concatenate a set of \ref RecordCoord%s.
 			    template<typename... RecordCoords>
 			    using Cat = RecordCoordFromList<boost::mp11::mp_append<typename RecordCoords::List...>>;

 			    /// Concatenate a set of \ref RecordCoord%s instances.
 			    template<typename... RecordCoords>
 			    constexpr auto cat(RecordCoords...)
 			    {
 			        return Cat<RecordCoords...>{};
 			    }

 			    /// RecordCoord without first coordinate component.
 			    template<typename RecordCoord>
 			    using PopFront = RecordCoordFromList<boost::mp11::mp_pop_front<typename RecordCoord::List>>;

 			    namespace internal
 			    {
 			        template<typename First, typename Second>
 			        struct RecordCoordCommonPrefixIsBiggerImpl;

 			        template<std::size_t... Coords1, std::size_t... Coords2>
 			        struct RecordCoordCommonPrefixIsBiggerImpl<RecordCoord<Coords1...>, RecordCoord<Coords2...>>
 			        {
 			            static constexpr auto value = []() constexpr
 			            {
 			                // CTAD does not work if Coords1/2 is an empty pack
 			                std::array<std::size_t, sizeof...(Coords1)> a1{Coords1...};
 			                std::array<std::size_t, sizeof...(Coords2)> a2{Coords2...};
 			                for(std::size_t i = 0; i < std::min(a1.size(), a2.size()); i++)
 			                {
 			                    if(a1[i] > a2[i])
 			                        return true;
 			                    if(a1[i] < a2[i])
 			                        return false;
 			                }
 			                return false;
 			            }
 			            ();
 			        };
 			    } // namespace internal

 			    /// Checks wether the first RecordCoord is bigger than the second.
 			    template<typename First, typename Second>
 			    inline constexpr auto RecordCoordCommonPrefixIsBigger
 			        = internal::RecordCoordCommonPrefixIsBiggerImpl<First, Second>::value;

 			    namespace internal
 			    {
 			        template<typename First, typename Second>
 			        struct RecordCoordCommonPrefixIsSameImpl;

 			        template<std::size_t... Coords1, std::size_t... Coords2>
 			        struct RecordCoordCommonPrefixIsSameImpl<RecordCoord<Coords1...>, RecordCoord<Coords2...>>
 			        {
 			            static constexpr auto value = []() constexpr
 			            {
 			                // CTAD does not work if Coords1/2 is an empty pack
 			                std::array<std::size_t, sizeof...(Coords1)> a1{Coords1...};
 			                std::array<std::size_t, sizeof...(Coords2)> a2{Coords2...};
 			                for(std::size_t i = 0; i < std::min(a1.size(), a2.size()); i++)
 			                    if(a1[i] != a2[i])
 			                        return false;
 			                return true;
 			            }
 			            ();
 			        };
 			    } // namespace internal

 			    /// Checks whether two \ref RecordCoord%s are the same or one is the prefix of the other.
 			    template<typename First, typename Second>
 			    inline constexpr auto RecordCoordCommonPrefixIsSame
 			        = internal::RecordCoordCommonPrefixIsSameImpl<First, Second>::value;
 			} // namespace llama
 			// ==
 			// == ./RecordCoord.hpp ==
 			// ============================================================================


 		#include <boost/core/demangle.hpp>
 		#include <iostream>
 		#include <string>
 		// #include <type_traits>    // amalgamate: file already included

 		namespace llama
 		{
 		    /// Anonymous naming for a \ref Field.
 		    struct NoName
 		    {
 		    };

 		    /// A type list of \ref Field%s which may be used to define a record dimension.
 		    template<typename... Fields>
 		    struct Record
 		    {
 		    };

 		    /// @brief Tells whether the given type is allowed as a field type in LLAMA. Such types need to be trivially
 		    /// constructible and trivially destructible.
 		    template<typename T>
 		    inline constexpr bool isAllowedFieldType = std::is_trivially_destructible_v<T>;

 		    /// Record dimension tree node which may either be a leaf or refer to a child tree presented as another \ref
 		    /// Record.
 		    /// \tparam Tag Name of the node. May be any type (struct, class).
 		    /// \tparam Type Type of the node. May be one of three cases. 1. another sub tree consisting of a nested \ref
 		    /// Record. 2. an array of static size of any type, in which case a Record with as many \ref Field as the array
 		    /// size is created, named \ref RecordCoord specialized on consecutive numbers I. 3. A scalar type different from
 		    /// \ref Record, making this node a leaf of this type.
 		    template<typename Tag, typename Type>
 		    struct Field
 		    {
 		        static_assert(isAllowedFieldType<Type>, "This field's type is not allowed");
 		    };

 		    struct NrAndOffset
 		    {
 		        std::size_t nr;
 		        std::size_t offset;

 		        friend auto operator==(const NrAndOffset& a, const NrAndOffset& b) -> bool
 		        {
 		            return a.nr == b.nr && a.offset == b.offset;
 		        }

 		        friend auto operator!=(const NrAndOffset& a, const NrAndOffset& b) -> bool
 		        {
 		            return !(a == b);
 		        }

 		        friend auto operator<<(std::ostream& os, const NrAndOffset& value) -> std::ostream&
 		        {
 		            return os << "NrAndOffset{" << value.nr << ", " << value.offset << "}";
 		        }
 		    };

 		    /// Get the tag from a \ref Field.
 		    template<typename Field>
 		    using GetFieldTag = boost::mp11::mp_first<Field>;

 		    /// Get the type from a \ref Field.
 		    template<typename Field>
 		    using GetFieldType = boost::mp11::mp_second<Field>;

 		    template<typename T>
 		    inline constexpr auto isRecord = false;

 		    template<typename... Fields>
 		    inline constexpr auto isRecord<Record<Fields...>> = true;

 		    namespace internal
 		    {
 		        template<typename RecordDim, typename RecordCoord>
 		        struct GetTagsImpl;

 		        template<typename... Fields, std::size_t FirstCoord, std::size_t... Coords>
 		        struct GetTagsImpl<Record<Fields...>, RecordCoord<FirstCoord, Coords...>>
 		        {
 		            using Field = boost::mp11::mp_at_c<boost::mp11::mp_list<Fields...>, FirstCoord>;
 		            using ChildTag = GetFieldTag<Field>;
 		            using ChildType = GetFieldType<Field>;
 		            using type
 		                = boost::mp11::mp_push_front<typename GetTagsImpl<ChildType, RecordCoord<Coords...>>::type, ChildTag>;
 		        };

 		        template<typename ChildType, std::size_t Count, std::size_t FirstCoord, std::size_t... Coords>
 		        struct GetTagsImpl<ChildType[Count], RecordCoord<FirstCoord, Coords...>>
 		        {
 		            using ChildTag = RecordCoord<FirstCoord>;
 		            using type
 		                = boost::mp11::mp_push_front<typename GetTagsImpl<ChildType, RecordCoord<Coords...>>::type, ChildTag>;
 		        };

 		        template<typename T>
 		        struct GetTagsImpl<T, RecordCoord<>>
 		        {
 		            using type = boost::mp11::mp_list<>;
 		        };
 		    } // namespace internal

 		    /// Get the tags of all \ref Field%s from the root of the record dimension tree until to the node identified by
 		    /// \ref RecordCoord.
 		    template<typename RecordDim, typename RecordCoord>
 		    using GetTags = typename internal::GetTagsImpl<RecordDim, RecordCoord>::type;

 		    namespace internal
 		    {
 		        template<typename RecordDim, typename RecordCoord>
 		        struct GetTagImpl
 		        {
 		            using type = boost::mp11::mp_back<GetTags<RecordDim, RecordCoord>>;
 		        };

 		        template<typename RecordDim>
 		        struct GetTagImpl<RecordDim, RecordCoord<>>
 		        {
 		            using type = NoName;
 		        };
 		    } // namespace internal

 		    /// Get the tag of the \ref Field at a \ref RecordCoord inside the record dimension tree.
 		    template<typename RecordDim, typename RecordCoord>
 		    using GetTag = typename internal::GetTagImpl<RecordDim, RecordCoord>::type;

 		    /// Is true if, starting at two coordinates in two record dimensions, all subsequent nodes in the record dimension
 		    /// tree have the same tag.
 		    /// \tparam RecordDimA First record dimension.
 		    /// \tparam LocalA \ref RecordCoord based on StartA along which the tags are compared.
 		    /// \tparam RecordDimB second record dimension.
 		    /// \tparam LocalB \ref RecordCoord based on StartB along which the tags are compared.
 		    template<typename RecordDimA, typename LocalA, typename RecordDimB, typename LocalB>
 		    inline constexpr auto hasSameTags = []() constexpr
 		    {
 		        if constexpr(LocalA::size != LocalB::size)
 		            return false;
 		        else if constexpr(LocalA::size == 0 && LocalB::size == 0)
 		            return true;
 		        else
 		            return std::is_same_v<GetTags<RecordDimA, LocalA>, GetTags<RecordDimB, LocalB>>;
 		    }
 		    ();

 		    namespace internal
 		    {
 		        template<typename FieldList, typename Tag>
 		        struct FindFieldByTag
 		        {
 		            template<typename Field>
 		            using HasTag = std::is_same<GetFieldTag<Field>, Tag>;

 		            static constexpr auto value = boost::mp11::mp_find_if<FieldList, HasTag>::value;
 		        };

 		        template<typename RecordDim, typename RecordCoord, typename... Tags>
 		        struct GetCoordFromTagsImpl
 		        {
 		            static_assert(boost::mp11::mp_size<RecordDim>::value != 0, "Tag combination is not valid");
 		        };

 		        template<typename... Fields, std::size_t... ResultCoords, typename FirstTag, typename... Tags>
 		        struct GetCoordFromTagsImpl<Record<Fields...>, RecordCoord<ResultCoords...>, FirstTag, Tags...>
 		        {
 		            static constexpr auto tagIndex = FindFieldByTag<boost::mp11::mp_list<Fields...>, FirstTag>::value;
 		            static_assert(
 		                tagIndex < sizeof...(Fields),
 		                "FirstTag was not found inside this Record. Does your record dimension contain the tag you access "
 		                "with?");

 		            using ChildType = GetFieldType<boost::mp11::mp_at_c<Record<Fields...>, tagIndex>>;

 		            using type =
 		                typename GetCoordFromTagsImpl<ChildType, RecordCoord<ResultCoords..., tagIndex>, Tags...>::type;
 		        };

 		        template<
 		            typename ChildType,
 		            std::size_t Count,
 		            std::size_t... ResultCoords,
 		            typename FirstTag,
 		            typename... Tags>
 		        struct GetCoordFromTagsImpl<ChildType[Count], RecordCoord<ResultCoords...>, FirstTag, Tags...>
 		        {
 		            static_assert(isRecordCoord<FirstTag>, "Please use a RecordCoord<I> to index into static arrays");
 		            static_assert(FirstTag::size == 1, "Expected RecordCoord with 1 coordinate");
 		            static_assert(FirstTag::front < Count, "Index out of bounds");

 		            using type =
 		                typename GetCoordFromTagsImpl<ChildType, RecordCoord<ResultCoords..., FirstTag::front>, Tags...>::type;
 		        };

 		        template<typename RecordDim, typename RecordCoord>
 		        struct GetCoordFromTagsImpl<RecordDim, RecordCoord>
 		        {
 		            using type = RecordCoord;
 		        };

 		        // unpack a list of tags
 		        template<typename... Fields, typename... Tags>
 		        struct GetCoordFromTagsImpl<Record<Fields...>, RecordCoord<>, boost::mp11::mp_list<Tags...>>
 		            : GetCoordFromTagsImpl<Record<Fields...>, RecordCoord<>, Tags...>
 		        {
 		        };
 		        template<typename ChildType, std::size_t Count, typename... Tags>
 		        struct GetCoordFromTagsImpl<ChildType[Count], RecordCoord<>, boost::mp11::mp_list<Tags...>>
 		            : GetCoordFromTagsImpl<ChildType[Count], RecordCoord<>, Tags...>
 		        {
 		        };
 		    } // namespace internal

 		    /// Converts a series of tags, or a list of tags, navigating down a record dimension into a \ref RecordCoord.
 		    template<typename RecordDim, typename... Tags>
 		    using GetCoordFromTags = typename internal::GetCoordFromTagsImpl<RecordDim, RecordCoord<>, Tags...>::type;

 		    namespace internal
 		    {
 		        template<typename RecordDim, typename... RecordCoordOrTags>
 		        struct GetTypeImpl
 		        {
 		            using type = typename GetTypeImpl<RecordDim, GetCoordFromTags<RecordDim, RecordCoordOrTags...>>::type;
 		        };

 		        template<typename... Children, std::size_t HeadCoord, std::size_t... TailCoords>
 		        struct GetTypeImpl<Record<Children...>, RecordCoord<HeadCoord, TailCoords...>>
 		        {
 		            using ChildType = GetFieldType<boost::mp11::mp_at_c<Record<Children...>, HeadCoord>>;
 		            using type = typename GetTypeImpl<ChildType, RecordCoord<TailCoords...>>::type;
 		        };

 		        template<typename ChildType, std::size_t N, std::size_t HeadCoord, std::size_t... TailCoords>
 		        struct GetTypeImpl<ChildType[N], RecordCoord<HeadCoord, TailCoords...>>
 		        {
 		            using type = typename GetTypeImpl<ChildType, RecordCoord<TailCoords...>>::type;
 		        };

 		        template<typename T>
 		        struct GetTypeImpl<T, RecordCoord<>>
 		        {
 		            static_assert(isAllowedFieldType<T>);
 		            using type = T;
 		        };
 		    } // namespace internal

 		    /// Returns the type of a node in a record dimension tree identified by a given \ref RecordCoord or a series of
 		    /// tags.
 		    template<typename RecordDim, typename... RecordCoordOrTags>
 		    using GetType = typename internal::GetTypeImpl<RecordDim, RecordCoordOrTags...>::type;

 		    namespace internal
 		    {
 		        template<typename RecordDim, typename RecordCoord>
 		        struct LeafRecordCoordsImpl;

 		        template<typename T, std::size_t... RCs>
 		        struct LeafRecordCoordsImpl<T, RecordCoord<RCs...>>
 		        {
 		            using type = boost::mp11::mp_list<RecordCoord<RCs...>>;
 		        };

 		        template<typename... Fields, std::size_t... RCs>
 		        struct LeafRecordCoordsImpl<Record<Fields...>, RecordCoord<RCs...>>
 		        {
 		            template<std::size_t... Is>
 		            static auto help(std::index_sequence<Is...>)
 		            {
 		                return boost::mp11::mp_append<
 		                    typename LeafRecordCoordsImpl<GetFieldType<Fields>, RecordCoord<RCs..., Is>>::type...>{};
 		            }
 		            using type = decltype(help(std::make_index_sequence<sizeof...(Fields)>{}));
 		        };

 		        template<typename Child, std::size_t N, std::size_t... RCs>
 		        struct LeafRecordCoordsImpl<Child[N], RecordCoord<RCs...>>
 		        {
 		            template<std::size_t... Is>
 		            static auto help(std::index_sequence<Is...>)
 		            {
 		                return boost::mp11::mp_append<
 		                    typename LeafRecordCoordsImpl<Child, RecordCoord<RCs..., Is>>::type...>{};
 		            }
 		            using type = decltype(help(std::make_index_sequence<N>{}));
 		        };
 		    } // namespace internal

 		    /// Returns a flat type list containing all record coordinates to all leaves of the given record dimension.
 		    template<typename RecordDim>
 		    using LeafRecordCoords = typename internal::LeafRecordCoordsImpl<RecordDim, RecordCoord<>>::type;

 		    namespace internal
 		    {
 		        // adapted from boost::mp11, but with LLAMA_FN_HOST_ACC_INLINE
 		        template<template<typename...> typename L, typename... T, typename F>
 		        LLAMA_FN_HOST_ACC_INLINE constexpr void mp_for_each_inlined(L<T...>, F&& f)
 		        {
 		            using A = int[sizeof...(T)];
 		            (void) A{((void) f(T{}), 0)...};
 		        }
 		    } // namespace internal

 		    /// Iterates over the record dimension tree and calls a functor on each element.
 		    /// \param functor Functor to execute at each element of. Needs to have `operator()` with a template parameter for
 		    /// the \ref RecordCoord in the record dimension tree.
 		    /// \param baseCoord \ref RecordCoord at which the iteration should be started. The functor is called on elements
 		    /// beneath this coordinate.
 		    template<typename RecordDim, typename Functor, std::size_t... Coords>
 		    LLAMA_FN_HOST_ACC_INLINE constexpr void forEachLeafCoord(Functor&& functor, RecordCoord<Coords...> baseCoord)
 		    {
 		        LLAMA_FORCE_INLINE_RECURSIVE
 		        internal::mp_for_each_inlined(
 		            LeafRecordCoords<GetType<RecordDim, RecordCoord<Coords...>>>{},
 		            [&](auto innerCoord) LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(constexpr)
 		            { std::forward<Functor>(functor)(cat(baseCoord, innerCoord)); });
 		    }

 		    /// Iterates over the record dimension tree and calls a functor on each element.
 		    /// \param functor Functor to execute at each element of. Needs to have `operator()` with a template parameter for
 		    /// the \ref RecordCoord in the record dimension tree.
 		    /// \param baseTags Tags used to define where the iteration should be started. The functor is called on elements
 		    /// beneath this coordinate.
 		    template<typename RecordDim, typename Functor, typename... Tags>
 		    LLAMA_FN_HOST_ACC_INLINE constexpr void forEachLeafCoord(Functor&& functor, Tags... /*baseTags*/)
 		    {
 		        LLAMA_FORCE_INLINE_RECURSIVE
 		        forEachLeafCoord<RecordDim>(std::forward<Functor>(functor), GetCoordFromTags<RecordDim, Tags...>{});
 		    }

 		    namespace internal
 		    {
 		        template<typename T>
 		        struct FlattenRecordDimImpl
 		        {
 		            using type = boost::mp11::mp_list<T>;
 		        };

 		        template<typename... Fields>
 		        struct FlattenRecordDimImpl<Record<Fields...>>
 		        {
 		            using type = boost::mp11::mp_append<typename FlattenRecordDimImpl<GetFieldType<Fields>>::type...>;
 		        };
 		        template<typename Child, std::size_t N>
 		        struct FlattenRecordDimImpl<Child[N]>
 		        {
 		            using type = boost::mp11::mp_repeat_c<typename FlattenRecordDimImpl<Child>::type, N>;
 		        };
 		    } // namespace internal

 		    /// Returns a flat type list containing all leaf field types of the given record dimension.
 		    template<typename RecordDim>
 		    using FlatRecordDim = typename internal::FlattenRecordDimImpl<RecordDim>::type;

 		    /// The total number of fields in the recursively expanded record dimension.
 		    template<typename RecordDim>
 		    inline constexpr std::size_t flatFieldCount = 1;

 		    template<typename... Children>
 		    inline constexpr std::size_t flatFieldCount<
 		        Record<Children...>> = (flatFieldCount<GetFieldType<Children>> + ... + 0);

 		    template<typename Child, std::size_t N>
 		    inline constexpr std::size_t flatFieldCount<Child[N]> = flatFieldCount<Child>* N;

 		    namespace internal
 		    {
 		        template<std::size_t I, typename RecordDim>
 		        inline constexpr std::size_t flatFieldCountBefore = 0;

 		        template<typename... Children>
 		        inline constexpr std::size_t flatFieldCountBefore<0, Record<Children...>> = 0;

 		        // recursive formulation to benefit from template instantiation memoization
 		        // this massively improves compilation time when this template is instantiated with a lot of different I
 		        template<std::size_t I, typename... Children>
 		        inline constexpr std::size_t flatFieldCountBefore<
 		            I,
 		            Record<
 		                Children...>> = flatFieldCountBefore<I - 1, Record<Children...>> + flatFieldCount<GetFieldType<boost::mp11::mp_at_c<Record<Children...>, I - 1>>>;
 		    } // namespace internal

 		    /// The equivalent zero based index into a flat record dimension (\ref FlatRecordDim) of the given hierarchical
 		    /// record coordinate.
 		    template<typename RecordDim, typename RecordCoord>
 		    inline constexpr std::size_t flatRecordCoord = 0;

 		    template<typename T>
 		    inline constexpr std::size_t flatRecordCoord<T, RecordCoord<>> = 0;

 		    template<typename... Children, std::size_t I, std::size_t... Is>
 		    inline constexpr std::size_t flatRecordCoord<
 		        Record<Children...>,
 		        RecordCoord<
 		            I,
 		            Is...>> = internal::
 		                          flatFieldCountBefore<
 		                              I,
 		                              Record<
 		                                  Children...>> + flatRecordCoord<GetFieldType<boost::mp11::mp_at_c<Record<Children...>, I>>, RecordCoord<Is...>>;

 		    template<typename Child, std::size_t N, std::size_t I, std::size_t... Is>
 		    inline constexpr std::size_t flatRecordCoord<Child[N], RecordCoord<I, Is...>> = flatFieldCount<Child>* I
 		        + flatRecordCoord<Child, RecordCoord<Is...>>;

 		    namespace internal
 		    {
 		        template<typename TypeList>
 		        constexpr auto flatAlignOfImpl()
 		        {
 		            using namespace boost::mp11;

 		            std::size_t maxAlign = 0;
 		            mp_for_each<mp_transform<mp_identity, TypeList>>([&](auto e) constexpr
 		                                                             {
 		                                                                 using T = typename decltype(e)::type;
 		                                                                 maxAlign = std::max(maxAlign, alignof(T));
 		                                                             });
 		            return maxAlign;
 		        }
 		    } // namespace internal

 		    /// The alignment of a type list if its elements would be in a normal struct.
 		    template<typename TypeList>
 		    inline constexpr std::size_t flatAlignOf = internal::flatAlignOfImpl<TypeList>();

 		    /// The alignment of a type T.
 		    template<typename T>
 		    inline constexpr std::size_t alignOf = alignof(T);

 		    /// The alignment of a record dimension if its fields would be in a normal struct.
 		    template<typename... Fields>
 		    inline constexpr std::size_t alignOf<Record<Fields...>> = flatAlignOf<FlatRecordDim<Record<Fields...>>>;

 		    namespace internal
 		    {
 		        constexpr void roundUpToMultiple(std::size_t& value, std::size_t multiple)
 		        {
 		            value = ((value + multiple - 1) / multiple) * multiple;
 		        }

 		        template<typename TypeList, bool Align, bool IncludeTailPadding>
 		        constexpr auto sizeOfImpl() -> std::size_t
 		        {
 		            using namespace boost::mp11;

 		            std::size_t size = 0;
 		            std::size_t maxAlign = 0;
 		            mp_for_each<mp_transform<mp_identity, TypeList>>([&](auto e) constexpr
 		                                                             {
 		                                                                 using T = typename decltype(e)::type;
 		                                                                 if constexpr(Align)
 		                                                                 {
 		                                                                     roundUpToMultiple(size, alignof(T));
 		                                                                     maxAlign = std::max(maxAlign, alignof(T));
 		                                                                 }
 		                                                                 // NOLINTNEXTLINE(readability-misleading-indentation)
 		                                                                 size += sizeof(T);
 		                                                             });

 		            // final padding, so next struct can start right away
 		            if constexpr(Align && IncludeTailPadding)
 		                roundUpToMultiple(size, maxAlign); // TODO(bgruber): we could use flatAlignOf<TypeList> here, at the
 		                                                   // cost of more template instantiations
 		            return size;
 		        }

 		        template<bool Align, typename TypeList, std::size_t I>
 		        constexpr auto offsetOfImplWorkaround() -> std::size_t;

 		        // recursive formulation to benefit from template instantiation memoization
 		        // this massively improves compilation time when this template is instantiated with a lot of different I
 		        template<bool Align, typename TypeList, std::size_t I>
 		        inline constexpr std::size_t offsetOfImpl
 		            = offsetOfImplWorkaround<Align, TypeList, I>(); // FIXME: MSVC fails to compile an IILE here.

 		        template<bool Align, typename TypeList>
 		        inline constexpr std::size_t offsetOfImpl<Align, TypeList, 0> = 0;

 		        template<bool Align, typename TypeList, std::size_t I>
 		        constexpr auto offsetOfImplWorkaround() -> std::size_t
 		        {
 		            std::size_t offset = offsetOfImpl<Align, TypeList, I - 1> + sizeof(boost::mp11::mp_at_c<TypeList, I - 1>);
 		            if constexpr(Align)
 		                roundUpToMultiple(offset, alignof(boost::mp11::mp_at_c<TypeList, I>));
 		            return offset;
 		        }
 		    } // namespace internal

 		    /// The size of a type list if its elements would be in a normal struct.
 		    template<typename TypeList, bool Align, bool IncludeTailPadding = true>
 		    inline constexpr std::size_t flatSizeOf = internal::sizeOfImpl<TypeList, Align, IncludeTailPadding>();

 		    /// The size of a type T.
 		    template<typename T, bool Align = false, bool IncludeTailPadding = true>
 		    inline constexpr std::size_t sizeOf = sizeof(T);

 		    /// The size of a record dimension if its fields would be in a normal struct.
 		    template<typename... Fields, bool Align, bool IncludeTailPadding>
 		    inline constexpr std::size_t sizeOf<Record<Fields...>, Align, IncludeTailPadding> = flatSizeOf<
 		        FlatRecordDim<Record<Fields...>>,
 		        Align,
 		        IncludeTailPadding>;

 		    /// The byte offset of an element in a type list ifs elements would be in a normal struct.
 		    template<typename TypeList, std::size_t I, bool Align>
 		    inline constexpr std::size_t flatOffsetOf = internal::offsetOfImpl<Align, TypeList, I>;

 		    /// The byte offset of an element in a record dimension if it would be a normal struct.
 		    /// \tparam RecordDim Record dimension tree.
 		    /// \tparam RecordCoord Record coordinate of an element inrecord dimension tree.
 		    template<typename RecordDim, typename RecordCoord, bool Align = false>
 		    inline constexpr std::size_t offsetOf
 		        = flatOffsetOf<FlatRecordDim<RecordDim>, flatRecordCoord<RecordDim, RecordCoord>, Align>;

 		    template<typename S>
 		    auto structName(S = {}) -> std::string
 		    {
 		        auto s = boost::core::demangle(typeid(S).name());
 		        if(const auto pos = s.rfind(':'); pos != std::string::npos)
 		            s = s.substr(pos + 1);
 		        return s;
 		    }

 		    namespace internal
 		    {
 		        template<typename T>
 		        struct IndirectValue
 		        {
 		            T value;

 		            auto operator->() -> T*
 		            {
 		                return &value;
 		            }

 		            auto operator->() const -> const T*
 		            {
 		                return &value;
 		            }
 		        };

 		        // TODO(bgruber): replace in C++20
 		        template<class T>
 		        struct IsBoundedArray : std::false_type
 		        {
 		        };

 		        template<class T, std::size_t N>
 		        struct IsBoundedArray<T[N]> : std::true_type
 		        {
 		        };
 		    } // namespace internal

 		    /// Returns the integral n rounded up to be a multiple of mult.
 		    template<typename Integral>
 		    LLAMA_FN_HOST_ACC_INLINE constexpr auto roundUpToMultiple(Integral n, Integral mult) -> Integral
 		    {
 		        return (n + mult - 1) / mult * mult;
 		    }

 		    namespace internal
 		    {
 		        template<typename T, template<typename> typename TypeFunctor>
 		        struct TransformLeavesImpl
 		        {
 		            using type = TypeFunctor<T>;
 		        };

 		        template<typename... Fields, template<typename> typename TypeFunctor>
 		        struct TransformLeavesImpl<Record<Fields...>, TypeFunctor>
 		        {
 		            using type = Record<
 		                Field<GetFieldTag<Fields>, typename TransformLeavesImpl<GetFieldType<Fields>, TypeFunctor>::type>...>;
 		        };
 		        template<typename Child, std::size_t N, template<typename> typename TypeFunctor>
 		        struct TransformLeavesImpl<Child[N], TypeFunctor>
 		        {
 		            using type = typename TransformLeavesImpl<Child, TypeFunctor>::type[N];
 		        };
 		    } // namespace internal

 		    /// Creates a new record dimension where each new leaf field's type is the result of applying FieldTypeFunctor to
 		    /// the original leaf field's type.
 		    template<typename RecordDim, template<typename> typename FieldTypeFunctor>
 		    using TransformLeaves = typename internal::TransformLeavesImpl<RecordDim, FieldTypeFunctor>::type;

 		    namespace internal
 		    {
 		        // TODO: we might implement this better by expanding a record dim into a list of tag lists and then computing a
 		        // real set union of the two tag list lists

 		        template<typename A, typename B>
 		        auto mergeRecordDimsImpl(boost::mp11::mp_identity<A> a, boost::mp11::mp_identity<B>)
 		        {
 		            static_assert(std::is_same_v<A, B>, "Cannot merge record and non-record or fields with different types");
 		            return a;
 		        }

 		        template<typename A, std::size_t NA, typename B, std::size_t NB>
 		        auto mergeRecordDimsImpl(
 		            [[maybe_unused]] boost::mp11::mp_identity<A[NA]> a,
 		            [[maybe_unused]] boost::mp11::mp_identity<B[NB]> b)
 		        {
 		            static_assert(std::is_same_v<A, B>, "Cannot merge arrays of different type");
 		            if constexpr(NA < NB)
 		                return b;
 		            else
 		                return a;
 		        }

 		        template<typename... FieldsA>
 		        auto mergeRecordDimsImpl(boost::mp11::mp_identity<Record<FieldsA...>> a, boost::mp11::mp_identity<Record<>>)
 		        {
 		            return a;
 		        }

 		        template<
 		            typename... FieldsA,
 		            typename FieldB,
 		            typename... FieldsB,
 		            auto pos = FindFieldByTag<Record<FieldsA...>, GetFieldTag<FieldB>>::value>
 		        auto mergeRecordDimsImpl(
 		            boost::mp11::mp_identity<Record<FieldsA...>>,
 		            boost::mp11::mp_identity<Record<FieldB, FieldsB...>>)
 		        {
 		            using namespace boost::mp11;
 		            if constexpr(pos == sizeof...(FieldsA))
 		            {
 		                return mergeRecordDimsImpl(
 		                    mp_identity<Record<FieldsA..., FieldB>>{},
 		                    mp_identity<Record<FieldsB...>>{});
 		            }
 		            else
 		            {
 		                using OldFieldA = mp_at_c<Record<FieldsA...>, pos>;
 		                using NewFieldA = Field<
 		                    GetFieldTag<OldFieldA>,
 		                    typename decltype(mergeRecordDimsImpl(
 		                        mp_identity<GetFieldType<OldFieldA>>{},
 		                        mp_identity<GetFieldType<FieldB>>{}))::type>;
 		                using NewRecordA = mp_replace_at_c<Record<FieldsA...>, pos, NewFieldA>;
 		                return mergeRecordDimsImpl(mp_identity<NewRecordA>{}, mp_identity<Record<FieldsB...>>{});
 		            }
 		        }
 		    } // namespace internal

 		    /// Creates a merged record dimension, where duplicated, nested fields are unified.
 		    template<typename RecordDimA, typename RecordDimB>
 		    using MergedRecordDims = typename decltype(internal::mergeRecordDimsImpl(
 		        boost::mp11::mp_identity<RecordDimA>{},
 		        boost::mp11::mp_identity<RecordDimB>{}))::type;

 		    /// Returns the tags interspersed by '.' represented by the given record coord in the given record dimension.
 		    template<typename RecordDim, std::size_t... Coords>
 		    auto recordCoordTags(RecordCoord<Coords...>) -> std::string
 		    {
 		        using Tags = GetTags<RecordDim, RecordCoord<Coords...>>;

 		        std::string r;
 		        boost::mp11::mp_for_each<Tags>(
 		            [&](auto tag)
 		            {
 		                using Tag = decltype(tag);
 		                if(!r.empty())
 		                    r += '.';
 		                if constexpr(isRecordCoord<Tag>)
 		                {
 		                    static_assert(Tag::size == 1);
 		                    r += std::to_string(Tag::front); // handle array indices
 		                }
 		                else
 		                    r += structName(tag);
 		            });
 		        return r;
 		    }
 		} // namespace llama
 		// ==
 		// == ./Core.hpp ==
 		// ============================================================================


 	#include <algorithm>
 	#include <iterator>
 	// #include <limits>    // amalgamate: file already included
 	#if CAN_USE_RANGES
 	#    include <ranges>
 	#endif

 	namespace llama
 	{
 	    /// Iterator supporting \ref ArrayIndexRange.
 	    template<typename ArrayExtents>
 	    struct ArrayIndexIterator
 	    {
 	        static_assert(!std::is_const_v<ArrayExtents>);

 	        using value_type = typename ArrayExtents::Index;
 	        using difference_type = std::ptrdiff_t;
 	        using reference = value_type;
 	        using pointer = internal::IndirectValue<value_type>;
 	        using iterator_category = std::random_access_iterator_tag;

 	        static constexpr std::size_t rank = ArrayExtents::rank;

 	        constexpr ArrayIndexIterator() noexcept = default;

 	        LLAMA_FN_HOST_ACC_INLINE constexpr ArrayIndexIterator(ArrayExtents extents, value_type current) noexcept
 	            : extents(extents)
 	            , current(current)
 	        {
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator*() const noexcept -> value_type
 	        {
 	            return current;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator->() const noexcept -> pointer
 	        {
 	            return {**this};
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator++() noexcept -> ArrayIndexIterator&
 	        {
 	            current[rank - 1]++;
 	            for(auto i = static_cast<int>(rank) - 2; i >= 0; i--)
 	            {
 	                if(current[i + 1] != extents[i + 1])
 	                    return *this;
 	                current[i + 1] = 0;
 	                current[i]++;
 	            }
 	            return *this;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator++(int) noexcept -> ArrayIndexIterator
 	        {
 	            auto tmp = *this;
 	            ++*this;
 	            return tmp;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator--() noexcept -> ArrayIndexIterator&
 	        {
 	            current[rank - 1]--;
 	            for(auto i = static_cast<int>(rank) - 2; i >= 0; i--)
 	            {
 	                if(current[i + 1] != std::numeric_limits<std::size_t>::max())
 	                    return *this;
 	                current[i + 1] = extents[i] - 1;
 	                current[i]--;
 	            }
 	            // decrementing beyond [0, 0, ..., 0] is UB
 	            return *this;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator--(int) noexcept -> ArrayIndexIterator
 	        {
 	            auto tmp = *this;
 	            --*this;
 	            return tmp;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator[](difference_type i) const noexcept -> reference
 	        {
 	            return *(*this + i);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator+=(difference_type n) noexcept -> ArrayIndexIterator&
 	        {
 	            // add n to all lower dimensions with carry
 	            for(auto i = static_cast<int>(rank) - 1; i > 0 && n != 0; i--)
 	            {
 	                n += static_cast<difference_type>(current[i]);
 	                const auto s = static_cast<difference_type>(extents[i]);
 	                auto mod = n % s;
 	                n /= s;
 	                if(mod < 0)
 	                {
 	                    mod += s;
 	                    n--;
 	                }
 	                current[i] = mod;
 	                assert(current[i] < extents[i]);
 	            }

 	            current[0] = static_cast<difference_type>(current[0]) + n;
 	            // current is either within bounds or at the end ([last + 1, 0, 0, ..., 0])
 	            assert(
 	                (current[0] < extents[0]
 	                 || (current[0] == extents[0]
 	                     && std::all_of(std::begin(current) + 1, std::end(current), [](auto c) { return c == 0; })))
 	                && "Iterator was moved past the end");

 	            return *this;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator+(ArrayIndexIterator it, difference_type n) noexcept -> ArrayIndexIterator
 	        {
 	            it += n;
 	            return it;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator+(difference_type n, ArrayIndexIterator it) noexcept -> ArrayIndexIterator
 	        {
 	            return it + n;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator-=(difference_type n) noexcept -> ArrayIndexIterator&
 	        {
 	            return operator+=(-n);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator-(ArrayIndexIterator it, difference_type n) noexcept -> ArrayIndexIterator
 	        {
 	            it -= n;
 	            return it;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator-(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept
 	            -> difference_type
 	        {
 	            assert(a.extents == b.extents);

 	            difference_type n = a.current[rank - 1] - b.current[rank - 1];
 	            difference_type size = a.extents[rank - 1];
 	            for(auto i = static_cast<int>(rank) - 2; i >= 0; i--)
 	            {
 	                n += (a.current[i] - b.current[i]) * size;
 	                size *= a.extents[i];
 	            }

 	            return n;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator==(
 	            const ArrayIndexIterator<ArrayExtents>& a,
 	            const ArrayIndexIterator<ArrayExtents>& b) noexcept -> bool
 	        {
 	            assert(a.extents == b.extents);
 	            return a.current == b.current;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator!=(
 	            const ArrayIndexIterator<ArrayExtents>& a,
 	            const ArrayIndexIterator<ArrayExtents>& b) noexcept -> bool
 	        {
 	            return !(a == b);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator<(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool
 	        {
 	            assert(a.extents == b.extents);
 	            return std::lexicographical_compare(
 	                std::begin(a.current),
 	                std::end(a.current),
 	                std::begin(b.current),
 	                std::end(b.current));
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator>(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool
 	        {
 	            return b < a;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator<=(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool
 	        {
 	            return !(a > b);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator>=(const ArrayIndexIterator& a, const ArrayIndexIterator& b) noexcept -> bool
 	        {
 	            return !(a < b);
 	        }

 	    private:
 	        ArrayExtents extents; // TODO(bgruber): we only need to store rank - 1 sizes
 	        value_type current;
 	    };

 	    /// Range allowing to iterate over all indices in an \ref ArrayExtents.
 	    template<typename ArrayExtents>
 	    struct ArrayIndexRange
 	        : private ArrayExtents
 	#if CAN_USE_RANGES
 	        , std::ranges::view_base
 	#endif
 	    {
 	        static_assert(!std::is_const_v<ArrayExtents>);

 	        constexpr ArrayIndexRange() noexcept = default;

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr explicit ArrayIndexRange(ArrayExtents extents) noexcept : ArrayExtents(extents)
 	        {
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto begin() const noexcept -> ArrayIndexIterator<ArrayExtents>
 	        {
 	            return {*this, typename ArrayExtents::Index{}};
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto end() const noexcept -> ArrayIndexIterator<ArrayExtents>
 	        {
 	            auto endPos = typename ArrayExtents::Index{};
 	            endPos[0] = this->toArray()[0];
 	            return {*this, endPos};
 	        }
 	    };
 	} // namespace llama
 	// ==
 	// == ./ArrayIndexRange.hpp ==
 	// ============================================================================

 // #include "Core.hpp"    // amalgamate: file already expanded

 namespace llama
 {
    namespace internal
    {
        constexpr auto divRoundUp(std::size_t dividend, std::size_t divisor) -> std::size_t
        {
            return (dividend + divisor - 1) / divisor;
        }
    } // namespace internal

 // FIXME: this test is actually not correct, because __cpp_constexpr_dynamic_alloc only guarantees constexpr
 // std::allocator
 #ifdef __cpp_constexpr_dynamic_alloc
    namespace internal
    {
        template<typename T>
        struct DynArray
        {
            constexpr DynArray() = default;

            constexpr DynArray(std::size_t n)
            {
                data = new T[n]{};
            }

            constexpr ~DynArray()
            {
                delete[] data;
            }

            constexpr void resize(std::size_t n)
            {
                delete[] data;
                data = new T[n]{};
            }

            T* data = nullptr;
        };
    } // namespace internal

    /// Proofs by exhaustion of the array and record dimensions, that all values mapped to memory do not overlap.
    // Unfortunately, this only works for smallish array dimensions, because of compiler limits on constexpr evaluation
    // depth.
    template<typename Mapping>
    constexpr auto mapsNonOverlappingly(const Mapping& m) -> bool
    {
        internal::DynArray<internal::DynArray<std::uint64_t>> blobByteMapped(m.blobCount);
        for(std::size_t i = 0; i < m.blobCount; i++)
            blobByteMapped.data[i].resize(internal::divRoundUp(m.blobSize(i), 64));

        auto testAndSet = [&](auto blob, auto offset) constexpr
        {
            const auto bit = std::uint64_t{1} << (offset % 64);
            if(blobByteMapped.data[blob].data[offset / 64] & bit)
                return true;
            blobByteMapped.data[blob].data[offset / 64] |= bit;
            return false;
        };

        bool collision = false;
        forEachLeafCoord<typename Mapping::RecordDim>([&](auto rc) constexpr
                                                      {
                                                          if(collision)
                                                              return;
                                                          for(auto ai : ArrayIndexRange{m.extents()})
                                                          {
                                                              using Type
                                                                  = GetType<typename Mapping::RecordDim, decltype(rc)>;
                                                              const auto [blob, offset] = m.blobNrAndOffset(ai, rc);
                                                              for(std::size_t b = 0; b < sizeof(Type); b++)
                                                                  if(testAndSet(blob, offset + b))
                                                                  {
                                                                      collision = true;
                                                                      break;
                                                                  }
                                                          }
                                                      });
        return !collision;
    }
 #endif

    /// Proofs by exhaustion of the array and record dimensions, that at least PieceLength elements are always stored
    /// contiguously.
    // Unfortunately, this only works for smallish array dimensions, because of compiler limits on constexpr evaluation
    // depth.
    template<std::size_t PieceLength, typename Mapping>
    constexpr auto mapsPiecewiseContiguous(const Mapping& m) -> bool
    {
        bool collision = false;
        forEachLeafCoord<typename Mapping::RecordDim>([&](auto rc) constexpr
                                                      {
                                                          std::size_t flatIndex = 0;
                                                          std::size_t lastBlob
                                                              = std::numeric_limits<std::size_t>::max();
                                                          std::size_t lastOffset
                                                              = std::numeric_limits<std::size_t>::max();
                                                          for(auto ai : ArrayIndexRange{m.extents()})
                                                          {
                                                              using Type
                                                                  = GetType<typename Mapping::RecordDim, decltype(rc)>;
                                                              const auto [blob, offset] = m.blobNrAndOffset(ai, rc);
                                                              if(flatIndex % PieceLength != 0
                                                                 && (lastBlob != blob
                                                                     || lastOffset + sizeof(Type) != offset))
                                                              {
                                                                  collision = true;
                                                                  break;
                                                              }
                                                              lastBlob = blob;
                                                              lastOffset = offset;
                                                              flatIndex++;
                                                          }
                                                      });
        return !collision;
    }
 } // namespace llama
 // ==
 // == ./Proofs.hpp ==
 // ============================================================================

 // ============================================================================
 // == ./Vector.hpp ==
 // ==
 // SPDX-License-Identifier: GPL-3.0-or-later

 // #pragma once
 	// ============================================================================
 	// == ./View.hpp ==
 	// ==
 	// Copyright 2018 Alexander Matthes
 	// SPDX-License-Identifier: GPL-3.0-or-later

 	// #pragma once
 	// #include "Array.hpp"    // amalgamate: file already expanded
 	// #include "ArrayIndexRange.hpp"    // amalgamate: file already expanded
 		// ============================================================================
 		// == ./BlobAllocators.hpp ==
 		// ==
 		// Copyright 2018 Alexander Matthes
 		// SPDX-License-Identifier: GPL-3.0-or-later

 		// #pragma once
 		// #include "Array.hpp"    // amalgamate: file already expanded
 			// ============================================================================
 			// == ./Concepts.hpp ==
 			// ==
 			// #pragma once
 			// #include "Array.hpp"    // amalgamate: file already expanded
 			// #include "Core.hpp"    // amalgamate: file already expanded

 			// #include <type_traits>    // amalgamate: file already included

 			#if __has_include(<concepts>)
 			#    include <concepts>
 			#endif
 			#ifdef __cpp_lib_concepts
 			namespace llama
 			{
 			    // clang-format off
 			    template <typename M>
 			    concept Mapping = requires(M m) {
 			        typename M::ArrayExtents;
 			        typename M::ArrayIndex;
 			        typename M::RecordDim;
 			        { m.extents() } -> std::same_as<typename M::ArrayExtents>;
 			        { M::blobCount } -> std::convertible_to<std::size_t>;
 			        Array<int, M::blobCount>{}; // validates constexpr-ness
 			        { m.blobSize(std::size_t{}) } -> std::same_as<std::size_t>;
 			        { m.blobNrAndOffset(typename M::ArrayIndex{}) } -> std::same_as<NrAndOffset>;
 			        { m.template blobNrAndOffset<0>(typename M::ArrayIndex{}) } -> std::same_as<NrAndOffset>;
 			        { m.blobNrAndOffset(typename M::ArrayIndex{}, llama::RecordCoord<0>{}) } -> std::same_as<NrAndOffset>;
 			    };
 			    // clang-format on

 			    template<typename B>
 			    concept Blob = requires(B b, std::size_t i)
 			    {
 			        // according to http://eel.is/c++draft/intro.object#3 only std::byte and unsigned char can provide storage for
 			        // other types
 			        std::is_same_v<decltype(b[i]), std::byte&> || std::is_same_v<decltype(b[i]), unsigned char&>;
 			    };

 			    // clang-format off
 			    template <typename BA>
 			    concept BlobAllocator = requires(BA ba, std::integral_constant<std::size_t, 16> alignment, std::size_t size) {
 			        { ba(alignment, size) } -> Blob;
 			    };
 			    // clang-format on
 			} // namespace llama

 			#endif
 			// ==
 			// == ./Concepts.hpp ==
 			// ============================================================================

 		// #include "macros.hpp"    // amalgamate: file already expanded

 		#include <cstddef>
 		#include <memory>
 		#include <vector>
 		#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 11000
 		#    include <boost/shared_ptr.hpp>
 		#endif

 		namespace llama::bloballoc
 		{
 		    /// Allocates stack memory for a \ref View, which is copied each time a \ref View is copied.
 		    /// \tparam BytesToReserve the amount of memory to reserve.
 		    template<std::size_t BytesToReserve>
 		    struct Stack
 		    {
 		        template<std::size_t Alignment>
 		        LLAMA_FN_HOST_ACC_INLINE auto operator()(std::integral_constant<std::size_t, Alignment>, std::size_t) const
 		        {
 		            struct alignas(Alignment) AlignedArray : Array<std::byte, BytesToReserve>
 		            {
 		            };
 		            return AlignedArray{};
 		        }
 		    };
 		#ifdef __cpp_lib_concepts
 		    static_assert(BlobAllocator<Stack<64>>);
 		#endif

 		    /// Allocates heap memory managed by a `std::shared_ptr` for a \ref View. This memory is shared between all copies
 		    /// of a \ref View.
 		    struct SharedPtr
 		    {
 		        // libc++ below 11.0.0 does not yet support shared_ptr with arrays
 		        template<typename T>
 		        using shared_ptr =
 		#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 11000
 		            boost::shared_ptr<T>;
 		#else
 		            std::shared_ptr<T>;
 		#endif

 		        template<std::size_t Alignment>
 		        auto operator()(std::integral_constant<std::size_t, Alignment>, std::size_t count) const
 		            -> shared_ptr<std::byte[]>
 		        {
 		            auto* ptr
 		                = static_cast<std::byte*>(::operator new[](count * sizeof(std::byte), std::align_val_t{Alignment}));
 		            auto deleter = [=](std::byte* ptr) { ::operator delete[](ptr, std::align_val_t{Alignment}); };
 		            return shared_ptr<std::byte[]>{ptr, deleter};
 		        }
 		    };
 		#ifdef __cpp_lib_concepts
 		    static_assert(BlobAllocator<SharedPtr>);
 		#endif

 		    /// An STL compatible allocator allowing to specify alignment.
 		    template<typename T, std::size_t Alignment>
 		    struct AlignedAllocator
 		    {
 		        using value_type = T;

 		        inline AlignedAllocator() noexcept = default;

 		        template<typename T2>
 		        inline explicit AlignedAllocator(AlignedAllocator<T2, Alignment> const&) noexcept
 		        {
 		        }

 		        inline auto allocate(std::size_t n) -> T*
 		        {
 		            return static_cast<T*>(::operator new[](n * sizeof(T), std::align_val_t{Alignment}));
 		        }

 		        inline void deallocate(T* p, std::size_t)
 		        {
 		            ::operator delete[](p, std::align_val_t{Alignment});
 		        }

 		        template<typename T2>
 		        struct rebind // NOLINT(readability-identifier-naming)
 		        {
 		            using other = AlignedAllocator<T2, Alignment>;
 		        };

 		        auto operator!=(const AlignedAllocator<T, Alignment>& other) const -> bool
 		        {
 		            return !(*this == other);
 		        }

 		        auto operator==(const AlignedAllocator<T, Alignment>&) const -> bool
 		        {
 		            return true;
 		        }
 		    };

 		    /// Allocates heap memory managed by a `std::vector` for a \ref View, which is copied each time a \ref View is
 		    /// copied.
 		    struct Vector
 		    {
 		        template<std::size_t Alignment>
 		        inline auto operator()(std::integral_constant<std::size_t, Alignment>, std::size_t count) const
 		        {
 		            return std::vector<std::byte, AlignedAllocator<std::byte, Alignment>>(count);
 		        }
 		    };
 		#ifdef __cpp_lib_concepts
 		    static_assert(BlobAllocator<Vector>);
 		#endif
 		} // namespace llama::bloballoc
 		// ==
 		// == ./BlobAllocators.hpp ==
 		// ============================================================================

 	// #include "Concepts.hpp"    // amalgamate: file already expanded
 	// #include "Core.hpp"    // amalgamate: file already expanded
 	// #include "macros.hpp"    // amalgamate: file already expanded
 		// ============================================================================
 		// == ./mapping/One.hpp ==
 		// ==
 		// Copyright 2018 Alexander Matthes
 		// SPDX-License-Identifier: GPL-3.0-or-later

 		// #pragma once
 		// #include "../Core.hpp"    // amalgamate: file already expanded
 			// ============================================================================
 			// == ./mapping/Common.hpp ==
 			// ==
 			// Copyright 2018 Alexander Matthes
 			// SPDX-License-Identifier: GPL-3.0-or-later

 			// #pragma once
 			// #include "../Core.hpp"    // amalgamate: file already expanded

 			#include <climits>

 			namespace llama::mapping
 			{
 			    /// Functor that maps an \ref ArrayIndex into linear numbers the way C++ arrays work. The fast moving index of the
 			    /// ArrayIndex object should be the last one. E.g. ArrayIndex<3> a; stores 3 indices where a[2] should be
 			    /// incremented in the innermost loop.
 			    struct LinearizeArrayDimsCpp
 			    {
 			        template<typename ArrayExtents>
 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto size(const ArrayExtents& extents) -> std::size_t
 			        {
 			            return product(extents);
 			        }

 			        /// \param ai Index in the array dimensions.
 			        /// \param extents Total size of the array dimensions.
 			        /// \return Linearized index.
 			        template<typename ArrayExtents>
 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto operator()(
 			            const typename ArrayExtents::Index& ai,
 			            const ArrayExtents& extents) const -> std::size_t
 			        {
 			            if constexpr(ArrayExtents::rank == 0)
 			                return 0;
 			            else
 			            {
 			                std::size_t address = ai[0];
 			                for(std::size_t i = 1; i < ArrayExtents::rank; i++)
 			                {
 			                    address *= extents[i];
 			                    address += ai[i];
 			                }
 			                return address;
 			            }
 			        }
 			    };

 			    /// Functor that maps a \ref ArrayIndex into linear numbers the way Fortran arrays work. The fast moving index of
 			    /// the ArrayIndex object should be the last one. E.g. ArrayIndex<3> a; stores 3 indices where a[2] should be
 			    /// incremented in the innermost loop.
 			    struct LinearizeArrayDimsFortran
 			    {
 			        template<typename ArrayExtents>
 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto size(const ArrayExtents& extents) -> std::size_t
 			        {
 			            return product(extents);
 			        }

 			        /// \param ai Index in the array dimensions.
 			        /// \param extents Total size of the array dimensions.
 			        /// \return Linearized index.
 			        template<typename ArrayExtents>
 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto operator()(
 			            const typename ArrayExtents::Index& ai,
 			            const ArrayExtents& extents) const -> std::size_t
 			        {
 			            if constexpr(ArrayExtents::rank == 0)
 			                return 0;
 			            else
 			            {
 			                std::size_t address = ai[ArrayExtents::rank - 1];
 			                for(int i = static_cast<int>(ArrayExtents::rank) - 2; i >= 0; i--)
 			                {
 			                    address *= extents[i];
 			                    address += ai[i];
 			                }
 			                return address;
 			            }
 			        }
 			    };

 			    /// Functor that maps an \ref ArrayIndex into linear numbers using the Z-order space filling curve (Morton codes).
 			    struct LinearizeArrayDimsMorton
 			    {
 			        template<typename ArrayExtents>
 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto size(const ArrayExtents& extents) const -> std::size_t
 			        {
 			            if constexpr(ArrayExtents::rank == 0)
 			                return 0;
 			            else
 			            {
 			                std::size_t longest = extents[0];
 			                for(std::size_t i = 1; i < ArrayExtents::rank; i++)
 			                    longest = std::max(longest, extents[i]);
 			                const auto longestPO2 = bit_ceil(longest);
 			                return intPow(longestPO2, ArrayExtents::rank);
 			            }
 			        }

 			        /// \param ai Coordinate in the array dimensions.
 			        /// \param extents Total size of the array dimensions.
 			        /// \return Linearized index.
 			        template<typename ArrayExtents>
 			        LLAMA_FN_HOST_ACC_INLINE constexpr auto operator()(
 			            const typename ArrayExtents::Index& ai,
 			            [[maybe_unused]] const ArrayExtents& extents) const -> std::size_t
 			        {
 			            std::size_t r = 0;
 			            for(std::size_t bit = 0; bit < (sizeof(std::size_t) * CHAR_BIT) / ArrayExtents::rank; bit++)
 			                for(std::size_t i = 0; i < ArrayExtents::rank; i++)
 			                    r |= (ai[i] & (std::size_t{1} << bit)) << ((bit + 1) * (ArrayExtents::rank - 1) - i);
 			            return r;
 			        }

 			    private:
 			        LLAMA_FN_HOST_ACC_INLINE static constexpr auto bit_ceil(std::size_t n) -> std::size_t
 			        {
 			            std::size_t r = 1;
 			            while(r < n)
 			                r <<= 1u;
 			            return r;
 			        }

 			        LLAMA_FN_HOST_ACC_INLINE static constexpr auto intPow(std::size_t b, std::size_t e) -> std::size_t
 			        {
 			            e--;
 			            auto r = b;
 			            while(e != 0u)
 			            {
 			                r *= b;
 			                e--;
 			            }
 			            return r;
 			        }
 			    };

 			    /// Flattens the record dimension in the order fields are written.
 			    template<typename RecordDim>
 			    struct FlattenRecordDimInOrder
 			    {
 			        using FlatRecordDim = llama::FlatRecordDim<RecordDim>;

 			        template<std::size_t... RecordCoords>
 			        static constexpr std::size_t flatIndex = flatRecordCoord<RecordDim, RecordCoord<RecordCoords...>>;
 			    };

 			    /// Flattens the record dimension by sorting the fields according to a given predicate on the field types.
 			    /// @tparam Less A binary predicate accepting two field types, which exposes a member value. Value must be true if
 			    /// the first field type is less than the second one, otherwise false.
 			    template<typename RecordDim, template<typename, typename> typename Less>
 			    struct FlattenRecordDimSorted
 			    {
 			    private:
 			        using FlatOrigRecordDim = llama::FlatRecordDim<RecordDim>;
 			        using FlatSortedRecordDim = boost::mp11::mp_sort<FlatOrigRecordDim, Less>;

 			        template<typename A, typename B>
 			        using LessWithIndices
 			            = Less<boost::mp11::mp_at<FlatOrigRecordDim, A>, boost::mp11::mp_at<FlatOrigRecordDim, B>>;

 			        // A permutation from new FlatSortedRecordDim index to old FlatOrigRecordDim index
 			        using PermutedIndices
 			            = boost::mp11::mp_sort<boost::mp11::mp_iota<boost::mp11::mp_size<FlatOrigRecordDim>>, LessWithIndices>;

 			        template<typename A, typename B>
 			        using LessInvertPermutation = std::bool_constant<(
 			            boost::mp11::mp_at<PermutedIndices, A>::value < boost::mp11::mp_at<PermutedIndices, B>::value)>;

 			        // A permutation from old FlatOrigRecordDim index to new FlatSortedRecordDim index
 			        using InversePermutedIndices = boost::mp11::
 			            mp_sort<boost::mp11::mp_iota<boost::mp11::mp_size<FlatOrigRecordDim>>, LessInvertPermutation>;

 			    public:
 			        using FlatRecordDim = FlatSortedRecordDim;

 			        template<std::size_t... RecordCoords>
 			        static constexpr std::size_t flatIndex = []() constexpr
 			        {
 			            constexpr auto indexBefore = flatRecordCoord<RecordDim, RecordCoord<RecordCoords...>>;
 			            constexpr auto indexAfter = boost::mp11::mp_at_c<InversePermutedIndices, indexBefore>::value;
 			            return indexAfter;
 			        }
 			        ();
 			    };

 			    namespace internal
 			    {
 			        template<typename A, typename B>
 			        using LessAlignment = std::bool_constant<alignof(A) < alignof(B)>;

 			        template<typename A, typename B>
 			        using MoreAlignment = std::bool_constant<(alignof(A) > alignof(B))>;
 			    } // namespace internal

 			    /// Flattens and sorts the record dimension by increasing alignment of its fields.
 			    template<typename RecordDim>
 			    using FlattenRecordDimIncreasingAlignment = FlattenRecordDimSorted<RecordDim, internal::LessAlignment>;

 			    /// Flattens and sorts the record dimension by decreasing alignment of its fields.
 			    template<typename RecordDim>
 			    using FlattenRecordDimDecreasingAlignment = FlattenRecordDimSorted<RecordDim, internal::MoreAlignment>;

 			    /// Flattens and sorts the record dimension by the alignment of its fields to minimize padding.
 			    template<typename RecordDim>
 			    using FlattenRecordDimMinimizePadding = FlattenRecordDimIncreasingAlignment<RecordDim>;
 			} // namespace llama::mapping
 			// ==
 			// == ./mapping/Common.hpp ==
 			// ============================================================================


 		namespace llama::mapping
 		{
 		    /// Maps all array dimension indices to the same location and layouts struct members consecutively. This mapping is
 		    /// used for temporary, single element views.
 		    /// \tparam AlignAndPad If true, padding bytes are inserted to guarantee that struct members are properly aligned.
 		    /// If false, struct members are tightly packed.
 		    /// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened. See \ref
 		    /// FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref FlattenRecordDimDecreasingAlignment and
 		    /// \ref FlattenRecordDimMinimizePadding.
 		    template<
 		        typename TArrayExtents,
 		        typename TRecordDim,
 		        bool AlignAndPad = true,
 		        template<typename> typename FlattenRecordDim = FlattenRecordDimMinimizePadding>
 		    struct One : TArrayExtents
 		    {
 		        using ArrayExtents = TArrayExtents;
 		        using ArrayIndex = typename ArrayExtents::Index;
 		        using RecordDim = TRecordDim;

 		        static constexpr std::size_t blobCount = 1;

 		        constexpr One() = default;

 		        LLAMA_FN_HOST_ACC_INLINE
 		        constexpr explicit One(ArrayExtents extents, RecordDim = {}) : ArrayExtents(extents)
 		        {
 		        }

 		        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
 		        {
 		            return ArrayExtents{*this};
 		        }

 		        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t) const -> std::size_t
 		        {
 		            return flatSizeOf<typename Flattener::FlatRecordDim, AlignAndPad, false>; // no tail padding
 		        }

 		        template<std::size_t... RecordCoords>
 		        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex, RecordCoord<RecordCoords...> = {}) const
 		            -> NrAndOffset
 		        {
 		            constexpr std::size_t flatFieldIndex =
 		#ifdef __NVCC__
 		                *& // mess with nvcc compiler state to workaround bug
 		#endif
 		                 Flattener::template flatIndex<RecordCoords...>;
 		            constexpr auto offset = flatOffsetOf<typename Flattener::FlatRecordDim, flatFieldIndex, AlignAndPad>;
 		            return {0, offset};
 		        }

 		    private:
 		        using Flattener = FlattenRecordDim<TRecordDim>;
 		    };

 		    /// One mapping preserving the alignment of the field types by inserting padding.
 		    /// \see One
 		    template<typename ArrayExtents, typename RecordDim>
 		    using AlignedOne = One<ArrayExtents, RecordDim, true, FlattenRecordDimInOrder>;

 		    /// One mapping preserving the alignment of the field types by inserting padding and permuting the field order to
 		    /// minimize this padding.
 		    /// \see One
 		    template<typename ArrayExtents, typename RecordDim>
 		    using MinAlignedOne = One<ArrayExtents, RecordDim, true, FlattenRecordDimMinimizePadding>;

 		    /// One mapping packing the field types tightly, violating the types' alignment requirements.
 		    /// \see One
 		    template<typename ArrayExtents, typename RecordDim>
 		    using PackedOne = One<ArrayExtents, RecordDim, false, FlattenRecordDimInOrder>;

 		    template<typename Mapping>
 		    inline constexpr bool isOne = false;

 		    template<typename ArrayExtents, typename RecordDim, bool AlignAndPad, template<typename> typename FlattenRecordDim>
 		    inline constexpr bool isOne<One<ArrayExtents, RecordDim, AlignAndPad, FlattenRecordDim>> = true;
 		} // namespace llama::mapping
 		// ==
 		// == ./mapping/One.hpp ==
 		// ============================================================================


 	// #include <type_traits>    // amalgamate: file already included

 	namespace llama
 	{
 	#ifdef __cpp_lib_concepts
 	    template<typename TMapping, Blob BlobType>
 	#else
 	    template<typename TMapping, typename BlobType>
 	#endif
 	    struct View;

 	    namespace internal
 	    {
 	        template<typename Allocator, typename RecordDim>
 	        using AllocatorBlobType
 	            = decltype(std::declval<Allocator>()(std::integral_constant<std::size_t, alignOf<RecordDim>>{}, 0));

 	        LLAMA_SUPPRESS_HOST_DEVICE_WARNING
 	        template<typename Allocator, typename Mapping, std::size_t... Is>
 	        LLAMA_FN_HOST_ACC_INLINE auto makeBlobArray(
 	            const Allocator& alloc,
 	            const Mapping& mapping,
 	            std::integer_sequence<std::size_t, Is...>)
 	            -> Array<AllocatorBlobType<Allocator, typename Mapping::RecordDim>, Mapping::blobCount>
 	        {
 	            [[maybe_unused]] constexpr auto alignment
 	                = alignOf<typename Mapping::RecordDim>; // g++-12 warns that alignment is unsed
 	            return {alloc(std::integral_constant<std::size_t, alignment>{}, mapping.blobSize(Is))...};
 	        }
 	    } // namespace internal

 	    /// Same as \ref allocView but does not run field constructors.
 	#ifdef __cpp_lib_concepts
 	    template<typename Mapping, BlobAllocator Allocator = bloballoc::Vector>
 	#else
 	    template<typename Mapping, typename Allocator = bloballoc::Vector>
 	#endif
 	    LLAMA_FN_HOST_ACC_INLINE auto allocViewUninitialized(Mapping mapping = {}, const Allocator& alloc = {})
 	        -> View<Mapping, internal::AllocatorBlobType<Allocator, typename Mapping::RecordDim>>
 	    {
 	        auto blobs = internal::makeBlobArray(alloc, mapping, std::make_index_sequence<Mapping::blobCount>{});
 	        return {std::move(mapping), std::move(blobs)};
 	    }

 	    namespace internal
 	    {
 	        template<typename Mapping, typename RecordCoord, typename = void>
 	        struct IsComputed : std::false_type
 	        {
 	        };

 	        template<typename Mapping, typename RecordCoord>
 	        struct IsComputed<Mapping, RecordCoord, std::void_t<decltype(Mapping::isComputed(RecordCoord{}))>>
 	            : std::bool_constant<Mapping::isComputed(RecordCoord{})>
 	        {
 	        };
 	    } // namespace internal

 	    /// Returns true if the field accessed via the given mapping and record coordinate is a computed value.
 	    template<typename Mapping, typename RecordCoord>
 	    inline constexpr bool isComputed = internal::IsComputed<Mapping, RecordCoord>::value;

 	    /// Runs the constructor of all fields reachable through the given view. Computed fields are not constructed.
 	    template<typename Mapping, typename BlobType>
 	    LLAMA_FN_HOST_ACC_INLINE void constructFields(View<Mapping, BlobType>& view)
 	    {
 	        using View = View<Mapping, BlobType>;
 	        using RecordDim = typename View::RecordDim;
 	        forEachADCoord(
 	            view.mapping().extents(),
 	            [&](typename View::ArrayIndex ai)
 	            {
 	                if constexpr(isRecord<RecordDim> || internal::IsBoundedArray<RecordDim>::value)
 	                    forEachLeafCoord<RecordDim>(
 	                        [&](auto rc)
 	                        {
 	                            // TODO(bgruber): we could initialize computed fields if we can write to those. We could
 	                            // test if the returned value can be cast to a T& and then attempt to write.
 	                            if constexpr(!isComputed<Mapping, decltype(rc)>)
 	                                new(&view(ai)(rc)) GetType<RecordDim, decltype(rc)>;
 	                        });
 	                else if constexpr(!isComputed<Mapping, RecordCoord<>>)
 	                    new(&view(ai)) RecordDim;
 	            });
 	    }

 	    /// Creates a view based on the given mapping, e.g. \ref AoS or \ref :SoA. For allocating the view's underlying
 	    /// memory, the specified allocator callable is used (or the default one, which is \ref bloballoc::Vector). The
 	    /// allocator callable is called with the alignment and size of bytes to allocate for each blob of the mapping.
 	    /// The constructors are run for all fields by calling \ref constructFields. This function is the preferred way to
 	    /// create a \ref View. See also \ref allocViewUninitialized.
 	#ifdef __cpp_lib_concepts
 	    template<typename Mapping, BlobAllocator Allocator = bloballoc::Vector>
 	#else
 	    template<typename Mapping, typename Allocator = bloballoc::Vector>
 	#endif
 	    LLAMA_FN_HOST_ACC_INLINE auto allocView(Mapping mapping = {}, const Allocator& alloc = {})
 	        -> View<Mapping, internal::AllocatorBlobType<Allocator, typename Mapping::RecordDim>>
 	    {
 	        auto view = allocViewUninitialized(std::move(mapping), alloc);
 	        constructFields(view);
 	        return view;
 	    }

 	    /// Allocates a \ref View holding a single record backed by stack memory (\ref bloballoc::Stack).
 	    /// \tparam Dim Dimension of the \ref ArrayExtents of the \ref View.
 	    template<std::size_t Dim, typename RecordDim>
 	    LLAMA_FN_HOST_ACC_INLINE auto allocViewStack() -> decltype(auto)
 	    {
 	        constexpr auto mapping = mapping::MinAlignedOne<ArrayExtentsStatic<Dim, 1>, RecordDim>{};
 	        return allocView(mapping, bloballoc::Stack<mapping.blobSize(0)>{});
 	    }

 	    template<typename View, typename BoundRecordCoord = RecordCoord<>, bool OwnView = false>
 	    struct VirtualRecord;

 	    /// A \ref VirtualRecord that owns and holds a single value.
 	    template<typename RecordDim>
 	    using One = VirtualRecord<decltype(allocViewStack<0, RecordDim>()), RecordCoord<>, true>;

 	    // TODO(bgruber): Higher dimensional iterators might not have good codegen. Multiple nested loops seem to be
 	    // superior to a single iterator over multiple dimensions. At least compilers are able to produce better code.
 	    // std::mdspan also discovered similar difficulties and there was a discussion in WG21 in Oulu 2016 to
 	    // remove/postpone iterators from the design. In std::mdspan's design, the iterator iterated over the co-domain.
 	    template<typename View>
 	    struct Iterator
 	    {
 	        using ArrayIndexIterator = llama::ArrayIndexIterator<typename View::ArrayExtents>;

 	        using iterator_category = std::random_access_iterator_tag;
 	        using value_type = One<typename View::RecordDim>;
 	        using difference_type = typename ArrayIndexIterator::difference_type;
 	        using pointer = internal::IndirectValue<VirtualRecord<View>>;
 	        using reference = VirtualRecord<View>;

 	        constexpr Iterator() = default;

 	        LLAMA_FN_HOST_ACC_INLINE constexpr Iterator(ArrayIndexIterator arrayIndex, View* view)
 	            : arrayIndex(arrayIndex)
 	            , view(view)
 	        {
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator++() -> Iterator&
 	        {
 	            ++arrayIndex;
 	            return *this;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator++(int) -> Iterator
 	        {
 	            auto tmp = *this;
 	            ++*this;
 	            return tmp;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator--() -> Iterator&
 	        {
 	            --arrayIndex;
 	            return *this;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator--(int) -> Iterator
 	        {
 	            auto tmp{*this};
 	            --*this;
 	            return tmp;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator*() const -> reference
 	        {
 	            return (*view)(*arrayIndex);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator->() const -> pointer
 	        {
 	            return {**this};
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator[](difference_type i) const -> reference
 	        {
 	            return *(*this + i);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator+=(difference_type n) -> Iterator&
 	        {
 	            arrayIndex += n;
 	            return *this;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator+(Iterator it, difference_type n) -> Iterator
 	        {
 	            it += n;
 	            return it;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator+(difference_type n, Iterator it) -> Iterator
 	        {
 	            return it + n;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto operator-=(difference_type n) -> Iterator&
 	        {
 	            arrayIndex -= n;
 	            return *this;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator-(Iterator it, difference_type n) -> Iterator
 	        {
 	            it -= n;
 	            return it;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator-(const Iterator& a, const Iterator& b) -> difference_type
 	        {
 	            assert(a.view == b.view);
 	            return static_cast<std::ptrdiff_t>(a.arrayIndex - b.arrayIndex);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator==(const Iterator& a, const Iterator& b) -> bool
 	        {
 	            assert(a.view == b.view);
 	            return a.arrayIndex == b.arrayIndex;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator!=(const Iterator& a, const Iterator& b) -> bool
 	        {
 	            return !(a == b);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator<(const Iterator& a, const Iterator& b) -> bool
 	        {
 	            assert(a.view == b.view);
 	            return a.arrayIndex < b.arrayIndex;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator>(const Iterator& a, const Iterator& b) -> bool
 	        {
 	            return b < a;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator<=(const Iterator& a, const Iterator& b) -> bool
 	        {
 	            return !(a > b);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        friend constexpr auto operator>=(const Iterator& a, const Iterator& b) -> bool
 	        {
 	            return !(a < b);
 	        }

 	        ArrayIndexIterator arrayIndex;
 	        View* view;
 	    };

 	    /// Central LLAMA class holding memory for storage and giving access to values stored there defined by a mapping. A
 	    /// view should be created using \ref allocView.
 	    /// \tparam TMapping The mapping used by the view to map accesses into memory.
 	    /// \tparam BlobType The storage type used by the view holding memory.
 	#ifdef __cpp_lib_concepts
 	    template<typename TMapping, Blob BlobType>
 	#else
 	    template<typename TMapping, typename BlobType>
 	#endif
 	    struct View
 	        : private TMapping
 	#if CAN_USE_RANGES
 	        , std::ranges::view_base
 	#endif
 	    {
 	        static_assert(!std::is_const_v<TMapping>);
 	        using Mapping = TMapping;
 	        using ArrayExtents = typename Mapping::ArrayExtents;
 	        using ArrayIndex = typename Mapping::ArrayIndex;
 	        using RecordDim = typename Mapping::RecordDim;
 	        using iterator = Iterator<View>;
 	        using const_iterator = Iterator<const View>;

 	        static_assert(
 	            std::is_same_v<Mapping, std::decay_t<Mapping>>,
 	            "Mapping must not be const qualified or a reference. Are you using decltype(...) as View template "
 	            "argument?");
 	        static_assert(
 	            std::is_same_v<ArrayExtents, std::decay_t<ArrayExtents>>,
 	            "Mapping::ArrayExtents must not be const qualified or a reference. Are you using decltype(...) as mapping "
 	            "template argument?");

 	        View() = default;

 	        LLAMA_FN_HOST_ACC_INLINE
 	        View(Mapping mapping, Array<BlobType, Mapping::blobCount> storageBlobs)
 	            : Mapping(std::move(mapping))
 	            , storageBlobs(std::move(storageBlobs))
 	        {
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE auto mapping() -> Mapping&
 	        {
 	            return static_cast<Mapping&>(*this);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE auto mapping() const -> const Mapping&
 	        {
 	            return static_cast<const Mapping&>(*this);
 	        }

 	        /// Retrieves the \ref VirtualRecord at the given \ref ArrayIndex index.
 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) const -> decltype(auto)
 	        {
 	            if constexpr(isRecord<RecordDim> || internal::IsBoundedArray<RecordDim>::value)
 	            {
 	                LLAMA_FORCE_INLINE_RECURSIVE
 	                return VirtualRecord<const View>{ai, *this};
 	            }
 	            else
 	            {
 	                LLAMA_FORCE_INLINE_RECURSIVE
 	                return accessor(ai, RecordCoord<>{});
 	            }
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) -> decltype(auto)
 	        {
 	            if constexpr(isRecord<RecordDim> || internal::IsBoundedArray<RecordDim>::value)
 	            {
 	                LLAMA_FORCE_INLINE_RECURSIVE
 	                return VirtualRecord<View>{ai, *this};
 	            }
 	            else
 	            {
 	                LLAMA_FORCE_INLINE_RECURSIVE
 	                return accessor(ai, RecordCoord<>{});
 	            }
 	        }

 	        /// Retrieves the \ref VirtualRecord at the \ref ArrayIndex index constructed from the passed component
 	        /// indices.
 	        template<typename... Indices>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(Indices... indices) const -> decltype(auto)
 	        {
 	            static_assert(
 	                sizeof...(Indices) == ArrayIndex::rank,
 	                "Please specify as many indices as you have array dimensions");
 	            static_assert(
 	                std::conjunction_v<std::is_convertible<Indices, std::size_t>...>,
 	                "Indices must be convertible to std::size_t");
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return (*this)(ArrayIndex{static_cast<typename ArrayIndex::value_type>(indices)...});
 	        }

 	        template<typename... Indices>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(Indices... indices) -> decltype(auto)
 	        {
 	            static_assert(
 	                sizeof...(Indices) == ArrayIndex::rank,
 	                "Please specify as many indices as you have array dimensions");
 	            static_assert(
 	                std::conjunction_v<std::is_convertible<Indices, std::size_t>...>,
 	                "Indices must be convertible to std::size_t");
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return (*this)(ArrayIndex{static_cast<typename ArrayIndex::value_type>(indices)...});
 	        }

 	        /// Retrieves the \ref VirtualRecord at the \ref ArrayIndex index constructed from the passed component
 	        /// indices.
 	        LLAMA_FN_HOST_ACC_INLINE auto operator[](ArrayIndex ai) const -> decltype(auto)
 	        {
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return (*this)(ai);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE auto operator[](ArrayIndex ai) -> decltype(auto)
 	        {
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return (*this)(ai);
 	        }

 	        /// Retrieves the \ref VirtualRecord at the 1D \ref ArrayIndex index constructed from the passed index.
 	        LLAMA_FN_HOST_ACC_INLINE auto operator[](std::size_t index) const -> decltype(auto)
 	        {
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return (*this)(index);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE auto operator[](std::size_t index) -> decltype(auto)
 	        {
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return (*this)(index);
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        auto begin() -> iterator
 	        {
 	            return {ArrayIndexRange<ArrayExtents>{mapping().extents()}.begin(), this};
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        auto begin() const -> const_iterator
 	        {
 	            return {ArrayIndexRange<ArrayExtents>{mapping().extents()}.begin(), this};
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        auto end() -> iterator
 	        {
 	            return {ArrayIndexRange<ArrayExtents>{mapping().extents()}.end(), this};
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        auto end() const -> const_iterator
 	        {
 	            return {ArrayIndexRange<ArrayExtents>{mapping().extents()}.end(), this};
 	        }

 	        Array<BlobType, Mapping::blobCount> storageBlobs;

 	    private:
 	        template<typename TView, typename TBoundRecordCoord, bool OwnView>
 	        friend struct VirtualRecord;

 	        LLAMA_SUPPRESS_HOST_DEVICE_WARNING
 	        template<std::size_t... Coords>
 	        LLAMA_FN_HOST_ACC_INLINE auto accessor(ArrayIndex ai, RecordCoord<Coords...> rc = {}) const -> decltype(auto)
 	        {
 	            if constexpr(llama::isComputed<Mapping, RecordCoord<Coords...>>)
 	                return mapping().compute(ai, rc, storageBlobs);
 	            else
 	            {
 	                const auto [nr, offset] = mapping().blobNrAndOffset(ai, rc);
 	                using Type = GetType<RecordDim, RecordCoord<Coords...>>;
 	                return reinterpret_cast<const Type&>(storageBlobs[nr][offset]);
 	            }
 	        }

 	        LLAMA_SUPPRESS_HOST_DEVICE_WARNING
 	        template<std::size_t... Coords>
 	        LLAMA_FN_HOST_ACC_INLINE auto accessor(ArrayIndex ai, RecordCoord<Coords...> rc = {}) -> decltype(auto)
 	        {
 	            if constexpr(llama::isComputed<Mapping, RecordCoord<Coords...>>)
 	                return mapping().compute(ai, rc, storageBlobs);
 	            else
 	            {
 	                const auto [nr, offset] = mapping().blobNrAndOffset(ai, rc);
 	                using Type = GetType<RecordDim, RecordCoord<Coords...>>;
 	                using QualifiedType = std::conditional_t<
 	                    std::is_const_v<std::remove_reference_t<decltype(storageBlobs[nr][offset])>>,
 	                    const Type,
 	                    Type>;
 	                return reinterpret_cast<QualifiedType&>(storageBlobs[nr][offset]);
 	            }
 	        }
 	    };

 	    template<typename View>
 	    inline constexpr auto IsView = false;

 	    template<typename Mapping, typename BlobType>
 	    inline constexpr auto IsView<View<Mapping, BlobType>> = true;

 	    /// Acts like a \ref View, but shows only a smaller and/or shifted part of another view it references, the parent
 	    /// view.
 	    template<typename TParentView>
 	    struct VirtualView
 	    {
 	        using ParentView = TParentView; ///< type of the parent view
 	        using Mapping = typename ParentView::Mapping; ///< mapping of the parent view
 	        using ArrayExtents = typename Mapping::ArrayExtents; ///< array extents of the parent view
 	        using ArrayIndex = typename Mapping::ArrayIndex; ///< array index of the parent view

 	        /// Creates a VirtualView given a parent \ref View and offset.
 	        LLAMA_FN_HOST_ACC_INLINE
 	        VirtualView(ParentView& parentView, ArrayIndex offset) : parentView(parentView), offset(offset)
 	        {
 	        }

 	        template<std::size_t... Coords>
 	        LLAMA_FN_HOST_ACC_INLINE auto accessor(ArrayIndex ai) const -> const auto&
 	        {
 	            return parentView.template accessor<Coords...>(ArrayIndex{ai + offset});
 	        }

 	        template<std::size_t... Coords>
 	        LLAMA_FN_HOST_ACC_INLINE auto accessor(ArrayIndex ai) -> auto&
 	        {
 	            return parentView.template accessor<Coords...>(ArrayIndex{ai + offset});
 	        }

 	        /// Same as \ref View::operator()(ArrayIndex), but shifted by the offset of this \ref VirtualView.
 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) const -> decltype(auto)
 	        {
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return parentView(ArrayIndex{ai + offset});
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(ArrayIndex ai) -> decltype(auto)
 	        {
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return parentView(ArrayIndex{ai + offset});
 	        }

 	        /// Same as corresponding operator in \ref View, but shifted by the offset of this \ref VirtualView.
 	        template<typename... Indices>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(Indices... indices) const -> decltype(auto)
 	        {
 	            static_assert(
 	                sizeof...(Indices) == ArrayIndex::rank,
 	                "Please specify as many indices as you have array dimensions");
 	            static_assert(
 	                std::conjunction_v<std::is_convertible<Indices, std::size_t>...>,
 	                "Indices must be convertible to std::size_t");
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return parentView(
 	                ArrayIndex{ArrayIndex{static_cast<typename ArrayIndex::value_type>(indices)...} + offset});
 	        }

 	        template<typename... Indices>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(Indices... indices) -> decltype(auto)
 	        {
 	            static_assert(
 	                sizeof...(Indices) == ArrayIndex::rank,
 	                "Please specify as many indices as you have array dimensions");
 	            static_assert(
 	                std::conjunction_v<std::is_convertible<Indices, std::size_t>...>,
 	                "Indices must be convertible to std::size_t");
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return parentView(
 	                ArrayIndex{ArrayIndex{static_cast<typename ArrayIndex::value_type>(indices)...} + offset});
 	        }

 	        template<std::size_t... Coord>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(RecordCoord<Coord...> = {}) const -> decltype(auto)
 	        {
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return accessor<Coord...>(ArrayIndex{});
 	        }

 	        template<std::size_t... Coord>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(RecordCoord<Coord...> = {}) -> decltype(auto)
 	        {
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return accessor<Coord...>(ArrayIndex{});
 	        }

 	        ParentView& parentView; ///< reference to parent view.
 	        const ArrayIndex
 	            offset; ///< offset this view's \ref ArrayIndex indices are shifted when passed to the parent view.
 	    };
 	} // namespace llama
 	// ==
 	// == ./View.hpp ==
 	// ============================================================================

 	// ============================================================================
 	// == ./VirtualRecord.hpp ==
 	// ==
 	// Copyright 2018 Alexander Matthes
 	// SPDX-License-Identifier: GPL-3.0-or-later

 	// #pragma once
 	// #include "View.hpp"    // amalgamate: file already expanded

 	#include <iosfwd>
 	// #include <type_traits>    // amalgamate: file already included

 	namespace llama
 	{
 	    template<typename View, typename BoundRecordCoord, bool OwnView>
 	    struct VirtualRecord;

 	    template<typename View>
 	    inline constexpr auto is_VirtualRecord = false;

 	    template<typename View, typename BoundRecordCoord, bool OwnView>
 	    inline constexpr auto is_VirtualRecord<VirtualRecord<View, BoundRecordCoord, OwnView>> = true;

 	    /// Creates a single \ref VirtualRecord owning a view with stack memory and copies all values from an existing \ref
 	    /// VirtualRecord.
 	    template<typename VirtualRecord>
 	    LLAMA_FN_HOST_ACC_INLINE auto copyVirtualRecordStack(const VirtualRecord& vd) -> decltype(auto)
 	    {
 	        One<typename VirtualRecord::AccessibleRecordDim> temp;
 	        temp = vd;
 	        return temp;
 	    }

 	    namespace internal
 	    {
 	        template<
 	            typename Functor,
 	            typename LeftRecord,
 	            typename RightView,
 	            typename RightBoundRecordDim,
 	            bool RightOwnView>
 	        LLAMA_FN_HOST_ACC_INLINE auto virtualRecordArithOperator(
 	            LeftRecord& left,
 	            const VirtualRecord<RightView, RightBoundRecordDim, RightOwnView>& right) -> LeftRecord&
 	        {
 	            using RightRecord = VirtualRecord<RightView, RightBoundRecordDim, RightOwnView>;
 	            // if the record dimension left and right is the same, a single loop is enough and no tag check is needed.
 	            // this safes a lot of compilation time.
 	            if constexpr(std::is_same_v<
 	                             typename LeftRecord::AccessibleRecordDim,
 	                             typename RightRecord::AccessibleRecordDim>)
 	            {
 	                forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>([&](auto rc) LLAMA_LAMBDA_INLINE
 	                                                                           { Functor{}(left(rc), right(rc)); });
 	            }
 	            else
 	            {
 	                forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>(
 	                    [&](auto leftRC) LLAMA_LAMBDA_INLINE
 	                    {
 	                        using LeftInnerCoord = decltype(leftRC);
 	                        forEachLeafCoord<typename RightRecord::AccessibleRecordDim>(
 	                            [&](auto rightRC) LLAMA_LAMBDA_INLINE
 	                            {
 	                                using RightInnerCoord = decltype(rightRC);
 	                                if constexpr(hasSameTags<
 	                                                 typename LeftRecord::AccessibleRecordDim,
 	                                                 LeftInnerCoord,
 	                                                 typename RightRecord::AccessibleRecordDim,
 	                                                 RightInnerCoord>)
 	                                {
 	                                    Functor{}(left(leftRC), right(rightRC));
 	                                }
 	                            });
 	                    });
 	            }
 	            return left;
 	        }

 	        template<typename Functor, typename LeftRecord, typename T>
 	        LLAMA_FN_HOST_ACC_INLINE auto virtualRecordArithOperator(LeftRecord& left, const T& right) -> LeftRecord&
 	        {
 	            forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>([&](auto leftRC) LLAMA_LAMBDA_INLINE
 	                                                                       { Functor{}(left(leftRC), right); });
 	            return left;
 	        }

 	        template<
 	            typename Functor,
 	            typename LeftRecord,
 	            typename RightView,
 	            typename RightBoundRecordDim,
 	            bool RightOwnView>
 	        LLAMA_FN_HOST_ACC_INLINE auto virtualRecordRelOperator(
 	            const LeftRecord& left,
 	            const VirtualRecord<RightView, RightBoundRecordDim, RightOwnView>& right) -> bool
 	        {
 	            using RightRecord = VirtualRecord<RightView, RightBoundRecordDim, RightOwnView>;
 	            bool result = true;
 	            // if the record dimension left and right is the same, a single loop is enough and no tag check is needed.
 	            // this safes a lot of compilation time.
 	            if constexpr(std::is_same_v<
 	                             typename LeftRecord::AccessibleRecordDim,
 	                             typename RightRecord::AccessibleRecordDim>)
 	            {
 	                forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>(
 	                    [&](auto rc) LLAMA_LAMBDA_INLINE { result &= Functor{}(left(rc), right(rc)); });
 	            }
 	            else
 	            {
 	                forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>(
 	                    [&](auto leftRC) LLAMA_LAMBDA_INLINE
 	                    {
 	                        using LeftInnerCoord = decltype(leftRC);
 	                        forEachLeafCoord<typename RightRecord::AccessibleRecordDim>(
 	                            [&](auto rightRC) LLAMA_LAMBDA_INLINE
 	                            {
 	                                using RightInnerCoord = decltype(rightRC);
 	                                if constexpr(hasSameTags<
 	                                                 typename LeftRecord::AccessibleRecordDim,
 	                                                 LeftInnerCoord,
 	                                                 typename RightRecord::AccessibleRecordDim,
 	                                                 RightInnerCoord>)
 	                                {
 	                                    result &= Functor{}(left(leftRC), right(rightRC));
 	                                }
 	                            });
 	                    });
 	            }
 	            return result;
 	        }

 	        template<typename Functor, typename LeftRecord, typename T>
 	        LLAMA_FN_HOST_ACC_INLINE auto virtualRecordRelOperator(const LeftRecord& left, const T& right) -> bool
 	        {
 	            bool result = true;
 	            forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>(
 	                [&](auto leftRC) LLAMA_LAMBDA_INLINE {
 	                    result &= Functor{}(
 	                        left(leftRC),
 	                        static_cast<std::remove_reference_t<decltype(left(leftRC))>>(right));
 	                });
 	            return result;
 	        }

 	        struct Assign
 	        {
 	            template<typename A, typename B>
 	            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
 	            {
 	                return std::forward<A>(a) = b;
 	            }
 	        };

 	        struct PlusAssign
 	        {
 	            template<typename A, typename B>
 	            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
 	            {
 	                return std::forward<A>(a) += b;
 	            }
 	        };

 	        struct MinusAssign
 	        {
 	            template<typename A, typename B>
 	            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
 	            {
 	                return std::forward<A>(a) -= b;
 	            }
 	        };

 	        struct MultiplyAssign
 	        {
 	            template<typename A, typename B>
 	            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
 	            {
 	                return std::forward<A>(a) *= b;
 	            }
 	        };

 	        struct DivideAssign
 	        {
 	            template<typename A, typename B>
 	            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
 	            {
 	                return std::forward<A>(a) /= b;
 	            }
 	        };

 	        struct ModuloAssign
 	        {
 	            template<typename A, typename B>
 	            LLAMA_FN_HOST_ACC_INLINE auto operator()(A&& a, const B& b) const -> decltype(auto)
 	            {
 	                return std::forward<A>(a) %= b;
 	            }
 	        };

 	        template<typename TWithOptionalConst, typename T>
 	        LLAMA_FN_HOST_ACC_INLINE auto asTupleImpl(TWithOptionalConst& leaf, T) -> std::enable_if_t<
 	            !is_VirtualRecord<std::decay_t<TWithOptionalConst>>,
 	            std::reference_wrapper<TWithOptionalConst>>
 	        {
 	            return leaf;
 	        }

 	        template<typename VirtualRecord, typename T, std::size_t N, std::size_t... Is>
 	        LLAMA_FN_HOST_ACC_INLINE auto asTupleImplArr(VirtualRecord&& vd, T(&&)[N], std::index_sequence<Is...>)
 	        {
 	            return std::make_tuple(asTupleImpl(vd(RecordCoord<Is>{}), T{})...);
 	        }

 	        template<typename VirtualRecord, typename T, std::size_t N>
 	        LLAMA_FN_HOST_ACC_INLINE auto asTupleImpl(VirtualRecord&& vd, T(&&a)[N])
 	        {
 	            return asTupleImplArr(std::forward<VirtualRecord>(vd), std::move(a), std::make_index_sequence<N>{});
 	        }

 	        template<typename VirtualRecord, typename... Fields>
 	        LLAMA_FN_HOST_ACC_INLINE auto asTupleImpl(VirtualRecord&& vd, Record<Fields...>)
 	        {
 	            return std::make_tuple(asTupleImpl(vd(GetFieldTag<Fields>{}), GetFieldType<Fields>{})...);
 	        }

 	        template<typename TWithOptionalConst, typename T>
 	        LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImpl(TWithOptionalConst& leaf, T)
 	            -> std::enable_if_t<!is_VirtualRecord<std::decay_t<TWithOptionalConst>>, std::tuple<TWithOptionalConst&>>
 	        {
 	            return {leaf};
 	        }

 	        template<typename VirtualRecord, typename T, std::size_t N, std::size_t... Is>
 	        LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImplArr(VirtualRecord&& vd, T(&&)[N], std::index_sequence<Is...>)
 	        {
 	            return std::tuple_cat(asFlatTupleImpl(vd(RecordCoord<Is>{}), T{})...);
 	        }

 	        template<typename VirtualRecord, typename T, std::size_t N>
 	        LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImpl(VirtualRecord&& vd, T(&&a)[N])
 	        {
 	            return asFlatTupleImplArr(std::forward<VirtualRecord>(vd), std::move(a), std::make_index_sequence<N>{});
 	        }

 	        template<typename VirtualRecord, typename... Fields>
 	        LLAMA_FN_HOST_ACC_INLINE auto asFlatTupleImpl(VirtualRecord&& vd, Record<Fields...>)
 	        {
 	            return std::tuple_cat(asFlatTupleImpl(vd(GetFieldTag<Fields>{}), GetFieldType<Fields>{})...);
 	        }

 	        template<typename T, typename = void>
 	        constexpr inline auto isTupleLike = false;

 	        // get<I>(t) and std::tuple_size<T> must be available
 	        using std::get; // make sure a get<0>() can be found, so the compiler can compile the trait
 	        template<typename T>
 	        constexpr inline auto
 	            isTupleLike<T, std::void_t<decltype(get<0>(std::declval<T>())), std::tuple_size<T>>> = true;

 	        template<typename... Ts>
 	        constexpr inline auto dependentFalse = false;

 	        template<typename Tuple1, typename Tuple2, std::size_t... Is>
 	        LLAMA_FN_HOST_ACC_INLINE void assignTuples(Tuple1&& dst, Tuple2&& src, std::index_sequence<Is...>);

 	        template<typename T1, typename T2>
 	        LLAMA_FN_HOST_ACC_INLINE void assignTupleElement(T1&& dst, T2&& src)
 	        {
 	            if constexpr(isTupleLike<std::decay_t<T1>> && isTupleLike<std::decay_t<T2>>)
 	            {
 	                static_assert(std::tuple_size_v<std::decay_t<T1>> == std::tuple_size_v<std::decay_t<T2>>);
 	                assignTuples(dst, src, std::make_index_sequence<std::tuple_size_v<std::decay_t<T1>>>{});
 	            }
 	            else if constexpr(!isTupleLike<std::decay_t<T1>> && !isTupleLike<std::decay_t<T2>>)
 	                std::forward<T1>(dst) = std::forward<T2>(src);
 	            else
 	                static_assert(
 	                    dependentFalse<T1, T2>,
 	                    "Elements to assign are not tuple/tuple or non-tuple/non-tuple.");
 	        }

 	        template<typename Tuple1, typename Tuple2, std::size_t... Is>
 	        LLAMA_FN_HOST_ACC_INLINE void assignTuples(Tuple1&& dst, Tuple2&& src, std::index_sequence<Is...>)
 	        {
 	            static_assert(std::tuple_size_v<std::decay_t<Tuple1>> == std::tuple_size_v<std::decay_t<Tuple2>>);
 	            using std::get;
 	            (assignTupleElement(get<Is>(std::forward<Tuple1>(dst)), get<Is>(std::forward<Tuple2>(src))), ...);
 	        }

 	        template<typename T, typename Tuple, std::size_t... Is>
 	        LLAMA_FN_HOST_ACC_INLINE auto makeFromTuple(Tuple&& src, std::index_sequence<Is...>)
 	        {
 	            using std::get;
 	            return T{get<Is>(std::forward<Tuple>(src))...};
 	        }

 	        template<typename T, typename SFINAE, typename... Args>
 	        constexpr inline auto isDirectListInitializableImpl = false;

 	        template<typename T, typename... Args>
 	        constexpr inline auto
 	            isDirectListInitializableImpl<T, std::void_t<decltype(T{std::declval<Args>()...})>, Args...> = true;

 	        template<typename T, typename... Args>
 	        constexpr inline auto isDirectListInitializable = isDirectListInitializableImpl<T, void, Args...>;

 	        template<typename T, typename Tuple>
 	        constexpr inline auto isDirectListInitializableFromTuple = false;

 	        template<typename T, template<typename...> typename Tuple, typename... Args>
 	        constexpr inline auto
 	            isDirectListInitializableFromTuple<T, Tuple<Args...>> = isDirectListInitializable<T, Args...>;
 	    } // namespace internal

 	    /// Virtual record type returned by \ref View after resolving an array dimensions coordinate or partially resolving
 	    /// a \ref RecordCoord. A virtual record does not hold data itself (thus named "virtual"), it just binds enough
 	    /// information (array dimensions coord and partial record coord) to retrieve it from a \ref View later. Virtual
 	    /// records should not be created by the user. They are returned from various access functions in \ref View and
 	    /// VirtualRecord itself.
 	    template<typename TView, typename TBoundRecordCoord, bool OwnView>
 	    struct VirtualRecord : private TView::Mapping::ArrayIndex
 	    {
 	        using View = TView; ///< View this virtual record points into.
 	        using BoundRecordCoord
 	            = TBoundRecordCoord; ///< Record coords into View::RecordDim which are already bound by this VirtualRecord.

 	    private:
 	        using ArrayIndex = typename View::Mapping::ArrayIndex;
 	        using RecordDim = typename View::Mapping::RecordDim;

 	        std::conditional_t<OwnView, View, View&> view;

 	    public:
 	        /// Subtree of the record dimension of View starting at BoundRecordCoord. If BoundRecordCoord is
 	        /// `RecordCoord<>` (default) AccessibleRecordDim is the same as `Mapping::RecordDim`.
 	        using AccessibleRecordDim = GetType<RecordDim, BoundRecordCoord>;

 	        /// Creates an empty VirtualRecord. Only available for if the view is owned. Used by llama::One.
 	        LLAMA_FN_HOST_ACC_INLINE VirtualRecord()
 	            /* requires(OwnView) */
 	            : ArrayIndex{}
 	            , view{allocViewStack<0, RecordDim>()}
 	        {
 	            static_assert(OwnView, "The default constructor of VirtualRecord is only available if it owns the view.");
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        VirtualRecord(ArrayIndex ai, std::conditional_t<OwnView, View&&, View&> view)
 	            : ArrayIndex{ai}
 	            , view{static_cast<decltype(view)>(view)}
 	        {
 	        }

 	        VirtualRecord(const VirtualRecord&) = default;

 	        // NOLINTNEXTLINE(cert-oop54-cpp)
 	        LLAMA_FN_HOST_ACC_INLINE auto operator=(const VirtualRecord& other) -> VirtualRecord&
 	        {
 	            // NOLINTNEXTLINE(cppcoreguidelines-c-copy-assignment-signature,misc-unconventional-assign-operator)
 	            return this->operator=<VirtualRecord>(other);
 	        }

 	        VirtualRecord(VirtualRecord&&) noexcept = default;
 	        auto operator=(VirtualRecord&&) noexcept -> VirtualRecord& = default;

 	        ~VirtualRecord() = default;

 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto arrayIndex() const -> ArrayIndex
 	        {
 	            return *this;
 	        }

 	        /// Create a VirtuaRecord from a different VirtualRecord. Only available for if the view is owned. Used by
 	        /// llama::One.
 	        template<typename OtherView, typename OtherBoundRecordCoord, bool OtherOwnView>
 	        // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
 	        LLAMA_FN_HOST_ACC_INLINE VirtualRecord(
 	            const VirtualRecord<OtherView, OtherBoundRecordCoord, OtherOwnView>& virtualRecord)
 	            /* requires(OwnView) */
 	            : VirtualRecord()
 	        {
 	            static_assert(
 	                OwnView,
 	                "The copy constructor of VirtualRecord from a different VirtualRecord is only available if it owns "
 	                "the "
 	                "view.");
 	            *this = virtualRecord;
 	        }

 	        // TODO(bgruber): unify with previous in C++20 and use explicit(cond)
 	        /// Create a VirtuaRecord from a scalar. Only available for if the view is owned. Used by llama::One.
 	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
 	        LLAMA_FN_HOST_ACC_INLINE explicit VirtualRecord(const T& scalar)
 	            /* requires(OwnView) */
 	            : VirtualRecord()
 	        {
 	            static_assert(
 	                OwnView,
 	                "The constructor of VirtualRecord from a scalar is only available if it owns the view.");
 	            *this = scalar;
 	        }

 	        /// Access a record in the record dimension underneath the current virtual record using a \ref RecordCoord. If
 	        /// the access resolves to a leaf, a reference to a variable inside the \ref View storage is returned,
 	        /// otherwise another virtual record.
 	        template<std::size_t... Coord>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(RecordCoord<Coord...> = {}) const -> decltype(auto)
 	        {
 	            using AbsolutCoord = Cat<BoundRecordCoord, RecordCoord<Coord...>>;
 	            using AccessedType = GetType<RecordDim, AbsolutCoord>;
 	            if constexpr(isRecord<AccessedType> || internal::IsBoundedArray<AccessedType>::value)
 	            {
 	                LLAMA_FORCE_INLINE_RECURSIVE
 	                return VirtualRecord<const View, AbsolutCoord>{arrayIndex(), this->view};
 	            }
 	            else
 	            {
 	                LLAMA_FORCE_INLINE_RECURSIVE
 	                return this->view.accessor(arrayIndex(), AbsolutCoord{});
 	            }
 	        }

 	        // FIXME(bgruber): remove redundancy
 	        template<std::size_t... Coord>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(RecordCoord<Coord...> = {}) -> decltype(auto)
 	        {
 	            using AbsolutCoord = Cat<BoundRecordCoord, RecordCoord<Coord...>>;
 	            using AccessedType = GetType<RecordDim, AbsolutCoord>;
 	            if constexpr(isRecord<AccessedType> || internal::IsBoundedArray<AccessedType>::value)
 	            {
 	                LLAMA_FORCE_INLINE_RECURSIVE
 	                return VirtualRecord<View, AbsolutCoord>{arrayIndex(), this->view};
 	            }
 	            else
 	            {
 	                LLAMA_FORCE_INLINE_RECURSIVE
 	                return this->view.accessor(arrayIndex(), AbsolutCoord{});
 	            }
 	        }

 	        /// Access a record in the record dimension underneath the current virtual record using a series of tags. If
 	        /// the access resolves to a leaf, a reference to a variable inside the \ref View storage is returned,
 	        /// otherwise another virtual record.
 	        template<typename... Tags>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(Tags...) const -> decltype(auto)
 	        {
 	            using RecordCoord = GetCoordFromTags<AccessibleRecordDim, Tags...>;

 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return operator()(RecordCoord{});
 	        }

 	        // FIXME(bgruber): remove redundancy
 	        template<typename... Tags>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator()(Tags...) -> decltype(auto)
 	        {
 	            using RecordCoord = GetCoordFromTags<AccessibleRecordDim, Tags...>;

 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return operator()(RecordCoord{});
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator=(const T& other) -> VirtualRecord&
 	        {
 	            // NOLINTNEXTLINE(cppcoreguidelines-c-copy-assignment-signature,misc-unconventional-assign-operator)
 	            return internal::virtualRecordArithOperator<internal::Assign>(*this, other);
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator+=(const T& other) -> VirtualRecord&
 	        {
 	            return internal::virtualRecordArithOperator<internal::PlusAssign>(*this, other);
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator-=(const T& other) -> VirtualRecord&
 	        {
 	            return internal::virtualRecordArithOperator<internal::MinusAssign>(*this, other);
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator*=(const T& other) -> VirtualRecord&
 	        {
 	            return internal::virtualRecordArithOperator<internal::MultiplyAssign>(*this, other);
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator/=(const T& other) -> VirtualRecord&
 	        {
 	            return internal::virtualRecordArithOperator<internal::DivideAssign>(*this, other);
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE auto operator%=(const T& other) -> VirtualRecord&
 	        {
 	            return internal::virtualRecordArithOperator<internal::ModuloAssign>(*this, other);
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator+(const VirtualRecord& vd, const T& t)
 	        {
 	            return copyVirtualRecordStack(vd) += t;
 	        }

 	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator+(const T& t, const VirtualRecord& vd)
 	        {
 	            return vd + t;
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator-(const VirtualRecord& vd, const T& t)
 	        {
 	            return copyVirtualRecordStack(vd) -= t;
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator*(const VirtualRecord& vd, const T& t)
 	        {
 	            return copyVirtualRecordStack(vd) *= t;
 	        }

 	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator*(const T& t, const VirtualRecord& vd)
 	        {
 	            return vd * t;
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator/(const VirtualRecord& vd, const T& t)
 	        {
 	            return copyVirtualRecordStack(vd) /= t;
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator%(const VirtualRecord& vd, const T& t)
 	        {
 	            return copyVirtualRecordStack(vd) %= t;
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator==(const VirtualRecord& vd, const T& t) -> bool
 	        {
 	            return internal::virtualRecordRelOperator<std::equal_to<>>(vd, t);
 	        }

 	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator==(const T& t, const VirtualRecord& vd) -> bool
 	        {
 	            return vd == t;
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator!=(const VirtualRecord& vd, const T& t) -> bool
 	        {
 	            return internal::virtualRecordRelOperator<std::not_equal_to<>>(vd, t);
 	        }

 	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator!=(const T& t, const VirtualRecord& vd) -> bool
 	        {
 	            return vd != t;
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator<(const VirtualRecord& vd, const T& t) -> bool
 	        {
 	            return internal::virtualRecordRelOperator<std::less<>>(vd, t);
 	        }

 	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator<(const T& t, const VirtualRecord& vd) -> bool
 	        {
 	            return vd > t;
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator<=(const VirtualRecord& vd, const T& t) -> bool
 	        {
 	            return internal::virtualRecordRelOperator<std::less_equal<>>(vd, t);
 	        }

 	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator<=(const T& t, const VirtualRecord& vd) -> bool
 	        {
 	            return vd >= t;
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator>(const VirtualRecord& vd, const T& t) -> bool
 	        {
 	            return internal::virtualRecordRelOperator<std::greater<>>(vd, t);
 	        }

 	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator>(const T& t, const VirtualRecord& vd) -> bool
 	        {
 	            return vd < t;
 	        }

 	        template<typename T>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator>=(const VirtualRecord& vd, const T& t) -> bool
 	        {
 	            return internal::virtualRecordRelOperator<std::greater_equal<>>(vd, t);
 	        }

 	        template<typename T, typename = std::enable_if_t<!is_VirtualRecord<T>>>
 	        LLAMA_FN_HOST_ACC_INLINE friend auto operator>=(const T& t, const VirtualRecord& vd) -> bool
 	        {
 	            return vd <= t;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE auto asTuple()
 	        {
 	            return internal::asTupleImpl(*this, AccessibleRecordDim{});
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE auto asTuple() const
 	        {
 	            return internal::asTupleImpl(*this, AccessibleRecordDim{});
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE auto asFlatTuple()
 	        {
 	            return internal::asFlatTupleImpl(*this, AccessibleRecordDim{});
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE auto asFlatTuple() const
 	        {
 	            return internal::asFlatTupleImpl(*this, AccessibleRecordDim{});
 	        }

 	        template<std::size_t I>
 	        LLAMA_FN_HOST_ACC_INLINE auto get() -> decltype(auto)
 	        {
 	            return operator()(RecordCoord<I>{});
 	        }

 	        template<std::size_t I>
 	        LLAMA_FN_HOST_ACC_INLINE auto get() const -> decltype(auto)
 	        {
 	            return operator()(RecordCoord<I>{});
 	        }

 	        template<typename TupleLike>
 	        LLAMA_FN_HOST_ACC_INLINE auto loadAs() -> TupleLike
 	        {
 	            static_assert(
 	                internal::isDirectListInitializableFromTuple<TupleLike, decltype(asFlatTuple())>,
 	                "TupleLike must be constructible from as many values as this VirtualRecord recursively represents "
 	                "like "
 	                "this: TupleLike{values...}");
 	            return internal::makeFromTuple<TupleLike>(
 	                asFlatTuple(),
 	                std::make_index_sequence<std::tuple_size_v<decltype(asFlatTuple())>>{});
 	        }

 	        template<typename TupleLike>
 	        LLAMA_FN_HOST_ACC_INLINE auto loadAs() const -> TupleLike
 	        {
 	            static_assert(
 	                internal::isDirectListInitializableFromTuple<TupleLike, decltype(asFlatTuple())>,
 	                "TupleLike must be constructible from as many values as this VirtualRecord recursively represents "
 	                "like "
 	                "this: TupleLike{values...}");
 	            return internal::makeFromTuple<TupleLike>(
 	                asFlatTuple(),
 	                std::make_index_sequence<std::tuple_size_v<decltype(asFlatTuple())>>{});
 	        }

 	        struct Loader
 	        {
 	            VirtualRecord& vd;

 	            template<typename T>
 	            // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
 	            LLAMA_FN_HOST_ACC_INLINE operator T()
 	            {
 	                return vd.loadAs<T>();
 	            }
 	        };

 	        struct LoaderConst
 	        {
 	            const VirtualRecord& vd;

 	            template<typename T>
 	            // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
 	            LLAMA_FN_HOST_ACC_INLINE operator T() const
 	            {
 	                return vd.loadAs<T>();
 	            }
 	        };

 	        LLAMA_FN_HOST_ACC_INLINE auto load() -> Loader
 	        {
 	            return {*this};
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE auto load() const -> LoaderConst
 	        {
 	            return {*this};
 	        }

 	        template<typename TupleLike>
 	        LLAMA_FN_HOST_ACC_INLINE void store(const TupleLike& t)
 	        {
 	            internal::assignTuples(asTuple(), t, std::make_index_sequence<std::tuple_size_v<TupleLike>>{});
 	        }

 	        // swap for equal VirtualRecord
 	        LLAMA_FN_HOST_ACC_INLINE friend void swap(
 	            std::conditional_t<OwnView, VirtualRecord&, VirtualRecord> a,
 	            std::conditional_t<OwnView, VirtualRecord&, VirtualRecord> b) noexcept
 	        {
 	            forEachLeafCoord<AccessibleRecordDim>(
 	                [&](auto rc) LLAMA_LAMBDA_INLINE
 	                {
 	                    using std::swap;
 	                    swap(a(rc), b(rc));
 	                });
 	        }
 	    };

 	    // swap for heterogeneous VirtualRecord
 	    template<
 	        typename ViewA,
 	        typename BoundRecordDimA,
 	        bool OwnViewA,
 	        typename ViewB,
 	        typename BoundRecordDimB,
 	        bool OwnViewB>
 	    LLAMA_FN_HOST_ACC_INLINE auto swap(
 	        VirtualRecord<ViewA, BoundRecordDimA, OwnViewA>& a,
 	        VirtualRecord<ViewB, BoundRecordDimB, OwnViewB>& b) noexcept
 	        -> std::enable_if_t<std::is_same_v<
 	            typename VirtualRecord<ViewA, BoundRecordDimA, OwnViewA>::AccessibleRecordDim,
 	            typename VirtualRecord<ViewB, BoundRecordDimB, OwnViewB>::AccessibleRecordDim>>
 	    {
 	        using LeftRecord = VirtualRecord<ViewA, BoundRecordDimA, OwnViewA>;
 	        forEachLeafCoord<typename LeftRecord::AccessibleRecordDim>(
 	            [&](auto rc) LLAMA_LAMBDA_INLINE
 	            {
 	                using std::swap;
 	                swap(a(rc), b(rc));
 	            });
 	    }

 	    template<typename View, typename BoundRecordCoord, bool OwnView>
 	    auto operator<<(std::ostream& os, const VirtualRecord<View, BoundRecordCoord, OwnView>& vr) -> std::ostream&
 	    {
 	        using RecordDim = typename VirtualRecord<View, BoundRecordCoord, OwnView>::AccessibleRecordDim;
 	        os << "{";
 	        // TODO(bgruber): I tried refactoring both branches into one, but MSVC and icpc have troubles with correctly
 	        // discarding the discarded if constexpr branch and not instantiating templates inside them.
 	        if constexpr(std::is_array_v<RecordDim>)
 	        {
 	            constexpr auto size = std::extent_v<RecordDim>;
 	            boost::mp11::mp_for_each<boost::mp11::mp_iota_c<size>>(
 	                [&](auto ic)
 	                {
 	                    constexpr std::size_t i = decltype(ic)::value;
 	                    os << '[' << i << ']' << ": " << vr(RecordCoord<i>{});
 	                    if(i + 1 < size)
 	                        os << ", ";
 	                });
 	        }
 	        else
 	        {
 	            constexpr auto size = boost::mp11::mp_size<RecordDim>::value;
 	            boost::mp11::mp_for_each<boost::mp11::mp_iota_c<size>>(
 	                [&](auto ic)
 	                {
 	                    constexpr std::size_t i = decltype(ic)::value;
 	                    using Field = boost::mp11::mp_at_c<RecordDim, i>;
 	                    using Tag = GetFieldTag<Field>;
 	                    os << structName<Tag>() << ": " << vr(RecordCoord<i>{});
 	                    if(i + 1 < size)
 	                        os << ", ";
 	                });
 	        }
 	        os << "}";
 	        return os;
 	    }

 	    template<typename VirtualRecordFwd, typename Functor>
 	    LLAMA_FN_HOST_ACC_INLINE constexpr void forEachLeaf(VirtualRecordFwd&& vr, Functor&& functor)
 	    {
 	        using VirtualRecord = std::remove_reference_t<VirtualRecordFwd>;
 	        LLAMA_FORCE_INLINE_RECURSIVE
 	        forEachLeafCoord<typename VirtualRecord::AccessibleRecordDim>(
 	            [functor = std::forward<Functor>(functor), &vr = vr](auto rc)
 	                LLAMA_LAMBDA_INLINE_WITH_SPECIFIERS(constexpr mutable) { std::forward<Functor>(functor)(vr(rc)); });
 	    }
 	} // namespace llama

 	template<typename View, typename BoundRecordCoord, bool OwnView>
 	struct std::tuple_size<llama::VirtualRecord<View, BoundRecordCoord, OwnView>>
 	    : boost::mp11::mp_size<typename llama::VirtualRecord<View, BoundRecordCoord, OwnView>::AccessibleRecordDim>
 	{
 	};

 	template<std::size_t I, typename View, typename BoundRecordCoord, bool OwnView>
 	struct std::tuple_element<I, llama::VirtualRecord<View, BoundRecordCoord, OwnView>>
 	{
 	    using type = decltype(std::declval<llama::VirtualRecord<View, BoundRecordCoord, OwnView>>().template get<I>());
 	};

 	template<std::size_t I, typename View, typename BoundRecordCoord, bool OwnView>
 	struct std::tuple_element<I, const llama::VirtualRecord<View, BoundRecordCoord, OwnView>>
 	{
 	    using type
 	        = decltype(std::declval<const llama::VirtualRecord<View, BoundRecordCoord, OwnView>>().template get<I>());
 	};

 	#if CAN_USE_RANGES
 	template<
 	    typename ViewA,
 	    typename BoundA,
 	    bool OwnA,
 	    typename ViewB,
 	    typename BoundB,
 	    bool OwnB,
 	    template<class>
 	    class TQual,
 	    template<class>
 	    class UQual>
 	struct std::basic_common_reference<
 	    llama::VirtualRecord<ViewA, BoundA, OwnA>,
 	    llama::VirtualRecord<ViewB, BoundB, OwnB>,
 	    TQual,
 	    UQual>
 	{
 	    using type = std::enable_if_t<
 	        std::is_same_v<
 	            typename llama::VirtualRecord<ViewA, BoundA, OwnA>::AccessibleRecordDim,
 	            typename llama::VirtualRecord<ViewB, BoundB, OwnB>::AccessibleRecordDim>,
 	        llama::One<typename ViewA::RecordDim>>;
 	};
 	#endif
 	// ==
 	// == ./VirtualRecord.hpp ==
 	// ============================================================================


 // #include <algorithm>    // amalgamate: file already included
 #include <stdexcept>
 // #include <string>    // amalgamate: file already included

 namespace llama
 {
    // TODO(bgruber): expose blob allocator
    /// An equivalent of std::vector<T> backed by a \ref View. Elements are never value initialized though. No strong
    /// exception guarantee.
    /// WARNING: This class is experimental.
    /// @tparam Mapping The mapping to be used for the underlying view. Needs to have 1 array dimension.
    template<typename Mapping>
    struct Vector
    {
        static_assert(Mapping::ArrayExtents::rank == 1, "llama::Vector only supports 1D mappings");

        using ViewType = decltype(allocViewUninitialized<Mapping>());
        using RecordDim = typename Mapping::RecordDim;

        using iterator = decltype(std::declval<ViewType>().begin());
        using value_type = typename iterator::value_type;

        Vector() = default;

        template<typename VirtualRecord = One<RecordDim>>
        LLAMA_FN_HOST_ACC_INLINE explicit Vector(std::size_t count, const VirtualRecord& value = {})
        {
            reserve(count);
            for(std::size_t i = 0; i < count; i++)
                push_back(value);
        }

        template<typename Iterator>
        LLAMA_FN_HOST_ACC_INLINE Vector(Iterator first, Iterator last)
        {
            if constexpr(std::is_same_v<
                             typename std::iterator_traits<Iterator>::iterator_category,
                             std::random_access_iterator_tag>)
                reserve(std::distance(first, last));
            for(; first != last; ++first)
                push_back(*first);
        }

        Vector(const Vector& other) = default;

        LLAMA_FN_HOST_ACC_INLINE Vector(Vector&& other) noexcept
        {
            swap(other);
        }

        auto operator=(const Vector& other) -> Vector& = default;

        LLAMA_FN_HOST_ACC_INLINE auto operator=(Vector&& other) noexcept -> Vector&
        {
            swap(other);
            return *this;
        }

        ~Vector() = default;

        // TODO(bgruber): assign

        LLAMA_FN_HOST_ACC_INLINE auto at(std::size_t i) -> decltype(auto)
        {
            if(i >= m_size)
                throw std::out_of_range{
                    "Index " + std::to_string(i) + "out of range [0:" + std::to_string(m_size) + "["};
            return m_view(i);
        }

        LLAMA_FN_HOST_ACC_INLINE auto at(std::size_t i) const -> decltype(auto)
        {
            if(i >= m_size)
                throw std::out_of_range{
                    "Index " + std::to_string(i) + "out of range [0:" + std::to_string(m_size) + "["};
            return m_view(i);
        }

        LLAMA_FN_HOST_ACC_INLINE auto operator[](std::size_t i) -> decltype(auto)
        {
            return m_view(i);
        }

        LLAMA_FN_HOST_ACC_INLINE auto operator[](std::size_t i) const -> decltype(auto)
        {
            return m_view(i);
        }

        LLAMA_FN_HOST_ACC_INLINE auto front() -> decltype(auto)
        {
            return m_view(0);
        }

        LLAMA_FN_HOST_ACC_INLINE auto front() const -> decltype(auto)
        {
            return m_view(0);
        }

        LLAMA_FN_HOST_ACC_INLINE auto back() -> decltype(auto)
        {
            return m_view(m_size - 1);
        }

        LLAMA_FN_HOST_ACC_INLINE auto back() const -> decltype(auto)
        {
            return m_view(m_size - 1);
        }

        LLAMA_FN_HOST_ACC_INLINE auto begin() -> decltype(auto)
        {
            return m_view.begin();
        }

        LLAMA_FN_HOST_ACC_INLINE auto begin() const -> decltype(auto)
        {
            return m_view.begin();
        }

        LLAMA_FN_HOST_ACC_INLINE auto cbegin() -> decltype(auto)
        {
            return std::as_const(m_view).begin();
        }

        LLAMA_FN_HOST_ACC_INLINE auto cbegin() const -> decltype(auto)
        {
            return m_view.begin();
        }

        LLAMA_FN_HOST_ACC_INLINE auto end() -> decltype(auto)
        {
            return m_view.begin() + m_size;
        }

        LLAMA_FN_HOST_ACC_INLINE auto end() const -> decltype(auto)
        {
            return m_view.begin() + m_size;
        }

        LLAMA_FN_HOST_ACC_INLINE auto cend() -> decltype(auto)
        {
            return std::as_const(m_view).begin() + m_size;
        }

        LLAMA_FN_HOST_ACC_INLINE auto cend() const -> decltype(auto)
        {
            return m_view.begin() + m_size;
        }

        LLAMA_FN_HOST_ACC_INLINE auto empty() const -> bool
        {
            return m_size == 0;
        }

        LLAMA_FN_HOST_ACC_INLINE auto size() const -> std::size_t
        {
            return m_size;
        }

        LLAMA_FN_HOST_ACC_INLINE void reserve(std::size_t cap)
        {
            if(cap > capacity())
                changeCapacity(cap);
        }

        LLAMA_FN_HOST_ACC_INLINE auto capacity() const -> std::size_t
        {
            return m_view.mapping().extents()[0];
        }

        LLAMA_FN_HOST_ACC_INLINE void shrink_to_fit()
        {
            changeCapacity(m_size);
        }

        LLAMA_FN_HOST_ACC_INLINE void clear()
        {
            m_size = 0;
        }

        template<typename T>
        LLAMA_FN_HOST_ACC_INLINE auto insert(iterator pos, T&& t) -> iterator
        {
            const auto i = pos - begin();
            reserve(m_size + 1); // might invalidate pos
            pos = begin() + i;
            std::copy_backward(pos, end(), end() + 1);
            m_view[i] = std::forward<T>(t);
            m_size++;
            return pos;
        }

        // TODO(bgruber): more insert overloads

        // TODO(bgruber): emplace

        LLAMA_FN_HOST_ACC_INLINE auto erase(iterator pos) -> iterator
        {
            std::copy(pos + 1, end(), pos);
            m_size--;
            return pos;
        }

        // TODO(bgruber): more erase overloads

        // TODO(bgruber): T here is probably a virtual record. We could also allow any struct that is storable to the
        // view via VirtualRecord::store().
        template<typename T>
        LLAMA_FN_HOST_ACC_INLINE void push_back(T&& t)
        {
            if(const auto cap = capacity(); m_size == cap)
                reserve(std::max(cap + cap / 2, m_size + 1));

            m_view[m_size++] = std::forward<T>(t);
        }

        // TODO(bgruber): emplace_back

        LLAMA_FN_HOST_ACC_INLINE void pop_back()
        {
            m_size--;
        }

        template<typename VirtualRecord = One<RecordDim>>
        LLAMA_FN_HOST_ACC_INLINE void resize(std::size_t count, const VirtualRecord& value = {})
        {
            reserve(count);
            for(std::size_t i = m_size; i < count; i++)
                m_view[i] = value;
            m_size = count;
        }

        LLAMA_FN_HOST_ACC_INLINE friend auto operator==(const Vector& a, const Vector& b) -> bool
        {
            if(a.m_size != b.m_size)
                return false;
            return std::equal(a.begin(), a.end(), b.begin());
        }

        LLAMA_FN_HOST_ACC_INLINE friend auto operator!=(const Vector& a, const Vector& b) -> bool
        {
            return !(a == b);
        }

        LLAMA_FN_HOST_ACC_INLINE friend auto operator<(const Vector& a, const Vector& b) -> bool
        {
            return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end());
        }

        LLAMA_FN_HOST_ACC_INLINE friend auto operator<=(const Vector& a, const Vector& b) -> bool
        {
            return !(b < a);
        }

        LLAMA_FN_HOST_ACC_INLINE friend auto operator>(const Vector& a, const Vector& b) -> bool
        {
            return b < a;
        }

        LLAMA_FN_HOST_ACC_INLINE friend auto operator>=(const Vector& a, const Vector& b) -> bool
        {
            return !(a < b);
        }

        LLAMA_FN_HOST_ACC_INLINE friend void swap(Vector& a, Vector& b) noexcept
        {
            a.swap(b);
        }

    private:
        LLAMA_FN_HOST_ACC_INLINE void changeCapacity(std::size_t cap)
        {
            auto newView = allocViewUninitialized<Mapping>(Mapping{typename Mapping::ArrayExtents{cap}});
            auto b = begin();
            std::copy(begin(), b + std::min(m_size, cap), newView.begin());
            using std::swap;
            swap(m_view, newView); // depends on move semantic of View
        }

        LLAMA_FN_HOST_ACC_INLINE void swap(Vector& other) noexcept
        {
            using std::swap;
            swap(m_view, other.m_view); // depends on move semantic of View
            swap(m_size, other.m_size);
        }

        ViewType m_view = {};
        std::size_t m_size = 0;
    };


 } // namespace llama
 // ==
 // == ./Vector.hpp ==
 // ============================================================================

 // ============================================================================
 // == ./Copy.hpp ==
 // ==
 // SPDX-License-Identifier: GPL-3.0-or-later

 // #pragma once
 // #include "View.hpp"    // amalgamate: file already expanded
 	// ============================================================================
 	// == ./mapping/AoSoA.hpp ==
 	// ==
 	// SPDX-License-Identifier: GPL-3.0-or-later

 	// #pragma once
 	// #include "Common.hpp"    // amalgamate: file already expanded

 	// #include <limits>    // amalgamate: file already included

 	namespace llama::mapping
 	{
 	    /// The maximum number of vector lanes that can be used to fetch each leaf type in the record dimension into a
 	    /// vector register of the given size in bits.
 	    template<typename RecordDim, std::size_t VectorRegisterBits>
 	    inline constexpr std::size_t maxLanes = []() constexpr
 	    {
 	        auto max = std::numeric_limits<std::size_t>::max();
 	        forEachLeafCoord<RecordDim>(
 	            [&](auto rc)
 	            {
 	                using AttributeType = GetType<RecordDim, decltype(rc)>;
 	                max = std::min(max, VectorRegisterBits / (sizeof(AttributeType) * CHAR_BIT));
 	            });
 	        return max;
 	    }
 	    ();

 	    /// Array of struct of arrays mapping. Used to create a \ref View via \ref allocView.
 	    /// \tparam Lanes The size of the inner arrays of this array of struct of arrays.
 	    /// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened. See \ref
 	    /// FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref FlattenRecordDimDecreasingAlignment and
 	    /// \ref FlattenRecordDimMinimizePadding.
 	    template<
 	        typename TArrayExtents,
 	        typename TRecordDim,
 	        std::size_t Lanes,
 	        typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
 	        template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder>
 	    struct AoSoA : private TArrayExtents
 	    {
 	        using ArrayExtents = TArrayExtents;
 	        using ArrayIndex = typename ArrayExtents::Index;
 	        using RecordDim = TRecordDim;
 	        using LinearizeArrayDimsFunctor = TLinearizeArrayDimsFunctor;
 	        static constexpr std::size_t blobCount = 1;

 	        constexpr AoSoA() = default;

 	        LLAMA_FN_HOST_ACC_INLINE constexpr explicit AoSoA(ArrayExtents extents, RecordDim = {}) : ArrayExtents(extents)
 	        {
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
 	        {
 	            return ArrayExtents{*this};
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t) const -> std::size_t
 	        {
 	            return roundUpToMultiple(
 	                LinearizeArrayDimsFunctor{}.size(extents()) * sizeOf<RecordDim>,
 	                Lanes * sizeOf<RecordDim>);
 	        }

 	        template<std::size_t... RecordCoords>
 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex ai, RecordCoord<RecordCoords...> = {}) const
 	            -> NrAndOffset
 	        {
 	            constexpr std::size_t flatFieldIndex =
 	#ifdef __NVCC__
 	                *& // mess with nvcc compiler state to workaround bug
 	#endif
 	                 Flattener::template flatIndex<RecordCoords...>;
 	            const auto flatArrayIndex = LinearizeArrayDimsFunctor{}(ai, extents());
 	            const auto blockIndex = flatArrayIndex / Lanes;
 	            const auto laneIndex = flatArrayIndex % Lanes;
 	            const auto offset = (sizeOf<RecordDim> * Lanes) * blockIndex
 	                + flatOffsetOf<typename Flattener::FlatRecordDim, flatFieldIndex, false> * Lanes
 	                + sizeof(GetType<RecordDim, RecordCoord<RecordCoords...>>) * laneIndex;
 	            return {0, offset};
 	        }

 	    private:
 	        using Flattener = FlattenRecordDim<TRecordDim>;
 	    };

 	    template<std::size_t Lanes, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
 	    struct PreconfiguredAoSoA
 	    {
 	        template<typename ArrayExtents, typename RecordDim>
 	        using type = AoSoA<ArrayExtents, RecordDim, Lanes, LinearizeArrayDimsFunctor>;
 	    };

 	    template<typename Mapping>
 	    inline constexpr bool isAoSoA = false;

 	    template<typename AD, typename RD, std::size_t L>
 	    inline constexpr bool isAoSoA<AoSoA<AD, RD, L>> = true;

 	} // namespace llama::mapping
 	// ==
 	// == ./mapping/AoSoA.hpp ==
 	// ============================================================================

 	// ============================================================================
 	// == ./mapping/SoA.hpp ==
 	// ==
 	// Copyright 2018 Alexander Matthes
 	// SPDX-License-Identifier: GPL-3.0-or-later

 	// #pragma once
 	// #include "Common.hpp"    // amalgamate: file already expanded

 	// #include <limits>    // amalgamate: file already included

 	namespace llama::mapping
 	{
 	    /// Struct of array mapping. Used to create a \ref View via \ref allocView.
 	    /// \tparam SeparateBuffers If true, every element of the record dimension is mapped to its own buffer.
 	    /// \tparam LinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
 	    /// how big the linear domain gets.
 	    /// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened if SeparateBuffers is
 	    /// false. See \ref FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref
 	    /// FlattenRecordDimDecreasingAlignment and \ref FlattenRecordDimMinimizePadding.
 	    template<
 	        typename TArrayExtents,
 	        typename TRecordDim,
 	        bool SeparateBuffers = true,
 	        typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
 	        template<typename> typename FlattenRecordDimSingleBlob = FlattenRecordDimInOrder>
 	    struct SoA : private TArrayExtents
 	    {
 	        using ArrayExtents = TArrayExtents;
 	        using ArrayIndex = typename ArrayExtents::Index;
 	        using RecordDim = TRecordDim;
 	        using LinearizeArrayDimsFunctor = TLinearizeArrayDimsFunctor;
 	        static constexpr std::size_t blobCount
 	            = SeparateBuffers ? boost::mp11::mp_size<FlatRecordDim<RecordDim>>::value : 1;

 	        constexpr SoA() = default;

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr explicit SoA(ArrayExtents extents, RecordDim = {}) : ArrayExtents(extents)
 	        {
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
 	        {
 	            return ArrayExtents{*this};
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr auto blobSize([[maybe_unused]] std::size_t blobIndex) const -> std::size_t
 	        {
 	            if constexpr(SeparateBuffers)
 	            {
 	                constexpr Array<std::size_t, blobCount> typeSizes = []() constexpr
 	                {
 	                    Array<std::size_t, blobCount> r{};
 	                    forEachLeafCoord<RecordDim>([&r, i = 0](auto rc) mutable constexpr
 	                                                { r[i++] = sizeof(GetType<RecordDim, decltype(rc)>); });
 	                    return r;
 	                }
 	                ();
 	                return LinearizeArrayDimsFunctor{}.size(extents()) * typeSizes[blobIndex];
 	            }
 	            else
 	            {
 	                return LinearizeArrayDimsFunctor{}.size(extents()) * sizeOf<RecordDim>;
 	            }
 	        }

 	        template<std::size_t... RecordCoords>
 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex ad, RecordCoord<RecordCoords...> = {}) const
 	            -> NrAndOffset
 	        {
 	            if constexpr(SeparateBuffers)
 	            {
 	                constexpr auto blob = flatRecordCoord<RecordDim, RecordCoord<RecordCoords...>>;
 	                const auto offset = LinearizeArrayDimsFunctor{}(ad, extents())
 	                    * sizeof(GetType<RecordDim, RecordCoord<RecordCoords...>>);
 	                return {blob, offset};
 	            }
 	            else
 	            {
 	                constexpr std::size_t flatFieldIndex =
 	#ifdef __NVCC__
 	                    *& // mess with nvcc compiler state to workaround bug
 	#endif
 	                     Flattener::template flatIndex<RecordCoords...>;
 	                const auto offset = LinearizeArrayDimsFunctor{}(ad, extents())
 	                        * sizeof(GetType<RecordDim, RecordCoord<RecordCoords...>>)
 	                    + flatOffsetOf<
 	                          typename Flattener::FlatRecordDim,
 	                          flatFieldIndex,
 	                          false> * LinearizeArrayDimsFunctor{}.size(extents());
 	                return {0, offset};
 	            }
 	        }

 	    private:
 	        using Flattener = FlattenRecordDimSingleBlob<TRecordDim>;
 	    };

 	    /// Struct of array mapping storing the entire layout in a single blob.
 	    /// \see SoA
 	    template<typename ArrayExtents, typename RecordDim, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
 	    using SingleBlobSoA = SoA<ArrayExtents, RecordDim, false, LinearizeArrayDimsFunctor>;

 	    /// Struct of array mapping storing each attribute of the record dimension in a separate blob.
 	    /// \see SoA
 	    template<typename ArrayExtents, typename RecordDim, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
 	    using MultiBlobSoA = SoA<ArrayExtents, RecordDim, true, LinearizeArrayDimsFunctor>;

 	    template<bool SeparateBuffers = true, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
 	    struct PreconfiguredSoA
 	    {
 	        template<typename ArrayExtents, typename RecordDim>
 	        using type = SoA<ArrayExtents, RecordDim, SeparateBuffers, LinearizeArrayDimsFunctor>;
 	    };

 	    template<typename Mapping>
 	    inline constexpr bool isSoA = false;

 	    template<typename ArrayExtents, typename RecordDim, bool SeparateBuffers, typename LinearizeArrayDimsFunctor>
 	    inline constexpr bool isSoA<SoA<ArrayExtents, RecordDim, SeparateBuffers, LinearizeArrayDimsFunctor>> = true;
 	} // namespace llama::mapping
 	// ==
 	// == ./mapping/SoA.hpp ==
 	// ============================================================================


 #include <cstring>
 #include <numeric>

 namespace llama
 {
    namespace internal
    {
        template<typename RecordDim>
        void assertTrivialCopyable()
        {
            forEachLeafCoord<RecordDim>(
                [](auto rc)
                {
                    static_assert(
                        std::is_trivially_copyable_v<GetType<RecordDim, decltype(rc)>>,
                        "All types in the record dimension must be trivially copyable");
                });
        }

        using memcopyFunc = void* (*) (void*, const void*, std::size_t);

        inline void parallel_memcpy(
            std::byte* dst,
            const std::byte* src,
            std::size_t size,
            std::size_t threadId = 0,
            std::size_t threadCount = 1,
            memcopyFunc singleThreadMemcpy = std::memcpy)
        {
            const auto sizePerThread = size / threadCount;
            const auto sizeLastThread = sizePerThread + size % threadCount;
            const auto sizeThisThread = threadId == threadCount - 1 ? sizeLastThread : sizePerThread;
            singleThreadMemcpy(dst + threadId * sizePerThread, src + threadId * sizePerThread, sizeThisThread);
        }
    } // namespace internal

    /// Direct memcpy from source view blobs to destination view blobs. Both views need to have the same mappings with
    /// the same array dimensions.
    /// @param threadId Optional. Zero-based id of calling thread for multi-threaded invocations.
    /// @param threadCount Optional. Thread count in case of multi-threaded invocation.
    template<typename Mapping, typename SrcBlob, typename DstBlob>
    void blobMemcpy(
        const View<Mapping, SrcBlob>& srcView,
        View<Mapping, DstBlob>& dstView,
        std::size_t threadId = 0,
        std::size_t threadCount = 1)
    {
        internal::assertTrivialCopyable<typename Mapping::RecordDim>();

        // TODO(bgruber): we do not verify if the mappings have other runtime state than the array dimensions
        if(srcView.mapping().extents() != dstView.mapping().extents())
            throw std::runtime_error{"Array dimensions sizes are different"};

        // TODO(bgruber): this is maybe not the best parallel copying strategy
        for(std::size_t i = 0; i < Mapping::blobCount; i++)
            internal::parallel_memcpy(
                &dstView.storageBlobs[i][0],
                &srcView.storageBlobs[i][0],
                dstView.mapping().blobSize(i),
                threadId,
                threadCount);
    }

    /// Field-wise copy from source to destination view. Both views need to have the same array and record dimensions.
    /// @param threadId Optional. Thread id in case of multi-threaded copy.
    /// @param threadCount Optional. Thread count in case of multi-threaded copy.
    template<typename SrcMapping, typename SrcBlob, typename DstMapping, typename DstBlob>
    void fieldWiseCopy(
        const View<SrcMapping, SrcBlob>& srcView,
        View<DstMapping, DstBlob>& dstView,
        std::size_t threadId = 0,
        std::size_t threadCount = 1)
    {
        // TODO(bgruber): think if we can remove this restriction
        static_assert(
            std::is_same_v<typename SrcMapping::RecordDim, typename DstMapping::RecordDim>,
            "The source and destination record dimensions must be the same");

        if(srcView.mapping().extents() != dstView.mapping().extents())
            throw std::runtime_error{"Array dimensions sizes are different"};

        auto copyOne = [&](auto ai) LLAMA_LAMBDA_INLINE
        {
            forEachLeafCoord<typename DstMapping::RecordDim>([&](auto rc) LLAMA_LAMBDA_INLINE
                                                             { dstView(ai)(rc) = srcView(ai)(rc); });
        };

        constexpr auto dims = SrcMapping::ArrayExtents::rank;
        const auto extents = srcView.mapping().extents().toArray();
        const auto workPerThread = (extents[0] + threadCount - 1) / threadCount;
        const auto start = threadId * workPerThread;
        const auto end = std::min((threadId + 1) * workPerThread, extents[0]);
        for(auto i = start; i < end; i++)
        {
            if constexpr(dims > 1)
                forEachADCoord(ArrayIndex<dims - 1>{pop_front(extents)}, copyOne, static_cast<std::size_t>(i));
            else
                copyOne(ArrayIndex<dims>{static_cast<std::size_t>(i)});
        }
    }

    namespace internal
    {
        template<typename Mapping>
        inline constexpr std::size_t aosoaLanes = 0;

        template<typename ArrayExtents, typename RecordDim, bool SeparateBuffers, typename LinearizeArrayDimsFunctor>
        inline constexpr std::size_t aosoaLanes<
            mapping::SoA<ArrayExtents, RecordDim, SeparateBuffers, LinearizeArrayDimsFunctor>> = std::
            numeric_limits<std::size_t>::max();

        template<typename ArrayExtents, typename RecordDim, std::size_t Lanes, typename LinearizeArrayDimsFunctor>
        inline constexpr std::size_t
            aosoaLanes<mapping::AoSoA<ArrayExtents, RecordDim, Lanes, LinearizeArrayDimsFunctor>> = Lanes;
    } // namespace internal

    /// AoSoA copy strategy which transfers data in common blocks. SoA mappings are also allowed for at most 1
    /// argument.
    /// @param threadId Optional. Zero-based id of calling thread for multi-threaded invocations.
    /// @param threadCount Optional. Thread count in case of multi-threaded invocation.
    template<typename SrcMapping, typename SrcBlob, typename DstMapping, typename DstBlob>
    void aosoaCommonBlockCopy(
        const View<SrcMapping, SrcBlob>& srcView,
        View<DstMapping, DstBlob>& dstView,
        bool readOpt,
        std::size_t threadId = 0,
        std::size_t threadCount = 1)
    {
        // TODO(bgruber): think if we can remove this restriction
        static_assert(
            std::is_same_v<typename SrcMapping::RecordDim, typename DstMapping::RecordDim>,
            "The source and destination record dimensions must be the same");
        static_assert(
            std::is_same_v<
                typename SrcMapping::LinearizeArrayDimsFunctor,
                typename DstMapping::LinearizeArrayDimsFunctor>,
            "Source and destination mapping need to use the same array dimensions linearizer");
        using RecordDim = typename SrcMapping::RecordDim;
        internal::assertTrivialCopyable<RecordDim>();

        [[maybe_unused]] static constexpr bool MBSrc = SrcMapping::blobCount > 1;
        [[maybe_unused]] static constexpr bool MBDst = DstMapping::blobCount > 1;
        static constexpr auto LanesSrc = internal::aosoaLanes<SrcMapping>;
        static constexpr auto LanesDst = internal::aosoaLanes<DstMapping>;

        if(srcView.mapping().extents() != dstView.mapping().extents())
            throw std::runtime_error{"Array dimensions sizes are different"};

        static constexpr auto srcIsAoSoA = LanesSrc != std::numeric_limits<std::size_t>::max();
        static constexpr auto dstIsAoSoA = LanesDst != std::numeric_limits<std::size_t>::max();

        static_assert(srcIsAoSoA || dstIsAoSoA, "At least one of the mappings must be an AoSoA mapping");
        static_assert(
            !srcIsAoSoA || std::tuple_size_v<decltype(srcView.storageBlobs)> == 1,
            "Implementation assumes AoSoA with single blob");
        static_assert(
            !dstIsAoSoA || std::tuple_size_v<decltype(dstView.storageBlobs)> == 1,
            "Implementation assumes AoSoA with single blob");

        const auto flatSize = product(dstView.mapping().extents());

        // TODO(bgruber): implement the following by adding additional copy loops for the remaining elements
        if(!srcIsAoSoA && flatSize % LanesDst != 0)
            throw std::runtime_error{"Source SoA mapping's total array elements must be evenly divisible by the "
                                     "destination AoSoA Lane count."};
        if(!dstIsAoSoA && flatSize % LanesSrc != 0)
            throw std::runtime_error{"Destination SoA mapping's total array elements must be evenly divisible by the "
                                     "source AoSoA Lane count."};

        // the same as AoSoA::blobNrAndOffset but takes a flat array index
        auto mapAoSoA = [](std::size_t flatArrayIndex, auto rc, std::size_t Lanes) LLAMA_LAMBDA_INLINE
        {
            const auto blockIndex = flatArrayIndex / Lanes;
            const auto laneIndex = flatArrayIndex % Lanes;
            const auto offset = (sizeOf<RecordDim> * Lanes) * blockIndex + offsetOf<RecordDim, decltype(rc)> * Lanes
                + sizeof(GetType<RecordDim, decltype(rc)>) * laneIndex;
            return offset;
        };
        // the same as SoA::blobNrAndOffset but takes a flat array index
        auto mapSoA = [&](std::size_t flatArrayIndex, auto rc, bool mb) LLAMA_LAMBDA_INLINE
        {
            const auto blob = mb * flatRecordCoord<RecordDim, decltype(rc)>;
            const auto offset = !mb * offsetOf<RecordDim, decltype(rc)> * flatSize
                + sizeof(GetType<RecordDim, decltype(rc)>) * flatArrayIndex;
            return NrAndOffset{blob, offset};
        };

        auto mapSrc = [&](std::size_t flatArrayIndex, auto rc) LLAMA_LAMBDA_INLINE
        {
            if constexpr(srcIsAoSoA)
                return &srcView.storageBlobs[0][0] + mapAoSoA(flatArrayIndex, rc, LanesSrc);
            else
            {
                const auto [blob, off] = mapSoA(flatArrayIndex, rc, MBSrc);
                return &srcView.storageBlobs[blob][off];
            }
        };
        auto mapDst = [&](std::size_t flatArrayIndex, auto rc) LLAMA_LAMBDA_INLINE
        {
            if constexpr(dstIsAoSoA)
                return &dstView.storageBlobs[0][0] + mapAoSoA(flatArrayIndex, rc, LanesDst);
            else
            {
                const auto [blob, off] = mapSoA(flatArrayIndex, rc, MBDst);
                return &dstView.storageBlobs[blob][off];
            }
        };

        static constexpr auto L = []
        {
            if constexpr(srcIsAoSoA && dstIsAoSoA)
                return std::gcd(LanesSrc, LanesDst);
            return std::min(LanesSrc, LanesDst);
        }();
        if(readOpt)
        {
            // optimized for linear reading
            constexpr auto srcL = srcIsAoSoA ? LanesSrc : L;
            const auto elementsPerThread = flatSize / srcL / threadCount * srcL;
            {
                const auto start = threadId * elementsPerThread;
                const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread;

                auto copyLBlock = [&](const std::byte*& threadSrc, std::size_t dstIndex, auto rc) LLAMA_LAMBDA_INLINE
                {
                    constexpr auto bytes = L * sizeof(GetType<RecordDim, decltype(rc)>);
                    std::memcpy(mapDst(dstIndex, rc), threadSrc, bytes);
                    threadSrc += bytes;
                };
                if constexpr(srcIsAoSoA)
                {
                    auto* threadSrc = mapSrc(start, RecordCoord<>{});
                    for(std::size_t i = start; i < stop; i += LanesSrc)
                        forEachLeafCoord<RecordDim>(
                            [&](auto rc) LLAMA_LAMBDA_INLINE
                            {
                                for(std::size_t j = 0; j < LanesSrc; j += L)
                                    copyLBlock(threadSrc, i + j, rc);
                            });
                }
                else
                {
                    forEachLeafCoord<RecordDim>(
                        [&](auto rc) LLAMA_LAMBDA_INLINE
                        {
                            auto* threadSrc = mapSrc(start, rc);
                            for(std::size_t i = start; i < stop; i += L)
                                copyLBlock(threadSrc, i, rc);
                        });
                }
            }
        }
        else
        {
            // optimized for linear writing
            constexpr auto dstL = dstIsAoSoA ? LanesDst : L;
            const auto elementsPerThread = flatSize / dstL / threadCount * dstL;
            {
                const auto start = threadId * elementsPerThread;
                const auto stop = threadId == threadCount - 1 ? flatSize : (threadId + 1) * elementsPerThread;

                auto copyLBlock = [&](std::byte*& threadDst, std::size_t srcIndex, auto rc) LLAMA_LAMBDA_INLINE
                {
                    constexpr auto bytes = L * sizeof(GetType<RecordDim, decltype(rc)>);
                    std::memcpy(threadDst, mapSrc(srcIndex, rc), bytes);
                    threadDst += bytes;
                };
                if constexpr(dstIsAoSoA)
                {
                    auto* threadDst = mapDst(start, RecordCoord<>{});
                    for(std::size_t i = start; i < stop; i += LanesDst)
                        forEachLeafCoord<RecordDim>(
                            [&](auto rc) LLAMA_LAMBDA_INLINE
                            {
                                for(std::size_t j = 0; j < LanesDst; j += L)
                                    copyLBlock(threadDst, i + j, rc);
                            });
                }
                else
                {
                    forEachLeafCoord<RecordDim>(
                        [&](auto rc) LLAMA_LAMBDA_INLINE
                        {
                            auto* threadDst = mapDst(start, rc);
                            for(std::size_t i = start; i < stop; i += L)
                                copyLBlock(threadDst, i, rc);
                        });
                }
            }
        }
    }

    /// @brief Generic implementation of \ref copy defaulting to \ref fieldWiseCopy. LLAMA provides several
    /// specializations of this construct for specific mappings. Users are encouraged to also specialize this template
    /// with better copy algorithms for further combinations of mappings, if they can and want to provide a better
    /// implementation.
    template<typename SrcMapping, typename DstMapping, typename SFINAE = void>
    struct Copy
    {
        template<typename SrcView, typename DstView>
        void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const
        {
            fieldWiseCopy(srcView, dstView, threadId, threadCount);
        }
    };

    template<typename Mapping>
    struct Copy<Mapping, Mapping>
    {
        template<typename SrcView, typename DstView>
        void operator()(const SrcView& srcView, DstView& dstView, std::size_t threadId, std::size_t threadCount) const
        {
            blobMemcpy(srcView, dstView, threadId, threadCount);
        }
    };

    template<
        typename ArrayExtents,
        typename RecordDim,
        typename LinearizeArrayDims,
        std::size_t LanesSrc,
        std::size_t LanesDst>
    struct Copy<
        mapping::AoSoA<ArrayExtents, RecordDim, LanesSrc, LinearizeArrayDims>,
        mapping::AoSoA<ArrayExtents, RecordDim, LanesDst, LinearizeArrayDims>,
        std::enable_if_t<LanesSrc != LanesDst>>
    {
        template<typename SrcBlob, typename DstBlob>
        void operator()(
            const View<mapping::AoSoA<ArrayExtents, RecordDim, LanesSrc, LinearizeArrayDims>, SrcBlob>& srcView,
            View<mapping::AoSoA<ArrayExtents, RecordDim, LanesDst, LinearizeArrayDims>, DstBlob>& dstView,
            std::size_t threadId,
            std::size_t threadCount)
        {
            constexpr auto readOpt = true; // TODO(bgruber): how to choose?
            aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount);
        }
    };

    template<
        typename ArrayExtents,
        typename RecordDim,
        typename LinearizeArrayDims,
        std::size_t LanesSrc,
        bool DstSeparateBuffers>
    struct Copy<
        mapping::AoSoA<ArrayExtents, RecordDim, LanesSrc, LinearizeArrayDims>,
        mapping::SoA<ArrayExtents, RecordDim, DstSeparateBuffers, LinearizeArrayDims>>
    {
        template<typename SrcBlob, typename DstBlob>
        void operator()(
            const View<mapping::AoSoA<ArrayExtents, RecordDim, LanesSrc, LinearizeArrayDims>, SrcBlob>& srcView,
            View<mapping::SoA<ArrayExtents, RecordDim, DstSeparateBuffers, LinearizeArrayDims>, DstBlob>& dstView,
            std::size_t threadId,
            std::size_t threadCount)
        {
            constexpr auto readOpt = true; // TODO(bgruber): how to choose?
            aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount);
        }
    };

    template<
        typename ArrayExtents,
        typename RecordDim,
        typename LinearizeArrayDims,
        std::size_t LanesDst,
        bool SrcSeparateBuffers>
    struct Copy<
        mapping::SoA<ArrayExtents, RecordDim, SrcSeparateBuffers, LinearizeArrayDims>,
        mapping::AoSoA<ArrayExtents, RecordDim, LanesDst, LinearizeArrayDims>>
    {
        template<typename SrcBlob, typename DstBlob>
        void operator()(
            const View<mapping::SoA<ArrayExtents, RecordDim, SrcSeparateBuffers, LinearizeArrayDims>, SrcBlob>&
                srcView,
            View<mapping::AoSoA<ArrayExtents, RecordDim, LanesDst, LinearizeArrayDims>, DstBlob>& dstView,
            std::size_t threadId,
            std::size_t threadCount)
        {
            constexpr auto readOpt = true; // TODO(bgruber): how to choose?
            aosoaCommonBlockCopy(srcView, dstView, readOpt, threadId, threadCount);
        }
    };

    /// Copy data from source view to destination view. Both views need to have the same array and record
    /// dimensions. Delegates to \ref Copy to choose an implementation.
    /// @param threadId Optional. Zero-based id of calling thread for multi-threaded invocations.
    /// @param threadCount Optional. Thread count in case of multi-threaded invocation.
    template<typename SrcMapping, typename SrcBlob, typename DstMapping, typename DstBlob>
    void copy(
        const View<SrcMapping, SrcBlob>& srcView,
        View<DstMapping, DstBlob>& dstView,
        std::size_t threadId = 0,
        std::size_t threadCount = 1)
    {
        Copy<SrcMapping, DstMapping>{}(srcView, dstView, threadId, threadCount);
    }
 } // namespace llama
 // ==
 // == ./Copy.hpp ==
 // ============================================================================

 // ============================================================================
 // == ./DumpMapping.hpp ==
 // ==
 // SPDX-License-Identifier: GPL-3.0-or-later

 // #pragma once
 #if !__has_include(<fmt/format.h>)
 #    error DumpMapping.hpp requires the fmt library
 #endif

 // #include "ArrayIndexRange.hpp"    // amalgamate: file already expanded
 // #include "Core.hpp"    // amalgamate: file already expanded

 #include <boost/functional/hash.hpp>
 #include <fmt/format.h>
 // #include <string>    // amalgamate: file already included
 // #include <vector>    // amalgamate: file already included

 namespace llama
 {
    namespace internal
    {
        template<std::size_t... Coords>
        auto toVec(RecordCoord<Coords...>) -> std::vector<std::size_t>
        {
            return {Coords...};
        }

        inline auto color(const std::vector<std::size_t>& recordCoord) -> std::size_t
        {
            auto c = boost::hash<std::vector<std::size_t>>{}(recordCoord) &0xFFFFFF;
            c |= 0x404040; // ensure color per channel is at least 0x40.
            return c;
        }

        template<std::size_t Dim>
        auto formatArrayIndex(const ArrayIndex<Dim>& ai)
        {
            if constexpr(Dim == 1)
                return std::to_string(ai[0]);
            else
            {
                std::string s = "{";
                for(auto v : ai)
                {
                    if(s.size() >= 2)
                        s += ",";
                    s += std::to_string(v);
                }
                s += "}";
                return s;
            }
        }

        template<std::size_t Dim>
        struct FieldBox
        {
            ArrayIndex<Dim> arrayIndex;
            std::vector<std::size_t> recordCoord;
            std::string recordTags;
            NrAndOffset nrAndOffset;
            std::size_t size;
        };

        template<typename Mapping>
        auto boxesFromMapping(const Mapping& mapping) -> std::vector<FieldBox<Mapping::ArrayIndex::rank>>
        {
            std::vector<FieldBox<Mapping::ArrayIndex::rank>> infos;

            using RecordDim = typename Mapping::RecordDim;
            for(auto ai : ArrayIndexRange{mapping.extents()})
            {
                forEachLeafCoord<RecordDim>(
                    [&](auto rc)
                    {
                        infos.push_back(
                            {ai,
                             internal::toVec(rc),
                             recordCoordTags<RecordDim>(rc),
                             mapping.blobNrAndOffset(ai, rc),
                             sizeof(GetType<RecordDim, decltype(rc)>)});
                    });
            }

            return infos;
        }

        template<std::size_t Dim>
        auto breakBoxes(std::vector<FieldBox<Dim>> boxes, std::size_t wrapByteCount) -> std::vector<FieldBox<Dim>>
        {
            for(std::size_t i = 0; i < boxes.size(); i++)
            {
                auto& fb = boxes[i];
                if(fb.nrAndOffset.offset / wrapByteCount != (fb.nrAndOffset.offset + fb.size - 1) / wrapByteCount)
                {
                    const auto remainingSpace = wrapByteCount - fb.nrAndOffset.offset % wrapByteCount;
                    auto newFb = fb;
                    newFb.nrAndOffset.offset = fb.nrAndOffset.offset + remainingSpace;
                    newFb.size = fb.size - remainingSpace;
                    fb.size = remainingSpace;
                    boxes.push_back(newFb);
                }
            }
            return boxes;
        }

        inline auto cssClass(std::string tags)
        {
            std::replace(begin(tags), end(tags), '.', '_');
            std::replace(begin(tags), end(tags), '<', '_');
            std::replace(begin(tags), end(tags), '>', '_');
            return tags;
        };
    } // namespace internal

    /// Returns an SVG image visualizing the memory layout created by the given mapping. The created memory blocks are
    /// wrapped after wrapByteCount bytes.
    template<typename Mapping>
    auto toSvg(const Mapping& mapping, std::size_t wrapByteCount = 64, bool breakBoxes = true) -> std::string
    {
        constexpr auto byteSizeInPixel = 30;
        constexpr auto blobBlockWidth = 60;

        auto infos = internal::boxesFromMapping(mapping);
        if(breakBoxes)
            infos = internal::breakBoxes(std::move(infos), wrapByteCount);

        std::string svg;

        std::array<int, Mapping::blobCount + 1> blobYOffset{};
        for(std::size_t i = 0; i < Mapping::blobCount; i++)
        {
            const auto blobRows = (mapping.blobSize(i) + wrapByteCount - 1) / wrapByteCount;
            blobYOffset[i + 1] = blobYOffset[i] + (blobRows + 1) * byteSizeInPixel; // one row gap between blobs
            const auto height = blobRows * byteSizeInPixel;
            svg += fmt::format(
                R"a(<rect x="0" y="{}" width="{}" height="{}" fill="#AAA" stroke="#000"/>
 <text x="{}" y="{}" fill="#000" text-anchor="middle">Blob: {}</text>
 )a",
                blobYOffset[i],
                blobBlockWidth,
                height,
                blobBlockWidth / 2,
                blobYOffset[i] + height / 2,
                i);
        }

        svg = fmt::format(
                  R"(<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <svg width="{}" height="{}" xmlns="http://www.w3.org/2000/svg">
    <style>
        .label {{ font: {}px sans-serif; }}
    </style>
 )",
                  blobBlockWidth + wrapByteCount * byteSizeInPixel,
                  blobYOffset.back() - byteSizeInPixel,
                  byteSizeInPixel / 2)
            + svg;

        for(const auto& info : infos)
        {
            const auto blobY = blobYOffset[info.nrAndOffset.nr];
            auto x = (info.nrAndOffset.offset % wrapByteCount) * byteSizeInPixel + blobBlockWidth;
            auto y = (info.nrAndOffset.offset / wrapByteCount) * byteSizeInPixel + blobY;
            const auto fill = internal::color(info.recordCoord);
            const auto width = byteSizeInPixel * info.size;

            constexpr auto cropBoxes = true;
            if(cropBoxes)
            {
                svg += fmt::format(
                    R"(<svg x="{}" y="{}" width="{}" height="{}">
 )",
                    x,
                    y,
                    width,
                    byteSizeInPixel);
                x = 0;
                y = 0;
            }
            svg += fmt::format(
                R"(<rect x="{}" y="{}" width="{}" height="{}" fill="#{:X}" stroke="#000"/>
 )",
                x,
                y,
                width,
                byteSizeInPixel,
                fill);
            for(std::size_t i = 1; i < info.size; i++)
            {
                svg += fmt::format(
                    R"(<line x1="{}" y1="{}" x2="{}" y2="{}" stroke="#777"/>
 )",
                    x + i * byteSizeInPixel,
                    y + byteSizeInPixel * 2 / 3,
                    x + i * byteSizeInPixel,
                    y + byteSizeInPixel);
            }
            svg += fmt::format(
                R"(<text x="{}" y="{}" fill="#000" text-anchor="middle" class="label">{} {}</text>
 )",
                x + width / 2,
                y + byteSizeInPixel * 3 / 4,
                internal::formatArrayIndex(info.arrayIndex),
                info.recordTags);
            if(cropBoxes)
                svg += R"(</svg>
 )";
        }
        svg += "</svg>";
        return svg;
    }

    /// Returns an HTML document visualizing the memory layout created by the given mapping. The visualization is
    /// resizeable.
    template<typename Mapping>
    auto toHtml(const Mapping& mapping) -> std::string
    {
        constexpr auto byteSizeInPixel = 30;
        constexpr auto rulerLengthInBytes = 512;
        constexpr auto rulerByteInterval = 8;

        auto infos = internal::boxesFromMapping(mapping);
        std::stable_sort(
            begin(infos),
            end(infos),
            [](const auto& a, const auto& b) {
                return std::tie(a.nrAndOffset.nr, a.nrAndOffset.offset)
                    < std::tie(b.nrAndOffset.nr, b.nrAndOffset.offset);
            });
        infos.erase(
            std::unique(
                begin(infos),
                end(infos),
                [](const auto& a, const auto& b) { return a.nrAndOffset == b.nrAndOffset; }),
            end(infos));

        std::string html;
        html += fmt::format(
            R"(<!DOCTYPE html>
 <html>
 <head>
 <style>
 .box {{
    outline: 1px solid;
    display: inline-block;
    white-space: nowrap;
    height: {}px;
    background: repeating-linear-gradient(90deg, #0000, #0000 29px, #777 29px, #777 30px);
    text-align: center;
    overflow: hidden;
    vertical-align: middle;
 }}
 #ruler {{
    background: repeating-linear-gradient(90deg, #0000, #0000 29px, #000 29px, #000 30px);
    border-bottom: 1px solid;
    height: 20px;
    margin-bottom: 20px;
 }}
 #ruler div {{
    position: absolute;
    display: inline-block;
 }}
 )",
            byteSizeInPixel);
        using RecordDim = typename Mapping::RecordDim;
        forEachLeafCoord<RecordDim>(
            [&](auto rc)
            {
                constexpr int size = sizeof(GetType<RecordDim, decltype(rc)>);

                html += fmt::format(
                    R"(.{} {{
    width: {}px;
    background-color: #{:X};
 }}
 )",
                    internal::cssClass(recordCoordTags<RecordDim>(rc)),
                    byteSizeInPixel * size,
                    internal::color(internal::toVec(rc)));
            });

        html += fmt::format(R"(</style>
 </head>
 <body>
    <header id="ruler">
 )");
        for(auto i = 0; i < rulerLengthInBytes; i += rulerByteInterval)
            html += fmt::format(
                R"(</style>
        <div style="margin-left: {}px;">{}</div>)",
                i * byteSizeInPixel,
                i);
        html += fmt::format(R"(
    </header>
 )");

        auto currentBlobNr = std::numeric_limits<std::size_t>::max();
        for(const auto& info : infos)
        {
            if(currentBlobNr != info.nrAndOffset.nr)
            {
                currentBlobNr = info.nrAndOffset.nr;
                html += fmt::format("<h1>Blob: {}</h1>", currentBlobNr);
            }
            html += fmt::format(
                R"(<div class="box {0}" title="{1} {2}">{1} {2}</div>)",
                internal::cssClass(info.recordTags),
                internal::formatArrayIndex(info.arrayIndex),
                info.recordTags);
        }
        html += R"(</body>
 </html>)";
        return html;
    }
 } // namespace llama
 // ==
 // == ./DumpMapping.hpp ==
 // ============================================================================

 // ============================================================================
 // == ./llama.hpp ==
 // ==
 // Copyright 2018 Alexander Matthes
 // SPDX-License-Identifier: GPL-3.0-or-later

 // #pragma once
 /// \mainpage LLAMA API documentation
 ///
 /// LLAMA is a C++17 template header-only library for the abstraction of memory access patterns. It distinguishes
 /// between the view of the algorithm on the memory and the real layout in the background. This enables performance
 /// portability for multicore, manycore and gpu applications with the very same code.
 ///
 /// In contrast to many other solutions LLAMA can define nested data structures of arbitrary depths and is not limited
 /// only to struct of array and array of struct data layouts. It is also capable to explicitly define padding,
 /// blocking, striding and any other run time or compile time access pattern simultaneously.
 ///
 /// To archieve this goal LLAMA is split into mostly independent, orthogonal parts completely written in modern C++17
 /// to run on as many architectures and with as many compilers as possible while still supporting extensions needed
 /// e.g. to run on GPU or other many core hardware.
 ///
 /// This page documents the API of LLAMA. The user documentation and an overview about the concepts and ideas can be
 /// found here: https://llama-doc.rtfd.io
 ///
 /// LLAMA is licensed under the LGPL3+.

 #define LLAMA_VERSION_MAJOR 0
 #define LLAMA_VERSION_MINOR 3
 #define LLAMA_VERSION_PATCH 0

 #ifdef __NVCC__
 #    pragma push
 #    if __CUDACC_VER_MAJOR__ * 1000 + __CUDACC_VER_MINOR__ >= 11005
 #        pragma nv_diag_suppress 940
 #    else
 #        pragma diag_suppress 940
 #    endif
 #endif

 // #include "ArrayExtents.hpp"    // amalgamate: file already expanded
 // #include "ArrayIndexRange.hpp"    // amalgamate: file already expanded
 // #include "BlobAllocators.hpp"    // amalgamate: file already expanded
 // #include "Copy.hpp"    // amalgamate: file already expanded
 // #include "Core.hpp"    // amalgamate: file already expanded
 // #include "Meta.hpp"    // amalgamate: file already expanded
 // #include "Vector.hpp"    // amalgamate: file already expanded
 // #include "View.hpp"    // amalgamate: file already expanded
 // #include "VirtualRecord.hpp"    // amalgamate: file already expanded
 // #include "macros.hpp"    // amalgamate: file already expanded
 	// ============================================================================
 	// == ./mapping/AoS.hpp ==
 	// ==
 	// Copyright 2018 Alexander Matthes
 	// SPDX-License-Identifier: GPL-3.0-or-later

 	// #pragma once
 	// #include "Common.hpp"    // amalgamate: file already expanded

 	namespace llama::mapping
 	{
 	    /// Array of struct mapping. Used to create a \ref View via \ref allocView.
 	    /// \tparam AlignAndPad If true, padding bytes are inserted to guarantee that struct members are properly aligned.
 	    /// If false, struct members are tightly packed.
 	    /// \tparam T_LinearizeArrayDimsFunctor Defines how the array dimensions should be mapped into linear numbers and
 	    /// how big the linear domain gets.
 	    /// \tparam FlattenRecordDim Defines how the record dimension's fields should be flattened. See \ref
 	    /// FlattenRecordDimInOrder, \ref FlattenRecordDimIncreasingAlignment, \ref FlattenRecordDimDecreasingAlignment and
 	    /// \ref FlattenRecordDimMinimizePadding.
 	    template<
 	        typename TArrayExtents,
 	        typename TRecordDim,
 	        bool AlignAndPad = true,
 	        typename TLinearizeArrayDimsFunctor = LinearizeArrayDimsCpp,
 	        template<typename> typename FlattenRecordDim = FlattenRecordDimInOrder>
 	    struct AoS : private TArrayExtents
 	    {
 	        using ArrayExtents = TArrayExtents;
 	        using ArrayIndex = typename ArrayExtents::Index;
 	        using RecordDim = TRecordDim;
 	        using LinearizeArrayDimsFunctor = TLinearizeArrayDimsFunctor;
 	        static constexpr std::size_t blobCount = 1;

 	        constexpr AoS() = default;

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr explicit AoS(ArrayExtents extents, RecordDim = {}) : ArrayExtents(extents)
 	        {
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
 	        {
 	            return *this;
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t) const -> std::size_t
 	        {
 	            return LinearizeArrayDimsFunctor{}.size(extents())
 	                * flatSizeOf<typename Flattener::FlatRecordDim, AlignAndPad>;
 	        }

 	        template<std::size_t... RecordCoords>
 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex ai, RecordCoord<RecordCoords...> = {}) const
 	            -> NrAndOffset
 	        {
 	            constexpr std::size_t flatFieldIndex =
 	#ifdef __NVCC__
 	                *& // mess with nvcc compiler state to workaround bug
 	#endif
 	                 Flattener::template flatIndex<RecordCoords...>;
 	            const auto offset
 	                = LinearizeArrayDimsFunctor{}(ai, extents())
 	                    * flatSizeOf<
 	                        typename Flattener::FlatRecordDim,
 	                        AlignAndPad> + flatOffsetOf<typename Flattener::FlatRecordDim, flatFieldIndex, AlignAndPad>;
 	            return {0, offset};
 	        }

 	    private:
 	        using Flattener = FlattenRecordDim<TRecordDim>;
 	    };

 	    /// Array of struct mapping preserving the alignment of the field types by inserting padding.
 	    /// \see AoS
 	    template<typename ArrayExtents, typename RecordDim, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
 	    using AlignedAoS = AoS<ArrayExtents, RecordDim, true, LinearizeArrayDimsFunctor>;

 	    /// Array of struct mapping preserving the alignment of the field types by inserting padding and permuting the
 	    /// field order to minimize this padding. \see AoS
 	    template<typename ArrayExtents, typename RecordDim, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
 	    using MinAlignedAoS
 	        = AoS<ArrayExtents, RecordDim, true, LinearizeArrayDimsFunctor, FlattenRecordDimMinimizePadding>;

 	    /// Array of struct mapping packing the field types tightly, violating the types alignment requirements.
 	    /// \see AoS
 	    template<typename ArrayExtents, typename RecordDim, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
 	    using PackedAoS = AoS<ArrayExtents, RecordDim, false, LinearizeArrayDimsFunctor>;

 	    template<bool AlignAndPad = true, typename LinearizeArrayDimsFunctor = LinearizeArrayDimsCpp>
 	    struct PreconfiguredAoS
 	    {
 	        template<typename ArrayExtents, typename RecordDim>
 	        using type = AoS<ArrayExtents, RecordDim, AlignAndPad, LinearizeArrayDimsFunctor>;
 	    };

 	    template<typename Mapping>
 	    inline constexpr bool isAoS = false;

 	    template<
 	        typename ArrayExtents,
 	        typename RecordDim,
 	        bool AlignAndPad,
 	        typename LinearizeArrayDimsFunctor,
 	        template<typename>
 	        typename FlattenRecordDim>
 	    inline constexpr bool
 	        isAoS<AoS<ArrayExtents, RecordDim, AlignAndPad, LinearizeArrayDimsFunctor, FlattenRecordDim>> = true;
 	} // namespace llama::mapping
 	// ==
 	// == ./mapping/AoS.hpp ==
 	// ============================================================================

 // #include "mapping/AoSoA.hpp"    // amalgamate: file already expanded
 	// ============================================================================
 	// == ./mapping/Bytesplit.hpp ==
 	// ==
 	// SPDX-License-Identifier: GPL-3.0-or-later

 	// #pragma once
 	// #include "Common.hpp"    // amalgamate: file already expanded

 	namespace llama::mapping
 	{
 	    namespace internal
 	    {
 	        template<typename T>
 	        using ReplaceByByteArray = std::byte[sizeof(T)];

 	        template<typename RecordDim>
 	        using SplitBytes = TransformLeaves<RecordDim, ReplaceByByteArray>;
 	    } // namespace internal

 	    template<typename TArrayExtents, typename TRecordDim, template<typename, typename> typename InnerMapping>
 	    struct Bytesplit : private InnerMapping<TArrayExtents, internal::SplitBytes<TRecordDim>>
 	    {
 	        using Inner = InnerMapping<TArrayExtents, internal::SplitBytes<TRecordDim>>;

 	        using ArrayExtents = typename Inner::ArrayExtents;
 	        using ArrayIndex = typename Inner::ArrayIndex;
 	        using RecordDim = TRecordDim; // hide Inner::RecordDim
 	        using Inner::blobCount;

 	        using Inner::blobSize;
 	        using Inner::extents;
 	        using Inner::Inner;

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr explicit Bytesplit(TArrayExtents extents, TRecordDim = {}) : Inner(extents)
 	        {
 	        }

 	        template<std::size_t... RecordCoords>
 	        static constexpr auto isComputed(RecordCoord<RecordCoords...>)
 	        {
 	            return true;
 	        }

 	        template<typename QualifiedBase, typename RC, typename BlobArray>
 	        struct Reference
 	        {
 	            QualifiedBase& innerMapping;
 	            ArrayIndex ai;
 	            BlobArray& blobs;

 	            using DstType = GetType<TRecordDim, RC>;

 	            // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
 	            operator DstType() const
 	            {
 	                DstType v;
 	                auto* p = reinterpret_cast<std::byte*>(&v);
 	                boost::mp11::mp_for_each<boost::mp11::mp_iota_c<sizeof(DstType)>>(
 	                    [&](auto ic)
 	                    {
 	                        constexpr auto i = decltype(ic)::value;
 	                        const auto [nr, off] = innerMapping.blobNrAndOffset(ai, Cat<RC, RecordCoord<i>>{});
 	                        p[i] = blobs[nr][off];
 	                    });
 	                return v;
 	            }

 	            auto operator=(DstType v) -> Reference&
 	            {
 	                auto* p = reinterpret_cast<std::byte*>(&v);
 	                boost::mp11::mp_for_each<boost::mp11::mp_iota_c<sizeof(DstType)>>(
 	                    [&](auto ic)
 	                    {
 	                        constexpr auto i = decltype(ic)::value;
 	                        const auto [nr, off] = innerMapping.blobNrAndOffset(ai, Cat<RC, RecordCoord<i>>{});
 	                        blobs[nr][off] = p[i];
 	                    });
 	                return *this;
 	            }
 	        };

 	        template<std::size_t... RecordCoords, typename BlobArray>
 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
 	            typename Inner::ArrayIndex ai,
 	            RecordCoord<RecordCoords...>,
 	            BlobArray& blobs) const
 	        {
 	            return Reference<decltype(*this), RecordCoord<RecordCoords...>, BlobArray>{*this, ai, blobs};
 	        }
 	    };
 	} // namespace llama::mapping
 	// ==
 	// == ./mapping/Bytesplit.hpp ==
 	// ============================================================================

 	// ============================================================================
 	// == ./mapping/Heatmap.hpp ==
 	// ==
 	// #pragma once
 	// #include "Common.hpp"    // amalgamate: file already expanded

 	// #include <array>    // amalgamate: file already included
 	#include <atomic>
 	#include <sstream>
 	// #include <vector>    // amalgamate: file already included

 	namespace llama::mapping
 	{
 	    /// Forwards all calls to the inner mapping. Counts all accesses made to all bytes, allowing to extract a heatmap.
 	    /// \tparam Mapping The type of the inner mapping.
 	    template<typename Mapping, typename CountType = std::size_t>
 	    struct Heatmap
 	    {
 	        using ArrayExtents = typename Mapping::ArrayExtents;
 	        using ArrayIndex = typename Mapping::ArrayIndex;
 	        using RecordDim = typename Mapping::RecordDim;
 	        static constexpr std::size_t blobCount = Mapping::blobCount;

 	        constexpr Heatmap() = default;

 	        LLAMA_FN_HOST_ACC_INLINE
 	        explicit Heatmap(Mapping mapping) : mapping(mapping)
 	        {
 	            for(std::size_t i = 0; i < blobCount; i++)
 	                byteHits[i] = std::vector<std::atomic<CountType>>(blobSize(i));
 	        }

 	        Heatmap(const Heatmap&) = delete;
 	        auto operator=(const Heatmap&) -> Heatmap& = delete;

 	        Heatmap(Heatmap&&) noexcept = default;
 	        auto operator=(Heatmap&&) noexcept -> Heatmap& = default;

 	        ~Heatmap() = default;

 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
 	        {
 	            return mapping.extents();
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t i) const -> std::size_t
 	        {
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return mapping.blobSize(i);
 	        }

 	        template<std::size_t... RecordCoords>
 	        LLAMA_FN_HOST_ACC_INLINE auto blobNrAndOffset(ArrayIndex ai, RecordCoord<RecordCoords...> rc = {}) const
 	            -> NrAndOffset
 	        {
 	            const auto nao = mapping.blobNrAndOffset(ai, rc);
 	            for(std::size_t i = 0; i < sizeof(GetType<RecordDim, RecordCoord<RecordCoords...>>); i++)
 	                byteHits[nao.nr][nao.offset + i]++;
 	            return nao;
 	        }

 	        auto toGnuplotScript(std::size_t wrapAfterBytes = 64) const -> std::string
 	        {
 	            std::stringstream f;
 	            f << "#!/usr/bin/gnuplot -p\n$data << EOD\n";
 	            for(std::size_t i = 0; i < blobCount; i++)
 	            {
 	                std::size_t byteCount = 0;
 	                for(const auto& hits : byteHits[i])
 	                    f << hits << ((++byteCount % wrapAfterBytes == 0) ? '\n' : ' ');
 	                while(byteCount++ % wrapAfterBytes != 0)
 	                    f << "0 ";
 	                f << '\n';
 	            }
 	            f << R"(EOD
 	set view map
 	set xtics format ""
 	set x2tics autofreq 8
 	set yrange [] reverse
 	set link x2; set link y2
 	set ylabel "Cacheline"
 	set x2label "Byte"
 	plot $data matrix with image axes x2y1
 	)";
 	            return f.str();
 	        }

 	        Mapping mapping;
 	        mutable std::array<std::vector<std::atomic<CountType>>, blobCount> byteHits;
 	    };
 	} // namespace llama::mapping
 	// ==
 	// == ./mapping/Heatmap.hpp ==
 	// ============================================================================

 // #include "mapping/One.hpp"    // amalgamate: file already expanded
 // #include "mapping/SoA.hpp"    // amalgamate: file already expanded
 	// ============================================================================
 	// == ./mapping/Split.hpp ==
 	// ==
 	// #pragma once
 	// #include "Common.hpp"    // amalgamate: file already expanded

 	namespace llama::mapping
 	{
 	    namespace internal
 	    {
 	        template<typename... Fields, std::size_t FirstCoord, std::size_t... Coords>
 	        auto partitionRecordDim(Record<Fields...>, RecordCoord<FirstCoord, Coords...>)
 	        {
 	            using namespace boost::mp11;
 	            using Rec = Record<Fields...>;
 	            if constexpr(sizeof...(Coords) == 0)
 	            {
 	                using Part1 = Record<mp_at_c<Rec, FirstCoord>>;
 	                using Part2 = mp_erase_c<Rec, FirstCoord, FirstCoord + 1>;
 	                return mp_list<Part1, Part2>{};
 	            }
 	            else
 	            {
 	                using FieldTag = GetTag<Rec, RecordCoord<FirstCoord>>;
 	                using FieldType = GetType<Rec, RecordCoord<FirstCoord>>;
 	                using InnerPartition = decltype(partitionRecordDim(FieldType{}, RecordCoord<Coords...>{}));
 	                using Part1 = Record<Field<FieldTag, mp_first<InnerPartition>>>;
 	                using Part2 = mp_replace_at_c<Rec, FirstCoord, Field<FieldTag, mp_second<InnerPartition>>>;
 	                return mp_list<Part1, Part2>{};
 	            }
 	        }

 	        template<typename Acc, typename TagList>
 	        struct PartitionFoldOpImpl
 	        {
 	            using Part1Before = boost::mp11::mp_first<Acc>;
 	            using Part2Before = boost::mp11::mp_second<Acc>;
 	            using R = decltype(partitionRecordDim(Part2Before{}, GetCoordFromTags<Part2Before, TagList>{}));
 	            using Part1After = boost::mp11::mp_first<R>;
 	            using Part2After = boost::mp11::mp_second<R>;

 	            using type = boost::mp11::mp_list<MergedRecordDims<Part1Before, Part1After>, Part2After>;
 	        };

 	        template<typename Acc, typename TagList>
 	        using PartitionFoldOp = typename PartitionFoldOpImpl<Acc, TagList>::type;

 	        template<typename... Fields, typename... RCs>
 	        auto partitionRecordDim(Record<Fields...>, boost::mp11::mp_list<RCs...>)
 	        {
 	            using namespace boost::mp11;
 	            using Initial = mp_list<Record<>, Record<Fields...>>; // initially, nothing selected for mapping 1
 	            return mp_fold<mp_list<GetTags<Record<Fields...>, RCs>...>, Initial, PartitionFoldOp>{};
 	        }

 	        // workaround for nvcc 11.3 and below: we cannot put the decltype() directly into the Split class
 	        template<typename RecordDim, typename RecordCoordForMapping1>
 	        struct PartionedRecordDim
 	        {
 	            using type = decltype(partitionRecordDim(RecordDim{}, RecordCoordForMapping1{}));
 	        };

 	        template<typename RC, typename RecordCoordForMapping1>
 	        inline constexpr bool isSelected = RecordCoordCommonPrefixIsSame<RecordCoordForMapping1, RC>;

 	        template<typename RC>
 	        struct IsSelectedPredicate
 	        {
 	            template<typename RecordCoordForMapping1>
 	            using fn = boost::mp11::mp_bool<isSelected<RC, RecordCoordForMapping1>>;
 	        };

 	        template<typename RC, typename... RecordCoordsForMapping1>
 	        inline constexpr bool isSelected<RC, boost::mp11::mp_list<RecordCoordsForMapping1...>> = boost::mp11::
 	            mp_any_of_q<boost::mp11::mp_list<RecordCoordsForMapping1...>, IsSelectedPredicate<RC>>::value;
 	    } // namespace internal

 	    /// Mapping which splits off a part of the record dimension and maps it differently then the rest.
 	    /// \tparam RecordCoordForMapping1 A \ref RecordCoord or a list of RecordCoords selecting the part of the record
 	    /// dimension to be mapped differently.
 	    /// \tparam MappingTemplate1 The mapping used for the selected part of the record dimension.
 	    /// \tparam MappingTemplate2 The mapping used for the not selected part of the record dimension.
 	    /// \tparam SeparateBlobs If true, both pieces of the record dimension are mapped to separate blobs.
 	    template<
 	        typename TArrayExtents,
 	        typename TRecordDim,
 	        typename RecordCoordForMapping1,
 	        template<typename...>
 	        typename MappingTemplate1,
 	        template<typename...>
 	        typename MappingTemplate2,
 	        bool SeparateBlobs = false>
 	    struct Split
 	    {
 	        using ArrayExtents = TArrayExtents;
 	        using ArrayIndex = typename ArrayExtents::Index;
 	        using RecordDim = TRecordDim;

 	        using RecordDimPartitions = typename internal::PartionedRecordDim<RecordDim, RecordCoordForMapping1>::type;
 	        using RecordDim1 = boost::mp11::mp_first<RecordDimPartitions>;
 	        using RecordDim2 = boost::mp11::mp_second<RecordDimPartitions>;

 	        using Mapping1 = MappingTemplate1<ArrayExtents, RecordDim1>;
 	        using Mapping2 = MappingTemplate2<ArrayExtents, RecordDim2>;

 	        static constexpr std::size_t blobCount = SeparateBlobs ? Mapping1::blobCount + Mapping2::blobCount : 1;
 	        static_assert(SeparateBlobs || Mapping1::blobCount == 1);
 	        static_assert(SeparateBlobs || Mapping2::blobCount == 1);

 	        constexpr Split() = default;

 	        LLAMA_FN_HOST_ACC_INLINE
 	        constexpr explicit Split(ArrayExtents extents) : mapping1(extents), mapping2(extents)
 	        {
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
 	        {
 	            return mapping1.extents();
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize([[maybe_unused]] std::size_t i) const -> std::size_t
 	        {
 	            if constexpr(SeparateBlobs)
 	            {
 	                if(i < Mapping1::blobCount)
 	                    return mapping1.blobSize(i);
 	                return mapping2.blobSize(i - Mapping1::blobCount);
 	            }
 	            else
 	                return mapping1.blobSize(0) + mapping2.blobSize(0);
 	        }

 	        template<std::size_t... RecordCoords>
 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(ArrayIndex ai, RecordCoord<RecordCoords...> = {}) const
 	            -> NrAndOffset
 	        {
 	            using Tags = GetTags<RecordDim, RecordCoord<RecordCoords...>>;

 	            if constexpr(internal::isSelected<RecordCoord<RecordCoords...>, RecordCoordForMapping1>)
 	                return mapping1.blobNrAndOffset(ai, GetCoordFromTags<RecordDim1, Tags>{});
 	            else
 	            {
 	                auto nrAndOffset = mapping2.blobNrAndOffset(ai, GetCoordFromTags<RecordDim2, Tags>{});
 	                if constexpr(SeparateBlobs)
 	                    nrAndOffset.nr += Mapping1::blobCount;
 	                else
 	                {
 	                    for(std::size_t i = 0; i < Mapping1::blobCount; i++)
 	                        nrAndOffset.offset += mapping1.blobSize(i);
 	                }
 	                return nrAndOffset;
 	            }
 	        }

 	        Mapping1 mapping1;
 	        Mapping2 mapping2;
 	    };

 	    template<
 	        typename RecordCoordsForMapping1,
 	        template<typename...>
 	        typename MappingTemplate1,
 	        template<typename...>
 	        typename MappingTemplate2,
 	        bool SeparateBlobs = false>
 	    struct PreconfiguredSplit
 	    {
 	        template<typename ArrayExtents, typename RecordDim>
 	        using type = Split<
 	            ArrayExtents,
 	            RecordDim,
 	            RecordCoordsForMapping1,
 	            MappingTemplate1,
 	            MappingTemplate2,
 	            SeparateBlobs>;
 	    };
 	} // namespace llama::mapping
 	// ==
 	// == ./mapping/Split.hpp ==
 	// ============================================================================

 	// ============================================================================
 	// == ./mapping/Trace.hpp ==
 	// ==
 	// #pragma once
 	// #include "Common.hpp"    // amalgamate: file already expanded

 	// #include <atomic>    // amalgamate: file already included
 	// #include <iostream>    // amalgamate: file already included
 	// #include <string>    // amalgamate: file already included
 	#include <unordered_map>

 	namespace llama::mapping
 	{
 	    /// Forwards all calls to the inner mapping. Traces all accesses made through this mapping and prints a summary on
 	    /// destruction.
 	    /// \tparam Mapping The type of the inner mapping.
 	    template<typename Mapping>
 	    struct Trace
 	    {
 	        using ArrayExtents = typename Mapping::ArrayExtents;
 	        using ArrayIndex = typename Mapping::ArrayIndex;
 	        using RecordDim = typename Mapping::RecordDim;
 	        static constexpr std::size_t blobCount = Mapping::blobCount;

 	        constexpr Trace() = default;

 	        LLAMA_FN_HOST_ACC_INLINE
 	        explicit Trace(Mapping mapping, bool printOnDestruction = true)
 	            : mapping(mapping)
 	            , printOnDestruction(printOnDestruction)
 	        {
 	            forEachLeafCoord<RecordDim>([&](auto rc) { fieldHits[recordCoordTags<RecordDim>(rc)] = 0; });
 	        }

 	        Trace(const Trace&) = delete;
 	        auto operator=(const Trace&) -> Trace& = delete;

 	        Trace(Trace&&) noexcept = default;
 	        auto operator=(Trace&&) noexcept -> Trace& = default;

 	        ~Trace()
 	        {
 	            if(printOnDestruction && !fieldHits.empty())
 	                print();
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
 	        {
 	            return mapping.extents();
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t i) const -> std::size_t
 	        {
 	            LLAMA_FORCE_INLINE_RECURSIVE
 	            return mapping.blobSize(i);
 	        }

 	        template<std::size_t... RecordCoords>
 	        LLAMA_FN_HOST_ACC_INLINE auto blobNrAndOffset(ArrayIndex ai, RecordCoord<RecordCoords...> rc = {}) const
 	            -> NrAndOffset
 	        {
 	            const static auto name = recordCoordTags<RecordDim>(RecordCoord<RecordCoords...>{});
 	            fieldHits.at(name)++;

 	            LLAMA_FORCE_INLINE_RECURSIVE return mapping.blobNrAndOffset(ai, rc);
 	        }

 	        void print() const
 	        {
 	            std::cout << "Trace mapping, number of accesses:\n";
 	            for(const auto& [k, v] : fieldHits)
 	                std::cout << '\t' << k << ":\t" << v << '\n';
 	        }

 	        Mapping mapping;
 	        mutable std::unordered_map<std::string, std::atomic<std::size_t>> fieldHits;
 	        bool printOnDestruction;
 	    };
 	} // namespace llama::mapping
 	// ==
 	// == ./mapping/Trace.hpp ==
 	// ============================================================================

 	// ============================================================================
 	// == ./mapping/tree/Mapping.hpp ==
 	// ==
 	// Copyright 2018 Alexander Matthes
 	// SPDX-License-Identifier: GPL-3.0-or-later

 	// #pragma once
 	// #include "../Common.hpp"    // amalgamate: file already expanded
 		// ============================================================================
 		// == ./mapping/tree/Functors.hpp ==
 		// ==
 		// Copyright 2018 Alexander Matthes
 		// SPDX-License-Identifier: GPL-3.0-or-later

 		// #pragma once
 			// ============================================================================
 			// == ./mapping/tree/TreeFromDimensions.hpp ==
 			// ==
 			// Copyright 2018 Alexander Matthes
 			// SPDX-License-Identifier: GPL-3.0-or-later
 			// #pragma once
 			// #include "../../Core.hpp"    // amalgamate: file already expanded
 				// ============================================================================
 				// == ./Tuple.hpp ==
 				// ==
 				// Copyright 2018 Alexander Matthes
 				// SPDX-License-Identifier: GPL-3.0-or-later

 				// #pragma once
 				// #include "Meta.hpp"    // amalgamate: file already expanded
 				// #include "macros.hpp"    // amalgamate: file already expanded

 				namespace llama
 				{
 				    template<typename... Elements>
 				    struct Tuple
 				    {
 				    };

 				    /// Tuple class like `std::tuple` but suitable for use with offloading devices like GPUs.
 				    template<typename TFirstElement, typename... Elements>
 				    struct Tuple<TFirstElement, Elements...>
 				    {
 				        using FirstElement = TFirstElement;
 				        using RestTuple = Tuple<Elements...>;

 				        constexpr Tuple() = default;

 				        /// Construct a tuple from values of the same types as the tuple stores.
 				        LLAMA_FN_HOST_ACC_INLINE constexpr explicit Tuple(FirstElement first, Elements... rest)
 				            : first(std::move(first))
 				            , rest(std::move(rest)...)
 				        {
 				        }

 				        /// Construct a tuple from forwarded values of potentially different types as the tuple stores.
 				        // SFINAE away this ctor if tuple elements cannot be constructed from ctor arguments
 				        template<
 				            typename T,
 				            typename... Ts,
 				            std::enable_if_t<
 				                sizeof...(Elements) == sizeof...(Ts)
 				                    && std::is_constructible_v<TFirstElement, T> && (std::is_constructible_v<Elements, Ts> && ...),
 				                int> = 0>
 				        LLAMA_FN_HOST_ACC_INLINE constexpr explicit Tuple(T&& firstArg, Ts&&... restArgs)
 				            : first(std::forward<T>(firstArg))
 				            , rest(std::forward<Ts>(restArgs)...)
 				        {
 				        }

 				        FirstElement first; ///< the first element (if existing)
 				#ifndef __NVCC__
 				        [[no_unique_address]] // nvcc 11.3 ICE
 				#endif
 				        RestTuple rest; ///< the remaining elements
 				    };

 				    template<typename... Elements>
 				    Tuple(Elements...) -> Tuple<std::remove_cv_t<std::remove_reference_t<Elements>>...>;

 				    template<std::size_t Pos, typename... Elements>
 				    LLAMA_FN_HOST_ACC_INLINE constexpr auto get(Tuple<Elements...>& tuple) -> auto&
 				    {
 				        if constexpr(Pos == 0)
 				            return tuple.first;
 				        else
 				            return get<Pos - 1>(tuple.rest);
 				    }

 				    template<std::size_t Pos, typename... Elements>
 				    LLAMA_FN_HOST_ACC_INLINE constexpr auto get(const Tuple<Elements...>& tuple) -> const auto&
 				    {
 				        if constexpr(Pos == 0)
 				            return tuple.first;
 				        else
 				            return get<Pos - 1>(tuple.rest);
 				    }
 				} // namespace llama

 				template<typename... Elements>
 				struct std::tuple_size<llama::Tuple<Elements...>>
 				{
 				    static constexpr auto value = sizeof...(Elements);
 				};

 				template<std::size_t I, typename... Elements>
 				struct std::tuple_element<I, llama::Tuple<Elements...>>
 				{
 				    using type = boost::mp11::mp_at_c<llama::Tuple<Elements...>, I>;
 				};

 				namespace llama
 				{
 				    namespace internal
 				    {
 				        template<typename... Elements, std::size_t... Is>
 				        LLAMA_FN_HOST_ACC_INLINE constexpr auto areEqual(
 				            const Tuple<Elements...>& a,
 				            const Tuple<Elements...>& b,
 				            std::index_sequence<Is...>) -> bool
 				        {
 				            return ((get<Is>(a) == get<Is>(b)) && ...);
 				        }
 				    } // namespace internal

 				    template<typename... ElementsA, typename... ElementsB>
 				    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator==(const Tuple<ElementsA...>& a, const Tuple<ElementsB...>& b)
 				        -> bool
 				    {
 				        using namespace boost::mp11;
 				        if constexpr(sizeof...(ElementsA) == sizeof...(ElementsB))
 				            if constexpr(mp_apply<mp_all, mp_transform<std::is_same, mp_list<ElementsA...>, mp_list<ElementsB...>>>::
 				                             value)
 				                return internal::areEqual(a, b, std::make_index_sequence<sizeof...(ElementsA)>{});
 				        return false;
 				    }

 				    template<typename... ElementsA, typename... ElementsB>
 				    LLAMA_FN_HOST_ACC_INLINE constexpr auto operator!=(const Tuple<ElementsA...>& a, const Tuple<ElementsB...>& b)
 				        -> bool
 				    {
 				        return !(a == b);
 				    }

 				    namespace internal
 				    {
 				        template<typename Tuple1, typename Tuple2, size_t... Is1, size_t... Is2>
 				        LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleCatImpl(
 				            const Tuple1& t1,
 				            const Tuple2& t2,
 				            std::index_sequence<Is1...>,
 				            std::index_sequence<Is2...>)
 				        {
 				            return Tuple{get<Is1>(t1)..., get<Is2>(t2)...};
 				        }
 				    } // namespace internal

 				    template<typename Tuple1, typename Tuple2>
 				    LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleCat(const Tuple1& t1, const Tuple2& t2)
 				    {
 				        return internal::tupleCatImpl(
 				            t1,
 				            t2,
 				            std::make_index_sequence<std::tuple_size_v<Tuple1>>{},
 				            std::make_index_sequence<std::tuple_size_v<Tuple2>>{});
 				    }

 				    namespace internal
 				    {
 				        template<std::size_t Pos, typename Tuple, typename Replacement>
 				        struct TupleReplaceImpl
 				        {
 				            LLAMA_FN_HOST_ACC_INLINE
 				            auto operator()(Tuple const tuple, Replacement const replacement)
 				            {
 				                return tupleCat(
 				                    llama::Tuple{tuple.first},
 				                    TupleReplaceImpl<Pos - 1, typename Tuple::RestTuple, Replacement>()(tuple.rest, replacement));
 				            };
 				        };

 				        template<typename... Elements, typename Replacement>
 				        struct TupleReplaceImpl<0, Tuple<Elements...>, Replacement>
 				        {
 				            LLAMA_FN_HOST_ACC_INLINE
 				            auto operator()(Tuple<Elements...> tuple, Replacement const replacement)
 				            {
 				                return tupleCat(Tuple{replacement}, tuple.rest);
 				            };
 				        };

 				        template<typename OneElement, typename Replacement>
 				        struct TupleReplaceImpl<0, Tuple<OneElement>, Replacement>
 				        {
 				            LLAMA_FN_HOST_ACC_INLINE
 				            auto operator()(Tuple<OneElement>, Replacement const replacement)
 				            {
 				                return Tuple{replacement};
 				            }
 				        };
 				    } // namespace internal

 				    /// Creates a copy of a tuple with the element at position Pos replaced by replacement.
 				    template<std::size_t Pos, typename Tuple, typename Replacement>
 				    LLAMA_FN_HOST_ACC_INLINE auto tupleReplace(Tuple tuple, Replacement replacement)
 				    {
 				        return internal::TupleReplaceImpl<Pos, Tuple, Replacement>()(tuple, replacement);
 				    }

 				    namespace internal
 				    {
 				        template<size_t... Is, typename... Elements, typename Functor>
 				        LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleTransformHelper(
 				            std::index_sequence<Is...>,
 				            const Tuple<Elements...>& tuple,
 				            const Functor& functor)
 				        {
 				            // FIXME(bgruber): nvcc fails to compile
 				            // Tuple{functor(get<Is>(tuple))...}
 				            return Tuple<decltype(functor(std::declval<Elements>()))...>{functor(get<Is>(tuple))...};
 				        }
 				    } // namespace internal

 				    /// Applies a functor to every element of a tuple, creating a new tuple with the result of the element
 				    /// transformations. The functor needs to implement a template `operator()` to which all tuple elements are passed.
 				    // TODO(bgruber): replace by mp11 version in Boost 1.74.
 				    template<typename... Elements, typename Functor>
 				    LLAMA_FN_HOST_ACC_INLINE constexpr auto tupleTransform(const Tuple<Elements...>& tuple, const Functor& functor)
 				    {
 				        return internal::tupleTransformHelper(std::make_index_sequence<sizeof...(Elements)>{}, tuple, functor);
 				    }

 				    /// Returns a copy of the tuple without the first element.
 				    template<typename... Elements>
 				    LLAMA_FN_HOST_ACC_INLINE constexpr auto pop_front(const Tuple<Elements...>& tuple)
 				    {
 				        return tuple.rest;
 				    }
 				} // namespace llama
 				// ==
 				// == ./Tuple.hpp ==
 				// ============================================================================


 			// #include <cstddef>    // amalgamate: file already included
 			// #include <string>    // amalgamate: file already included
 			// #include <type_traits>    // amalgamate: file already included

 			namespace llama::mapping::tree
 			{
 			    template<typename T>
 			    inline constexpr auto one = 1;

 			    template<>
 			    inline constexpr auto one<boost::mp11::mp_size_t<1>> = boost::mp11::mp_size_t<1>{};

 			    template<typename TIdentifier, typename TType, typename CountType = std::size_t>
 			    struct Leaf
 			    {
 			        using Identifier = TIdentifier;
 			        using Type = TType;

 			        const CountType count = one<CountType>;
 			    };

 			    template<typename TIdentifier, typename TChildrenTuple, typename CountType = std::size_t>
 			    struct Node
 			    {
 			        using Identifier = TIdentifier;
 			        using ChildrenTuple = TChildrenTuple;

 			        const CountType count = one<CountType>;
 			        const ChildrenTuple childs = {};
 			    };

 			    template<std::size_t ChildIndex = 0, typename ArrayIndexType = std::size_t>
 			    struct TreeCoordElement
 			    {
 			        static constexpr boost::mp11::mp_size_t<ChildIndex> childIndex = {};
 			        const ArrayIndexType arrayIndex = {};
 			    };

 			    template<std::size_t... Coords>
 			    using TreeCoord = Tuple<TreeCoordElement<Coords, boost::mp11::mp_size_t<0>>...>;

 			    namespace internal
 			    {
 			        template<typename... Coords, std::size_t... Is>
 			        auto treeCoordToString(Tuple<Coords...> treeCoord, std::index_sequence<Is...>) -> std::string
 			        {
 			            auto s
 			                = ((std::to_string(get<Is>(treeCoord).arrayIndex) + ":" + std::to_string(get<Is>(treeCoord).childIndex)
 			                    + ", ")
 			                   + ...);
 			            s.resize(s.length() - 2);
 			            return s;
 			        }
 			    } // namespace internal

 			    template<typename TreeCoord>
 			    auto treeCoordToString(TreeCoord treeCoord) -> std::string
 			    {
 			        return std::string("[ ")
 			            + internal::treeCoordToString(treeCoord, std::make_index_sequence<std::tuple_size_v<TreeCoord>>{})
 			            + std::string(" ]");
 			    }

 			    namespace internal
 			    {
 			        template<typename Tag, typename RecordDim, typename CountType>
 			        struct CreateTreeElement
 			        {
 			            using type = Leaf<Tag, RecordDim, boost::mp11::mp_size_t<1>>;
 			        };

 			        template<typename Tag, typename... Fields, typename CountType>
 			        struct CreateTreeElement<Tag, Record<Fields...>, CountType>
 			        {
 			            using type = Node<
 			                Tag,
 			                Tuple<
 			                    typename CreateTreeElement<GetFieldTag<Fields>, GetFieldType<Fields>, boost::mp11::mp_size_t<1>>::
 			                        type...>,
 			                CountType>;
 			        };

 			        template<typename Tag, typename ChildType, std::size_t Count, typename CountType>
 			        struct CreateTreeElement<Tag, ChildType[Count], CountType>
 			        {
 			            template<std::size_t... Is>
 			            static auto createChildren(std::index_sequence<Is...>)
 			            {
 			                return Tuple<
 			                    typename CreateTreeElement<RecordCoord<Is>, ChildType, boost::mp11::mp_size_t<1>>::type...>{};
 			            }

 			            using type = Node<Tag, decltype(createChildren(std::make_index_sequence<Count>{})), CountType>;
 			        };

 			        template<typename Leaf, std::size_t Count>
 			        struct WrapInNNodes
 			        {
 			            using type = Node<NoName, Tuple<typename WrapInNNodes<Leaf, Count - 1>::type>>;
 			        };

 			        template<typename Leaf>
 			        struct WrapInNNodes<Leaf, 0>
 			        {
 			            using type = Leaf;
 			        };

 			        template<typename RecordDim>
 			        using TreeFromRecordDimImpl = typename CreateTreeElement<NoName, RecordDim, std::size_t>::type;
 			    } // namespace internal

 			    template<typename RecordDim>
 			    using TreeFromRecordDim = internal::TreeFromRecordDimImpl<RecordDim>;

 			    template<typename ArrayExtents, typename RecordDim>
 			    using TreeFromDimensions =
 			        typename internal::WrapInNNodes<internal::TreeFromRecordDimImpl<RecordDim>, ArrayExtents::rank - 1>::type;

 			    template<typename RecordDim, std::size_t N, std::size_t Pos = 0>
 			    LLAMA_FN_HOST_ACC_INLINE auto createTree(const ArrayIndex<N>& size)
 			    {
 			        if constexpr(Pos == N - 1)
 			            return TreeFromRecordDim<RecordDim>{size[N - 1]};
 			        else
 			        {
 			            Tuple inner{createTree<RecordDim, N, Pos + 1>(size)};
 			            return Node<NoName, decltype(inner)>{size[Pos], inner};
 			        }
 			    };

 			    namespace internal
 			    {
 			        template<
 			            typename ArrayIndex,
 			            std::size_t... ADIndices,
 			            std::size_t FirstRecordCoord,
 			            std::size_t... RecordCoords>
 			        LLAMA_FN_HOST_ACC_INLINE auto createTreeCoord(
 			            const ArrayIndex& ai,
 			            std::index_sequence<ADIndices...>,
 			            RecordCoord<FirstRecordCoord, RecordCoords...>)
 			        {
 			            return Tuple{
 			                TreeCoordElement<(ADIndices == ArrayIndex::rank - 1 ? FirstRecordCoord : 0)>{ai[ADIndices]}...,
 			                TreeCoordElement<RecordCoords, boost::mp11::mp_size_t<0>>{}...,
 			                TreeCoordElement<0, boost::mp11::mp_size_t<0>>{}};
 			        }
 			    } // namespace internal

 			    template<typename RecordCoord, typename ArrayIndex>
 			    LLAMA_FN_HOST_ACC_INLINE auto createTreeCoord(const ArrayIndex& ai)
 			    {
 			        return internal::createTreeCoord(ai, std::make_index_sequence<ArrayIndex::rank>{}, RecordCoord{});
 			    }
 			} // namespace llama::mapping::tree
 			// ==
 			// == ./mapping/tree/TreeFromDimensions.hpp ==
 			// ============================================================================


 		namespace llama::mapping::tree::functor
 		{
 		    /// Functor for \ref tree::Mapping. Does nothing with the mapping tree. Is used for testing.
 		    struct Idem
 		    {
 		        template<typename Tree>
 		        LLAMA_FN_HOST_ACC_INLINE auto basicToResult(const Tree& tree) const -> Tree
 		        {
 		            return tree;
 		        }

 		        template<typename Tree, typename TreeCoord>
 		        LLAMA_FN_HOST_ACC_INLINE auto basicCoordToResultCoord(const TreeCoord& basicCoord, const Tree&) const
 		            -> TreeCoord
 		        {
 		            return basicCoord;
 		        }

 		        template<typename Tree, typename TreeCoord>
 		        LLAMA_FN_HOST_ACC_INLINE auto resultCoordToBasicCoord(const TreeCoord& resultCoord, const Tree&) const
 		            -> TreeCoord
 		        {
 		            return resultCoord;
 		        }
 		    };

 		    /// Functor for \ref tree::Mapping. Moves all run time parts to the leaves, creating a SoA layout.
 		    struct LeafOnlyRT
 		    {
 		        template<typename Tree>
 		        LLAMA_FN_HOST_ACC_INLINE auto basicToResult(Tree tree) const
 		        {
 		            return basicToResultImpl(tree, 1);
 		        }

 		        template<typename Tree, typename BasicCoord>
 		        LLAMA_FN_HOST_ACC_INLINE auto basicCoordToResultCoord(const BasicCoord& basicCoord, const Tree& tree) const
 		        {
 		            return basicCoordToResultCoordImpl(basicCoord, tree);
 		        }

 		        template<typename Tree, typename ResultCoord>
 		        LLAMA_FN_HOST_ACC_INLINE auto resultCoordToBasicCoord(const ResultCoord& resultCoord, const Tree& /*tree*/)
 		            const -> ResultCoord
 		        {
 		            return resultCoord;
 		        }

 		    private:
 		        template<typename Identifier, typename Type, typename CountType>
 		        LLAMA_FN_HOST_ACC_INLINE static auto basicToResultImpl(
 		            const Node<Identifier, Type, CountType>& node,
 		            std::size_t arraySize)
 		        {
 		            auto children = tupleTransform(
 		                node.childs,
 		                [&](auto element) { return basicToResultImpl(element, LLAMA_COPY(node.count) * arraySize); });
 		            return Node<Identifier, decltype(children), boost::mp11::mp_size_t<1>>{{}, children};
 		        }

 		        template<typename Identifier, typename Type, typename CountType>
 		        LLAMA_FN_HOST_ACC_INLINE static auto basicToResultImpl(
 		            const Leaf<Identifier, Type, CountType>& leaf,
 		            std::size_t arraySize)
 		        {
 		            return Leaf<Identifier, Type, std::size_t>{LLAMA_COPY(leaf.count) * arraySize};
 		        }

 		        template<typename BasicCoord, typename NodeOrLeaf>
 		        LLAMA_FN_HOST_ACC_INLINE static auto basicCoordToResultCoordImpl(
 		            const BasicCoord& basicCoord,
 		            const NodeOrLeaf& nodeOrLeaf,
 		            std::size_t arraySize = 0)
 		        {
 		            if constexpr(std::tuple_size_v<BasicCoord> == 1)
 		                return Tuple{TreeCoordElement<BasicCoord::FirstElement::childIndex>{
 		                    arraySize + LLAMA_COPY(basicCoord.first.arrayIndex)}};
 		            else
 		            {
 		                const auto& branch = get<BasicCoord::FirstElement::childIndex>(nodeOrLeaf.childs);
 		                auto first = TreeCoordElement<BasicCoord::FirstElement::childIndex, boost::mp11::mp_size_t<0>>{};

 		                return tupleCat(
 		                    Tuple{first},
 		                    basicCoordToResultCoordImpl(
 		                        basicCoord.rest,
 		                        branch,
 		                        (arraySize + LLAMA_COPY(basicCoord.first.arrayIndex)) * LLAMA_COPY(branch.count)));
 		            }
 		        }
 		    };

 		    namespace internal
 		    {
 		        template<typename TreeCoord, typename Node>
 		        LLAMA_FN_HOST_ACC_INLINE auto getNode(const Node& node)
 		        {
 		            if constexpr(std::is_same_v<TreeCoord, Tuple<>>)
 		                return node;
 		            else
 		                return getNode<typename TreeCoord::RestTuple>(get<TreeCoord::FirstElement::childIndex>(node.childs));
 		        }

 		        template<typename TreeCoord, typename Identifier, typename Type, typename CountType>
 		        LLAMA_FN_HOST_ACC_INLINE auto changeNodeRuntime(
 		            const Node<Identifier, Type, CountType>& tree,
 		            std::size_t newValue)
 		        {
 		            if constexpr(std::is_same_v<TreeCoord, Tuple<>>)
 		                return Node<Identifier, Type>{newValue, tree.childs};
 		            else
 		            {
 		                auto current = get<TreeCoord::FirstElement::childIndex>(tree.childs);
 		                auto replacement = changeNodeRuntime<typename TreeCoord::RestTuple>(current, newValue);
 		                auto children = tupleReplace<TreeCoord::FirstElement::childIndex>(tree.childs, replacement);
 		                return Node<Identifier, decltype(children)>{tree.count, children};
 		            }
 		        }

 		        template<typename TreeCoord, typename Identifier, typename Type, typename CountType>
 		        LLAMA_FN_HOST_ACC_INLINE auto changeNodeRuntime(
 		            const Leaf<Identifier, Type, CountType>& /*tree*/,
 		            std::size_t newValue)
 		        {
 		            return Leaf<Identifier, Type, std::size_t>{newValue};
 		        }

 		        struct ChangeNodeChildsRuntimeFunctor
 		        {
 		            const std::size_t newValue;

 		            template<typename Identifier, typename Type, typename CountType>
 		            LLAMA_FN_HOST_ACC_INLINE auto operator()(const Node<Identifier, Type, CountType>& element) const
 		            {
 		                return Node<Identifier, Type, std::size_t>{element.count * newValue, element.childs};
 		            }

 		            template<typename Identifier, typename Type, typename CountType>
 		            LLAMA_FN_HOST_ACC_INLINE auto operator()(const Leaf<Identifier, Type, CountType>& element) const
 		            {
 		                return Leaf<Identifier, Type, std::size_t>{element.count * newValue};
 		            }
 		        };

 		        template<typename TreeCoord, typename Identifier, typename Type, typename CountType>
 		        LLAMA_FN_HOST_ACC_INLINE auto changeNodeChildsRuntime(
 		            const Node<Identifier, Type, CountType>& tree,
 		            std::size_t newValue)
 		        {
 		            if constexpr(std::is_same_v<TreeCoord, Tuple<>>)
 		            {
 		                auto children = tupleTransform(tree.childs, ChangeNodeChildsRuntimeFunctor{newValue});
 		                return Node<Identifier, decltype(children)>{tree.count, children};
 		            }
 		            else
 		            {
 		                auto current = get<TreeCoord::FirstElement::childIndex>(tree.childs);
 		                auto replacement = changeNodeChildsRuntime<typename TreeCoord::RestTuple>(current, newValue);
 		                auto children = tupleReplace<TreeCoord::FirstElement::childIndex>(tree.childs, replacement);
 		                return Node<Identifier, decltype(children)>{tree.count, children};
 		            }
 		        }

 		        template<typename TreeCoord, typename Identifier, typename Type, typename CountType>
 		        LLAMA_FN_HOST_ACC_INLINE auto changeNodeChildsRuntime(
 		            const Leaf<Identifier, Type, CountType>& tree,
 		            std::size_t /*newValue*/)
 		        {
 		            return tree;
 		        }
 		    } // namespace internal

 		    /// Functor for \ref tree::Mapping. Move the run time part of a node one level down in direction of the leaves by
 		    /// the given amount (runtime or compile time value).
 		    /// \tparam TreeCoord tree coordinate in the mapping tree which's run time part shall be moved down one level
 		    /// \see tree::Mapping
 		    template<typename TreeCoord, typename Amount = std::size_t>
 		    struct MoveRTDown
 		    {
 		        const Amount amount = {};

 		        template<typename Tree>
 		        LLAMA_FN_HOST_ACC_INLINE auto basicToResult(const Tree& tree) const
 		        {
 		            return internal::changeNodeChildsRuntime<TreeCoord>(
 		                internal::changeNodeRuntime<TreeCoord>(
 		                    tree,
 		                    // NOLINTNEXTLINE(clang-analyzer-core.DivideZero)
 		                    (internal::getNode<TreeCoord>(tree).count + amount - 1) / amount),
 		                amount);
 		        }

 		        template<typename Tree, typename BasicCoord>
 		        LLAMA_FN_HOST_ACC_INLINE auto basicCoordToResultCoord(const BasicCoord& basicCoord, const Tree& tree) const
 		        {
 		            return basicCoordToResultCoordImpl<TreeCoord>(basicCoord, tree);
 		        }

 		        template<typename Tree, typename ResultCoord>
 		        LLAMA_FN_HOST_ACC_INLINE auto resultCoordToBasicCoord(const ResultCoord& resultCoord, const Tree&) const
 		            -> ResultCoord
 		        {
 		            return resultCoord;
 		        }

 		    private:
 		        template<typename InternalTreeCoord, typename BasicCoord, typename Tree>
 		        LLAMA_FN_HOST_ACC_INLINE auto basicCoordToResultCoordImpl(const BasicCoord& basicCoord, const Tree& tree) const
 		        {
 		            if constexpr(std::is_same_v<InternalTreeCoord, Tuple<>>)
 		            {
 		                if constexpr(std::is_same_v<BasicCoord, Tuple<>>)
 		                    return Tuple{};
 		                else
 		                {
 		                    const auto& childTree = get<BasicCoord::FirstElement::childIndex>(tree.childs);
 		                    const auto rt1 = basicCoord.first.arrayIndex / amount;
 		                    const auto rt2
 		                        = basicCoord.first.arrayIndex % amount * childTree.count + basicCoord.rest.first.arrayIndex;
 		                    auto rt1Child = TreeCoordElement<BasicCoord::FirstElement::childIndex>{rt1};
 		                    auto rt2Child = TreeCoordElement<BasicCoord::RestTuple::FirstElement::childIndex>{rt2};
 		                    return tupleCat(Tuple{rt1Child}, tupleCat(Tuple{rt2Child}, pop_front(basicCoord.rest)));
 		                }
 		            }
 		            else
 		            {
 		                if constexpr(InternalTreeCoord::FirstElement::childIndex != BasicCoord::FirstElement::childIndex)
 		                    return basicCoord;
 		                else
 		                {
 		                    auto rest = basicCoordToResultCoordImpl<typename InternalTreeCoord::RestTuple>(
 		                        pop_front(basicCoord),
 		                        get<BasicCoord::FirstElement::childIndex>(tree.childs));
 		                    return tupleCat(Tuple{basicCoord.first}, rest);
 		                }
 		            }
 		        }
 		    };

 		    template<typename TreeCoord, std::size_t Amount>
 		    using MoveRTDownFixed = MoveRTDown<TreeCoord, boost::mp11::mp_size_t<Amount>>;
 		} // namespace llama::mapping::tree::functor
 		// ==
 		// == ./mapping/tree/Functors.hpp ==
 		// ============================================================================

 	// #include "TreeFromDimensions.hpp"    // amalgamate: file already expanded
 		// ============================================================================
 		// == ./mapping/tree/toString.hpp ==
 		// ==
 		// Copyright 2018 Alexander Matthes
 		// SPDX-License-Identifier: GPL-3.0-or-later

 		// #pragma once
 		// #include "TreeFromDimensions.hpp"    // amalgamate: file already expanded

 		// #include <boost/core/demangle.hpp>    // amalgamate: file already included
 		// #include <string>    // amalgamate: file already included
 		#include <typeinfo>

 		namespace llama::mapping::tree
 		{
 		    template<typename T>
 		    auto toString(T) -> std::string
 		    {
 		        return "Unknown";
 		    }

 		    // handles array indices
 		    template<std::size_t I>
 		    inline auto toString(RecordCoord<I>) -> std::string
 		    {
 		        return "";
 		    }

 		    inline auto toString(NoName) -> std::string
 		    {
 		        return "";
 		    }

 		    template<typename... Elements>
 		    auto toString(Tuple<Elements...> tree) -> std::string
 		    {
 		        if constexpr(sizeof...(Elements) > 1)
 		            return toString(tree.first) + " , " + toString(tree.rest);
 		        else
 		            return toString(tree.first);
 		    }

 		    namespace internal
 		    {
 		        inline void replace_all(std::string& str, const std::string& search, const std::string& replace)
 		        {
 		            std::string::size_type i = 0;
 		            while((i = str.find(search, i)) != std::string::npos)
 		            {
 		                str.replace(i, search.length(), replace);
 		                i += replace.length();
 		            }
 		        }

 		        template<typename NodeOrLeaf>
 		        auto countAndIdentToString(const NodeOrLeaf& nodeOrLeaf) -> std::string
 		        {
 		            auto r = std::to_string(nodeOrLeaf.count);
 		            if constexpr(std::is_same_v<std::decay_t<decltype(nodeOrLeaf.count)>, std::size_t>)
 		                r += "R"; // runtime
 		            else
 		                r += "C"; // compile time
 		            r += std::string{" * "} + toString(typename NodeOrLeaf::Identifier{});
 		            return r;
 		        }
 		    } // namespace internal

 		    template<typename Identifier, typename Type, typename CountType>
 		    auto toString(const Node<Identifier, Type, CountType>& node) -> std::string
 		    {
 		        return internal::countAndIdentToString(node) + "[ " + toString(node.childs) + " ]";
 		    }

 		    template<typename Identifier, typename Type, typename CountType>
 		    auto toString(const Leaf<Identifier, Type, CountType>& leaf) -> std::string
 		    {
 		        auto raw = boost::core::demangle(typeid(Type).name());
 		#ifdef _MSC_VER
 		        internal::replace_all(raw, " __cdecl(void)", "");
 		#endif
 		#ifdef __GNUG__
 		        internal::replace_all(raw, " ()", "");
 		#endif
 		        return internal::countAndIdentToString(leaf) + "(" + raw + ")";
 		    }
 		} // namespace llama::mapping::tree
 		// ==
 		// == ./mapping/tree/toString.hpp ==
 		// ============================================================================


 	// #include <type_traits>    // amalgamate: file already included

 	namespace llama::mapping::tree
 	{
 	    namespace internal
 	    {
 	        template<typename Tree, typename TreeOperationList>
 	        struct MergeFunctors
 	        {
 	        };

 	        template<typename Tree, typename... Operations>
 	        struct MergeFunctors<Tree, Tuple<Operations...>>
 	        {
 	            boost::mp11::mp_first<Tuple<Operations...>> operation = {};
 	            using ResultTree = decltype(operation.basicToResult(Tree()));
 	            ResultTree treeAfterOp;
 	            MergeFunctors<ResultTree, boost::mp11::mp_drop_c<Tuple<Operations...>, 1>> next = {};

 	            MergeFunctors() = default;

 	            LLAMA_FN_HOST_ACC_INLINE
 	            MergeFunctors(const Tree& tree, const Tuple<Operations...>& treeOperationList)
 	                : operation(treeOperationList.first)
 	                , treeAfterOp(operation.basicToResult(tree))
 	                , next(treeAfterOp, pop_front(treeOperationList))
 	            {
 	            }

 	            LLAMA_FN_HOST_ACC_INLINE
 	            auto basicToResult(const Tree& tree) const
 	            {
 	                if constexpr(sizeof...(Operations) > 1)
 	                    return next.basicToResult(treeAfterOp);
 	                else if constexpr(sizeof...(Operations) == 1)
 	                    return operation.basicToResult(tree);
 	                else
 	                    return tree;
 	            }

 	            template<typename TreeCoord>
 	            LLAMA_FN_HOST_ACC_INLINE auto basicCoordToResultCoord(const TreeCoord& basicCoord, const Tree& tree) const
 	            {
 	                if constexpr(sizeof...(Operations) >= 1)
 	                    return next.basicCoordToResultCoord(
 	                        operation.basicCoordToResultCoord(basicCoord, tree),
 	                        treeAfterOp);
 	                else
 	                    return basicCoord;
 	            }

 	            template<typename TreeCoord>
 	            LLAMA_FN_HOST_ACC_INLINE auto resultCoordToBasicCoord(const TreeCoord& resultCoord, const Tree& tree) const
 	            {
 	                if constexpr(sizeof...(Operations) >= 1)
 	                    return next.resultCoordToBasicCoord(
 	                        operation.resultCoordToBasicCoord(resultCoord, tree),
 	                        operation.basicToResult(tree));
 	                else
 	                    return resultCoord;
 	            }
 	        };

 	        template<typename Tree>
 	        struct MergeFunctors<Tree, Tuple<>>
 	        {
 	            MergeFunctors() = default;

 	            LLAMA_FN_HOST_ACC_INLINE
 	            MergeFunctors(const Tree&, const Tuple<>&)
 	            {
 	            }

 	            LLAMA_FN_HOST_ACC_INLINE
 	            auto basicToResult(const Tree& tree) const
 	            {
 	                return tree;
 	            }

 	            template<typename TreeCoord>
 	            LLAMA_FN_HOST_ACC_INLINE auto basicCoordToResultCoord(TreeCoord const& basicCoord, Tree const& /*tree*/)
 	                const -> TreeCoord
 	            {
 	                return basicCoord;
 	            }

 	            template<typename TreeCoord>
 	            LLAMA_FN_HOST_ACC_INLINE auto resultCoordToBasicCoord(TreeCoord const& resultCoord, Tree const& /*tree*/)
 	                const -> TreeCoord
 	            {
 	                return resultCoord;
 	            }
 	        };

 	        template<typename Identifier, typename Type, typename CountType>
 	        LLAMA_FN_HOST_ACC_INLINE auto getTreeBlobSize(const Node<Identifier, Type, CountType>& node) -> std::size_t;

 	        template<typename Identifier, typename Type, typename CountType>
 	        LLAMA_FN_HOST_ACC_INLINE auto getTreeBlobSize(const Leaf<Identifier, Type, CountType>& leaf) -> std::size_t;

 	        template<typename... Children, std::size_t... Is, typename Count>
 	        LLAMA_FN_HOST_ACC_INLINE auto getChildrenBlobSize(
 	            const Tuple<Children...>& childs,
 	            std::index_sequence<Is...> /*ii*/,
 	            const Count& count) -> std::size_t
 	        {
 	            return count * (getTreeBlobSize(get<Is>(childs)) + ...);
 	        }

 	        template<typename Identifier, typename Type, typename CountType>
 	        LLAMA_FN_HOST_ACC_INLINE auto getTreeBlobSize(const Node<Identifier, Type, CountType>& node) -> std::size_t
 	        {
 	            constexpr std::size_t childCount = boost::mp11::mp_size<std::decay_t<decltype(node.childs)>>::value;
 	            return getChildrenBlobSize(node.childs, std::make_index_sequence<childCount>{}, LLAMA_COPY(node.count));
 	        }

 	        template<typename Identifier, typename Type, typename CountType>
 	        LLAMA_FN_HOST_ACC_INLINE auto getTreeBlobSize(const Leaf<Identifier, Type, CountType>& leaf) -> std::size_t
 	        {
 	            return leaf.count * sizeof(Type);
 	        }

 	        template<typename Childs, typename CountType>
 	        LLAMA_FN_HOST_ACC_INLINE auto getTreeBlobSize(const Childs& childs, const CountType& count) -> std::size_t
 	        {
 	            return getTreeBlobSize(Node<NoName, Childs, CountType>{count, childs});
 	        }

 	        template<std::size_t MaxPos, typename Identifier, typename Type, typename CountType, std::size_t... Is>
 	        LLAMA_FN_HOST_ACC_INLINE auto sumChildrenSmallerThan(
 	            const Node<Identifier, Type, CountType>& node,
 	            std::index_sequence<Is...>) -> std::size_t
 	        {
 	            return ((getTreeBlobSize(get<Is>(node.childs)) * (Is < MaxPos)) + ...);
 	        }

 	        template<typename Tree, typename... Coords>
 	        LLAMA_FN_HOST_ACC_INLINE auto getTreeBlobByte(const Tree& tree, const Tuple<Coords...>& treeCoord)
 	            -> std::size_t
 	        {
 	            const auto firstArrayIndex = treeCoord.first.arrayIndex;
 	            if constexpr(sizeof...(Coords) > 1)
 	            {
 	                constexpr auto firstChildIndex = decltype(treeCoord.first.childIndex)::value;
 	                return getTreeBlobSize(tree.childs, firstArrayIndex)
 	                    + sumChildrenSmallerThan<firstChildIndex>(
 	                           tree,
 	                           std::make_index_sequence<std::tuple_size_v<typename Tree::ChildrenTuple>>{})
 	                    + getTreeBlobByte(get<firstChildIndex>(tree.childs), treeCoord.rest);
 	            }
 	            else
 	                return sizeof(typename Tree::Type) * firstArrayIndex;
 	        }
 	    } // namespace internal

 	    /// An experimental attempt to provide a general purpose description of a mapping. \ref Array and record
 	    /// dimensions are represented by a compile time tree data structure. This tree is mapped into memory by means of a
 	    /// breadth-first tree traversal. By specifying additional tree operations, the tree can be modified at compile
 	    /// time before being mapped to memory.
 	    template<typename TArrayExtents, typename TRecordDim, typename TreeOperationList>
 	    struct Mapping : private TArrayExtents
 	    {
 	        using ArrayExtents = TArrayExtents;
 	        using ArrayIndex = typename ArrayExtents::Index;
 	        using RecordDim = TRecordDim;
 	        using BasicTree = TreeFromDimensions<ArrayExtents, RecordDim>;
 	        // TODO(bgruber): , support more than one blob
 	        static constexpr std::size_t blobCount = 1;

 	        using MergedFunctors = internal::MergeFunctors<BasicTree, TreeOperationList>;

 	        BasicTree basicTree;
 	        MergedFunctors mergedFunctors;

 	        using ResultTree = decltype(mergedFunctors.basicToResult(basicTree));
 	        ResultTree resultTree;

 	        Mapping() = default;

 	        LLAMA_FN_HOST_ACC_INLINE
 	        Mapping(ArrayExtents extents, TreeOperationList treeOperationList, RecordDim = {})
 	            : ArrayExtents(extents)
 	            , basicTree(createTree<RecordDim>(extents.toArray()))
 	            , mergedFunctors(basicTree, treeOperationList)
 	            , resultTree(mergedFunctors.basicToResult(basicTree))
 	        {
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE auto extents() const -> ArrayExtents
 	        {
 	            return ArrayExtents{*this};
 	        }

 	        LLAMA_FN_HOST_ACC_INLINE
 	        auto blobSize(std::size_t const) const -> std::size_t
 	        {
 	            return internal::getTreeBlobSize(resultTree);
 	        }

 	        template<std::size_t... RecordCoords>
 	        LLAMA_FN_HOST_ACC_INLINE auto blobNrAndOffset(ArrayIndex ai, RecordCoord<RecordCoords...> = {}) const
 	            -> NrAndOffset
 	        {
 	            auto const basicTreeCoord = createTreeCoord<RecordCoord<RecordCoords...>>(ai);
 	            auto const resultTreeCoord = mergedFunctors.basicCoordToResultCoord(basicTreeCoord, basicTree);
 	            const auto offset = internal::getTreeBlobByte(resultTree, resultTreeCoord);
 	            return {0, offset};
 	        }
 	    };
 	} // namespace llama::mapping::tree
 	// ==
 	// == ./mapping/tree/Mapping.hpp ==
 	// ============================================================================


 #ifdef __NVCC__
 #    pragma pop
 #endif
 // ==
 // == ./llama.hpp ==
 // ============================================================================