bradphelan · August 4, 2023 09:15
diff --git a/taskflow_single_header.hpp b/taskflow_single_header.hpp
 #pragma once











 #include <type_traits>
 #include <iterator>
 #include <iostream>
 #include <fstream>
 #include <mutex>
 #include <stack>
 #include <queue>
 #include <vector>
 #include <algorithm>
 #include <memory>
 #include <atomic>
 #include <thread>
 #include <future>
 #include <functional>
 #include <unordered_map>
 #include <unordered_set>
 #include <sstream>
 #include <list>
 #include <numeric>
 #include <random>
 #include <iomanip>
 #include <cassert>
 #include <cmath>
 #include <array>
 #include <string>
 #include <variant>
 #include <optional>


 #include <cstdlib>
 #include <cstdio>
 #include <string>

 #define TF_OS_LINUX 0
 #define TF_OS_DRAGONFLY 0
 #define TF_OS_FREEBSD 0
 #define TF_OS_NETBSD 0
 #define TF_OS_OPENBSD 0
 #define TF_OS_DARWIN 0
 #define TF_OS_WINDOWS 0
 #define TF_OS_CNK 0
 #define TF_OS_HURD 0
 #define TF_OS_SOLARIS 0
 #define TF_OS_UNIX 0

 #ifdef _WIN32
 #undef TF_OS_WINDOWS
 #define TF_OS_WINDOWS 1
 #endif

 #ifdef __CYGWIN__
 #undef TF_OS_WINDOWS
 #define TF_OS_WINDOWS 1
 #endif

 #if (defined __APPLE__ && defined __MACH__)
 #undef TF_OS_DARWIN
 #define TF_OS_DARWIN 1
 #endif

 // in some ppc64 linux installations, only the second condition is met
 #if (defined __linux)
 #undef TF_OS_LINUX
 #define TF_OS_LINUX 1
 #elif (defined __linux__)
 #undef TF_OS_LINUX
 #define TF_OS_LINUX 1
 #else
 #endif

 #if (defined __DragonFly__)
 #undef TF_OS_DRAGONFLY
 #define TF_OS_DRAGONFLY 1
 #endif

 #if (defined __FreeBSD__)
 #undef TF_OS_FREEBSD
 #define TF_OS_FREEBSD 1
 #endif

 #if (defined __NetBSD__)
 #undef TF_OS_NETBSD
 #define TF_OS_NETBSD 1
 #endif

 #if (defined __OpenBSD__)
 #undef TF_OS_OPENBSD
 #define TF_OS_OPENBSD 1
 #endif

 #if (defined __bgq__)
 #undef TF_OS_CNK
 #define TF_OS_CNK 1
 #endif

 #if (defined __GNU__)
 #undef TF_OS_HURD
 #define TF_OS_HURD 1
 #endif

 #if (defined __sun)
 #undef TF_OS_SOLARIS
 #define TF_OS_SOLARIS 1
 #endif

 #if (1 !=                                                                  \
     TF_OS_LINUX + TF_OS_DRAGONFLY + TF_OS_FREEBSD + TF_OS_NETBSD +        \
     TF_OS_OPENBSD + TF_OS_DARWIN + TF_OS_WINDOWS + TF_OS_HURD +           \
     TF_OS_SOLARIS)
 #define TF_OS_UNKNOWN 1
 #endif

 #if TF_OS_LINUX || TF_OS_DRAGONFLY || TF_OS_FREEBSD || TF_OS_NETBSD ||     \
    TF_OS_OPENBSD || TF_OS_DARWIN || TF_OS_HURD || TF_OS_SOLARIS
 #undef TF_OS_UNIX
 #define TF_OS_UNIX 1
 #endif


 //-----------------------------------------------------------------------------
 // Cache line alignment
 //-----------------------------------------------------------------------------
 #if defined(__i386__) || defined(__x86_64__)
  #define TF_CACHELINE_SIZE 64
 #elif defined(__powerpc64__)
  // TODO
  // This is the L1 D-cache line size of our Power7 machines.
  // Need to check if this is appropriate for other PowerPC64 systems.
  #define TF_CACHELINE_SIZE 128
 #elif defined(__arm__)
  // Cache line sizes for ARM: These values are not strictly correct since
  // cache line sizes depend on implementations, not architectures.
  // There are even implementations with cache line sizes configurable
  // at boot time.
  #if defined(__ARM_ARCH_5T__)
    #define TF_CACHELINE_SIZE 32
  #elif defined(__ARM_ARCH_7A__)
    #define TF_CACHELINE_SIZE 64
  #endif
 #endif

 #ifndef TF_CACHELINE_SIZE
 // A reasonable default guess.  Note that overestimates tend to waste more
 // space, while underestimates tend to waste more time.
  #define TF_CACHELINE_SIZE 64
 #endif



 //-----------------------------------------------------------------------------
 // pause
 //-----------------------------------------------------------------------------
 //#if __has_include (<immintrin.h>)
 //  #define TF_HAS_MM_PAUSE 1
 //  #include <immintrin.h>
 //#endif

 namespace tf {

 // Struct: CachelineAligned
 // Due to prefetch, we typically do 2x cacheline for the alignment.
 template <typename T>
 struct CachelineAligned {
  alignas (2*TF_CACHELINE_SIZE) T data;
 };

 // Function: get_env
 inline std::string get_env(const std::string& str) {
 #ifdef _MSC_VER
  char *ptr = nullptr;
  size_t len = 0;

  if(_dupenv_s(&ptr, &len, str.c_str()) == 0 && ptr != nullptr) {
    std::string res(ptr, len);
    std::free(ptr);
    return res;
  }
  return "";

 #else
  auto ptr = std::getenv(str.c_str());
  return ptr ? ptr : "";
 #endif
 }

 // Function: has_env
 inline bool has_env(const std::string& str) {
 #ifdef _MSC_VER
  char *ptr = nullptr;
  size_t len = 0;

  if(_dupenv_s(&ptr, &len, str.c_str()) == 0 && ptr != nullptr) {
    std::string res(ptr, len);
    std::free(ptr);
    return true;
  }
  return false;

 #else
  auto ptr = std::getenv(str.c_str());
  return ptr ? true : false;
 #endif
 }

 // Procedure: relax_cpu
 //inline void relax_cpu() {
 //#ifdef TF_HAS_MM_PAUSE
 //  _mm_pause();
 //#endif
 //}



 }  // end of namespace tf -----------------------------------------------------











 namespace tf {

 //-----------------------------------------------------------------------------
 // Traits
 //-----------------------------------------------------------------------------

 //// Struct: dependent_false
 //template <typename... T>
 //struct dependent_false {
 //  static constexpr bool value = false;
 //};
 //
 //template <typename... T>
 //constexpr auto dependent_false_v = dependent_false<T...>::value;

 template<typename> inline constexpr bool dependent_false_v = false;

 // ----------------------------------------------------------------------------
 // is_pod
 //-----------------------------------------------------------------------------
 template <typename T>
 struct is_pod {
  static const bool value = std::is_trivial_v<T> && 
                            std::is_standard_layout_v<T>;
 };

 template <typename T>
 constexpr bool is_pod_v = is_pod<T>::value;

 //-----------------------------------------------------------------------------
 // NoInit
 //-----------------------------------------------------------------------------

 template <typename T>
 struct NoInit {

  //static_assert(is_pod_v<T>, "NoInit only supports POD type");

  // constructor without initialization
  NoInit () noexcept {}

  // implicit conversion T -> NoInit<T>
  constexpr  NoInit (T value) noexcept : v{value} {}

  // implicit conversion NoInit<T> -> T
  constexpr  operator T () const noexcept { return v; }

  T v;
 };

 //-----------------------------------------------------------------------------
 // Move-On-Copy
 //-----------------------------------------------------------------------------

 // Struct: MoveOnCopyWrapper
 template <typename T>
 struct MoC {

  MoC(T&& rhs) : object(std::move(rhs)) {}
  MoC(const MoC& other) : object(std::move(other.object)) {}

  T& get() { return object; }

  mutable T object;
 };

 template <typename T>
 auto make_moc(T&& m) {
  return MoC<T>(std::forward<T>(m));
 }

 //-----------------------------------------------------------------------------
 // Visitors.
 //-----------------------------------------------------------------------------

 //// Overloadded.
 //template <typename... Ts>
 //struct Visitors : Ts... {
 //  using Ts::operator()... ;
 //};
 //
 //template <typename... Ts>
 //Visitors(Ts...) -> Visitors<Ts...>;

 // ----------------------------------------------------------------------------
 // std::variant
 // ----------------------------------------------------------------------------
 template <typename T, typename>
 struct get_index;

 template <size_t I, typename... Ts>
 struct get_index_impl {};

 template <size_t I, typename T, typename... Ts>
 struct get_index_impl<I, T, T, Ts...> : std::integral_constant<size_t, I>{};

 template <size_t I, typename T, typename U, typename... Ts>
 struct get_index_impl<I, T, U, Ts...> : get_index_impl<I+1, T, Ts...>{};

 template <typename T, typename... Ts>
 struct get_index<T, std::variant<Ts...>> : get_index_impl<0, T, Ts...>{};

 template <typename T, typename... Ts>
 constexpr auto get_index_v = get_index<T, Ts...>::value;

 // ----------------------------------------------------------------------------
 // unwrap_reference
 // ----------------------------------------------------------------------------

 template <class T>
 struct unwrap_reference { using type = T; };

 template <class U>
 struct unwrap_reference<std::reference_wrapper<U>> { using type = U&; };

 template<class T>
 using unwrap_reference_t = typename unwrap_reference<T>::type;

 template< class T >
 struct unwrap_ref_decay : unwrap_reference<std::decay_t<T>> {};

 template<class T>
 using unwrap_ref_decay_t = typename unwrap_ref_decay<T>::type;

 // ----------------------------------------------------------------------------
 // stateful iterators
 // ----------------------------------------------------------------------------

 // STL-styled iterator
 template <typename B, typename E>
 struct stateful_iterator {

  using TB = std::decay_t<unwrap_ref_decay_t<B>>;
  using TE = std::decay_t<unwrap_ref_decay_t<E>>;

  static_assert(std::is_same_v<TB, TE>, "decayed iterator types must match");

  using type = TB;
 };

 template <typename B, typename E>
 using stateful_iterator_t = typename stateful_iterator<B, E>::type;

 // raw integral index
 template <typename B, typename E, typename S>
 struct stateful_index {

  using TB = std::decay_t<unwrap_ref_decay_t<B>>;
  using TE = std::decay_t<unwrap_ref_decay_t<E>>;
  using TS = std::decay_t<unwrap_ref_decay_t<S>>;

  static_assert(
    std::is_integral_v<TB>, "decayed beg index must be an integral type"
  );

  static_assert(
    std::is_integral_v<TE>, "decayed end index must be an integral type"
  );

  static_assert(
    std::is_integral_v<TS>, "decayed step must be an integral type"
  );

  static_assert(
    std::is_same_v<TB, TE> && std::is_same_v<TE, TS>,
    "decayed index and step types must match"
  );

  using type = TB;
 };

 template <typename B, typename E, typename S>
 using stateful_index_t = typename stateful_index<B, E, S>::type;

 // ----------------------------------------------------------------------------
 // visit a tuple with a functor at runtime
 // ----------------------------------------------------------------------------

 template <typename Func, typename Tuple, size_t N = 0>
 void visit_tuple(Func func, Tuple& tup, size_t idx) {
  if (N == idx) {
    std::invoke(func, std::get<N>(tup));
    return;
  }
  if constexpr (N + 1 < std::tuple_size_v<Tuple>) {
    return visit_tuple<Func, Tuple, N + 1>(func, tup, idx);
  }
 }

 // ----------------------------------------------------------------------------
 // unroll loop
 // ----------------------------------------------------------------------------

 // Template unrolled looping construct.
 template<auto beg, auto end, auto step, bool valid = (beg < end)>
 struct Unroll {
  template<typename F>
  static void eval(F f) {
    f(beg);
    Unroll<beg + step, end, step>::eval(f);
  }
 };

 template<auto beg, auto end, auto step>
 struct Unroll<beg, end, step, false> {
  template<typename F>
  static void eval(F) { }
 };

 template<auto beg, auto end, auto step, typename F>
 void unroll(F f) {
  Unroll<beg, end, step>::eval(f);
 }

 // ----------------------------------------------------------------------------
 // make types of variant unique
 // ----------------------------------------------------------------------------

 template <typename T, typename... Ts>
 struct filter_duplicates { using type = T; };

 template <template <typename...> class C, typename... Ts, typename U, typename... Us>
 struct filter_duplicates<C<Ts...>, U, Us...>
    : std::conditional_t<(std::is_same_v<U, Ts> || ...)
                       , filter_duplicates<C<Ts...>, Us...>
                       , filter_duplicates<C<Ts..., U>, Us...>> {};

 template <typename T>
 struct unique_variant;

 template <typename... Ts>
 struct unique_variant<std::variant<Ts...>> : filter_duplicates<std::variant<>, Ts...> {};

 template <typename T>
 using unique_variant_t = typename unique_variant<T>::type;


 // ----------------------------------------------------------------------------
 // check if it is default compare
 // ----------------------------------------------------------------------------
 template <typename T> struct is_std_compare : std::false_type { };
 template <typename T> struct is_std_compare<std::less<T>> : std::true_type { };
 template <typename T> struct is_std_compare<std::greater<T>> : std::true_type { };

 template <typename T>
 constexpr static bool is_std_compare_v = is_std_compare<T>::value;

 // ----------------------------------------------------------------------------
 // check if all types are the same
 // ----------------------------------------------------------------------------

 template<bool...> 
 struct bool_pack;

 template<bool... bs>
 using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;

 template <typename T, typename... Ts>
 using all_same = all_true<std::is_same_v<T, Ts>...>;

 template <typename T, typename... Ts>
 constexpr bool all_same_v = all_same<T, Ts...>::value;


 }  // end of namespace tf. ----------------------------------------------------






 #include <cstddef>
 #include <type_traits>

 namespace tf {

 template <typename T>
 constexpr std::enable_if_t<std::is_integral<std::decay_t<T>>::value, bool>
 is_range_invalid(T beg, T end, T step) {
  return ((step == 0 && beg != end) ||
          (beg < end && step <=  0) ||
          (beg > end && step >=  0));
 }

 template <typename T>
 constexpr std::enable_if_t<std::is_integral<std::decay_t<T>>::value, size_t>
 distance(T beg, T end, T step) {
  return (end - beg + step + (step > 0 ? -1 : 1)) / step;
 }

 }  // end of namespace tf -----------------------------------------------------

 // 2020/03/13 - modified by Tsung-Wei Huang
 //  - fixed bug in aligning memory
 //
 // 2020/02/02 - modified by Tsung-Wei Huang
 //  - new implementation motivated by Hoard
 //
 // 2019/07/10 - modified by Tsung-Wei Huang
 //  - replace raw pointer with smart pointer
 //
 // 2019/06/13 - created by Tsung-Wei Huang
 //  - implemented an object pool class



 #include <thread>
 #include <atomic>
 #include <mutex>
 #include <vector>
 #include <cassert>
 #include <cstddef>

 namespace tf {

 #define TF_ENABLE_POOLABLE_ON_THIS                          \
  template <typename T, size_t S> friend class ObjectPool;  \
  void* _object_pool_block

 // Class: ObjectPool
 //
 // The class implements an efficient thread-safe object pool motivated
 // by the Hoard memory allocator algorithm.
 // Different from the normal memory allocator, object pool allocates
 // only one object at a time.
 //
 // Internall, we use the following variables to maintain blocks and heaps:
 // X: size in byte of a item slot
 // M: number of items per block
 // F: emptiness threshold
 // B: number of bins per local heap (bin[B-1] is the full list)
 // W: number of items per bin
 // K: shrinkness constant
 //
 // Example scenario 1:
 // M = 30
 // F = 4
 // W = (30+4-1)/4 = 8
 //
 // b0: 0, 1, 2, 3, 4, 5, 6, 7
 // b1: 8, 9, 10, 11, 12, 13, 14, 15
 // b2: 16, 17, 18, 19, 20, 21, 22, 23
 // b3: 24, 25, 26, 27, 28, 29
 // b4: 30 (anything equal to M)
 //
 // Example scenario 2:
 // M = 32
 // F = 4
 // W = (32+4-1)/4 = 8
 // b0: 0, 1, 2, 3, 4, 5, 6, 7
 // b1: 8, 9, 10, 11, 12, 13, 14, 15
 // b2: 16, 17, 18, 19, 20, 21, 22, 23
 // b3: 24, 25, 26, 27, 28, 29, 30, 31
 // b4: 32 (anything equal to M)
 //
 template <typename T, size_t S = 65536>
 class ObjectPool {

  // the data column must be sufficient to hold the pointer in freelist
  constexpr static size_t X = (std::max)(sizeof(T*), sizeof(T));
  //constexpr static size_t X = sizeof(long double) + std::max(sizeof(T*), sizeof(T));
  //constexpr static size_t M = (S - offsetof(Block, data)) / X;
  constexpr static size_t M = S / X;
  constexpr static size_t F = 4;
  constexpr static size_t B = F + 1;
  constexpr static size_t W = (M + F - 1) / F;
  constexpr static size_t K = 4;

  static_assert(
    S && (!(S & (S-1))), "block size S must be a power of two"
  );

  static_assert(
    M >= 128, "block size S must be larger enough to pool at least 128 objects"
  );

  struct Blocklist {
    Blocklist* prev;
    Blocklist* next;
  };

  struct GlobalHeap {
    std::mutex mutex;
    Blocklist list;
  };

  struct LocalHeap {
    std::mutex mutex;
    Blocklist lists[B];
    size_t u {0};
    size_t a {0};
  };

  struct Block {
    std::atomic<LocalHeap*> heap;
    Blocklist list_node;
    size_t i;
    size_t u;
    T* top;
    // long double padding;
    char data[S];
  };

  public:

    /**
    @brief constructs an object pool from a number of anticipated threads
    */
    explicit ObjectPool(unsigned = std::thread::hardware_concurrency());

    /**
    @brief destructs the object pool
    */
    ~ObjectPool();

    /**
    @brief acquires a pointer to a object constructed from a given argument list
    */
    template <typename... ArgsT>
    T* animate(ArgsT&&... args);

    /**
    @brief recycles a object pointed by @c ptr and destroys it
    */
    void recycle(T* ptr);

    size_t num_bins_per_local_heap() const;
    size_t num_objects_per_bin() const;
    size_t num_objects_per_block() const;
    size_t num_available_objects() const;
    size_t num_allocated_objects() const;
    size_t capacity() const;
    size_t num_local_heaps() const;
    size_t num_global_heaps() const;
    size_t num_heaps() const;

    float emptiness_threshold() const;

  private:

    const size_t _lheap_mask;

    GlobalHeap _gheap;

    std::vector<LocalHeap> _lheaps;

    LocalHeap& _this_heap();

    constexpr unsigned _next_pow2(unsigned n) const;

    template <class P, class Q>
    constexpr size_t _offset_in_class(const Q P::*member) const;

    template <class P, class Q>
    constexpr P* _parent_class_of(Q*, const Q P::*member);

    template <class P, class Q>
    constexpr P* _parent_class_of(const Q*, const Q P::*member) const;

    constexpr Block* _block_of(Blocklist*);
    constexpr Block* _block_of(const Blocklist*) const;

    size_t _bin(size_t) const;

    T* _allocate(Block*);

    void _deallocate(Block*, T*);
    void _blocklist_init_head(Blocklist*);
    void _blocklist_add_impl(Blocklist*, Blocklist*, Blocklist*);
    void _blocklist_push_front(Blocklist*, Blocklist*);
    void _blocklist_push_back(Blocklist*, Blocklist*);
    void _blocklist_del_impl(Blocklist*, Blocklist*);
    void _blocklist_del(Blocklist*);
    void _blocklist_replace(Blocklist*, Blocklist*);
    void _blocklist_move_front(Blocklist*, Blocklist*);
    void _blocklist_move_back(Blocklist*, Blocklist*);
    bool _blocklist_is_first(const Blocklist*, const Blocklist*);
    bool _blocklist_is_last(const Blocklist*, const Blocklist*);
    bool _blocklist_is_empty(const Blocklist*);
    bool _blocklist_is_singular(const Blocklist*);

    template <typename C>
    void _for_each_block_safe(Blocklist*, C&&);

    template <typename C>
    void _for_each_block(Blocklist*, C&&);

 };

 // ----------------------------------------------------------------------------
 // ObjectPool definition
 // ----------------------------------------------------------------------------

 // Constructor
 template <typename T, size_t S>
 ObjectPool<T, S>::ObjectPool(unsigned t) :
  //_heap_mask   {(_next_pow2(t) << 1) - 1u},
  //_heap_mask   { _next_pow2(t<<1) - 1u },
  //_heap_mask   {(t << 1) - 1},
  _lheap_mask { _next_pow2((t+1) << 1) - 1 },
  _lheaps     { _lheap_mask + 1 } {

  _blocklist_init_head(&_gheap.list);

  for(auto& h : _lheaps) {
    for(size_t i=0; i<B; ++i) {
      _blocklist_init_head(&h.lists[i]);
    }
  }
 }

 // Destructor
 template <typename T, size_t S>
 ObjectPool<T, S>::~ObjectPool() {

  // clear local heaps
  for(auto& h : _lheaps) {
    for(size_t i=0; i<B; ++i) {
      _for_each_block_safe(&h.lists[i], [] (Block* b) {
        //std::free(b);
        delete b;
      });
    }
  }

  // clear global heap
  _for_each_block_safe(&_gheap.list, [] (Block* b) {
    //std::free(b);
    delete b;
  });
 }

 // Function: num_bins_per_local_heap
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::num_bins_per_local_heap() const {
  return B;
 }

 // Function: num_objects_per_bin
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::num_objects_per_bin() const {
  return W;
 }

 // Function: num_objects_per_block
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::num_objects_per_block() const {
  return M;
 }

 // Function: emptiness_threshold
 template <typename T, size_t S>
 float ObjectPool<T, S>::emptiness_threshold() const {
  return 1.0f/F;
 }

 // Function: num_global_heaps
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::num_global_heaps() const {
  return 1;
 }

 // Function: num_lheaps
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::num_local_heaps() const {
  return _lheaps.size();
 }

 // Function: num_heaps
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::num_heaps() const {
  return _lheaps.size() + 1;
 }

 // Function: capacity
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::capacity() const {

  size_t n = 0;

  // global heap
  for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) {
    n += M;
  };

  // local heap
  for(auto& h : _lheaps) {
    n += h.a;
  }

  return n;
 }

 // Function: num_available_objects
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::num_available_objects() const {

  size_t n = 0;

  // global heap
  for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) {
    n += (M - _block_of(p)->u);
  };

  // local heap
  for(auto& h : _lheaps) {
    n += (h.a - h.u);
  }
  return n;
 }

 // Function: num_allocated_objects
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::num_allocated_objects() const {

  size_t n = 0;

  // global heap
  for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) {
    n += _block_of(p)->u;
  };

  // local heap
  for(auto& h : _lheaps) {
    n += h.u;
  }
  return n;
 }

 // Function: _bin
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::_bin(size_t u) const {
  return u == M ? F : u/W;
 }

 // Function: _offset_in_class
 template <typename T, size_t S>
 template <class P, class Q>
 constexpr size_t ObjectPool<T, S>::_offset_in_class(
  const Q P::*member) const {
  return (size_t) &( reinterpret_cast<P*>(0)->*member);
 }

 // C macro: parent_class_of(list_pointer, Block, list)
 // C++: parent_class_of(list_pointer,  &Block::list)
 template <typename T, size_t S>
 template <class P, class Q>
 constexpr P* ObjectPool<T, S>::_parent_class_of(
  Q* ptr, const Q P::*member
 ) {
  return (P*)( (char*)ptr - _offset_in_class(member));
 }

 // Function: _parent_class_of
 template <typename T, size_t S>
 template <class P, class Q>
 constexpr P* ObjectPool<T, S>::_parent_class_of(
  const Q* ptr, const Q P::*member
 ) const {
  return (P*)( (char*)ptr - _offset_in_class(member));
 }

 // Function: _block_of
 template <typename T, size_t S>
 constexpr typename ObjectPool<T, S>::Block*
 ObjectPool<T, S>::_block_of(Blocklist* list) {
  return _parent_class_of(list, &Block::list_node);
 }

 // Function: _block_of
 template <typename T, size_t S>
 constexpr typename ObjectPool<T, S>::Block*
 ObjectPool<T, S>::_block_of(const Blocklist* list) const {
  return _parent_class_of(list, &Block::list_node);
 }

 // Procedure: initialize a list head
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_init_head(Blocklist *list) {
  list->next = list;
  list->prev = list;
 }

 // Procedure: _blocklist_add_impl
 // Insert a new entry between two known consecutive entries.
 //
 // This is only for internal list manipulation where we know
 // the prev/next entries already!
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_add_impl(
  Blocklist *curr, Blocklist *prev, Blocklist *next
 ) {
  next->prev = curr;
  curr->next = next;
  curr->prev = prev;
  prev->next = curr;
 }

 // list_push_front - add a new entry
 // @curr: curr entry to be added
 // @head: list head to add it after
 //
 // Insert a new entry after the specified head.
 // This is good for implementing stacks.
 //
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_push_front(
  Blocklist *curr, Blocklist *head
 ) {
  _blocklist_add_impl(curr, head, head->next);
 }

 // list_add_tail - add a new entry
 // @curr: curr entry to be added
 // @head: list head to add it before
 //
 // Insert a new entry before the specified head.
 // This is useful for implementing queues.
 //
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_push_back(
  Blocklist *curr, Blocklist *head
 ) {
  _blocklist_add_impl(curr, head->prev, head);
 }

 // Delete a list entry by making the prev/next entries
 // point to each other.
 //
 // This is only for internal list manipulation where we know
 // the prev/next entries already!
 //
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_del_impl(
  Blocklist * prev, Blocklist * next
 ) {
  next->prev = prev;
  prev->next = next;
 }

 // _blocklist_del - deletes entry from list.
 // @entry: the element to delete from the list.
 // Note: list_empty() on entry does not return true after this, the entry is
 // in an undefined state.
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_del(Blocklist *entry) {
  _blocklist_del_impl(entry->prev, entry->next);
  entry->next = nullptr;
  entry->prev = nullptr;
 }

 // list_replace - replace old entry by new one
 // @old : the element to be replaced
 // @curr : the new element to insert
 //
 // If @old was empty, it will be overwritten.
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_replace(
  Blocklist *old, Blocklist *curr
 ) {
  curr->next = old->next;
  curr->next->prev = curr;
  curr->prev = old->prev;
  curr->prev->next = curr;
 }

 // list_move - delete from one list and add as another's head
 // @list: the entry to move
 // @head: the head that will precede our entry
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_move_front(
  Blocklist *list, Blocklist *head
 ) {
  _blocklist_del_impl(list->prev, list->next);
  _blocklist_push_front(list, head);
 }

 // list_move_tail - delete from one list and add as another's tail
 // @list: the entry to move
 // @head: the head that will follow our entry
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_move_back(
  Blocklist *list, Blocklist *head
 ) {
  _blocklist_del_impl(list->prev, list->next);
  _blocklist_push_back(list, head);
 }

 // list_is_first - tests whether @list is the last entry in list @head
 // @list: the entry to test
 // @head: the head of the list
 template <typename T, size_t S>
 bool ObjectPool<T, S>::_blocklist_is_first(
  const Blocklist *list, const Blocklist *head
 ) {
  return list->prev == head;
 }

 // list_is_last - tests whether @list is the last entry in list @head
 // @list: the entry to test
 // @head: the head of the list
 template <typename T, size_t S>
 bool ObjectPool<T, S>::_blocklist_is_last(
  const Blocklist *list, const Blocklist *head
 ) {
  return list->next == head;
 }

 // list_empty - tests whether a list is empty
 // @head: the list to test.
 template <typename T, size_t S>
 bool ObjectPool<T, S>::_blocklist_is_empty(const Blocklist *head) {
  return head->next == head;
 }

 // list_is_singular - tests whether a list has just one entry.
 // @head: the list to test.
 template <typename T, size_t S>
 bool ObjectPool<T, S>::_blocklist_is_singular(
  const Blocklist *head
 ) {
  return !_blocklist_is_empty(head) && (head->next == head->prev);
 }

 // Procedure: _for_each_block
 template <typename T, size_t S>
 template <typename C>
 void ObjectPool<T, S>::_for_each_block(Blocklist* head, C&& c) {
  Blocklist* p;
  for(p=head->next; p!=head; p=p->next) {
    c(_block_of(p));
  }
 }

 // Procedure: _for_each_block_safe
 // Iterate each item of a list - safe to free
 template <typename T, size_t S>
 template <typename C>
 void ObjectPool<T, S>::_for_each_block_safe(Blocklist* head, C&& c) {
  Blocklist* p;
  Blocklist* t;
  for(p=head->next, t=p->next; p!=head; p=t, t=p->next) {
    c(_block_of(p));
  }
 }

 // Function: _allocate
 // allocate a spot from the block
 template <typename T, size_t S>
 T* ObjectPool<T, S>::_allocate(Block* s) {
  if(s->top == nullptr) {
    return reinterpret_cast<T*>(s->data + s->i++ * X);
  }
  else {
    T* retval = s->top;
    s->top = *(reinterpret_cast<T**>(s->top));
    return retval;
  }
 }

 // Procedure: _deallocate
 template <typename T, size_t S>
 void ObjectPool<T, S>::_deallocate(Block* s, T* ptr) {
  *(reinterpret_cast<T**>(ptr)) = s->top;
  s->top = ptr;
 }

 // Function: allocate
 template <typename T, size_t S>
 template <typename... ArgsT>
 T* ObjectPool<T, S>::animate(ArgsT&&... args) {

  //std::cout << "construct a new item\n";

  // my logically mapped heap
  LocalHeap& h = _this_heap();

  Block* s {nullptr};

  h.mutex.lock();

  // scan the list of superblocks from the most full to the least full
  int f = static_cast<int>(F-1);
  for(; f>=0; f--) {
    if(!_blocklist_is_empty(&h.lists[f])) {
      s = _block_of(h.lists[f].next);
      break;
    }
  }

  // no superblock found
  if(f == -1) {

    // check heap 0 for a superblock
    _gheap.mutex.lock();
    if(!_blocklist_is_empty(&_gheap.list)) {

      s = _block_of(_gheap.list.next);

      //printf("get a superblock from global heap %lu\n", s->u);
      assert(s->u < M && s->heap == nullptr);
      f = static_cast<int>(_bin(s->u + 1));

      _blocklist_move_front(&s->list_node, &h.lists[f]);

      s->heap = &h;  // must be within the global heap lock
      _gheap.mutex.unlock();

      h.u = h.u + s->u;
      h.a = h.a + M;
    }
    // create a new block
    else {
      //printf("create a new superblock\n");
      _gheap.mutex.unlock();
      f = 0;
      //s = static_cast<Block*>(std::malloc(sizeof(Block)));
      s = new Block();

      if(s == nullptr) {
        throw std::bad_alloc();
      }

      s->heap = &h;
      s->i = 0;
      s->u = 0;
      s->top = nullptr;

      _blocklist_push_front(&s->list_node, &h.lists[f]);

      h.a = h.a + M;
    }
  }

  // the superblock must have at least one space
  //assert(s->u < M);
  //printf("%lu %lu %lu\n", h.u, h.a, s->u);
  //assert(h.u < h.a);

  h.u = h.u + 1;
  s->u = s->u + 1;

  // take one item from the superblock
  T* mem = _allocate(s);

  int b = static_cast<int>(_bin(s->u));

  if(b != f) {
    //printf("move superblock from list[%d] to list[%d]\n", f, b);
    _blocklist_move_front(&s->list_node, &h.lists[b]);
  }

  //std::cout << "s.i " << s->i << '\n'
  //          << "s.u " << s->u << '\n'
  //          << "h.u " << h.u  << '\n'
  //          << "h.a " << h.a  << '\n';

  h.mutex.unlock();

  //printf("allocate %p (s=%p)\n", mem, s);

  new (mem) T(std::forward<ArgsT>(args)...);

  mem->_object_pool_block = s;

  return mem;
 }

 // Function: destruct
 template <typename T, size_t S>
 void ObjectPool<T, S>::recycle(T* mem) {

  //Block* s = *reinterpret_cast<Block**>(
  //  reinterpret_cast<char*>(mem) - sizeof(Block**)
  //);

  //Block* s= *(reinterpret_cast<Block**>(mem) - O); //  (mem) - 1

  Block* s = static_cast<Block*>(mem->_object_pool_block);

  mem->~T();

  //printf("deallocate %p (s=%p) M=%lu W=%lu X=%lu\n", mem, s, M, W, X);

  // here we need a loop because when we lock the heap,
  // other threads may have removed the superblock to another heap
  bool sync = false;

  do {
    LocalHeap* h = s->heap.load(std::memory_order_relaxed);

    // the block is in global heap
    if(h == nullptr) {
      std::lock_guard<std::mutex> glock(_gheap.mutex);
      if(s->heap == h) {
        sync = true;
        _deallocate(s, mem);
        s->u = s->u - 1;
      }
    }
    else {
      std::lock_guard<std::mutex> llock(h->mutex);
      if(s->heap == h) {
        sync = true;
        // deallocate the item from the superblock
        size_t f = _bin(s->u);
        _deallocate(s, mem);
        s->u = s->u - 1;
        h->u = h->u - 1;

        size_t b = _bin(s->u);

        if(b != f) {
          //printf("move superblock from list[%d] to list[%d]\n", f, b);
          _blocklist_move_front(&s->list_node, &h->lists[b]);
        }

        // transfer a mostly-empty superblock to global heap
        if((h->u + K*M < h->a) && (h->u < ((F-1) * h->a / F))) {
          for(size_t i=0; i<F; i++) {
            if(!_blocklist_is_empty(&h->lists[i])) {
              Block* x = _block_of(h->lists[i].next);
              //printf("transfer a block (x.u=%lu/x.i=%lu) to the global heap\n", x->u, x->i);
              assert(h->u > x->u && h->a > M);
              h->u = h->u - x->u;
              h->a = h->a - M;
              x->heap = nullptr;
              std::lock_guard<std::mutex> glock(_gheap.mutex);
              _blocklist_move_front(&x->list_node, &_gheap.list);
              break;
            }
          }
        }
      }
    }
  } while(!sync);

  //std::cout << "s.i " << s->i << '\n'
  //          << "s.u " << s->u << '\n';
 }

 // Function: _this_heap
 template <typename T, size_t S>
 typename ObjectPool<T, S>::LocalHeap&
 ObjectPool<T, S>::_this_heap() {
  // here we don't use thread local since object pool might be
  // created and destroyed multiple times
  //thread_local auto hv = std::hash<std::thread::id>()(std::this_thread::get_id());
  //return _lheaps[hv & _lheap_mask];

  return _lheaps[
    std::hash<std::thread::id>()(std::this_thread::get_id()) & _lheap_mask
  ];
 }

 // Function: _next_pow2
 template <typename T, size_t S>
 constexpr unsigned ObjectPool<T, S>::_next_pow2(unsigned n) const {
  if(n == 0) return 1;
  n--;
  n |= n >> 1;
  n |= n >> 2;
  n |= n >> 4;
  n |= n >> 8;
  n |= n >> 16;
  n++;
  return n;
 }

 }  // end namespace tf --------------------------------------------------------




 #include <atomic>

 namespace tf {

 // rounds the given 64-bit unsigned integer to the nearest power of 2
 template <typename T, std::enable_if_t<
  (std::is_unsigned_v<std::decay_t<T>> && sizeof(T) == 8) , void
 >* = nullptr>
 constexpr T next_pow2(T x) {
  if(x == 0) return 1;
  x--;
  x |= x>>1;
 	x |= x>>2;
 	x |= x>>4;
 	x |= x>>8;
 	x |= x>>16;
 	x |= x>>32;
  x++;
  return x;
 }

 // rounds the given 32-bit unsigned integer to the nearest power of 2
 template <typename T, std::enable_if_t<
  (std::is_unsigned_v<std::decay_t<T>> && sizeof(T) == 4), void
 >* = nullptr>
 constexpr T next_pow2(T x) {
  if(x == 0) return 1;
  x--;
  x |= x>>1;
 	x |= x>>2;
 	x |= x>>4;
 	x |= x>>8;
 	x |= x>>16;
  x++;
  return x;
 }

 // checks if the given number if a power of 2
 template <typename T, std::enable_if_t<
  std::is_integral_v<std::decay_t<T>>, void>* = nullptr
 >
 constexpr bool is_pow2(const T& x) {
  return x && (!(x&(x-1)));
 }

 //// finds the ceil of x divided by b
 //template <typename T, std::enable_if_t<
 //  std::is_integral_v<std::decay_t<T>>, void>* = nullptr
 //>
 //constexpr T ceil(const T& x, const T& y) {
 //  //return (x + y - 1) / y;
 //  return (x-1) / y + 1;
 //}

 /**
 @brief returns floor(log2(n)), assumes n > 0
 */
 template<typename T>
 constexpr int log2(T n) {
  int log = 0;
  while (n >>= 1) {
    ++log;
  }
  return log;
 }

 /**
 @brief finds the median of three numbers of dereferenced iterators using
       the given comparator
 */
 template <typename RandItr, typename C>
 RandItr median_of_three(RandItr l, RandItr m, RandItr r, C cmp) {
  return cmp(*l, *m) ? (cmp(*m, *r) ? m : (cmp(*l, *r) ? r : l ))
                     : (cmp(*r, *m) ? m : (cmp(*r, *l) ? r : l ));
 }

 /**
 @brief finds the pseudo median of a range of items using spreaded
       nine numbers
 */
 template <typename RandItr, typename C>
 RandItr pseudo_median_of_nine(RandItr beg, RandItr end, C cmp) {
  size_t N = std::distance(beg, end);
  size_t offset = N >> 3;
  return median_of_three(
    median_of_three(beg, beg+offset, beg+(offset*2), cmp),
    median_of_three(beg+(offset*3), beg+(offset*4), beg+(offset*5), cmp),
    median_of_three(beg+(offset*6), beg+(offset*7), end-1, cmp),
    cmp
  );
 }

 /**
 @brief sorts two elements of dereferenced iterators using the given
       comparison function
 */
 template<typename Iter, typename Compare>
 void sort2(Iter a, Iter b, Compare comp) {
  if (comp(*b, *a)) std::iter_swap(a, b);
 }

 /**
 @brief sorts three elements of dereferenced iterators using the given
       comparison function
 */
 template<typename Iter, typename Compare>
 void sort3(Iter a, Iter b, Iter c, Compare comp) {
  sort2(a, b, comp);
  sort2(b, c, comp);
  sort2(a, b, comp);
 }

 /**
 @brief generates a program-wise unique id of the give type (thread-safe)
 */
 template <typename T, std::enable_if_t<std::is_integral_v<T>, void>* = nullptr>
 T unique_id() {
  static std::atomic<T> counter{0};
  return counter.fetch_add(1, std::memory_order_relaxed);
 }

 /**
 @brief updates an atomic variable with a maximum value
 */
 template <typename T>
 inline void atomic_max(std::atomic<T>& v, const T& max_v) noexcept {
  T prev = v.load(std::memory_order_relaxed);
  while(prev < max_v && 
        !v.compare_exchange_weak(prev, max_v, std::memory_order_relaxed,
                                              std::memory_order_relaxed)) {
  }
 }

 /**
 @brief updates an atomic variable with a minimum value
 */
 template <typename T>
 inline void atomic_min(std::atomic<T>& v, const T& min_v) noexcept {
  T prev = v.load(std::memory_order_relaxed);
  while(prev > min_v && 
        !v.compare_exchange_weak(prev, min_v, std::memory_order_relaxed,
                                              std::memory_order_relaxed)) {
  }
 }

 }  // end of namespace tf -----------------------------------------------------




 // small vector modified from llvm



 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdlib>
 #include <cstring>
 #include <initializer_list>
 #include <iterator>
 #include <memory>

 #if defined(__GNUC__)
  #define TF_LIKELY(x) (__builtin_expect((x), 1))
  #define TF_UNLIKELY(x) (__builtin_expect((x), 0))
 #else
  #define TF_LIKELY(x) (x)
  #define TF_UNLIKELY(x) (x)
 #endif

 /**
 @file small_vector.hpp
 @brief small vector include file
 */

 namespace tf { namespace detail {

 /**
 @private
 @brief NextCapacity - Returns the next power of two (in 64-bits)
       that is strictly greater than A.  Returns zero on overflow.
       this function assumes A to be positive
 */
 inline uint64_t NextCapacity(uint64_t A) {
  A |= (A >> 1);
  A |= (A >> 2);
  A |= (A >> 4);
  A |= (A >> 8);
  A |= (A >> 16);
  A |= (A >> 32);
  return A + 1;
 }

 }}  // end of namespace tf::detail --------------------------------------------


 namespace tf {

 /**
 @private
 */
 template <typename T>
 struct IsPod : std::integral_constant<bool, std::is_standard_layout<T>::value &&
                                            std::is_trivial<T>::value> {};

 /**
 @private
 */
 class SmallVectorBase {
 protected:
  void *BeginX, *EndX, *CapacityX;

 protected:
  SmallVectorBase(void *FirstEl, size_t Size)
    : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {}

  /// This is an implementation of the grow() method which only works
  /// on POD-like data types and is out of line to reduce code duplication.
  void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize){
    size_t CurSizeBytes = size_in_bytes();
    size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow.
    if (NewCapacityInBytes < MinSizeInBytes) {
      NewCapacityInBytes = MinSizeInBytes;
    }

    void *NewElts;
    if (BeginX == FirstEl) {
      NewElts = std::malloc(NewCapacityInBytes);

      // Copy the elements over.  No need to run dtors on PODs.
      memcpy(NewElts, this->BeginX, CurSizeBytes);
    } else {
      // If this wasn't grown from the inline copy, grow the allocated space.
      NewElts = realloc(this->BeginX, NewCapacityInBytes);
    }
    //assert(NewElts && "Out of memory");

    this->EndX = (char*)NewElts+CurSizeBytes;
    this->BeginX = NewElts;
    this->CapacityX = (char*)this->BeginX + NewCapacityInBytes;
  }

 public:
  /// This returns size()*sizeof(T).
  size_t size_in_bytes() const {
    return size_t((char*)EndX - (char*)BeginX);
  }

  /// capacity_in_bytes - This returns capacity()*sizeof(T).
  size_t capacity_in_bytes() const {
    return size_t((char*)CapacityX - (char*)BeginX);
  }

  bool empty() const { return BeginX == EndX; }
 };

 /**
 @private
 */
 template <typename T, unsigned N> struct SmallVectorStorage;

 /**
 @private
 */
 template <typename T, typename = void>
 class SmallVectorTemplateCommon : public SmallVectorBase {

  private:
  template <typename, unsigned> friend struct SmallVectorStorage;

  template <typename X>
  struct AlignedUnionType {
    alignas(X) std::byte buff[std::max(sizeof(std::byte), sizeof(X))];
  };

  // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
  // don't want it to be automatically run, so we need to represent the space as
  // something else.  Use an array of char of sufficient alignment.
  
  // deprecated in c++23
  //typedef typename std::aligned_union<1, T>::type U;
  typedef AlignedUnionType<T> U;

  U FirstEl;
  // Space after 'FirstEl' is clobbered, do not add any instance vars after it.

  protected:
  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {}

  void grow_pod(size_t MinSizeInBytes, size_t TSize) {
    SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
  }

  /// Return true if this is a smallvector which has not had dynamic
  /// memory allocated for it.
  bool isSmall() const {
    return BeginX == static_cast<const void*>(&FirstEl);
  }

  /// Put this vector in a state of being small.
  void resetToSmall() {
    BeginX = EndX = CapacityX = &FirstEl;
  }

  void setEnd(T *P) { this->EndX = P; }

  public:
  typedef size_t size_type;
  typedef ptrdiff_t difference_type;
  typedef T value_type;
  typedef T *iterator;
  typedef const T *const_iterator;

  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
  typedef std::reverse_iterator<iterator> reverse_iterator;

  typedef T &reference;
  typedef const T &const_reference;
  typedef T *pointer;
  typedef const T *const_pointer;

  // forward iterator creation methods.
  inline iterator begin() { return (iterator)this->BeginX; }
  inline const_iterator begin() const { return (const_iterator)this->BeginX; }
  inline iterator end() { return (iterator)this->EndX; }
  inline const_iterator end() const { return (const_iterator)this->EndX; }

  protected:

  iterator capacity_ptr() { return (iterator)this->CapacityX; }
  const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;}

  public:

  // reverse iterator creation methods.
  reverse_iterator rbegin()            { return reverse_iterator(end()); }
  const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
  reverse_iterator rend()              { return reverse_iterator(begin()); }
  const_reverse_iterator rend() const { return const_reverse_iterator(begin());}

  inline size_type size() const { return end()-begin(); }
  inline size_type max_size() const { return size_type(-1) / sizeof(T); }

  /// Return the total number of elements in the currently allocated buffer.
  size_t capacity() const { return capacity_ptr() - begin(); }

  /// Return a pointer to the vector's buffer, even if empty().
  pointer data() { return pointer(begin()); }
  /// Return a pointer to the vector's buffer, even if empty().
  const_pointer data() const { return const_pointer(begin()); }

  inline reference operator[](size_type idx) {
    //assert(idx < size());
    return begin()[idx];
  }

  inline const_reference operator[](size_type idx) const {
    //assert(idx < size());
    return begin()[idx];
  }

  reference front() {
    //assert(!empty());
    return begin()[0];
  }

  const_reference front() const {
    //assert(!empty());
    return begin()[0];
  }

  reference back() {
    //assert(!empty());
    return end()[-1];
  }

  const_reference back() const {
    //assert(!empty());
    return end()[-1];
  }
 };

 /**
 @private
 */
 template <typename T, bool isPodLike>
 class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {

 protected:
  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}

  static void destroy_range(T *S, T *E) {
    while (S != E) {
      --E;
      E->~T();
    }
  }

  /// Move the range [I, E) into the uninitialized memory starting with "Dest",
  /// constructing elements as needed.
  template<typename It1, typename It2>
  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
    std::uninitialized_copy(std::make_move_iterator(I),
                            std::make_move_iterator(E), Dest);
  }

  /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
  /// constructing elements as needed.
  template<typename It1, typename It2>
  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
    std::uninitialized_copy(I, E, Dest);
  }

  /// Grow the allocated memory (without initializing new elements), doubling
  /// the size of the allocated memory. Guarantees space for at least one more
  /// element, or MinSize more elements if specified.
  void grow(size_t MinSize = 0);

 public:
  void push_back(const T &Elt) {
    if (TF_UNLIKELY(this->EndX >= this->CapacityX))
      this->grow();
    ::new ((void*) this->end()) T(Elt);
    this->setEnd(this->end()+1);
  }

  void push_back(T &&Elt) {
    if (TF_UNLIKELY(this->EndX >= this->CapacityX))
      this->grow();
    ::new ((void*) this->end()) T(::std::move(Elt));
    this->setEnd(this->end()+1);
  }

  void pop_back() {
    this->setEnd(this->end()-1);
    this->end()->~T();
  }
 };

 /**
 @private
 */
 template <typename T, bool isPodLike>
 void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
  size_t CurCapacity = this->capacity();
  size_t CurSize = this->size();
  // Always grow, even from zero.
  size_t NewCapacity = size_t(tf::detail::NextCapacity(CurCapacity+2));
  if (NewCapacity < MinSize)
    NewCapacity = MinSize;
  T *NewElts = static_cast<T*>(std::malloc(NewCapacity*sizeof(T)));

  // Move the elements over.
  this->uninitialized_move(this->begin(), this->end(), NewElts);

  // Destroy the original elements.
  destroy_range(this->begin(), this->end());

  // If this wasn't grown from the inline copy, deallocate the old space.
  if (!this->isSmall())
    std::free(this->begin());

  this->setEnd(NewElts+CurSize);
  this->BeginX = NewElts;
  this->CapacityX = this->begin()+NewCapacity;
 }

 /**
 @private
 */
 template <typename T>
 class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
 protected:
  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}

  // No need to do a destroy loop for POD's.
  static void destroy_range(T *, T *) {}

  /// Move the range [I, E) onto the uninitialized memory
  /// starting with "Dest", constructing elements into it as needed.
  template<typename It1, typename It2>
  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
    // Just do a copy.
    uninitialized_copy(I, E, Dest);
  }

  /// Copy the range [I, E) onto the uninitialized memory
  /// starting with "Dest", constructing elements into it as needed.
  template<typename It1, typename It2>
  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
    // Arbitrary iterator types; just use the basic implementation.
    std::uninitialized_copy(I, E, Dest);
  }

  /// Copy the range [I, E) onto the uninitialized memory
  /// starting with "Dest", constructing elements into it as needed.
  template <typename T1, typename T2>
  static void uninitialized_copy(
      T1 *I, T1 *E, T2 *Dest,
      typename std::enable_if<std::is_same<typename std::remove_const<T1>::type,
                                           T2>::value>::type * = nullptr) {
    // Use memcpy for PODs iterated by pointers (which includes SmallVector
    // iterators): std::uninitialized_copy optimizes to memmove, but we can
    // use memcpy here. Note that I and E are iterators and thus might be
    // invalid for memcpy if they are equal.
    if (I != E)
      memcpy(Dest, I, (E - I) * sizeof(T));
  }

  /// Double the size of the allocated memory, guaranteeing space for at
  /// least one more element or MinSize if specified.
  void grow(size_t MinSize = 0) {
    this->grow_pod(MinSize*sizeof(T), sizeof(T));
  }
 public:
  void push_back(const T &Elt) {
    if (TF_UNLIKELY(this->EndX >= this->CapacityX))
      this->grow();
    memcpy(this->end(), &Elt, sizeof(T));
    this->setEnd(this->end()+1);
  }

  void pop_back() {
    this->setEnd(this->end()-1);
  }
 };

 /**
 @private
 */
 template <typename T>
 class SmallVectorImpl : public SmallVectorTemplateBase<T, IsPod<T>::value> {
  typedef SmallVectorTemplateBase<T, IsPod<T>::value> SuperClass;

  SmallVectorImpl(const SmallVectorImpl&) = delete;

 public:
  typedef typename SuperClass::iterator iterator;
  typedef typename SuperClass::const_iterator const_iterator;
  typedef typename SuperClass::size_type size_type;

 protected:
  // Default ctor - Initialize to empty.
  explicit SmallVectorImpl(unsigned N)
    : SmallVectorTemplateBase<T, IsPod<T>::value>(N*sizeof(T)) {
  }

 public:
  ~SmallVectorImpl() {
    // Destroy the constructed elements in the vector.
    this->destroy_range(this->begin(), this->end());

    // If this wasn't grown from the inline copy, deallocate the old space.
    if (!this->isSmall())
      std::free(this->begin());
  }


  void clear() {
    this->destroy_range(this->begin(), this->end());
    this->EndX = this->BeginX;
  }

  void resize(size_type N) {
    if (N < this->size()) {
      this->destroy_range(this->begin()+N, this->end());
      this->setEnd(this->begin()+N);
    } else if (N > this->size()) {
      if (this->capacity() < N)
        this->grow(N);
      for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
        new (&*I) T();
      this->setEnd(this->begin()+N);
    }
  }

  void resize(size_type N, const T &NV) {
    if (N < this->size()) {
      this->destroy_range(this->begin()+N, this->end());
      this->setEnd(this->begin()+N);
    } else if (N > this->size()) {
      if (this->capacity() < N)
        this->grow(N);
      std::uninitialized_fill(this->end(), this->begin()+N, NV);
      this->setEnd(this->begin()+N);
    }
  }

  void reserve(size_type N) {
    if (this->capacity() < N)
      this->grow(N);
  }

  T pop_back_val() {
    T Result = ::std::move(this->back());
    this->pop_back();
    return Result;
  }

  void swap(SmallVectorImpl &RHS);

  /// Add the specified range to the end of the SmallVector.
  template<typename in_iter>
  void append(in_iter in_start, in_iter in_end) {
    size_type NumInputs = std::distance(in_start, in_end);
    // Grow allocated space if needed.
    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
      this->grow(this->size()+NumInputs);

    // Copy the new elements over.
    this->uninitialized_copy(in_start, in_end, this->end());
    this->setEnd(this->end() + NumInputs);
  }

  /// Add the specified range to the end of the SmallVector.
  void append(size_type NumInputs, const T &Elt) {
    // Grow allocated space if needed.
    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
      this->grow(this->size()+NumInputs);

    // Copy the new elements over.
    std::uninitialized_fill_n(this->end(), NumInputs, Elt);
    this->setEnd(this->end() + NumInputs);
  }

  void append(std::initializer_list<T> IL) {
    append(IL.begin(), IL.end());
  }

  void assign(size_type NumElts, const T &Elt) {
    clear();
    if (this->capacity() < NumElts)
      this->grow(NumElts);
    this->setEnd(this->begin()+NumElts);
    std::uninitialized_fill(this->begin(), this->end(), Elt);
  }

  void assign(std::initializer_list<T> IL) {
    clear();
    append(IL);
  }

  iterator erase(const_iterator CI) {
    // Just cast away constness because this is a non-const member function.
    iterator I = const_cast<iterator>(CI);

    //assert(I >= this->begin() && "Iterator to erase is out of bounds.");
    //assert(I < this->end() && "Erasing at past-the-end iterator.");

    iterator N = I;
    // Shift all elts down one.
    std::move(I+1, this->end(), I);
    // Drop the last elt.
    this->pop_back();
    return(N);
  }

  iterator erase(const_iterator CS, const_iterator CE) {
    // Just cast away constness because this is a non-const member function.
    iterator S = const_cast<iterator>(CS);
    iterator E = const_cast<iterator>(CE);

    //assert(S >= this->begin() && "Range to erase is out of bounds.");
    //assert(S <= E && "Trying to erase invalid range.");
    //assert(E <= this->end() && "Trying to erase past the end.");

    iterator N = S;
    // Shift all elts down.
    iterator I = std::move(E, this->end(), S);
    // Drop the last elts.
    this->destroy_range(I, this->end());
    this->setEnd(I);
    return(N);
  }

  iterator insert(iterator I, T &&Elt) {
    if (I == this->end()) {  // Important special case for empty vector.
      this->push_back(::std::move(Elt));
      return this->end()-1;
    }

    //assert(I >= this->begin() && "Insertion iterator is out of bounds.");
    //assert(I <= this->end() && "Inserting past the end of the vector.");

    if (this->EndX >= this->CapacityX) {
      size_t EltNo = I-this->begin();
      this->grow();
      I = this->begin()+EltNo;
    }

    ::new ((void*) this->end()) T(::std::move(this->back()));
    // Push everything else over.
    std::move_backward(I, this->end()-1, this->end());
    this->setEnd(this->end()+1);

    // If we just moved the element we're inserting, be sure to update
    // the reference.
    T *EltPtr = &Elt;
    if (I <= EltPtr && EltPtr < this->EndX)
      ++EltPtr;

    *I = ::std::move(*EltPtr);
    return I;
  }

  iterator insert(iterator I, const T &Elt) {
    if (I == this->end()) {  // Important special case for empty vector.
      this->push_back(Elt);
      return this->end()-1;
    }

    //assert(I >= this->begin() && "Insertion iterator is out of bounds.");
    //assert(I <= this->end() && "Inserting past the end of the vector.");

    if (this->EndX >= this->CapacityX) {
      size_t EltNo = I-this->begin();
      this->grow();
      I = this->begin()+EltNo;
    }
    ::new ((void*) this->end()) T(std::move(this->back()));
    // Push everything else over.
    std::move_backward(I, this->end()-1, this->end());
    this->setEnd(this->end()+1);

    // If we just moved the element we're inserting, be sure to update
    // the reference.
    const T *EltPtr = &Elt;
    if (I <= EltPtr && EltPtr < this->EndX)
      ++EltPtr;

    *I = *EltPtr;
    return I;
  }

  iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
    size_t InsertElt = I - this->begin();

    if (I == this->end()) {  // Important special case for empty vector.
      append(NumToInsert, Elt);
      return this->begin()+InsertElt;
    }

    //assert(I >= this->begin() && "Insertion iterator is out of bounds.");
    //assert(I <= this->end() && "Inserting past the end of the vector.");

    // Ensure there is enough space.
    reserve(this->size() + NumToInsert);

    // Uninvalidate the iterator.
    I = this->begin()+InsertElt;

    // If there are more elements between the insertion point and the end of the
    // range than there are being inserted, we can use a simple approach to
    // insertion.  Since we already reserved space, we know that this won't
    // reallocate the vector.
    if (size_t(this->end()-I) >= NumToInsert) {
      T *OldEnd = this->end();
      append(std::move_iterator<iterator>(this->end() - NumToInsert),
             std::move_iterator<iterator>(this->end()));

      // Copy the existing elements that get replaced.
      std::move_backward(I, OldEnd-NumToInsert, OldEnd);

      std::fill_n(I, NumToInsert, Elt);
      return I;
    }

    // Otherwise, we're inserting more elements than exist already, and we're
    // not inserting at the end.

    // Move over the elements that we're about to overwrite.
    T *OldEnd = this->end();
    this->setEnd(this->end() + NumToInsert);
    size_t NumOverwritten = OldEnd-I;
    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);

    // Replace the overwritten part.
    std::fill_n(I, NumOverwritten, Elt);

    // Insert the non-overwritten middle part.
    std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt);
    return I;
  }

  template<typename ItTy>
  iterator insert(iterator I, ItTy From, ItTy To) {
    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
    size_t InsertElt = I - this->begin();

    if (I == this->end()) {  // Important special case for empty vector.
      append(From, To);
      return this->begin()+InsertElt;
    }

    //assert(I >= this->begin() && "Insertion iterator is out of bounds.");
    //assert(I <= this->end() && "Inserting past the end of the vector.");

    size_t NumToInsert = std::distance(From, To);

    // Ensure there is enough space.
    reserve(this->size() + NumToInsert);

    // Uninvalidate the iterator.
    I = this->begin()+InsertElt;

    // If there are more elements between the insertion point and the end of the
    // range than there are being inserted, we can use a simple approach to
    // insertion.  Since we already reserved space, we know that this won't
    // reallocate the vector.
    if (size_t(this->end()-I) >= NumToInsert) {
      T *OldEnd = this->end();
      append(std::move_iterator<iterator>(this->end() - NumToInsert),
             std::move_iterator<iterator>(this->end()));

      // Copy the existing elements that get replaced.
      std::move_backward(I, OldEnd-NumToInsert, OldEnd);

      std::copy(From, To, I);
      return I;
    }

    // Otherwise, we're inserting more elements than exist already, and we're
    // not inserting at the end.

    // Move over the elements that we're about to overwrite.
    T *OldEnd = this->end();
    this->setEnd(this->end() + NumToInsert);
    size_t NumOverwritten = OldEnd-I;
    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);

    // Replace the overwritten part.
    for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
      *J = *From;
      ++J; ++From;
    }

    // Insert the non-overwritten middle part.
    this->uninitialized_copy(From, To, OldEnd);
    return I;
  }

  void insert(iterator I, std::initializer_list<T> IL) {
    insert(I, IL.begin(), IL.end());
  }

  template <typename... ArgTypes> void emplace_back(ArgTypes &&... Args) {
    if (TF_UNLIKELY(this->EndX >= this->CapacityX))
      this->grow();
    ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
    this->setEnd(this->end() + 1);
  }

  SmallVectorImpl &operator=(const SmallVectorImpl &RHS);

  SmallVectorImpl &operator=(SmallVectorImpl &&RHS);

  bool operator==(const SmallVectorImpl &RHS) const {
    if (this->size() != RHS.size()) return false;
    return std::equal(this->begin(), this->end(), RHS.begin());
  }
  bool operator!=(const SmallVectorImpl &RHS) const {
    return !(*this == RHS);
  }

  bool operator<(const SmallVectorImpl &RHS) const {
    return std::lexicographical_compare(this->begin(), this->end(),
                                        RHS.begin(), RHS.end());
  }

  /// Set the array size to \p N, which the current array must have enough
  /// capacity for.
  ///
  /// This does not construct or destroy any elements in the vector.
  ///
  /// Clients can use this in conjunction with capacity() to write past the end
  /// of the buffer when they know that more elements are available, and only
  /// update the size later. This avoids the cost of value initializing elements
  /// which will only be overwritten.
  void set_size(size_type N) {
    //assert(N <= this->capacity());
    this->setEnd(this->begin() + N);
  }
 };


 template <typename T>
 void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
  if (this == &RHS) return;

  // We can only avoid copying elements if neither vector is small.
  if (!this->isSmall() && !RHS.isSmall()) {
    std::swap(this->BeginX, RHS.BeginX);
    std::swap(this->EndX, RHS.EndX);
    std::swap(this->CapacityX, RHS.CapacityX);
    return;
  }
  if (RHS.size() > this->capacity())
    this->grow(RHS.size());
  if (this->size() > RHS.capacity())
    RHS.grow(this->size());

  // Swap the shared elements.
  size_t NumShared = this->size();
  if (NumShared > RHS.size()) NumShared = RHS.size();
  for (size_type i = 0; i != NumShared; ++i)
    std::swap((*this)[i], RHS[i]);

  // Copy over the extra elts.
  if (this->size() > RHS.size()) {
    size_t EltDiff = this->size() - RHS.size();
    this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
    RHS.setEnd(RHS.end()+EltDiff);
    this->destroy_range(this->begin()+NumShared, this->end());
    this->setEnd(this->begin()+NumShared);
  } else if (RHS.size() > this->size()) {
    size_t EltDiff = RHS.size() - this->size();
    this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
    this->setEnd(this->end() + EltDiff);
    this->destroy_range(RHS.begin()+NumShared, RHS.end());
    RHS.setEnd(RHS.begin()+NumShared);
  }
 }

 template <typename T>
 SmallVectorImpl<T> &SmallVectorImpl<T>::
  operator=(const SmallVectorImpl<T> &RHS) {
  // Avoid self-assignment.
  if (this == &RHS) return *this;

  // If we already have sufficient space, assign the common elements, then
  // destroy any excess.
  size_t RHSSize = RHS.size();
  size_t CurSize = this->size();
  if (CurSize >= RHSSize) {
    // Assign common elements.
    iterator NewEnd;
    if (RHSSize)
      NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin());
    else
      NewEnd = this->begin();

    // Destroy excess elements.
    this->destroy_range(NewEnd, this->end());

    // Trim.
    this->setEnd(NewEnd);
    return *this;
  }

  // If we have to grow to have enough elements, destroy the current elements.
  // This allows us to avoid copying them during the grow.
  // FIXME: don't do this if they're efficiently moveable.
  if (this->capacity() < RHSSize) {
    // Destroy current elements.
    this->destroy_range(this->begin(), this->end());
    this->setEnd(this->begin());
    CurSize = 0;
    this->grow(RHSSize);
  } else if (CurSize) {
    // Otherwise, use assignment for the already-constructed elements.
    std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin());
  }

  // Copy construct the new elements in place.
  this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(),
                           this->begin()+CurSize);

  // Set end.
  this->setEnd(this->begin()+RHSSize);
  return *this;
 }

 template <typename T>
 SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
  // Avoid self-assignment.
  if (this == &RHS) return *this;

  // If the RHS isn't small, clear this vector and then steal its buffer.
  if (!RHS.isSmall()) {
    this->destroy_range(this->begin(), this->end());
    if (!this->isSmall()) std::free(this->begin());
    this->BeginX = RHS.BeginX;
    this->EndX = RHS.EndX;
    this->CapacityX = RHS.CapacityX;
    RHS.resetToSmall();
    return *this;
  }

  // If we already have sufficient space, assign the common elements, then
  // destroy any excess.
  size_t RHSSize = RHS.size();
  size_t CurSize = this->size();
  if (CurSize >= RHSSize) {
    // Assign common elements.
    iterator NewEnd = this->begin();
    if (RHSSize)
      NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);

    // Destroy excess elements and trim the bounds.
    this->destroy_range(NewEnd, this->end());
    this->setEnd(NewEnd);

    // Clear the RHS.
    RHS.clear();

    return *this;
  }

  // If we have to grow to have enough elements, destroy the current elements.
  // This allows us to avoid copying them during the grow.
  // FIXME: this may not actually make any sense if we can efficiently move
  // elements.
  if (this->capacity() < RHSSize) {
    // Destroy current elements.
    this->destroy_range(this->begin(), this->end());
    this->setEnd(this->begin());
    CurSize = 0;
    this->grow(RHSSize);
  } else if (CurSize) {
    // Otherwise, use assignment for the already-constructed elements.
    std::move(RHS.begin(), RHS.begin()+CurSize, this->begin());
  }

  // Move-construct the new elements in place.
  this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
                           this->begin()+CurSize);

  // Set end.
  this->setEnd(this->begin()+RHSSize);

  RHS.clear();
  return *this;
 }

 /**
 @private
 */
 template <typename T, unsigned N>
 struct SmallVectorStorage {
  /**
  @private
  */
  typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1];
 };

 /**
 @private
 */
 template <typename T> struct SmallVectorStorage<T, 1> {};

 /**
 @private
 */
 template <typename T> struct SmallVectorStorage<T, 0> {};

 /**
 @brief class to define a vector optimized for small array

 @tparam T data type
 @tparam N threshold of the number of elements in the initial storage

 The class defines a C++ STL-styled vector (a variable-sized array)
 optimized for the case when the array is small.
 It contains some number of elements in-place,
 which allows it to avoid heap allocation when the actual number of
 elements is below that threshold. This allows normal @em small cases to be
 fast without losing generality for large inputs.
 All the methods in [std::vector](https://en.cppreference.com/w/cpp/container/vector)
 can apply to this class.

 The class is stripped from the LLVM codebase.
 */
 template <typename T, unsigned N = 2>
 class SmallVector : public SmallVectorImpl<T> {
  /// Inline space for elements which aren't stored in the base class.
  SmallVectorStorage<T, N> Storage;

 public:

  /**
  @brief constructs an empty vector
  */
  SmallVector() : SmallVectorImpl<T>(N) {
  }

  /**
  @brief constructs a vector with @c Size copies of elements with value @c value
  */
  explicit SmallVector(size_t Size, const T &Value = T())
    : SmallVectorImpl<T>(N) {
    this->assign(Size, Value);
  }

  /**
  @brief constructs a vector with the contents of the range
         <tt>[S, E)</tt>
   */
  template<typename ItTy>
  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
    this->append(S, E);
  }

  //template <typename RangeTy>
  //explicit SmallVector(const tf::iterator_range<RangeTy> &R)
  //    : SmallVectorImpl<T>(N) {
  //  this->append(R.begin(), R.end());
  //}

  /**
  @brief constructs a vector with the contents of the initializer list @c IL
  */
  SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
    this->assign(IL);
  }

  /**
  @brief constructs the vector with the copy of the contents of @c RHS
  */
  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
    if (!RHS.empty())
      SmallVectorImpl<T>::operator=(RHS);
  }

  /**
  @brief constructs the vector with the contents of @c RHS using move semantics
  */
  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
    if (!RHS.empty())
      SmallVectorImpl<T>::operator=(::std::move(RHS));
  }

  /**
  @brief replaces the contents with a copy of the contents of @c RHS
  */
  const SmallVector &operator=(const SmallVector &RHS) {
    SmallVectorImpl<T>::operator=(RHS);
    return *this;
  }

  /**
  @brief replaces the contents with the contents of @c RHS using move semantics
  */
  const SmallVector &operator=(SmallVector &&RHS) {
    SmallVectorImpl<T>::operator=(::std::move(RHS));
    return *this;
  }

  /**
  @brief constructs a vector with the contents of @c RHS using move semantics
  */
  SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
    if (!RHS.empty())
      SmallVectorImpl<T>::operator=(::std::move(RHS));
  }

  /**
  @brief replaces the contents with the contents of @c RHS using move semantics
   */
  const SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
    SmallVectorImpl<T>::operator=(::std::move(RHS));
    return *this;
  }

  /**
  @brief replaces the contents with the copy of the contents of an initializer list @c IL
   */
  const SmallVector &operator=(std::initializer_list<T> IL) {
    this->assign(IL);
    return *this;
  }
 };

 template<typename T, unsigned N>
 static inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
  return X.capacity_in_bytes();
 }

 } // end tf namespace ---------------------------------------------------------

 namespace std {
  /// Implement std::swap in terms of SmallVector swap.
  template<typename T>
  inline void
  swap(tf::SmallVectorImpl<T> &LHS, tf::SmallVectorImpl<T> &RHS) {
    LHS.swap(RHS);
  }

  /// Implement std::swap in terms of SmallVector swap.
  template<typename T, unsigned N>
  inline void
  swap(tf::SmallVector<T, N> &LHS, tf::SmallVector<T, N> &RHS) {
    LHS.swap(RHS);
  }
 }  // end of namespace std ----------------------------------------------------





 #include <type_traits>
 #include <iterator>
 #include <iostream>
 #include <fstream>
 #include <stack>
 #include <queue>
 #include <vector>
 #include <algorithm>
 #include <memory>
 #include <functional>
 #include <map>
 #include <set>
 #include <unordered_map>
 #include <unordered_set>
 #include <sstream>
 #include <list>
 #include <forward_list>
 #include <numeric>
 #include <iomanip>
 #include <cassert>
 #include <cmath>
 #include <array>
 #include <string>
 #include <variant>
 #include <optional>

 namespace tf {

 // ----------------------------------------------------------------------------
 // Supported C++ STL type
 // ----------------------------------------------------------------------------

 // std::basic_string
 template <typename T>
 struct is_std_basic_string : std::false_type {};

 template <typename... ArgsT>
 struct is_std_basic_string <std::basic_string<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_basic_string_v = is_std_basic_string<T>::value;

 // std::array
 template <typename T>
 struct is_std_array : std::false_type {};

 template <typename T, size_t N>
 struct is_std_array <std::array<T, N>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_array_v = is_std_array<T>::value;

 // std::vector
 template <typename T>
 struct is_std_vector : std::false_type {};

 template <typename... ArgsT>
 struct is_std_vector <std::vector<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_vector_v = is_std_vector<T>::value;

 // std::deque
 template <typename T>
 struct is_std_deque : std::false_type {};

 template <typename... ArgsT>
 struct is_std_deque <std::deque<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_deque_v = is_std_deque<T>::value;

 // std::list
 template <typename T>
 struct is_std_list : std::false_type {};

 template <typename... ArgsT>
 struct is_std_list <std::list<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_list_v = is_std_list<T>::value;

 // std::forward_list
 template <typename T>
 struct is_std_forward_list : std::false_type {};

 template <typename... ArgsT>
 struct is_std_forward_list <std::forward_list<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_forward_list_v = is_std_forward_list<T>::value;

 // std::map
 template <typename T>
 struct is_std_map : std::false_type {};

 template <typename... ArgsT>
 struct is_std_map <std::map<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_map_v = is_std_map<T>::value;

 // std::unordered_map
 template <typename T>
 struct is_std_unordered_map : std::false_type {};

 template <typename... ArgsT>
 struct is_std_unordered_map <std::unordered_map<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_unordered_map_v = is_std_unordered_map<T>::value;

 // std::set
 template <typename T>
 struct is_std_set : std::false_type {};

 template <typename... ArgsT>
 struct is_std_set <std::set<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_set_v = is_std_set<T>::value;

 // std::unordered_set
 template <typename T>
 struct is_std_unordered_set : std::false_type {};

 template <typename... ArgsT>
 struct is_std_unordered_set <std::unordered_set<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_unordered_set_v = is_std_unordered_set<T>::value;

 // std::variant
 template <typename T>
 struct is_std_variant : std::false_type {};

 template <typename... ArgsT>
 struct is_std_variant <std::variant<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_variant_v = is_std_variant<T>::value;

 // std::optional
 template <typename T>
 struct is_std_optional : std::false_type {};

 template <typename... ArgsT>
 struct is_std_optional <std::optional<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_optional_v = is_std_optional<T>::value;

 // std::unique_ptr
 template <typename T>
 struct is_std_unique_ptr : std::false_type {};

 template <typename... ArgsT>
 struct is_std_unique_ptr <std::unique_ptr<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_unique_ptr_v = is_std_unique_ptr<T>::value;

 // std::shared_ptr
 template <typename T>
 struct is_std_shared_ptr : std::false_type {};

 template <typename... ArgsT>
 struct is_std_shared_ptr <std::shared_ptr<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_shared_ptr_v = is_std_shared_ptr<T>::value;

 // std::duration
 template <typename T> struct is_std_duration : std::false_type {};

 template <typename... ArgsT>
 struct is_std_duration<std::chrono::duration<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_duration_v = is_std_duration<T>::value;

 // std::time_point
 template <typename T>
 struct is_std_time_point : std::false_type {};

 template <typename... ArgsT>
 struct is_std_time_point<std::chrono::time_point<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_time_point_v = is_std_time_point<T>::value;

 // std::tuple
 template <typename T>
 struct is_std_tuple : std::false_type {};

 template <typename... ArgsT>
 struct is_std_tuple<std::tuple<ArgsT...>> : std::true_type {};

 template <typename T>
 constexpr bool is_std_tuple_v = is_std_tuple<T>::value;

 //-----------------------------------------------------------------------------
 // Type extraction.
 //-----------------------------------------------------------------------------

 // ExtractType: forward declaration
 template <size_t, typename>
 struct ExtractType;

 // ExtractType_t: alias interface
 template <size_t idx, typename C>
 using ExtractType_t = typename ExtractType<idx, C>::type;

 // ExtractType: base
 template <template <typename...> typename C, typename T, typename... RestT>
 struct ExtractType <0, C<T, RestT...>> {
  using type = T;
 };

 // ExtractType: base
 template <typename T>
 struct ExtractType <0, T> {
  using type = T;
 };

 // ExtractType: recursive definition.
 template <size_t idx, template <typename...> typename C, typename T, typename... RestT>
 struct ExtractType <idx, C<T, RestT...>> : ExtractType<idx-1, C<RestT...>> {
 };

 // ----------------------------------------------------------------------------
 // Size Wrapper
 // ----------------------------------------------------------------------------

 // Struct: SizeTag
 // Class that wraps a given size item which can be customized.
 template <typename T>
 class SizeTag {

  public:

    using type = std::conditional_t<std::is_lvalue_reference_v<T>, T, std::decay_t<T>>;

    SizeTag(T&& item) : _item(std::forward<T>(item)) {}

    SizeTag& operator = (const SizeTag&) = delete;

    inline const T& get() const {return _item;}

    template <typename ArchiverT>
    auto save(ArchiverT & ar) const { return ar(_item); }

    template <typename ArchiverT>
    auto load(ArchiverT & ar) { return ar(_item); }

  private:

    type _item;
 };

 // Function: make_size_tag
 template <typename T>
 SizeTag<T> make_size_tag(T&& t) {
  return { std::forward<T>(t) };
 }

 // ----------------------------------------------------------------------------
 // Size Wrapper
 // ----------------------------------------------------------------------------

 // Class: MapItem
 template <typename KeyT, typename ValueT>
 class MapItem {

  public:

    using KeyType = std::conditional_t <std::is_lvalue_reference_v<KeyT>, KeyT, std::decay_t<KeyT>>;
    using ValueType = std::conditional_t <std::is_lvalue_reference_v<ValueT>, ValueT, std::decay_t<ValueT>>;

    MapItem(KeyT&& k, ValueT&& v) : _key(std::forward<KeyT>(k)), _value(std::forward<ValueT>(v)) {}
    MapItem& operator = (const MapItem&) = delete;

    inline const KeyT& key() const { return _key; }
    inline const ValueT& value() const { return _value; }

    template <typename ArchiverT>
    auto save(ArchiverT & ar) const { return ar(_key, _value); }

    template <typename ArchiverT>
    auto load(ArchiverT & ar) { return ar(_key, _value); }

  private:

    KeyType _key;
    ValueType _value;
 };

 // Function: make_kv_pair
 template <typename KeyT, typename ValueT>
 MapItem<KeyT, ValueT> make_kv_pair(KeyT&& k, ValueT&& v) {
  return { std::forward<KeyT>(k), std::forward<ValueT>(v) };
 }

 // ----------------------------------------------------------------------------
 // Serializer Definition
 // ----------------------------------------------------------------------------

 template <typename T>
 constexpr auto is_default_serializable_v = (
  std::is_arithmetic_v<T>    ||
  std::is_enum_v<T>          ||
  is_std_basic_string_v<T>   ||
  is_std_vector_v<T>         ||
  is_std_deque_v<T>          ||
  is_std_list_v<T>           ||
  is_std_forward_list_v<T>   ||
  is_std_map_v<T>            ||
  is_std_unordered_map_v<T>  ||
  is_std_set_v<T>            ||
  is_std_unordered_set_v<T>  ||
  is_std_duration_v<T>       ||
  is_std_time_point_v<T>     ||
  is_std_variant_v<T>        ||
  is_std_optional_v<T>       ||
  is_std_tuple_v<T>          ||
  is_std_array_v<T>
 );


 // Class: Serializer
 template <typename Stream, typename SizeType = std::streamsize>
 class Serializer {

  public:

    Serializer(Stream& stream);

    template <typename... T>
    SizeType operator()(T&&... items);

  private:

    Stream& _stream;

    template <typename T,
      std::enable_if_t<!is_default_serializable_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<
        is_std_deque_v<std::decay_t<T>> ||
        is_std_list_v<std::decay_t<T>>,
        void
      >* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<
        is_std_forward_list_v<std::decay_t<T>>,
        void
      >* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<
        is_std_map_v<std::decay_t<T>> ||
        is_std_unordered_map_v<std::decay_t<T>>,
        void
      >* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<
        is_std_set_v<std::decay_t<T>> ||
        is_std_unordered_set_v<std::decay_t<T>>,
        void
      >* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _save(T&&);

    template <typename T,
      std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _save(T&&);


 };

 // Constructor
 template <typename Stream, typename SizeType>
 Serializer<Stream, SizeType>::Serializer(Stream& stream) : _stream(stream) {
 }

 // Operator ()
 template <typename Stream, typename SizeType>
 template <typename... T>
 SizeType Serializer<Stream, SizeType>::operator() (T&&... items) {
  return (_save(std::forward<T>(items)) + ...);
 }

 // arithmetic data type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  _stream.write(reinterpret_cast<const char*>(std::addressof(t)), sizeof(t));
  return sizeof(t);
 }

 // std::basic_string
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  using U = std::decay_t<T>;
  auto sz = _save(make_size_tag(t.size()));
  _stream.write(
    reinterpret_cast<const char*>(t.data()),
    t.size()*sizeof(typename U::value_type)
  );
  return sz + t.size()*sizeof(typename U::value_type);
 }

 // std::vector
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {

  using U = std::decay_t<T>;

  auto sz = _save(make_size_tag(t.size()));

  if constexpr (std::is_arithmetic_v<typename U::value_type>) {
    _stream.write(
      reinterpret_cast<const char*>(t.data()),
      t.size() * sizeof(typename U::value_type)
    );
    sz += t.size() * sizeof(typename U::value_type);
  } else {
    for(auto&& item : t) {
      sz += _save(item);
    }
  }

  return sz;
 }

 // std::list and std::deque
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_deque_v<std::decay_t<T>> ||
                   is_std_list_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  auto sz = _save(make_size_tag(t.size()));
  for(auto&& item : t) {
    sz += _save(item);
  }
  return sz;
 }

 // std::forward_list
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_forward_list_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  auto sz = _save(make_size_tag(std::distance(t.begin(), t.end())));
  for(auto&& item : t) {
    sz += _save(item);
  }
  return sz;
 }

 // std::map and std::unordered_map
 template <typename Stream, typename SizeType>
 template <typename T, std::enable_if_t<
  is_std_map_v<std::decay_t<T>> ||
  is_std_unordered_map_v<std::decay_t<T>>,
  void
 >*>
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  auto sz = _save(make_size_tag(t.size()));
  for(auto&& [k, v] : t) {
    sz += _save(make_kv_pair(k, v));
  }
  return sz;
 }

 // std::set and std::unordered_set
 template <typename Stream, typename SizeType>
 template <typename T, std::enable_if_t<
  is_std_set_v<std::decay_t<T>> ||
  is_std_unordered_set_v<std::decay_t<T>>,
  void
 >*>
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  auto sz = _save(make_size_tag(t.size()));
  for(auto&& item : t) {
    sz += _save(item);
  }
  return sz;
 }

 // enum data type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  using U = std::decay_t<T>;
  return _save(static_cast<std::underlying_type_t<U>>(t));
 }

 // duration data type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  return _save(t.count());
 }

 // time point data type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  return _save(t.time_since_epoch());
 }

 // optional data type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  if(bool flag = t.has_value(); flag) {
    return _save(flag) + _save(*t);
  }
  else {
    return _save(flag);
  }
 }

 // variant type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  return _save(t.index()) +
         std::visit([&] (auto&& arg){ return _save(arg);}, t);
 }

 // tuple type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  return std::apply(
    [&] (auto&&... args) {
      return (_save(std::forward<decltype(args)>(args)) + ... + 0);
    },
    std::forward<T>(t)
  );
 }

 // array
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {

  using U = std::decay_t<T>;

  static_assert(std::tuple_size<U>::value > 0, "Array size can't be zero");

  SizeType sz;

  if constexpr(std::is_arithmetic_v<typename U::value_type>) {
    _stream.write(reinterpret_cast<const char*>(t.data()), sizeof(t));
    sz = sizeof(t);
  }
  else {
    sz = 0;
    for(auto&& item : t) {
      sz += _save(item);
    }
  }

  return sz;
 }

 // custom save method
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<!is_default_serializable_v<std::decay_t<T>>, void>*
 >
 SizeType Serializer<Stream, SizeType>::_save(T&& t) {
  return t.save(*this);
 }

 // ----------------------------------------------------------------------------
 // DeSerializer Definition
 // ----------------------------------------------------------------------------

 template <typename T>
 constexpr auto is_default_deserializable_v =
  std::is_arithmetic_v<T>    ||
  std::is_enum_v<T>          ||
  is_std_basic_string_v<T>   ||
  is_std_vector_v<T>         ||
  is_std_deque_v<T>          ||
  is_std_list_v<T>           ||
  is_std_forward_list_v<T>   ||
  is_std_map_v<T>            ||
  is_std_unordered_map_v<T>  ||
  is_std_set_v<T>            ||
  is_std_unordered_set_v<T>  ||
  is_std_duration_v<T>       ||
  is_std_time_point_v<T>     ||
  is_std_variant_v<T>        ||
  is_std_optional_v<T>       ||
  is_std_tuple_v<T>          ||
  is_std_array_v<T>;

 // Class: Deserializer
 template <typename Stream, typename SizeType = std::streamsize>
 class Deserializer {

  public:

    Deserializer(Stream& stream);

    template <typename... T>
    SizeType operator()(T&&... items);

  private:

    Stream& _stream;

    // Function: _variant_helper
    template <
      size_t I = 0, typename... ArgsT,
      std::enable_if_t<I==sizeof...(ArgsT)>* = nullptr
    >
    SizeType _variant_helper(size_t, std::variant<ArgsT...>&);

    // Function: _variant_helper
    template <
      size_t I = 0, typename... ArgsT,
      std::enable_if_t<I<sizeof...(ArgsT)>* = nullptr
    >
    SizeType _variant_helper(size_t, std::variant<ArgsT...>&);

    template <typename T,
      std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<
        is_std_deque_v<std::decay_t<T>> ||
        is_std_list_v<std::decay_t<T>>  ||
        is_std_forward_list_v<std::decay_t<T>>,
        void
      >* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<is_std_map_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<is_std_unordered_map_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<is_std_set_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<is_std_unordered_set_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);

    template <typename T,
      std::enable_if_t<!is_default_deserializable_v<std::decay_t<T>>, void>* = nullptr
    >
    SizeType _load(T&&);
 };

 // Constructor
 template <typename Stream, typename SizeType>
 Deserializer<Stream, SizeType>::Deserializer(Stream& stream) : _stream(stream) {
 }

 // Operator ()
 template <typename Stream, typename SizeType>
 template <typename... T>
 SizeType Deserializer<Stream, SizeType>::operator() (T&&... items) {
  return (_load(std::forward<T>(items)) + ...);
 }

 // Function: _variant_helper
 template <typename Stream, typename SizeType>
 template <size_t I, typename... ArgsT, std::enable_if_t<I==sizeof...(ArgsT)>*>
 SizeType Deserializer<Stream, SizeType>::_variant_helper(size_t, std::variant<ArgsT...>&) {
  return 0;
 }

 // Function: _variant_helper
 template <typename Stream, typename SizeType>
 template <size_t I, typename... ArgsT, std::enable_if_t<I<sizeof...(ArgsT)>*>
 SizeType Deserializer<Stream, SizeType>::_variant_helper(size_t i, std::variant<ArgsT...>& v) {
  if(i == 0) {
    using type = ExtractType_t<I, std::variant<ArgsT...>>;
    if(v.index() != I) {
      static_assert(
        std::is_default_constructible<type>::value,
        "Failed to archive variant (type should be default constructible T())"
      );
      v = type();
    }
    return _load(*std::get_if<type>(&v));
  }
  return _variant_helper<I+1, ArgsT...>(i-1, v);
 }

 // arithmetic data type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
  _stream.read(reinterpret_cast<char*>(std::addressof(t)), sizeof(t));
  return sizeof(t);
 }

 // std::basic_string
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
  using U = std::decay_t<T>;
  typename U::size_type num_chars;
  auto sz = _load(make_size_tag(num_chars));
  t.resize(num_chars);
  _stream.read(reinterpret_cast<char*>(t.data()), num_chars*sizeof(typename U::value_type));
  return sz + num_chars*sizeof(typename U::value_type);
 }

 // std::vector
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {

  using U = std::decay_t<T>;

  typename U::size_type num_data;

  auto sz = _load(make_size_tag(num_data));

  if constexpr(std::is_arithmetic_v<typename U::value_type>) {
    t.resize(num_data);
    _stream.read(reinterpret_cast<char*>(t.data()), num_data * sizeof(typename U::value_type));
    sz += num_data * sizeof(typename U::value_type);
  }
  else {
    t.resize(num_data);
    for(auto && v : t) {
      sz += _load(v);
    }
  }
  return sz;
 }

 // std::list and std::deque
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_deque_v<std::decay_t<T>> ||
                   is_std_list_v<std::decay_t<T>>  ||
                   is_std_forward_list_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
  using U = std::decay_t<T>;

  typename U::size_type num_data;
  auto sz = _load(make_size_tag(num_data));

  t.resize(num_data);
  for(auto && v : t) {
    sz += _load(v);
  }
  return sz;
 }

 // std::map
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_map_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {

  using U = std::decay_t<T>;

  typename U::size_type num_data;
  auto sz = _load(make_size_tag(num_data));

  t.clear();
  auto hint = t.begin();

  typename U::key_type k;
  typename U::mapped_type v;

  for(size_t i=0; i<num_data; ++i) {
    sz += _load(make_kv_pair(k, v));
    hint = t.emplace_hint(hint, std::move(k), std::move(v));
  }
  return sz;
 }

 // std::unordered_map
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_unordered_map_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
  using U = std::decay_t<T>;
  typename U::size_type num_data;
  auto sz = _load(make_size_tag(num_data));

  t.clear();
  t.reserve(num_data);

  typename U::key_type k;
  typename U::mapped_type v;

  for(size_t i=0; i<num_data; ++i) {
    sz += _load(make_kv_pair(k, v));
    t.emplace(std::move(k), std::move(v));
  }

  return sz;
 }

 // std::set
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_set_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {

  using U = std::decay_t<T>;

  typename U::size_type num_data;
  auto sz = _load(make_size_tag(num_data));

  t.clear();
  auto hint = t.begin();

  typename U::key_type k;

  for(size_t i=0; i<num_data; ++i) {
    sz += _load(k);
    hint = t.emplace_hint(hint, std::move(k));
  }
  return sz;
 }

 // std::unordered_set
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_unordered_set_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {

  using U = std::decay_t<T>;

  typename U::size_type num_data;
  auto sz = _load(make_size_tag(num_data));

  t.clear();
  t.reserve(num_data);

  typename U::key_type k;

  for(size_t i=0; i<num_data; ++i) {
    sz += _load(k);
    t.emplace(std::move(k));
  }
  return sz;
 }

 // enum data type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
  using U = std::decay_t<T>;
  std::underlying_type_t<U> k;
  auto sz = _load(k);
  t = static_cast<U>(k);
  return sz;
 }

 // duration data type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
  using U = std::decay_t<T>;
  typename U::rep count;
  auto s = _load(count);
  t = U{count};
  return s;
 }

 // time point data type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
  using U = std::decay_t<T>;
  typename U::duration elapsed;
  auto s = _load(elapsed);
  t = U{elapsed};
  return s;
 }

 // optional data type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {

  using U = std::decay_t<T>;

  bool has_value;
  auto s = _load(has_value);
  if(has_value) {
    if(!t) {
      t = typename U::value_type();
    }
    s += _load(*t);
  }
  else {
    t.reset();
  }
  return s;
 }

 // variant type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
  std::decay_t<decltype(t.index())> idx;
  auto s = _load(idx);
  return s + _variant_helper(idx, t);
 }

 // tuple type
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
  return std::apply(
    [&] (auto&&... args) {
      return (_load(std::forward<decltype(args)>(args)) + ... + 0);
    },
    std::forward<T>(t)
  );
 }

 // array
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {

  using U = std::decay_t<T>;

  static_assert(std::tuple_size<U>::value > 0, "Array size can't be zero");

  SizeType sz;

  if constexpr(std::is_arithmetic_v<typename U::value_type>) {
    _stream.read(reinterpret_cast<char*>(t.data()), sizeof(t));
    sz = sizeof(t);
  }
  else {
    sz = 0;
    for(auto && v : t) {
      sz += _load(v);
    }
  }

  return sz;
 }

 // custom save method
 template <typename Stream, typename SizeType>
 template <typename T,
  std::enable_if_t<!is_default_deserializable_v<std::decay_t<T>>, void>*
 >
 SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
  return t.load(*this);
 }

 }  // ned of namespace tf -----------------------------------------------------









 #include <iostream>
 #include <sstream>
 #include <exception>



 #include <iostream>
 #include <string>

 namespace tf {

 // Procedure: ostreamize
 template <typename T>
 void ostreamize(std::ostream& os, T&& token) {
  os << std::forward<T>(token);
 }

 // Procedure: ostreamize
 template <typename T, typename... Rest>
 void ostreamize(std::ostream& os, T&& token, Rest&&... rest) {
  os << std::forward<T>(token);
  ostreamize(os, std::forward<Rest>(rest)...);
 }

 // Function: stringify
 template <typename... ArgsT>
 std::string stringify(ArgsT&&... args) {
  std::ostringstream oss;
  ostreamize(oss, std::forward<ArgsT>(args)...);
  return oss.str();
 }


 }  // end of namespace tf -----------------------------------------------------



 namespace tf {

 // Procedure: throw_se
 // Throws the system error under a given error code.
 template <typename... ArgsT>
 //void throw_se(const char* fname, const size_t line, Error::Code c, ArgsT&&... args) {
 void throw_re(const char* fname, const size_t line, ArgsT&&... args) {
  std::ostringstream oss;
  oss << "[" << fname << ":" << line << "] ";
  //ostreamize(oss, std::forward<ArgsT>(args)...);
  (oss << ... << args);
  throw std::runtime_error(oss.str());
 }

 }  // ------------------------------------------------------------------------

 #define TF_THROW(...) tf::throw_re(__FILE__, __LINE__, __VA_ARGS__);




 namespace tf {

 // ----------------------------------------------------------------------------
 // taskflow
 // ----------------------------------------------------------------------------
 class AsyncTopology;
 class Node;
 class Graph;
 class FlowBuilder;
 class Semaphore;
 class Subflow;
 class Runtime;
 class Task;
 class TaskView;
 class Taskflow;
 class Topology;
 class TopologyBase;
 class Executor;
 class Worker;
 class WorkerView;
 class ObserverInterface;
 class ChromeTracingObserver;
 class TFProfObserver;
 class TFProfManager;

 template <typename T>
 class Future;

 template <typename...Fs>
 class Pipeline;

 // ----------------------------------------------------------------------------
 // cudaFlow
 // ----------------------------------------------------------------------------
 class cudaFlowNode;
 class cudaFlowGraph;
 class cudaTask;
 class cudaFlow;
 class cudaFlowCapturer;
 class cudaFlowOptimizerBase;
 class cudaFlowLinearOptimizer;
 class cudaFlowSequentialOptimizer;
 class cudaFlowRoundRobinOptimizer;

 // ----------------------------------------------------------------------------
 // syclFlow
 // ----------------------------------------------------------------------------
 class syclNode;
 class syclGraph;
 class syclTask;
 class syclFlow;


 }  // end of namespace tf -----------------------------------------------------







 #include <vector>
 #include <mutex>



 /**
 @file semaphore.hpp
 @brief semaphore include file
 */

 namespace tf {

 // ----------------------------------------------------------------------------
 // Semaphore
 // ----------------------------------------------------------------------------

 /**
 @class Semaphore

 @brief class to create a semophore object for building a concurrency constraint

 A semaphore creates a constraint that limits the maximum concurrency,
 i.e., the number of workers, in a set of tasks.
 You can let a task acquire/release one or multiple semaphores before/after
 executing its work.
 A task can acquire and release a semaphore,
 or just acquire or just release it.
 A tf::Semaphore object starts with an initial count.
 As long as that count is above 0, tasks can acquire the semaphore and do
 their work.
 If the count is 0 or less, a task trying to acquire the semaphore will not run
 but goes to a waiting list of that semaphore.
 When the semaphore is released by another task,
 it reschedules all tasks on that waiting list.

 @code{.cpp}
 tf::Executor executor(8);   // create an executor of 8 workers
 tf::Taskflow taskflow;

 tf::Semaphore semaphore(1); // create a semaphore with initial count 1

 std::vector<tf::Task> tasks {
  taskflow.emplace([](){ std::cout << "A" << std::endl; }),
  taskflow.emplace([](){ std::cout << "B" << std::endl; }),
  taskflow.emplace([](){ std::cout << "C" << std::endl; }),
  taskflow.emplace([](){ std::cout << "D" << std::endl; }),
  taskflow.emplace([](){ std::cout << "E" << std::endl; })
 };

 for(auto & task : tasks) {  // each task acquires and release the semaphore
  task.acquire(semaphore);
  task.release(semaphore);
 }

 executor.run(taskflow).wait();
 @endcode

 The above example creates five tasks with no dependencies between them.
 Under normal circumstances, the five tasks would be executed concurrently.
 However, this example has a semaphore with initial count 1,
 and all tasks need to acquire that semaphore before running and release that
 semaphore after they are done.
 This arrangement limits the number of concurrently running tasks to only one.

 */
 class Semaphore {

  friend class Node;

  public:

    /**
    @brief constructs a semaphore with the given counter

    A semaphore creates a constraint that limits the maximum concurrency,
    i.e., the number of workers, in a set of tasks.

    @code{.cpp}
    tf::Semaphore semaphore(4);  // concurrency constraint of 4 workers
    @endcode
    */
    explicit Semaphore(size_t max_workers);

    /**
    @brief queries the counter value (not thread-safe during the run)
    */
    size_t count() const;

  private:

    std::mutex _mtx;

    size_t _counter;

    std::vector<Node*> _waiters;

    bool _try_acquire_or_wait(Node*);

    std::vector<Node*> _release();
 };

 inline Semaphore::Semaphore(size_t max_workers) :
  _counter(max_workers) {
 }

 inline bool Semaphore::_try_acquire_or_wait(Node* me) {
  std::lock_guard<std::mutex> lock(_mtx);
  if(_counter > 0) {
    --_counter;
    return true;
  }
  else {
    _waiters.push_back(me);
    return false;
  }
 }

 inline std::vector<Node*> Semaphore::_release() {
  std::lock_guard<std::mutex> lock(_mtx);
  ++_counter;
  std::vector<Node*> r{std::move(_waiters)};
  return r;
 }

 inline size_t Semaphore::count() const {
  return _counter;
 }

 }  // end of namespace tf. ---------------------------------------------------




 #define TF_ENABLE_PROFILER "TF_ENABLE_PROFILER"

 namespace tf {

 }  // end of namespace tf -----------------------------------------------------




 namespace tf {

 // ----------------------------------------------------------------------------

 // class: TopologyBase
 class TopologyBase {

  friend class Executor;
  friend class Node;

  template <typename T>
  friend class Future;

  protected:

  std::atomic<bool> _is_cancelled { false };
 };

 // ----------------------------------------------------------------------------

 // class: Topology
 class Topology : public TopologyBase {

  friend class Executor;
  friend class Runtime;

  public:

    template <typename P, typename C>
    Topology(Taskflow&, P&&, C&&);

  private:

    Taskflow& _taskflow;

    std::promise<void> _promise;

    SmallVector<Node*> _sources;

    std::function<bool()> _pred;
    std::function<void()> _call;

    std::atomic<size_t> _join_counter {0};
 };

 // Constructor
 template <typename P, typename C>
 Topology::Topology(Taskflow& tf, P&& p, C&& c):
  _taskflow(tf),
  _pred {std::forward<P>(p)},
  _call {std::forward<C>(c)} {
 }

 }  // end of namespace tf. ----------------------------------------------------





 #if defined(_MSC_VER)
  #define TF_FORCE_INLINE __forceinline
 #elif defined(__GNUC__) && __GNUC__ > 3
  #define TF_FORCE_INLINE __attribute__((__always_inline__)) inline
 #else
  #define TF_FORCE_INLINE inline
 #endif

 #if defined(_MSC_VER)
  #define TF_NO_INLINE __declspec(noinline)
 #elif defined(__GNUC__) && __GNUC__ > 3
  #define TF_NO_INLINE __attribute__((__noinline__))
 #else
  #define TF_NO_INLINE
 #endif



 /**
 @file tsq.hpp
 @brief task queue include file
 */

 namespace tf {


 // ----------------------------------------------------------------------------
 // Task Types
 // ----------------------------------------------------------------------------

 /**
 @enum TaskPriority

 @brief enumeration of all task priority values

 A priority is an enumerated value of type @c unsigned.
 Currently, %Taskflow defines three priority levels, 
 @c HIGH, @c NORMAL, and @c LOW, starting from 0, 1, to 2.
 That is, the lower the value, the higher the priority.

 */
 enum class TaskPriority : unsigned {
  /** @brief value of the highest priority (i.e., 0)  */
  HIGH = 0,
  /** @brief value of the normal priority (i.e., 1)  */
  NORMAL = 1,
  /** @brief value of the lowest priority (i.e., 2) */
  LOW = 2,
  /** @brief conventional value for iterating priority values */
  MAX = 3
 };



 // ----------------------------------------------------------------------------
 // Task Queue
 // ----------------------------------------------------------------------------


 /**
 @class: TaskQueue

 @tparam T data type (must be a pointer type)
 @tparam TF_MAX_PRIORITY maximum level of the priority 

 @brief class to create a lock-free unbounded single-producer multiple-consumer queue

 This class implements the work-stealing queue described in the paper,
 <a href="https://www.di.ens.fr/~zappa/readings/ppopp13.pdf">Correct and Efficient Work-Stealing for Weak Memory Models</a>,
 and extends it to include priority.

 Only the queue owner can perform pop and push operations,
 while others can steal data from the queue simultaneously.
 Priority starts from zero (highest priority) to the template value 
 `TF_MAX_PRIORITY-1` (lowest priority).
 All operations are associated with priority values to indicate
 the corresponding queues to which an operation is applied.

 The default template value, `TF_MAX_PRIORITY`, is `TaskPriority::MAX` 
 which applies only three priority levels to the task queue.

 @code{.cpp}
 auto [A, B, C, D, E] = taskflow.emplace(
  [] () { },
  [&] () { 
    std::cout << "Task B: " << counter++ << '\n';  // 0
  },
  [&] () { 
    std::cout << "Task C: " << counter++ << '\n';  // 2
  },
  [&] () { 
    std::cout << "Task D: " << counter++ << '\n';  // 1
  },
  [] () { }
 );

 A.precede(B, C, D); 
 E.succeed(B, C, D);
  
 B.priority(tf::TaskPriority::HIGH);
 C.priority(tf::TaskPriority::LOW);
 D.priority(tf::TaskPriority::NORMAL);
  
 executor.run(taskflow).wait();
 @endcode

 In the above example, we have a task graph of five tasks,
 @c A, @c B, @c C, @c D, and @c E, in which @c B, @c C, and @c D
 can run in simultaneously when @c A finishes.
 Since we only uses one worker thread in the executor, 
 we can deterministically run @c B first, then @c D, and @c C
 in order of their priority values.
 The output is as follows:

 @code{.shell-session}
 Task B: 0
 Task D: 1
 Task C: 2
 @endcode

 */
 template <typename T, unsigned TF_MAX_PRIORITY = static_cast<unsigned>(TaskPriority::MAX)>
 class TaskQueue {
  
  static_assert(TF_MAX_PRIORITY > 0, "TF_MAX_PRIORITY must be at least one");
  static_assert(std::is_pointer_v<T>, "T must be a pointer type");

  struct Array {

    int64_t C;
    int64_t M;
    std::atomic<T>* S;

    explicit Array(int64_t c) :
      C {c},
      M {c-1},
      S {new std::atomic<T>[static_cast<size_t>(C)]} {
    }

    ~Array() {
      delete [] S;
    }

    int64_t capacity() const noexcept {
      return C;
    }

    void push(int64_t i, T o) noexcept {
      S[i & M].store(o, std::memory_order_relaxed);
    }

    T pop(int64_t i) noexcept {
      return S[i & M].load(std::memory_order_relaxed);
    }

    Array* resize(int64_t b, int64_t t) {
      Array* ptr = new Array {2*C};
      for(int64_t i=t; i!=b; ++i) {
        ptr->push(i, pop(i));
      }
      return ptr;
    }

  };

  // Doubling the alignment by 2 seems to generate the most
  // decent performance.
  CachelineAligned<std::atomic<int64_t>> _top[TF_MAX_PRIORITY];
  CachelineAligned<std::atomic<int64_t>> _bottom[TF_MAX_PRIORITY];
  std::atomic<Array*> _array[TF_MAX_PRIORITY];
  std::vector<Array*> _garbage[TF_MAX_PRIORITY];

  //std::atomic<T> _cache {nullptr};

  public:

    /**
    @brief constructs the queue with a given capacity

    @param capacity the capacity of the queue (must be power of 2)
    */
    explicit TaskQueue(int64_t capacity = 512);

    /**
    @brief destructs the queue
    */
    ~TaskQueue();

    /**
    @brief queries if the queue is empty at the time of this call
    */
    bool empty() const noexcept;

    /**
    @brief queries if the queue is empty at a specific priority value
    */
    bool empty(unsigned priority) const noexcept;

    /**
    @brief queries the number of items at the time of this call
    */
    size_t size() const noexcept;

    /**
    @brief queries the number of items with the given priority
           at the time of this call
    */
    size_t size(unsigned priority) const noexcept;

    /**
    @brief queries the capacity of the queue
    */
    int64_t capacity() const noexcept;
    
    /**
    @brief queries the capacity of the queue at a specific priority value
    */
    int64_t capacity(unsigned priority) const noexcept;

    /**
    @brief inserts an item to the queue

    @param item the item to push to the queue
    @param priority priority value of the item to push (default = 0)
    
    Only the owner thread can insert an item to the queue.
    The operation can trigger the queue to resize its capacity
    if more space is required.
    */
    TF_FORCE_INLINE void push(T item, unsigned priority);

    /**
    @brief pops out an item from the queue

    Only the owner thread can pop out an item from the queue.
    The return can be a @c nullptr if this operation failed (empty queue).
    */
    T pop();

    /**
    @brief pops out an item with a specific priority value from the queue

    @param priority priority of the item to pop

    Only the owner thread can pop out an item from the queue.
    The return can be a @c nullptr if this operation failed (empty queue).
    */
    TF_FORCE_INLINE T pop(unsigned priority);

    /**
    @brief steals an item from the queue

    Any threads can try to steal an item from the queue.
    The return can be a @c nullptr if this operation failed (not necessary empty).
    */
    T steal();

    /**
    @brief steals an item with a specific priority value from the queue

    @param priority priority of the item to steal

    Any threads can try to steal an item from the queue.
    The return can be a @c nullptr if this operation failed (not necessary empty).
    */
    T steal(unsigned priority);

  private:
    TF_NO_INLINE Array* resize_array(Array* a, unsigned p, std::int64_t b, std::int64_t t);
 };

 // Constructor
 template <typename T, unsigned TF_MAX_PRIORITY>
 TaskQueue<T, TF_MAX_PRIORITY>::TaskQueue(int64_t c) {
  assert(c && (!(c & (c-1))));
  unroll<0, TF_MAX_PRIORITY, 1>([&](auto p){
    _top[p].data.store(0, std::memory_order_relaxed);
    _bottom[p].data.store(0, std::memory_order_relaxed);
    _array[p].store(new Array{c}, std::memory_order_relaxed);
    _garbage[p].reserve(32);
  });
 }

 // Destructor
 template <typename T, unsigned TF_MAX_PRIORITY>
 TaskQueue<T, TF_MAX_PRIORITY>::~TaskQueue() {
  unroll<0, TF_MAX_PRIORITY, 1>([&](auto p){
    for(auto a : _garbage[p]) {
      delete a;
    }
    delete _array[p].load();
  });
 }

 // Function: empty
 template <typename T, unsigned TF_MAX_PRIORITY>
 bool TaskQueue<T, TF_MAX_PRIORITY>::empty() const noexcept {
  for(unsigned i=0; i<TF_MAX_PRIORITY; i++) {
    if(!empty(i)) {
      return false;
    }
  }
  return true;
 }

 // Function: empty
 template <typename T, unsigned TF_MAX_PRIORITY>
 bool TaskQueue<T, TF_MAX_PRIORITY>::empty(unsigned p) const noexcept {
  int64_t b = _bottom[p].data.load(std::memory_order_relaxed);
  int64_t t = _top[p].data.load(std::memory_order_relaxed);
  return (b <= t);
 }

 // Function: size
 template <typename T, unsigned TF_MAX_PRIORITY>
 size_t TaskQueue<T, TF_MAX_PRIORITY>::size() const noexcept {
  size_t s;
  unroll<0, TF_MAX_PRIORITY, 1>([&](auto i) { s = i ? size(i) + s : size(i); });
  return s;
 }

 // Function: size
 template <typename T, unsigned TF_MAX_PRIORITY>
 size_t TaskQueue<T, TF_MAX_PRIORITY>::size(unsigned p) const noexcept {
  int64_t b = _bottom[p].data.load(std::memory_order_relaxed);
  int64_t t = _top[p].data.load(std::memory_order_relaxed);
  return static_cast<size_t>(b >= t ? b - t : 0);
 }

 // Function: push
 template <typename T, unsigned TF_MAX_PRIORITY>
 TF_FORCE_INLINE void TaskQueue<T, TF_MAX_PRIORITY>::push(T o, unsigned p) {

  int64_t b = _bottom[p].data.load(std::memory_order_relaxed);
  int64_t t = _top[p].data.load(std::memory_order_acquire);
  Array* a = _array[p].load(std::memory_order_relaxed);

  // queue is full
  if(a->capacity() - 1 < (b - t)) {
    a = resize_array(a, p, b, t);
  }

  a->push(b, o);
  std::atomic_thread_fence(std::memory_order_release);
  _bottom[p].data.store(b + 1, std::memory_order_relaxed);
 }

 // Function: pop
 template <typename T, unsigned TF_MAX_PRIORITY>
 T TaskQueue<T, TF_MAX_PRIORITY>::pop() {
  for(unsigned i=0; i<TF_MAX_PRIORITY; i++) {
    if(auto t = pop(i); t) {
      return t;
    }
  }
  return nullptr;
 }

 // Function: pop
 template <typename T, unsigned TF_MAX_PRIORITY>
 TF_FORCE_INLINE T TaskQueue<T, TF_MAX_PRIORITY>::pop(unsigned p) {

  int64_t b = _bottom[p].data.load(std::memory_order_relaxed) - 1;
  Array* a = _array[p].load(std::memory_order_relaxed);
  _bottom[p].data.store(b, std::memory_order_relaxed);
  std::atomic_thread_fence(std::memory_order_seq_cst);
  int64_t t = _top[p].data.load(std::memory_order_relaxed);

  T item {nullptr};

  if(t <= b) {
    item = a->pop(b);
    if(t == b) {
      // the last item just got stolen
      if(!_top[p].data.compare_exchange_strong(t, t+1,
                                               std::memory_order_seq_cst,
                                               std::memory_order_relaxed)) {
        item = nullptr;
      }
      _bottom[p].data.store(b + 1, std::memory_order_relaxed);
    }
  }
  else {
    _bottom[p].data.store(b + 1, std::memory_order_relaxed);
  }

  return item;
 }

 // Function: steal
 template <typename T, unsigned TF_MAX_PRIORITY>
 T TaskQueue<T, TF_MAX_PRIORITY>::steal() {
  for(unsigned i=0; i<TF_MAX_PRIORITY; i++) {
    if(auto t = steal(i); t) {
      return t;
    }
  }
  return nullptr;
 }

 // Function: steal
 template <typename T, unsigned TF_MAX_PRIORITY>
 T TaskQueue<T, TF_MAX_PRIORITY>::steal(unsigned p) {
  
  int64_t t = _top[p].data.load(std::memory_order_acquire);
  std::atomic_thread_fence(std::memory_order_seq_cst);
  int64_t b = _bottom[p].data.load(std::memory_order_acquire);

  T item {nullptr};

  if(t < b) {
    Array* a = _array[p].load(std::memory_order_consume);
    item = a->pop(t);
    if(!_top[p].data.compare_exchange_strong(t, t+1,
                                             std::memory_order_seq_cst,
                                             std::memory_order_relaxed)) {
      return nullptr;
    }
  }

  return item;
 }

 // Function: capacity
 template <typename T, unsigned TF_MAX_PRIORITY>
 int64_t TaskQueue<T, TF_MAX_PRIORITY>::capacity() const noexcept {
  size_t s;
  unroll<0, TF_MAX_PRIORITY, 1>([&](auto i) { 
    s = i ? capacity(i) + s : capacity(i); 
  });
  return s;
 }

 // Function: capacity
 template <typename T, unsigned TF_MAX_PRIORITY>
 int64_t TaskQueue<T, TF_MAX_PRIORITY>::capacity(unsigned p) const noexcept {
  return _array[p].load(std::memory_order_relaxed)->capacity();
 }

 template <typename T, unsigned TF_MAX_PRIORITY>
 TF_NO_INLINE typename TaskQueue<T, TF_MAX_PRIORITY>::Array*
  TaskQueue<T, TF_MAX_PRIORITY>::resize_array(Array* a, unsigned p, std::int64_t b, std::int64_t t) {

  Array* tmp = a->resize(b, t);
  _garbage[p].push_back(a);
  std::swap(a, tmp);
  _array[p].store(a, std::memory_order_release);
  // Note: the original paper using relaxed causes t-san to complain
  //_array.store(a, std::memory_order_relaxed);
  return a;
 }


 }  // end of namespace tf -----------------------------------------------------


 /**
 @file graph.hpp
 @brief graph include file
 */

 namespace tf {

 // ----------------------------------------------------------------------------
 // Class: Graph
 // ----------------------------------------------------------------------------

 /**
 @class Graph

 @brief class to create a graph object

 A graph is the ultimate storage for a task dependency graph and is the main
 gateway to interact with an executor.
 A graph manages a set of nodes in a global object pool that animates and
 recycles node objects efficiently without going through repetitive and
 expensive memory allocations and deallocations.
 This class is mainly used for creating an opaque graph object in a custom
 class to interact with the executor through taskflow composition.

 A graph object is move-only.
 */
 class Graph {

  friend class Node;
  friend class FlowBuilder;
  friend class Subflow;
  friend class Taskflow;
  friend class Executor;

  public:

    /**
    @brief constructs a graph object
    */
    Graph() = default;

    /**
    @brief disabled copy constructor
    */
    Graph(const Graph&) = delete;

    /**
    @brief constructs a graph using move semantics
    */
    Graph(Graph&&);

    /**
    @brief destructs the graph object
    */
    ~Graph();

    /**
    @brief disabled copy assignment operator
    */
    Graph& operator = (const Graph&) = delete;

    /**
    @brief assigns a graph using move semantics
    */
    Graph& operator = (Graph&&);

    /**
    @brief queries if the graph is empty
    */
    bool empty() const;

    /**
    @brief queries the number of nodes in the graph
    */
    size_t size() const;

    /**
    @brief clears the graph
    */
    void clear();

  private:

    std::vector<Node*> _nodes;

    void _clear();
    void _clear_detached();
    void _merge(Graph&&);
    void _erase(Node*);
    
    /**
    @private
    */
    template <typename ...ArgsT>
    Node* _emplace_back(ArgsT&&...);
 };

 // ----------------------------------------------------------------------------

 /**
 @class Runtime

 @brief class to include a runtime object in a task

 A runtime object allows users to interact with the
 scheduling runtime inside a task, such as scheduling an active task,
 spawning a subflow, and so on.

 @code{.cpp}
 tf::Task A, B, C, D;
 std::tie(A, B, C, D) = taskflow.emplace(
  [] () { return 0; },
  [&C] (tf::Runtime& rt) {  // C must be captured by reference
    std::cout << "B\n";
    rt.schedule(C);
  },
  [] () { std::cout << "C\n"; },
  [] () { std::cout << "D\n"; }
 );
 A.precede(B, C, D);
 executor.run(taskflow).wait();
 @endcode

 A runtime object is associated with the worker and the executor
 that runs the task.

 */
 class Runtime {

  friend class Executor;
  friend class FlowBuilder;

  public:

  /**
  @brief obtains the running executor

  The running executor of a runtime task is the executor that runs
  the parent taskflow of that runtime task.

  @code{.cpp}
  tf::Executor executor;
  tf::Taskflow taskflow;
  taskflow.emplace([&](tf::Runtime& rt){
    assert(&(rt.executor()) == &executor);
  });
  executor.run(taskflow).wait();
  @endcode
  */
  Executor& executor();

  /**
  @brief schedules an active task immediately to the worker's queue

  @param task the given active task to schedule immediately

  This member function immediately schedules an active task to the
  task queue of the associated worker in the runtime task.
  An active task is a task in a running taskflow.
  The task may or may not be running, and scheduling that task
  will immediately put the task into the task queue of the worker
  that is running the runtime task.
  Consider the following example:

  @code{.cpp}
  tf::Task A, B, C, D;
  std::tie(A, B, C, D) = taskflow.emplace(
    [] () { return 0; },
    [&C] (tf::Runtime& rt) {  // C must be captured by reference
      std::cout << "B\n";
      rt.schedule(C);
    },
    [] () { std::cout << "C\n"; },
    [] () { std::cout << "D\n"; }
  );
  A.precede(B, C, D);
  executor.run(taskflow).wait();
  @endcode

  The executor will first run the condition task @c A which returns @c 0
  to inform the scheduler to go to the runtime task @c B.
  During the execution of @c B, it directly schedules task @c C without
  going through the normal taskflow graph scheduling process.
  At this moment, task @c C is active because its parent taskflow is running.
  When the taskflow finishes, we will see both @c B and @c C in the output.
  */
  void schedule(Task task);
  
  /**
  @brief runs the given callable asynchronously

  @tparam F callable type
  @param f callable object
    
  The method creates an asynchronous task to launch the given
  function on the given arguments.
  The difference to tf::Executor::async is that the created asynchronous task
  pertains to the runtime.
  When the runtime joins, all asynchronous tasks created from the runtime
  are guaranteed to finish after the join returns.
  For example:

  @code{.cpp}
  std::atomic<int> counter(0);
  taskflow.emplace([&](tf::Runtime& rt){
    auto fu1 = rt.async([&](){ counter++; });
    auto fu2 = rt.async([&](){ counter++; });
    fu1.get();
    fu2.get();
    assert(counter == 2);
    
    // spawn 100 asynchronous tasks from the worker of the runtime
    for(int i=0; i<100; i++) {
      rt.async([&](){ counter++; });
    }
    
    // explicit join 100 asynchronous tasks
    rt.join();
    assert(counter == 102);
  });
  @endcode

  This method is thread-safe and can be called by multiple workers
  that hold the reference to the runtime.
  For example, the code below spawns 100 tasks from the worker of
  a runtime, and each of the 100 tasks spawns another task
  that will be run by another worker.
  
  @code{.cpp}
  std::atomic<int> counter(0);
  taskflow.emplace([&](tf::Runtime& rt){
    // worker of the runtime spawns 100 tasks each spawning another task
    // that will be run by another worker
    for(int i=0; i<100; i++) {
      rt.async([&](){ 
        counter++; 
        rt.async([](){ counter++; });
      });
    }
    
    // explicit join 100 asynchronous tasks
    rt.join();
    assert(counter == 200);
  });
  @endcode
  */
  template <typename F>
  auto async(F&& f);
  
  /**
  @brief similar to tf::Runtime::async but assigns the task a name

  @tparam F callable type

  @param name assigned name to the task
  @param f callable

  @code{.cpp}
  taskflow.emplace([&](tf::Runtime& rt){
    auto future = rt.async("my task", [](){});
    future.get();
  });
  @endcode

  */
  template <typename F>
  auto async(const std::string& name, F&& f);

  /**
  @brief runs the given function asynchronously without returning any future object

  @tparam F callable type
  @param f callable

  This member function is more efficient than tf::Runtime::async
  and is encouraged to use when there is no data returned.

  @code{.cpp}
  std::atomic<int> counter(0);
  taskflow.emplace([&](tf::Runtime& rt){
    for(int i=0; i<100; i++) {
      rt.silent_async([&](){ counter++; });
    }
    rt.join();
    assert(counter == 100);
  });
  @endcode

  This member function is thread-safe.
  */
  template <typename F>
  void silent_async(F&& f);
  
  /**
  @brief similar to tf::Runtime::silent_async but assigns the task a name

  @tparam F callable type
  @param name assigned name to the task
  @param f callable
  
  @code{.cpp}
  taskflow.emplace([&](tf::Runtime& rt){
    rt.silent_async("my task", [](){});
    rt.join();
  });
  @endcode
  */
  template <typename F>
  void silent_async(const std::string& name, F&& f);
  
  /**
  @brief similar to tf::Runtime::silent_async but the caller must be the worker of the runtime

  @tparam F callable type

  @param name assigned name to the task
  @param f callable

  The method bypass the check of the caller worker from the executor 
  and thus can only called by the worker of this runtime.

  @code{.cpp}
  taskflow.emplace([&](tf::Runtime& rt){
    // running by the worker of this runtime
    rt.silent_async_unchecked("my task", [](){});
    rt.join();
  });
  @endcode
  */
  template <typename F>
  void silent_async_unchecked(const std::string& name, F&& f);

  /**
  @brief co-runs the given target and waits until it completes
  
  A target can be one of the following forms:
    + a dynamic task to spawn a subflow or
    + a composable graph object with `tf::Graph& T::graph()` defined

  @code{.cpp}
  // co-run a subflow and wait until all tasks complete
  taskflow.emplace([](tf::Runtime& rt){
    rt.corun([](tf::Subflow& sf){
      tf::Task A = sf.emplace([](){});
      tf::Task B = sf.emplace([](){});
    }); 
  });
  
  // co-run a taskflow and wait until all tasks complete
  tf::Taskflow taskflow1, taskflow2;
  taskflow1.emplace([](){ std::cout << "running taskflow1\n"; });
  taskflow2.emplace([&](tf::Runtime& rt){
    std::cout << "running taskflow2\n";
    rt.corun(taskflow1);
  });
  executor.run(taskflow2).wait();
  @endcode

  Although tf::Runtime::corun blocks until the operation completes, 
  the caller thread (worker) is not blocked (e.g., sleeping or holding any lock). 
  Instead, the caller thread joins the work-stealing loop of the executor 
  and returns when all tasks in the target completes.
  */
  template <typename T>
  void corun(T&& target);

  /**
  @brief keeps running the work-stealing loop until the predicate becomes true
  
  @tparam P predicate type
  @param predicate a boolean predicate to indicate when to stop the loop

  The method keeps the caller worker running in the work-stealing loop
  until the stop predicate becomes true.
  */
  template <typename P>
  void corun_until(P&& predicate);
  
  /**
  @brief joins all asynchronous tasks spawned by this runtime

  Immediately joins all asynchronous tasks (tf::Runtime::async,
  tf::Runtime::silent_async).
  Unlike tf::Subflow::join, you can join multiples times from
  a tf::Runtime object.
    
  @code{.cpp}
  std::atomic<size_t> counter{0};
  taskflow.emplace([&](tf::Runtime& rt){
    // spawn 100 async tasks and join
    for(int i=0; i<100; i++) {
      rt.silent_async([&](){ counter++; });
    }
    rt.join();
    assert(counter == 100);
    
    // spawn another 100 async tasks and join
    for(int i=0; i<100; i++) {
      rt.silent_async([&](){ counter++; });
    }
    rt.join();
    assert(counter == 200);
  });
  @endcode

  @attention
  Only the worker of this tf::Runtime can issue join.
  */
  inline void join();

  /**
  @brief acquire a reference to the underlying worker
  */
  inline Worker& worker();

  protected:
  
  /**
  @private
  */
  explicit Runtime(Executor&, Worker&, Node*);
  
  /**
  @private
  */
  Executor& _executor;
  
  /**
  @private
  */
  Worker& _worker;
  
  /**
  @private
  */
  Node* _parent;

  /**
  @private
  */
  template <typename F>
  auto _async(Worker& w, const std::string& name, F&& f);
  
  /**
  @private
  */
  template <typename F>
  void _silent_async(Worker& w, const std::string& name, F&& f);
 };

 // constructor
 inline Runtime::Runtime(Executor& e, Worker& w, Node* p) :
  _executor{e},
  _worker  {w},
  _parent  {p}{
 }

 // Function: executor
 inline Executor& Runtime::executor() {
  return _executor;
 }

 // Function: worker
 inline Worker& Runtime::worker() {
  return _worker;
 }

 // ----------------------------------------------------------------------------
 // Node
 // ----------------------------------------------------------------------------

 /**
 @private
 */
 class Node {

  friend class Graph;
  friend class Task;
  friend class TaskView;
  friend class Taskflow;
  friend class Executor;
  friend class FlowBuilder;
  friend class Subflow;
  friend class Runtime;

  enum class AsyncState : int {
    UNFINISHED = 0,
    LOCKED = 1,
    FINISHED = 2
  };

  TF_ENABLE_POOLABLE_ON_THIS;

  // state bit flag
  constexpr static int CONDITIONED = 1;
  constexpr static int DETACHED    = 2;
  constexpr static int ACQUIRED    = 4;
  constexpr static int READY       = 8;

  using Placeholder = std::monostate;

  // static work handle
  struct Static {

    template <typename C>
    Static(C&&);

    std::variant<
      std::function<void()>, std::function<void(Runtime&)>
    > work;
  };

  // dynamic work handle
  struct Dynamic {

    template <typename C>
    Dynamic(C&&);

    std::function<void(Subflow&)> work;
    Graph subgraph;
  };

  // condition work handle
  struct Condition {

    template <typename C>
    Condition(C&&);
    
    std::variant<
      std::function<int()>, std::function<int(Runtime&)>
    > work;
  };

  // multi-condition work handle
  struct MultiCondition {

    template <typename C>
    MultiCondition(C&&);

    std::variant<
      std::function<SmallVector<int>()>, std::function<SmallVector<int>(Runtime&)>
    > work;
  };

  // module work handle
  struct Module {

    template <typename T>
    Module(T&);

    Graph& graph;
  };

  // Async work
  struct Async {

    template <typename T>
    Async(T&&);

    std::function<void()> work;
  };
  
  // silent dependent async
  struct DependentAsync {
    
    template <typename C>
    DependentAsync(C&&);
    
    std::function<void()> work;
    
    std::atomic<AsyncState> state {AsyncState::UNFINISHED};
  };

  using handle_t = std::variant<
    Placeholder,      // placeholder
    Static,           // static tasking
    Dynamic,          // dynamic tasking
    Condition,        // conditional tasking
    MultiCondition,   // multi-conditional tasking
    Module,           // composable tasking
    Async,            // async tasking
    DependentAsync    // dependent async tasking (no future)
  >;

  struct Semaphores {
    SmallVector<Semaphore*> to_acquire;
    SmallVector<Semaphore*> to_release;
  };

  public:

  // variant index
  constexpr static auto PLACEHOLDER     = get_index_v<Placeholder, handle_t>;
  constexpr static auto STATIC          = get_index_v<Static, handle_t>;
  constexpr static auto DYNAMIC         = get_index_v<Dynamic, handle_t>;
  constexpr static auto CONDITION       = get_index_v<Condition, handle_t>;
  constexpr static auto MULTI_CONDITION = get_index_v<MultiCondition, handle_t>;
  constexpr static auto MODULE          = get_index_v<Module, handle_t>;
  constexpr static auto ASYNC           = get_index_v<Async, handle_t>;
  constexpr static auto DEPENDENT_ASYNC = get_index_v<DependentAsync, handle_t>;

  Node() = default;

  template <typename... Args>
  Node(const std::string&, unsigned, Topology*, Node*, size_t, Args&&... args);

  ~Node();

  size_t num_successors() const;
  size_t num_dependents() const;
  size_t num_strong_dependents() const;
  size_t num_weak_dependents() const;

  const std::string& name() const;

  private:

  std::string _name;
  
  unsigned _priority {0};
  
  Topology* _topology {nullptr};
  Node* _parent {nullptr};

  void* _data {nullptr};

  SmallVector<Node*> _successors;
  SmallVector<Node*> _dependents;

  std::atomic<int> _state {0};
  std::atomic<size_t> _join_counter {0};

  std::unique_ptr<Semaphores> _semaphores;
  
  handle_t _handle;

  void _precede(Node*);
  void _set_up_join_counter();

  bool _is_cancelled() const;
  bool _is_conditioner() const;
  bool _acquire_all(SmallVector<Node*>&);

  SmallVector<Node*> _release_all();
 };

 // ----------------------------------------------------------------------------
 // Node Object Pool
 // ----------------------------------------------------------------------------

 /**
 @private
 */
 inline ObjectPool<Node> node_pool;

 // ----------------------------------------------------------------------------
 // Definition for Node::Static
 // ----------------------------------------------------------------------------

 // Constructor
 template <typename C>
 Node::Static::Static(C&& c) : work {std::forward<C>(c)} {
 }

 // ----------------------------------------------------------------------------
 // Definition for Node::Dynamic
 // ----------------------------------------------------------------------------

 // Constructor
 template <typename C>
 Node::Dynamic::Dynamic(C&& c) : work {std::forward<C>(c)} {
 }

 // ----------------------------------------------------------------------------
 // Definition for Node::Condition
 // ----------------------------------------------------------------------------

 // Constructor
 template <typename C>
 Node::Condition::Condition(C&& c) : work {std::forward<C>(c)} {
 }                                        

 // ----------------------------------------------------------------------------
 // Definition for Node::MultiCondition
 // ----------------------------------------------------------------------------

 // Constructor
 template <typename C>
 Node::MultiCondition::MultiCondition(C&& c) : work {std::forward<C>(c)} {
 }

 // ----------------------------------------------------------------------------
 // Definition for Node::Module
 // ----------------------------------------------------------------------------

 // Constructor
 template <typename T>
 inline Node::Module::Module(T& obj) : graph{ obj.graph() } {
 }

 // ----------------------------------------------------------------------------
 // Definition for Node::Async
 // ----------------------------------------------------------------------------

 // Constructor
 template <typename C>
 Node::Async::Async(C&& c) : work {std::forward<C>(c)} {
 }

 // ----------------------------------------------------------------------------
 // Definition for Node::DependentAsync
 // ----------------------------------------------------------------------------

 // Constructor
 template <typename C>
 Node::DependentAsync::DependentAsync(C&& c) : work {std::forward<C>(c)} {
 }

 // ----------------------------------------------------------------------------
 // Definition for Node
 // ----------------------------------------------------------------------------

 // Constructor
 template <typename... Args>
 Node::Node(
  const std::string& name, 
  unsigned priority,
  Topology* topology, 
  Node* parent, 
  size_t join_counter,
  Args&&... args
 ) :
  _name         {name},
  _priority     {priority},
  _topology     {topology},
  _parent       {parent},
  _join_counter {join_counter},
  _handle       {std::forward<Args>(args)...} {
 }

 //Node::Node(Args&&... args): _handle{std::forward<Args>(args)...} {
 //}

 // Destructor
 inline Node::~Node() {
  // this is to avoid stack overflow

  if(_handle.index() == DYNAMIC) {
    // using std::get_if instead of std::get makes this compatible
    // with older macOS versions
    // the result of std::get_if is guaranteed to be non-null
    // due to the index check above
    auto& subgraph = std::get_if<Dynamic>(&_handle)->subgraph;
    std::vector<Node*> nodes;
    nodes.reserve(subgraph.size());

    std::move(
      subgraph._nodes.begin(), subgraph._nodes.end(), std::back_inserter(nodes)
    );
    subgraph._nodes.clear();

    size_t i = 0;

    while(i < nodes.size()) {

      if(nodes[i]->_handle.index() == DYNAMIC) {
        auto& sbg = std::get_if<Dynamic>(&(nodes[i]->_handle))->subgraph;
        std::move(
          sbg._nodes.begin(), sbg._nodes.end(), std::back_inserter(nodes)
        );
        sbg._nodes.clear();
      }

      ++i;
    }

    //auto& np = Graph::_node_pool();
    for(i=0; i<nodes.size(); ++i) {
      node_pool.recycle(nodes[i]);
    }
  }
 }

 // Procedure: _precede
 inline void Node::_precede(Node* v) {
  _successors.push_back(v);
  v->_dependents.push_back(this);
 }

 // Function: num_successors
 inline size_t Node::num_successors() const {
  return _successors.size();
 }

 // Function: dependents
 inline size_t Node::num_dependents() const {
  return _dependents.size();
 }

 // Function: num_weak_dependents
 inline size_t Node::num_weak_dependents() const {
  size_t n = 0;
  for(size_t i=0; i<_dependents.size(); i++) {
    //if(_dependents[i]->_handle.index() == Node::CONDITION) {
    if(_dependents[i]->_is_conditioner()) {
      n++;
    }
  }
  return n;
 }

 // Function: num_strong_dependents
 inline size_t Node::num_strong_dependents() const {
  size_t n = 0;
  for(size_t i=0; i<_dependents.size(); i++) {
    //if(_dependents[i]->_handle.index() != Node::CONDITION) {
    if(!_dependents[i]->_is_conditioner()) {
      n++;
    }
  }
  return n;
 }

 // Function: name
 inline const std::string& Node::name() const {
  return _name;
 }

 // Function: _is_conditioner
 inline bool Node::_is_conditioner() const {
  return _handle.index() == Node::CONDITION ||
         _handle.index() == Node::MULTI_CONDITION;
 }

 // Function: _is_cancelled
 inline bool Node::_is_cancelled() const {
  return _topology && _topology->_is_cancelled.load(std::memory_order_relaxed);
 }

 // Procedure: _set_up_join_counter
 inline void Node::_set_up_join_counter() {
  size_t c = 0;
  for(auto p : _dependents) {
    //if(p->_handle.index() == Node::CONDITION) {
    if(p->_is_conditioner()) {
      _state.fetch_or(Node::CONDITIONED, std::memory_order_relaxed);
    }
    else {
      c++;
    }
  }
  _join_counter.store(c, std::memory_order_release);
 }


 // Function: _acquire_all
 inline bool Node::_acquire_all(SmallVector<Node*>& nodes) {

  auto& to_acquire = _semaphores->to_acquire;

  for(size_t i = 0; i < to_acquire.size(); ++i) {
    if(!to_acquire[i]->_try_acquire_or_wait(this)) {
      for(size_t j = 1; j <= i; ++j) {
        auto r = to_acquire[i-j]->_release();
        nodes.insert(std::end(nodes), std::begin(r), std::end(r));
      }
      return false;
    }
  }
  return true;
 }

 // Function: _release_all
 inline SmallVector<Node*> Node::_release_all() {

  auto& to_release = _semaphores->to_release;

  SmallVector<Node*> nodes;
  for(const auto& sem : to_release) {
    auto r = sem->_release();
    nodes.insert(std::end(nodes), std::begin(r), std::end(r));
  }

  return nodes;
 }

 // ----------------------------------------------------------------------------
 // Node Deleter
 // ----------------------------------------------------------------------------

 /**
 @private
 */
 struct NodeDeleter {
  void operator ()(Node* ptr) {
    node_pool.recycle(ptr);
  }
 };

 // ----------------------------------------------------------------------------
 // Graph definition
 // ----------------------------------------------------------------------------

 // Destructor
 inline Graph::~Graph() {
  _clear();
 }

 // Move constructor
 inline Graph::Graph(Graph&& other) :
  _nodes {std::move(other._nodes)} {
 }

 // Move assignment
 inline Graph& Graph::operator = (Graph&& other) {
  _clear();
  _nodes = std::move(other._nodes);
  return *this;
 }

 // Procedure: clear
 inline void Graph::clear() {
  _clear();
 }

 // Procedure: clear
 inline void Graph::_clear() {
  for(auto node : _nodes) {
    node_pool.recycle(node);
  }
  _nodes.clear();
 }

 // Procedure: clear_detached
 inline void Graph::_clear_detached() {

  auto mid = std::partition(_nodes.begin(), _nodes.end(), [] (Node* node) {
    return !(node->_state.load(std::memory_order_relaxed) & Node::DETACHED);
  });

  for(auto itr = mid; itr != _nodes.end(); ++itr) {
    node_pool.recycle(*itr);
  }
  _nodes.resize(std::distance(_nodes.begin(), mid));
 }

 // Procedure: merge
 inline void Graph::_merge(Graph&& g) {
  for(auto n : g._nodes) {
    _nodes.push_back(n);
  }
  g._nodes.clear();
 }

 // Function: erase
 inline void Graph::_erase(Node* node) {
  if(auto I = std::find(_nodes.begin(), _nodes.end(), node); I != _nodes.end()) {
    _nodes.erase(I);
    node_pool.recycle(node);
  }
 }

 // Function: size
 inline size_t Graph::size() const {
  return _nodes.size();
 }

 // Function: empty
 inline bool Graph::empty() const {
  return _nodes.empty();
 }

 /**
 @private
 */
 template <typename ...ArgsT>
 Node* Graph::_emplace_back(ArgsT&&... args) {
  _nodes.push_back(node_pool.animate(std::forward<ArgsT>(args)...));
  return _nodes.back();
 }

 }  // end of namespace tf. ---------------------------------------------------


 /**
 @file task.hpp
 @brief task include file
 */

 namespace tf {

 // ----------------------------------------------------------------------------
 // Task Types
 // ----------------------------------------------------------------------------

 /**
 @enum TaskType

 @brief enumeration of all task types
 */
 enum class TaskType : int {
  /** @brief placeholder task type */
  PLACEHOLDER = 0,
  /** @brief static task type */
  STATIC,
  /** @brief dynamic (subflow) task type */
  DYNAMIC,
  /** @brief condition task type */
  CONDITION,
  /** @brief module task type */
  MODULE,
  /** @brief asynchronous task type */
  ASYNC,
  /** @brief undefined task type (for internal use only) */
  UNDEFINED
 };

 /**
 @private
 @brief array of all task types (used for iterating task types)
 */
 inline constexpr std::array<TaskType, 6> TASK_TYPES = {
  TaskType::PLACEHOLDER,
  TaskType::STATIC,
  TaskType::DYNAMIC,
  TaskType::CONDITION,
  TaskType::MODULE,
  TaskType::ASYNC,
 };

 /**
 @brief convert a task type to a human-readable string

 The name of each task type is the litte-case string of its characters.

 @code{.cpp}
 TaskType::PLACEHOLDER     ->  "placeholder"
 TaskType::STATIC          ->  "static"
 TaskType::DYNAMIC         ->  "subflow"
 TaskType::CONDITION       ->  "condition"
 TaskType::MODULE          ->  "module"
 TaskType::ASYNC           ->  "async"
 @endcode
 */
 inline const char* to_string(TaskType type) {

  const char* val;

  switch(type) {
    case TaskType::PLACEHOLDER:      val = "placeholder";     break;
    case TaskType::STATIC:           val = "static";          break;
    case TaskType::DYNAMIC:          val = "subflow";         break;
    case TaskType::CONDITION:        val = "condition";       break;
    case TaskType::MODULE:           val = "module";          break;
    case TaskType::ASYNC:            val = "async";           break;
    default:                         val = "undefined";       break;
  }

  return val;
 }

 // ----------------------------------------------------------------------------
 // Task Traits
 // ----------------------------------------------------------------------------

 /**
 @brief determines if a callable is a dynamic task

 A dynamic task is a callable object constructible from std::function<void(Subflow&)>.
 */
 template <typename C>
 constexpr bool is_dynamic_task_v = 
  std::is_invocable_r_v<void, C, Subflow&> &&
  !std::is_invocable_r_v<void, C, Runtime&>;

 /**
 @brief determines if a callable is a condition task

 A condition task is a callable object constructible from std::function<int()>
 or std::function<int(tf::Runtime&)>.
 */
 template <typename C>
 constexpr bool is_condition_task_v = 
  (std::is_invocable_r_v<int, C> || std::is_invocable_r_v<int, C, Runtime&>) &&
  !is_dynamic_task_v<C>;

 /**
 @brief determines if a callable is a multi-condition task

 A multi-condition task is a callable object constructible from
 std::function<tf::SmallVector<int>()> or
 std::function<tf::SmallVector<int>(tf::Runtime&)>.
 */
 template <typename C>
 constexpr bool is_multi_condition_task_v =
  (std::is_invocable_r_v<SmallVector<int>, C> ||
  std::is_invocable_r_v<SmallVector<int>, C, Runtime&>) &&
  !is_dynamic_task_v<C>;

 /**
 @brief determines if a callable is a static task

 A static task is a callable object constructible from std::function<void()>
 or std::function<void(tf::Runtime&)>.
 */
 template <typename C>
 constexpr bool is_static_task_v =
  (std::is_invocable_r_v<void, C> || std::is_invocable_r_v<void, C, Runtime&>) &&
  !is_condition_task_v<C> &&
  !is_multi_condition_task_v<C> &&
  !is_dynamic_task_v<C>;

 // ----------------------------------------------------------------------------
 // Task
 // ----------------------------------------------------------------------------

 /**
 @class Task

 @brief class to create a task handle over a node in a taskflow graph

 A task is a wrapper over a node in a taskflow graph.
 It provides a set of methods for users to access and modify the attributes of
 the associated node in the taskflow graph.
 A task is very lightweight object (i.e., only storing a node pointer) that
 can be trivially copied around,
 and it does not own the lifetime of the associated node.
 */
 class Task {

  friend class FlowBuilder;
  friend class Runtime;
  friend class Taskflow;
  friend class TaskView;
  friend class Executor;

  public:

    /**
    @brief constructs an empty task
    */
    Task() = default;

    /**
    @brief constructs the task with the copy of the other task
    */
    Task(const Task& other);

    /**
    @brief replaces the contents with a copy of the other task
    */
    Task& operator = (const Task&);

    /**
    @brief replaces the contents with a null pointer
    */
    Task& operator = (std::nullptr_t);

    /**
    @brief compares if two tasks are associated with the same graph node
    */
    bool operator == (const Task& rhs) const;

    /**
    @brief compares if two tasks are not associated with the same graph node
    */
    bool operator != (const Task& rhs) const;

    /**
    @brief queries the name of the task
    */
    const std::string& name() const;

    /**
    @brief queries the number of successors of the task
    */
    size_t num_successors() const;

    /**
    @brief queries the number of predecessors of the task
    */
    size_t num_dependents() const;

    /**
    @brief queries the number of strong dependents of the task
    */
    size_t num_strong_dependents() const;

    /**
    @brief queries the number of weak dependents of the task
    */
    size_t num_weak_dependents() const;

    /**
    @brief assigns a name to the task

    @param name a @std_string acceptable string

    @return @c *this
    */
    Task& name(const std::string& name);

    /**
    @brief assigns a callable

    @tparam C callable type

    @param callable callable to construct a task

    @return @c *this
    */
    template <typename C>
    Task& work(C&& callable);

    /**
    @brief creates a module task from a taskflow

    @tparam T object type
    @param object a custom object that defines @c T::graph() method

    @return @c *this
    */
    template <typename T>
    Task& composed_of(T& object);

    /**
    @brief adds precedence links from this to other tasks

    @tparam Ts parameter pack

    @param tasks one or multiple tasks

    @return @c *this
    */
    template <typename... Ts>
    Task& precede(Ts&&... tasks);

    /**
    @brief adds precedence links from other tasks to this

    @tparam Ts parameter pack

    @param tasks one or multiple tasks

    @return @c *this
    */
    template <typename... Ts>
    Task& succeed(Ts&&... tasks);

    /**
    @brief makes the task release this semaphore
    */
    Task& release(Semaphore& semaphore);

    /**
    @brief makes the task acquire this semaphore
    */
    Task& acquire(Semaphore& semaphore);

    /**
    @brief assigns pointer to user data

    @param data pointer to user data

    The following example shows how to attach user data to a task and
    run the task iteratively while changing the data value:

    @code{.cpp}
    tf::Executor executor;
    tf::Taskflow taskflow("attach data to a task");

    int data;

    // create a task and attach it the data
    auto A = taskflow.placeholder();
    A.data(&data).work([A](){
      auto d = *static_cast<int*>(A.data());
      std::cout << "data is " << d << std::endl;
    });

    // run the taskflow iteratively with changing data
    for(data = 0; data<10; data++){
      executor.run(taskflow).wait();
    }
    @endcode

    @return @c *this
    */
    Task& data(void* data);
      
    /**
    @brief assigns a priority value to the task

    A priority value can be one of the following three levels, 
    tf::TaskPriority::HIGH (numerically equivalent to 0),
    tf::TaskPriority::NORMAL (numerically equivalent to 1), and
    tf::TaskPriority::LOW (numerically equivalent to 2).
    The smaller the priority value, the higher the priority.
    */
    Task& priority(TaskPriority p);
    
    /**
    @brief queries the priority value of the task
    */
    TaskPriority priority() const;

    /**
    @brief resets the task handle to null
    */
    void reset();

    /**
    @brief resets the associated work to a placeholder
    */
    void reset_work();

    /**
    @brief queries if the task handle points to a task node
    */
    bool empty() const;

    /**
    @brief queries if the task has a work assigned
    */
    bool has_work() const;

    /**
    @brief applies an visitor callable to each successor of the task
    */
    template <typename V>
    void for_each_successor(V&& visitor) const;

    /**
    @brief applies an visitor callable to each dependents of the task
    */
    template <typename V>
    void for_each_dependent(V&& visitor) const;

    /**
    @brief obtains a hash value of the underlying node
    */
    size_t hash_value() const;

    /**
    @brief returns the task type
    */
    TaskType type() const;

    /**
    @brief dumps the task through an output stream
    */
    void dump(std::ostream& ostream) const;

    /**
    @brief queries pointer to user data
    */
    void* data() const;


  private:

    Task(Node*);

    Node* _node {nullptr};
 };

 // Constructor
 inline Task::Task(Node* node) : _node {node} {
 }

 // Constructor
 inline Task::Task(const Task& rhs) : _node {rhs._node} {
 }

 // Function: precede
 template <typename... Ts>
 Task& Task::precede(Ts&&... tasks) {
  (_node->_precede(tasks._node), ...);
  //_precede(std::forward<Ts>(tasks)...);
  return *this;
 }

 // Function: succeed
 template <typename... Ts>
 Task& Task::succeed(Ts&&... tasks) {
  (tasks._node->_precede(_node), ...);
  //_succeed(std::forward<Ts>(tasks)...);
  return *this;
 }

 // Function: composed_of
 template <typename T>
 Task& Task::composed_of(T& object) {
  _node->_handle.emplace<Node::Module>(object);
  return *this;
 }

 // Operator =
 inline Task& Task::operator = (const Task& rhs) {
  _node = rhs._node;
  return *this;
 }

 // Operator =
 inline Task& Task::operator = (std::nullptr_t ptr) {
  _node = ptr;
  return *this;
 }

 // Operator ==
 inline bool Task::operator == (const Task& rhs) const {
  return _node == rhs._node;
 }

 // Operator !=
 inline bool Task::operator != (const Task& rhs) const {
  return _node != rhs._node;
 }

 // Function: name
 inline Task& Task::name(const std::string& name) {
  _node->_name = name;
  return *this;
 }

 // Function: acquire
 inline Task& Task::acquire(Semaphore& s) {
  if(!_node->_semaphores) {
    _node->_semaphores = std::make_unique<Node::Semaphores>();
  }
  _node->_semaphores->to_acquire.push_back(&s);
  return *this;
 }

 // Function: release
 inline Task& Task::release(Semaphore& s) {
  if(!_node->_semaphores) {
    //_node->_semaphores.emplace();
    _node->_semaphores = std::make_unique<Node::Semaphores>();
  }
  _node->_semaphores->to_release.push_back(&s);
  return *this;
 }

 // Procedure: reset
 inline void Task::reset() {
  _node = nullptr;
 }

 // Procedure: reset_work
 inline void Task::reset_work() {
  _node->_handle.emplace<std::monostate>();
 }

 // Function: name
 inline const std::string& Task::name() const {
  return _node->_name;
 }

 // Function: num_dependents
 inline size_t Task::num_dependents() const {
  return _node->num_dependents();
 }

 // Function: num_strong_dependents
 inline size_t Task::num_strong_dependents() const {
  return _node->num_strong_dependents();
 }

 // Function: num_weak_dependents
 inline size_t Task::num_weak_dependents() const {
  return _node->num_weak_dependents();
 }

 // Function: num_successors
 inline size_t Task::num_successors() const {
  return _node->num_successors();
 }

 // Function: empty
 inline bool Task::empty() const {
  return _node == nullptr;
 }

 // Function: has_work
 inline bool Task::has_work() const {
  return _node ? _node->_handle.index() != 0 : false;
 }

 // Function: task_type
 inline TaskType Task::type() const {
  switch(_node->_handle.index()) {
    case Node::PLACEHOLDER:     return TaskType::PLACEHOLDER;
    case Node::STATIC:          return TaskType::STATIC;
    case Node::DYNAMIC:         return TaskType::DYNAMIC;
    case Node::CONDITION:       return TaskType::CONDITION;
    case Node::MULTI_CONDITION: return TaskType::CONDITION;
    case Node::MODULE:          return TaskType::MODULE;
    case Node::ASYNC:           return TaskType::ASYNC;
    case Node::DEPENDENT_ASYNC: return TaskType::ASYNC;
    default:                    return TaskType::UNDEFINED;
  }
 }

 // Function: for_each_successor
 template <typename V>
 void Task::for_each_successor(V&& visitor) const {
  for(size_t i=0; i<_node->_successors.size(); ++i) {
    visitor(Task(_node->_successors[i]));
  }
 }

 // Function: for_each_dependent
 template <typename V>
 void Task::for_each_dependent(V&& visitor) const {
  for(size_t i=0; i<_node->_dependents.size(); ++i) {
    visitor(Task(_node->_dependents[i]));
  }
 }

 // Function: hash_value
 inline size_t Task::hash_value() const {
  return std::hash<Node*>{}(_node);
 }

 // Procedure: dump
 inline void Task::dump(std::ostream& os) const {
  os << "task ";
  if(name().empty()) os << _node;
  else os << name();
  os << " [type=" << to_string(type()) << ']';
 }

 // Function: work
 template <typename C>
 Task& Task::work(C&& c) {

  if constexpr(is_static_task_v<C>) {
    _node->_handle.emplace<Node::Static>(std::forward<C>(c));
  }
  else if constexpr(is_dynamic_task_v<C>) {
    _node->_handle.emplace<Node::Dynamic>(std::forward<C>(c));
  }
  else if constexpr(is_condition_task_v<C>) {
    _node->_handle.emplace<Node::Condition>(std::forward<C>(c));
  }
  else if constexpr(is_multi_condition_task_v<C>) {
    _node->_handle.emplace<Node::MultiCondition>(std::forward<C>(c));
  }
  else {
    static_assert(dependent_false_v<C>, "invalid task callable");
  }
  return *this;
 }

 // Function: data
 inline void* Task::data() const {
  return _node->_data;
 }

 // Function: data
 inline Task& Task::data(void* data) {
  _node->_data = data;
  return *this;
 }

 // Function: priority
 inline Task& Task::priority(TaskPriority p) {
  _node->_priority = static_cast<unsigned>(p);
  return *this;
 }

 // Function: priority
 inline TaskPriority Task::priority() const {
  return static_cast<TaskPriority>(_node->_priority);
 }

 // ----------------------------------------------------------------------------
 // global ostream
 // ----------------------------------------------------------------------------

 /**
 @brief overload of ostream inserter operator for Task
 */
 inline std::ostream& operator << (std::ostream& os, const Task& task) {
  task.dump(os);
  return os;
 }

 // ----------------------------------------------------------------------------
 // Task View
 // ----------------------------------------------------------------------------

 /**
 @class TaskView

 @brief class to access task information from the observer interface
 */
 class TaskView {

  friend class Executor;

  public:

    /**
    @brief queries the name of the task
    */
    const std::string& name() const;

    /**
    @brief queries the number of successors of the task
    */
    size_t num_successors() const;

    /**
    @brief queries the number of predecessors of the task
    */
    size_t num_dependents() const;

    /**
    @brief queries the number of strong dependents of the task
    */
    size_t num_strong_dependents() const;

    /**
    @brief queries the number of weak dependents of the task
    */
    size_t num_weak_dependents() const;

    /**
    @brief applies an visitor callable to each successor of the task
    */
    template <typename V>
    void for_each_successor(V&& visitor) const;

    /**
    @brief applies an visitor callable to each dependents of the task
    */
    template <typename V>
    void for_each_dependent(V&& visitor) const;

    /**
    @brief queries the task type
    */
    TaskType type() const;

    /**
    @brief obtains a hash value of the underlying node
    */
    size_t hash_value() const;

  private:

    TaskView(const Node&);
    TaskView(const TaskView&) = default;

    const Node& _node;
 };

 // Constructor
 inline TaskView::TaskView(const Node& node) : _node {node} {
 }

 // Function: name
 inline const std::string& TaskView::name() const {
  return _node._name;
 }

 // Function: num_dependents
 inline size_t TaskView::num_dependents() const {
  return _node.num_dependents();
 }

 // Function: num_strong_dependents
 inline size_t TaskView::num_strong_dependents() const {
  return _node.num_strong_dependents();
 }

 // Function: num_weak_dependents
 inline size_t TaskView::num_weak_dependents() const {
  return _node.num_weak_dependents();
 }

 // Function: num_successors
 inline size_t TaskView::num_successors() const {
  return _node.num_successors();
 }

 // Function: type
 inline TaskType TaskView::type() const {
  switch(_node._handle.index()) {
    case Node::PLACEHOLDER:     return TaskType::PLACEHOLDER;
    case Node::STATIC:          return TaskType::STATIC;
    case Node::DYNAMIC:         return TaskType::DYNAMIC;
    case Node::CONDITION:       return TaskType::CONDITION;
    case Node::MULTI_CONDITION: return TaskType::CONDITION;
    case Node::MODULE:          return TaskType::MODULE;
    case Node::ASYNC:           return TaskType::ASYNC;
    case Node::DEPENDENT_ASYNC: return TaskType::ASYNC;
    default:                    return TaskType::UNDEFINED;
  }
 }

 // Function: hash_value
 inline size_t TaskView::hash_value() const {
  return std::hash<const Node*>{}(&_node);
 }

 // Function: for_each_successor
 template <typename V>
 void TaskView::for_each_successor(V&& visitor) const {
  for(size_t i=0; i<_node._successors.size(); ++i) {
    visitor(TaskView(*_node._successors[i]));
  }
 }

 // Function: for_each_dependent
 template <typename V>
 void TaskView::for_each_dependent(V&& visitor) const {
  for(size_t i=0; i<_node._dependents.size(); ++i) {
    visitor(TaskView(*_node._dependents[i]));
  }
 }

 }  // end of namespace tf. ---------------------------------------------------

 namespace std {

 /**
 @struct hash

 @brief hash specialization for std::hash<tf::Task>
 */
 template <>
 struct hash<tf::Task> {
  auto operator() (const tf::Task& task) const noexcept {
    return task.hash_value();
  }
 };

 /**
 @struct hash

 @brief hash specialization for std::hash<tf::TaskView>
 */
 template <>
 struct hash<tf::TaskView> {
  auto operator() (const tf::TaskView& task_view) const noexcept {
    return task_view.hash_value();
  }
 };

 }  // end of namespace std ----------------------------------------------------








 // 2019/02/09 - created by Tsung-Wei Huang
 //  - modified the event count from Eigen



 #include <iostream>
 #include <vector>
 #include <cstdlib>
 #include <cstdio>
 #include <atomic>
 #include <memory>
 #include <deque>
 #include <mutex>
 #include <condition_variable>
 #include <thread>
 #include <algorithm>
 #include <numeric>
 #include <cassert>

 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2016 Dmitry Vyukov <[email protected]>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

 namespace tf {

 // Notifier allows to wait for arbitrary predicates in non-blocking
 // algorithms. Think of condition variable, but wait predicate does not need to
 // be protected by a mutex. Usage:
 // Waiting thread does:
 //
 //   if (predicate)
 //     return act();
 //   Notifier::Waiter& w = waiters[my_index];
 //   ec.prepare_wait(&w);
 //   if (predicate) {
 //     ec.cancel_wait(&w);
 //     return act();
 //   }
 //   ec.commit_wait(&w);
 //
 // Notifying thread does:
 //
 //   predicate = true;
 //   ec.notify(true);
 //
 // notify is cheap if there are no waiting threads. prepare_wait/commit_wait are not
 // cheap, but they are executed only if the preceeding predicate check has
 // failed.
 //
 // Algorihtm outline:
 // There are two main variables: predicate (managed by user) and _state.
 // Operation closely resembles Dekker mutual algorithm:
 // https://en.wikipedia.org/wiki/Dekker%27s_algorithm
 // Waiting thread sets _state then checks predicate, Notifying thread sets
 // predicate then checks _state. Due to seq_cst fences in between these
 // operations it is guaranteed than either waiter will see predicate change
 // and won't block, or notifying thread will see _state change and will unblock
 // the waiter, or both. But it can't happen that both threads don't see each
 // other changes, which would lead to deadlock.
 class Notifier {

  friend class Executor;

  public:

  struct Waiter {
    std::atomic<Waiter*> next;
    std::mutex mu;
    std::condition_variable cv;
    uint64_t epoch;
    unsigned state;
    enum {
      kNotSignaled,
      kWaiting,
      kSignaled,
    };
  };

  explicit Notifier(size_t N) : _waiters{N} {
    assert(_waiters.size() < (1 << kWaiterBits) - 1);
    // Initialize epoch to something close to overflow to test overflow.
    _state = kStackMask | (kEpochMask - kEpochInc * _waiters.size() * 2);
  }

  ~Notifier() {
    // Ensure there are no waiters.
    assert((_state.load() & (kStackMask | kWaiterMask)) == kStackMask);
  }

  // prepare_wait prepares for waiting.
  // After calling this function the thread must re-check the wait predicate
  // and call either cancel_wait or commit_wait passing the same Waiter object.
  void prepare_wait(Waiter* w) {
    w->epoch = _state.fetch_add(kWaiterInc, std::memory_order_relaxed);
    std::atomic_thread_fence(std::memory_order_seq_cst);
  }

  // commit_wait commits waiting.
  void commit_wait(Waiter* w) {
    w->state = Waiter::kNotSignaled;
    // Modification epoch of this waiter.
    uint64_t epoch =
        (w->epoch & kEpochMask) +
        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
    uint64_t state = _state.load(std::memory_order_seq_cst);
    for (;;) {
      if (int64_t((state & kEpochMask) - epoch) < 0) {
        // The preceeding waiter has not decided on its fate. Wait until it
        // calls either cancel_wait or commit_wait, or is notified.
        std::this_thread::yield();
        state = _state.load(std::memory_order_seq_cst);
        continue;
      }
      // We've already been notified.
      if (int64_t((state & kEpochMask) - epoch) > 0) return;
      // Remove this thread from prewait counter and add it to the waiter list.
      assert((state & kWaiterMask) != 0);
      uint64_t newstate = state - kWaiterInc + kEpochInc;
      //newstate = (newstate & ~kStackMask) | (w - &_waiters[0]);
      newstate = static_cast<uint64_t>((newstate & ~kStackMask) | static_cast<uint64_t>(w - &_waiters[0]));
      if ((state & kStackMask) == kStackMask)
        w->next.store(nullptr, std::memory_order_relaxed);
      else
        w->next.store(&_waiters[state & kStackMask], std::memory_order_relaxed);
      if (_state.compare_exchange_weak(state, newstate,
                                       std::memory_order_release))
        break;
    }
    _park(w);
  }

  // cancel_wait cancels effects of the previous prepare_wait call.
  void cancel_wait(Waiter* w) {
    uint64_t epoch =
        (w->epoch & kEpochMask) +
        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
    uint64_t state = _state.load(std::memory_order_relaxed);
    for (;;) {
      if (int64_t((state & kEpochMask) - epoch) < 0) {
        // The preceeding waiter has not decided on its fate. Wait until it
        // calls either cancel_wait or commit_wait, or is notified.
        std::this_thread::yield();
        state = _state.load(std::memory_order_relaxed);
        continue;
      }
      // We've already been notified.
      if (int64_t((state & kEpochMask) - epoch) > 0) return;
      // Remove this thread from prewait counter.
      assert((state & kWaiterMask) != 0);
      if (_state.compare_exchange_weak(state, state - kWaiterInc + kEpochInc,
                                       std::memory_order_relaxed))
        return;
    }
  }

  // notify wakes one or all waiting threads.
  // Must be called after changing the associated wait predicate.
  void notify(bool all) {
    std::atomic_thread_fence(std::memory_order_seq_cst);
    uint64_t state = _state.load(std::memory_order_acquire);
    for (;;) {
      // Easy case: no waiters.
      if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0)
        return;
      uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
      uint64_t newstate;
      if (all) {
        // Reset prewait counter and empty wait list.
        newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask;
      } else if (waiters) {
        // There is a thread in pre-wait state, unblock it.
        newstate = state + kEpochInc - kWaiterInc;
      } else {
        // Pop a waiter from list and unpark it.
        Waiter* w = &_waiters[state & kStackMask];
        Waiter* wnext = w->next.load(std::memory_order_relaxed);
        uint64_t next = kStackMask;
        //if (wnext != nullptr) next = wnext - &_waiters[0];
        if (wnext != nullptr) next = static_cast<uint64_t>(wnext - &_waiters[0]);
        // Note: we don't add kEpochInc here. ABA problem on the lock-free stack
        // can't happen because a waiter is re-pushed onto the stack only after
        // it was in the pre-wait state which inevitably leads to epoch
        // increment.
        newstate = (state & kEpochMask) + next;
      }
      if (_state.compare_exchange_weak(state, newstate,
                                       std::memory_order_acquire)) {
        if (!all && waiters) return;  // unblocked pre-wait thread
        if ((state & kStackMask) == kStackMask) return;
        Waiter* w = &_waiters[state & kStackMask];
        if (!all) w->next.store(nullptr, std::memory_order_relaxed);
        _unpark(w);
        return;
      }
    }
  }

  // notify n workers
  void notify_n(size_t n) {
    if(n >= _waiters.size()) {
      notify(true);
    }
    else {
      for(size_t k=0; k<n; ++k) {
        notify(false);
      }
    }
  }

  size_t size() const {
    return _waiters.size();
  }

 private:

  // State_ layout:
  // - low kStackBits is a stack of waiters committed wait.
  // - next kWaiterBits is count of waiters in prewait state.
  // - next kEpochBits is modification counter.
  static const uint64_t kStackBits = 16;
  static const uint64_t kStackMask = (1ull << kStackBits) - 1;
  static const uint64_t kWaiterBits = 16;
  static const uint64_t kWaiterShift = 16;
  static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
                                      << kWaiterShift;
  static const uint64_t kWaiterInc = 1ull << kWaiterBits;
  static const uint64_t kEpochBits = 32;
  static const uint64_t kEpochShift = 32;
  static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
  static const uint64_t kEpochInc = 1ull << kEpochShift;
  std::atomic<uint64_t> _state;
  std::vector<Waiter> _waiters;

  void _park(Waiter* w) {
    std::unique_lock<std::mutex> lock(w->mu);
    while (w->state != Waiter::kSignaled) {
      w->state = Waiter::kWaiting;
      w->cv.wait(lock);
    }
  }

  void _unpark(Waiter* waiters) {
    Waiter* next = nullptr;
    for (Waiter* w = waiters; w; w = next) {
      next = w->next.load(std::memory_order_relaxed);
      unsigned state;
      {
        std::unique_lock<std::mutex> lock(w->mu);
        state = w->state;
        w->state = Waiter::kSignaled;
      }
      // Avoid notifying if it wasn't waiting.
      if (state == Waiter::kWaiting) w->cv.notify_one();
    }
  }

 };



 }  // namespace tf ------------------------------------------------------------



 /**
 @file worker.hpp
 @brief worker include file
 */

 namespace tf {

 // ----------------------------------------------------------------------------
 // Class Definition: Worker
 // ----------------------------------------------------------------------------

 /**
 @class Worker

 @brief class to create a worker in an executor

 The class is primarily used by the executor to perform work-stealing algorithm.
 Users can access a worker object and alter its property
 (e.g., changing the thread affinity in a POSIX-like system)
 using tf::WorkerInterface.
 */
 class Worker {

  friend class Executor;
  friend class WorkerView;

  public:

    /**
    @brief queries the worker id associated with its parent executor

    A worker id is a unsigned integer in the range <tt>[0, N)</tt>,
    where @c N is the number of workers spawned at the construction
    time of the executor.
    */
    inline size_t id() const { return _id; }

    /**
    @brief acquires a pointer access to the underlying thread
    */
    inline std::thread* thread() const { return _thread; }

    /**
    @brief queries the size of the queue (i.e., number of enqueued tasks to
           run) associated with the worker
    */
    inline size_t queue_size() const { return _wsq.size(); }
    
    /**
    @brief queries the current capacity of the queue
    */
    inline size_t queue_capacity() const { return static_cast<size_t>(_wsq.capacity()); }

  private:

    size_t _id;
    size_t _vtm;
    Executor* _executor;
    std::thread* _thread;
    Notifier::Waiter* _waiter;
    std::default_random_engine _rdgen { std::random_device{}() };
    TaskQueue<Node*> _wsq;
    Node* _cache;
 };

 // ----------------------------------------------------------------------------
 // Class Definition: PerThreadWorker
 // ----------------------------------------------------------------------------

 /**
 @private
 */
 //struct PerThreadWorker {
 //
 //  Worker* worker;
 //
 //  PerThreadWorker() : worker {nullptr} {}
 //
 //  PerThreadWorker(const PerThreadWorker&) = delete;
 //  PerThreadWorker(PerThreadWorker&&) = delete;
 //
 //  PerThreadWorker& operator = (const PerThreadWorker&) = delete;
 //  PerThreadWorker& operator = (PerThreadWorker&&) = delete;
 //};

 /**
 @private
 */
 //inline PerThreadWorker& this_worker() {
 //  thread_local PerThreadWorker worker;
 //  return worker;
 //}


 // ----------------------------------------------------------------------------
 // Class Definition: WorkerView
 // ----------------------------------------------------------------------------

 /**
 @class WorkerView

 @brief class to create an immutable view of a worker in an executor

 An executor keeps a set of internal worker threads to run tasks.
 A worker view provides users an immutable interface to observe
 when a worker runs a task, and the view object is only accessible
 from an observer derived from tf::ObserverInterface.
 */
 class WorkerView {

  friend class Executor;

  public:

    /**
    @brief queries the worker id associated with its parent executor

    A worker id is a unsigned integer in the range <tt>[0, N)</tt>,
    where @c N is the number of workers spawned at the construction
    time of the executor.
    */
    size_t id() const;

    /**
    @brief queries the size of the queue (i.e., number of pending tasks to
           run) associated with the worker
    */
    size_t queue_size() const;

    /**
    @brief queries the current capacity of the queue
    */
    size_t queue_capacity() const;

  private:

    WorkerView(const Worker&);
    WorkerView(const WorkerView&) = default;

    const Worker& _worker;

 };

 // Constructor
 inline WorkerView::WorkerView(const Worker& w) : _worker{w} {
 }

 // function: id
 inline size_t WorkerView::id() const {
  return _worker._id;
 }

 // Function: queue_size
 inline size_t WorkerView::queue_size() const {
  return _worker._wsq.size();
 }

 // Function: queue_capacity
 inline size_t WorkerView::queue_capacity() const {
  return static_cast<size_t>(_worker._wsq.capacity());
 }


 // ----------------------------------------------------------------------------
 // Class Definition: WorkerInterface
 // ----------------------------------------------------------------------------

 /**
 @class WorkerInterface

 @brief class to configure worker behavior in an executor

 The tf::WorkerInterface class lets users interact with the executor
 to customize the worker behavior,
 such as calling custom methods before and after a worker enters and leaves
 the loop.
 When you create an executor, it spawns a set of workers to run tasks.
 The interaction between the executor and its spawned workers looks like
 the following:

 for(size_t n=0; n<num_workers; n++) {
  create_thread([](Worker& worker)
  
    // pre-processing executor-specific worker information
    // ...
  
    // enter the scheduling loop
    // Here, WorkerInterface::scheduler_prologue is invoked, if any
    
    while(1) {
      perform_work_stealing_algorithm();
      if(stop) {
        break; 
      }
    }
  
    // leaves the scheduling loop and joins this worker thread
    // Here, WorkerInterface::scheduler_epilogue is invoked, if any
  );
 }

 @note
 Methods defined in tf::WorkerInterface are not thread-safe and may be
 be invoked by multiple workers concurrently.

 */
 class WorkerInterface {

  public:
  
  /**
  @brief default destructor
  */
  virtual ~WorkerInterface() = default;
  
  /**
  @brief method to call before a worker enters the scheduling loop
  @param worker a reference to the worker

  The method is called by the constructor of an executor.
  */
  virtual void scheduler_prologue(Worker& worker) = 0;
  
  /**
  @brief method to call after a worker leaves the scheduling loop
  @param worker a reference to the worker
  @param ptr an pointer to the exception thrown by the scheduling loop

  The method is called by the constructor of an executor.
  */
  virtual void scheduler_epilogue(Worker& worker, std::exception_ptr ptr) = 0;

 };

 /**
 @brief helper function to create an instance derived from tf::WorkerInterface

 @tparam T type derived from tf::WorkerInterface
 @tparam ArgsT argument types to construct @c T

 @param args arguments to forward to the constructor of @c T
 */
 template <typename T, typename... ArgsT>
 std::shared_ptr<T> make_worker_interface(ArgsT&&... args) {
  static_assert(
    std::is_base_of_v<WorkerInterface, T>, 
    "T must be derived from WorkerInterface"
  );
  return std::make_shared<T>(std::forward<ArgsT>(args)...);
 }

 }  // end of namespact tf -----------------------------------------------------




 /** 
 @file observer.hpp
 @brief observer include file
 */

 namespace tf {

 // ----------------------------------------------------------------------------
 // timeline data structure
 // ----------------------------------------------------------------------------

 /**
 @brief default time point type of observers
 */
 using observer_stamp_t = std::chrono::time_point<std::chrono::steady_clock>;

 /**
 @private
 */
 struct Segment {

  std::string name;
  TaskType type;

  observer_stamp_t beg;
  observer_stamp_t end;

  template <typename Archiver>
  auto save(Archiver& ar) const {
    return ar(name, type, beg, end);
  }

  template <typename Archiver>
  auto load(Archiver& ar) {
    return ar(name, type, beg, end);
  }

  Segment() = default;

  Segment(
    const std::string& n, TaskType t, observer_stamp_t b, observer_stamp_t e
  ) : name {n}, type {t}, beg {b}, end {e} {
  }

  auto span() const {
    return end-beg;
  } 
 };

 /**
 @private
 */
 struct Timeline {

  size_t uid;

  observer_stamp_t origin;
  std::vector<std::vector<std::vector<Segment>>> segments;

  Timeline() = default;

  Timeline(const Timeline& rhs) = delete;
  Timeline(Timeline&& rhs) = default;

  Timeline& operator = (const Timeline& rhs) = delete;
  Timeline& operator = (Timeline&& rhs) = default;

  template <typename Archiver>
  auto save(Archiver& ar) const {
    return ar(uid, origin, segments);
  }

  template <typename Archiver>
  auto load(Archiver& ar) {
    return ar(uid, origin, segments);
  }
 };  

 /**
 @private
 */
 struct ProfileData {

  std::vector<Timeline> timelines;

  ProfileData() = default;

  ProfileData(const ProfileData& rhs) = delete;
  ProfileData(ProfileData&& rhs) = default;

  ProfileData& operator = (const ProfileData& rhs) = delete;
  ProfileData& operator = (ProfileData&&) = default;
  
  template <typename Archiver>
  auto save(Archiver& ar) const {
    return ar(timelines);
  }

  template <typename Archiver>
  auto load(Archiver& ar) {
    return ar(timelines);
  }
 };

 // ----------------------------------------------------------------------------
 // observer interface 
 // ----------------------------------------------------------------------------

 /**
 @class: ObserverInterface

 @brief class to derive an executor observer 

 The tf::ObserverInterface class allows users to define custom methods to monitor 
 the behaviors of an executor. This is particularly useful when you want to 
 inspect the performance of an executor and visualize when each thread 
 participates in the execution of a task.
 To prevent users from direct access to the internal threads and tasks, 
 tf::ObserverInterface provides immutable wrappers,
 tf::WorkerView and tf::TaskView, over workers and tasks.

 Please refer to tf::WorkerView and tf::TaskView for details.

 Example usage:

 @code{.cpp}

 struct MyObserver : public tf::ObserverInterface {

  MyObserver(const std::string& name) {
    std::cout << "constructing observer " << name << '\n';
  }

  void set_up(size_t num_workers) override final {
    std::cout << "setting up observer with " << num_workers << " workers\n";
  }

  void on_entry(WorkerView w, tf::TaskView tv) override final {
    std::ostringstream oss;
    oss << "worker " << w.id() << " ready to run " << tv.name() << '\n';
    std::cout << oss.str();
  }

  void on_exit(WorkerView w, tf::TaskView tv) override final {
    std::ostringstream oss;
    oss << "worker " << w.id() << " finished running " << tv.name() << '\n';
    std::cout << oss.str();
  }
 };
  
 tf::Taskflow taskflow;
 tf::Executor executor;

 // insert tasks into taskflow
 // ...
  
 // create a custom observer
 std::shared_ptr<MyObserver> observer = executor.make_observer<MyObserver>("MyObserver");

 // run the taskflow
 executor.run(taskflow).wait();
 @endcode
 */
 class ObserverInterface {

  public:

  /**
  @brief virtual destructor
  */
  virtual ~ObserverInterface() = default;
  
  /**
  @brief constructor-like method to call when the executor observer is fully created
  @param num_workers the number of the worker threads in the executor
  */
  virtual void set_up(size_t num_workers) = 0;
  
  /**
  @brief method to call before a worker thread executes a closure 
  @param wv an immutable view of this worker thread 
  @param task_view a constant wrapper object to the task 
  */
  virtual void on_entry(WorkerView wv, TaskView task_view) = 0;
  
  /**
  @brief method to call after a worker thread executed a closure
  @param wv an immutable view of this worker thread
  @param task_view a constant wrapper object to the task
  */
  virtual void on_exit(WorkerView wv, TaskView task_view) = 0;
 };

 // ----------------------------------------------------------------------------
 // ChromeObserver definition
 // ----------------------------------------------------------------------------

 /**
 @class: ChromeObserver

 @brief class to create an observer based on Chrome tracing format

 A tf::ChromeObserver inherits tf::ObserverInterface and defines methods to dump
 the observed thread activities into a format that can be visualized through
 @ChromeTracing.

 @code{.cpp}
 tf::Taskflow taskflow;
 tf::Executor executor;

 // insert tasks into taskflow
 // ...
  
 // create a custom observer
 std::shared_ptr<tf::ChromeObserver> observer = executor.make_observer<tf::ChromeObserver>();

 // run the taskflow
 executor.run(taskflow).wait();

 // dump the thread activities to a chrome-tracing format.
 observer->dump(std::cout);
 @endcode
 */
 class ChromeObserver : public ObserverInterface {

  friend class Executor;
  
  // data structure to record each task execution
  struct Segment {

    std::string name;

    observer_stamp_t beg;
    observer_stamp_t end;

    Segment(
      const std::string& n,
      observer_stamp_t b,
      observer_stamp_t e
    );
  };
  
  // data structure to store the entire execution timeline
  struct Timeline {
    observer_stamp_t origin;
    std::vector<std::vector<Segment>> segments;
    std::vector<std::stack<observer_stamp_t>> stacks;
  };  

  public:

    /**
    @brief dumps the timelines into a @ChromeTracing format through 
           an output stream 
    */
    void dump(std::ostream& ostream) const;

    /**
    @brief dumps the timelines into a @ChromeTracing format
    */
    inline std::string dump() const;

    /**
    @brief clears the timeline data
    */
    inline void clear();

    /**
    @brief queries the number of tasks observed
    */
    inline size_t num_tasks() const;

  private:
    
    inline void set_up(size_t num_workers) override final;
    inline void on_entry(WorkerView w, TaskView task_view) override final;
    inline void on_exit(WorkerView w, TaskView task_view) override final;

    Timeline _timeline;
 };  
    
 // constructor
 inline ChromeObserver::Segment::Segment(
  const std::string& n, observer_stamp_t b, observer_stamp_t e
 ) :
  name {n}, beg {b}, end {e} {
 }

 // Procedure: set_up
 inline void ChromeObserver::set_up(size_t num_workers) {
  _timeline.segments.resize(num_workers);
  _timeline.stacks.resize(num_workers);

  for(size_t w=0; w<num_workers; ++w) {
    _timeline.segments[w].reserve(32);
  }
  
  _timeline.origin = observer_stamp_t::clock::now();
 }

 // Procedure: on_entry
 inline void ChromeObserver::on_entry(WorkerView wv, TaskView) {
  _timeline.stacks[wv.id()].push(observer_stamp_t::clock::now());
 }

 // Procedure: on_exit
 inline void ChromeObserver::on_exit(WorkerView wv, TaskView tv) {

  size_t w = wv.id();

  assert(!_timeline.stacks[w].empty());

  auto beg = _timeline.stacks[w].top();
  _timeline.stacks[w].pop();

  _timeline.segments[w].emplace_back(
    tv.name(), beg, observer_stamp_t::clock::now()
  );
 }

 // Function: clear
 inline void ChromeObserver::clear() {
  for(size_t w=0; w<_timeline.segments.size(); ++w) {
    _timeline.segments[w].clear();
    while(!_timeline.stacks[w].empty()) {
      _timeline.stacks[w].pop();
    }
  }
 }

 // Procedure: dump
 inline void ChromeObserver::dump(std::ostream& os) const {

  using namespace std::chrono;

  size_t first;

  for(first = 0; first<_timeline.segments.size(); ++first) {
    if(_timeline.segments[first].size() > 0) { 
      break; 
    }
  }

  os << '[';

  for(size_t w=first; w<_timeline.segments.size(); w++) {

    if(w != first && _timeline.segments[w].size() > 0) {
      os << ',';
    }

    for(size_t i=0; i<_timeline.segments[w].size(); i++) {

      os << '{'<< "\"cat\":\"ChromeObserver\",";

      // name field
      os << "\"name\":\"";
      if(_timeline.segments[w][i].name.empty()) {
        os << w << '_' << i;
      }
      else {
        os << _timeline.segments[w][i].name;
      }
      os << "\",";
      
      // segment field
      os << "\"ph\":\"X\","
         << "\"pid\":1,"
         << "\"tid\":" << w << ','
         << "\"ts\":" << duration_cast<microseconds>(
                           _timeline.segments[w][i].beg - _timeline.origin
                         ).count() << ','
         << "\"dur\":" << duration_cast<microseconds>(
                           _timeline.segments[w][i].end - _timeline.segments[w][i].beg
                         ).count();

      if(i != _timeline.segments[w].size() - 1) {
        os << "},";
      }
      else {
        os << '}';
      }
    }
  }
  os << "]\n";
 }

 // Function: dump
 inline std::string ChromeObserver::dump() const {
  std::ostringstream oss;
  dump(oss);
  return oss.str();
 }

 // Function: num_tasks
 inline size_t ChromeObserver::num_tasks() const {
  return std::accumulate(
    _timeline.segments.begin(), _timeline.segments.end(), size_t{0}, 
    [](size_t sum, const auto& exe){ 
      return sum + exe.size(); 
    }
  );
 }

 // ----------------------------------------------------------------------------
 // TFProfObserver definition
 // ----------------------------------------------------------------------------

 /**
 @class TFProfObserver

 @brief class to create an observer based on the built-in taskflow profiler format

 A tf::TFProfObserver inherits tf::ObserverInterface and defines methods to dump
 the observed thread activities into a format that can be visualized through
 @TFProf.

 @code{.cpp}
 tf::Taskflow taskflow;
 tf::Executor executor;

 // insert tasks into taskflow
 // ...
  
 // create a custom observer
 std::shared_ptr<tf::TFProfObserver> observer = executor.make_observer<tf::TFProfObserver>();

 // run the taskflow
 executor.run(taskflow).wait();

 // dump the thread activities to Taskflow Profiler format.
 observer->dump(std::cout);
 @endcode

 */
 class TFProfObserver : public ObserverInterface {

  friend class Executor;
  friend class TFProfManager;

  /** @private overall task summary */
  struct TaskSummary {
    size_t count {0};
    size_t total_span {0};
    size_t min_span;
    size_t max_span;
    
    float avg_span() const { return total_span * 1.0f / count; }
  };

  /** @private worker summary at a level */
  struct WorkerSummary {

    size_t id;
    size_t level;
    size_t count {0};
    size_t total_span {0};
    size_t min_span{0};
    size_t max_span{0};

    std::array<TaskSummary, TASK_TYPES.size()> tsum;

    float avg_span() const { return total_span * 1.0f / count; }
    //return count < 2 ? 0.0f : total_delay * 1.0f / (count-1); 
  };
  
  /** @private */
  struct Summary {
    std::array<TaskSummary, TASK_TYPES.size()> tsum;
    std::vector<WorkerSummary> wsum;
    
    void dump_tsum(std::ostream&) const;
    void dump_wsum(std::ostream&) const;
    void dump(std::ostream&) const;
  };

  public:

    /**
    @brief dumps the timelines into a @TFProf format through 
           an output stream
    */
    void dump(std::ostream& ostream) const;

    /**
    @brief dumps the timelines into a JSON string
    */
    std::string dump() const;

    /**
    @brief shows the summary report through an output stream
    */
    void summary(std::ostream& ostream) const;

    /**
    @brief returns the summary report in a string
    */
    std::string summary() const;

    /**
    @brief clears the timeline data
    */
    void clear();

    /**
    @brief queries the number of tasks observed
    */
    size_t num_tasks() const;
    
    /**
    @brief queries the number of observed workers
    */
    size_t num_workers() const;

  private:
    
    Timeline _timeline;
  
    std::vector<std::stack<observer_stamp_t>> _stacks;
    
    inline void set_up(size_t num_workers) override final;
    inline void on_entry(WorkerView, TaskView) override final;
    inline void on_exit(WorkerView, TaskView) override final;
 };  


 // dump the task summary
 inline void TFProfObserver::Summary::dump_tsum(std::ostream& os) const {

  // task summary
  size_t type_w{10}, count_w{5}, time_w{9}, avg_w{8}, min_w{8}, max_w{8};

  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
    if(i.count == 0) return;
    count_w = std::max(count_w, std::to_string(i.count).size());
  });
  
  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
    if(i.count == 0) return;
    time_w = std::max(time_w, std::to_string(i.total_span).size());
  });
  
  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
    if(i.count == 0) return;
    avg_w = std::max(time_w, std::to_string(i.avg_span()).size());
  });
  
  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
    if(i.count == 0) return;
    min_w = std::max(min_w, std::to_string(i.min_span).size());
  });
  
  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
    if(i.count == 0) return;
    max_w = std::max(max_w, std::to_string(i.max_span).size());
  });

  os << std::setw(type_w) << "-Task-" 
     << std::setw(count_w+2) << "Count"
     << std::setw(time_w+2) << "Time (us)"
     << std::setw(avg_w+2) << "Avg (us)"
     << std::setw(min_w+2) << "Min (us)"
     << std::setw(max_w+2) << "Max (us)"
     << '\n';

  for(size_t i=0; i<TASK_TYPES.size(); i++) {
    if(tsum[i].count == 0) {
      continue;
    }
    os << std::setw(type_w) << to_string(TASK_TYPES[i])
       << std::setw(count_w+2) << tsum[i].count
       << std::setw(time_w+2) << tsum[i].total_span
       << std::setw(avg_w+2) << std::to_string(tsum[i].avg_span())
       << std::setw(min_w+2) << tsum[i].min_span
       << std::setw(max_w+2) << tsum[i].max_span
       << '\n';
  }
 }

 // dump the worker summary
 inline void TFProfObserver::Summary::dump_wsum(std::ostream& os) const {
  
  // task summary
  size_t w_w{10}, t_w{10}, l_w{5}, c_w{5}, d_w{9}, avg_w{8}, min_w{8}, max_w{8};

  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
    if(i.count == 0) return;
    l_w = std::max(l_w, std::to_string(i.level).size());
  });
  
  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
    if(i.count == 0) return;
    c_w = std::max(c_w, std::to_string(i.count).size());
  });
  
  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
    if(i.count == 0) return;
    d_w = std::max(d_w, std::to_string(i.total_span).size());
  });
  
  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
    if(i.count == 0) return;
    avg_w = std::max(avg_w, std::to_string(i.avg_span()).size());
  });
  
  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
    if(i.count == 0) return;
    min_w = std::max(min_w, std::to_string(i.min_span).size());
  });
  
  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
    if(i.count == 0) return;
    max_w = std::max(max_w, std::to_string(i.max_span).size());
  });
  
  os << std::setw(w_w) << "-Worker-" 
     << std::setw(l_w+2) << "Level"
     << std::setw(t_w) << "Task"
     << std::setw(c_w+2) << "Count"
     << std::setw(d_w+2) << "Time (us)"
     << std::setw(avg_w+2) << "Avg (us)"
     << std::setw(min_w+2) << "Min (us)"
     << std::setw(max_w+2) << "Max (us)"
     << '\n';

  for(const auto& ws : wsum) {

    if(ws.count == 0) {
      continue;
    }

    os << std::setw(w_w) << ws.id
       << std::setw(l_w+2) << ws.level;
    
    bool first = true;
    for(size_t i=0; i<TASK_TYPES.size(); i++) {

      if(ws.tsum[i].count == 0) {
        continue;
      }

      os << (first ? std::setw(t_w) : std::setw(w_w + l_w + 2 + t_w));
      first = false;

      os << to_string(TASK_TYPES[i])
         << std::setw(c_w+2) << ws.tsum[i].count
         << std::setw(d_w+2) << ws.tsum[i].total_span
         << std::setw(avg_w+2) << std::to_string(ws.tsum[i].avg_span())
         << std::setw(min_w+2) << ws.tsum[i].min_span
         << std::setw(max_w+2) << ws.tsum[i].max_span
         << '\n';
    }

    // per-worker summary
    os << std::setw(w_w + l_w + t_w + c_w + 4) << ws.count
       << std::setw(d_w+2) << ws.total_span
       << std::setw(avg_w+2) << std::to_string(ws.avg_span())
       << std::setw(min_w+2) << ws.min_span
       << std::setw(max_w+2) << ws.max_span
       << '\n';
    
    //for(size_t j=0; j<w_w+l_w+t_w+4; j++) os << ' ';
    //for(size_t j=0; j<c_w+d_w+avg_w+min_w+max_w+8; j++) os << '-';
    //os <<'\n';
  }
 }

 // dump the summary report through an ostream
 inline void TFProfObserver::Summary::dump(std::ostream& os) const {
  dump_tsum(os);
  os << '\n';
  dump_wsum(os);
 }

 // Procedure: set_up
 inline void TFProfObserver::set_up(size_t num_workers) {
  _timeline.uid = unique_id<size_t>();
  _timeline.origin = observer_stamp_t::clock::now();
  _timeline.segments.resize(num_workers);
  _stacks.resize(num_workers);
 }

 // Procedure: on_entry
 inline void TFProfObserver::on_entry(WorkerView wv, TaskView) {
  _stacks[wv.id()].push(observer_stamp_t::clock::now());
 }

 // Procedure: on_exit
 inline void TFProfObserver::on_exit(WorkerView wv, TaskView tv) {

  size_t w = wv.id();

  assert(!_stacks[w].empty());
  
  if(_stacks[w].size() > _timeline.segments[w].size()) {
    _timeline.segments[w].resize(_stacks[w].size());
  }

  auto beg = _stacks[w].top();
  _stacks[w].pop();

  _timeline.segments[w][_stacks[w].size()].emplace_back(
    tv.name(), tv.type(), beg, observer_stamp_t::clock::now()
  );
 }

 // Function: clear
 inline void TFProfObserver::clear() {
  for(size_t w=0; w<_timeline.segments.size(); ++w) {
    for(size_t l=0; l<_timeline.segments[w].size(); ++l) {
      _timeline.segments[w][l].clear();
    }
    while(!_stacks[w].empty()) {
      _stacks[w].pop();
    }
  }
 }

 // Procedure: dump
 inline void TFProfObserver::dump(std::ostream& os) const {

  using namespace std::chrono;

  size_t first;

  for(first = 0; first<_timeline.segments.size(); ++first) {
    if(_timeline.segments[first].size() > 0) { 
      break; 
    }
  }
  
  // not timeline data to dump
  if(first == _timeline.segments.size()) {
    os << "{}\n";
    return;
  }

  os << "{\"executor\":\"" << _timeline.uid << "\",\"data\":[";

  bool comma = false;

  for(size_t w=first; w<_timeline.segments.size(); w++) {
    for(size_t l=0; l<_timeline.segments[w].size(); l++) {

      if(_timeline.segments[w][l].empty()) {
        continue;
      }

      if(comma) {
        os << ',';
      }
      else {
        comma = true;
      }

      os << "{\"worker\":" << w << ",\"level\":" << l << ",\"data\":[";
      for(size_t i=0; i<_timeline.segments[w][l].size(); ++i) {

        const auto& s = _timeline.segments[w][l][i];

        if(i) os << ',';
        
        // span 
        os << "{\"span\":[" 
           << duration_cast<microseconds>(s.beg - _timeline.origin).count() 
           << ","
           << duration_cast<microseconds>(s.end - _timeline.origin).count() 
           << "],";
        
        // name
        os << "\"name\":\""; 
        if(s.name.empty()) {
          os << w << '_' << i;
        }
        else {
          os << s.name;
        }
        os << "\",";
    
        // e.g., category "type": "Condition Task"
        os << "\"type\":\"" << to_string(s.type) << "\"";

        os << "}";
      }
      os << "]}";
    }
  }

  os << "]}\n";
 }

 // Function: dump
 inline std::string TFProfObserver::dump() const {
  std::ostringstream oss;
  dump(oss);
  return oss.str();
 }

 // Procedure: summary
 inline void TFProfObserver::summary(std::ostream& os) const {

  using namespace std::chrono;
  
  Summary summary;
  std::optional<observer_stamp_t> view_beg, view_end;

  // find the first non-empty worker
  size_t first;
  for(first = 0; first<_timeline.segments.size(); ++first) {
    if(_timeline.segments[first].size() > 0) { 
      break; 
    }
  }
  
  // not timeline data to dump
  if(first == _timeline.segments.size()) {
    goto end_of_summary;
  }

  for(size_t w=first; w<_timeline.segments.size(); w++) {
    for(size_t l=0; l<_timeline.segments[w].size(); l++) {

      if(_timeline.segments[w][l].empty()) {
        continue;
      }

      // worker w at level l
      WorkerSummary ws;
      ws.id = w;
      ws.level = l;
      ws.count = _timeline.segments[w][l].size();
      
      // scan all tasks at level l
      for(size_t i=0; i<_timeline.segments[w][l].size(); ++i) {
        
        // update the entire span
        auto& s = _timeline.segments[w][l][i];
        view_beg = view_beg ? std::min(*view_beg, s.beg) : s.beg;
        view_end = view_end ? std::max(*view_end, s.end) : s.end;
        
        // update the task summary
        size_t t = duration_cast<microseconds>(s.end - s.beg).count();

        auto& x = summary.tsum[static_cast<int>(s.type)];
        x.count += 1;
        x.total_span += t;
        x.min_span = (x.count == 1) ? t : std::min(t, x.min_span);
        x.max_span = (x.count == 1) ? t : std::max(t, x.max_span);

        // update the worker summary
        ws.total_span += t;
        ws.min_span = (i == 0) ? t : std::min(t, ws.min_span);
        ws.max_span = (i == 0) ? t : std::max(t, ws.max_span);

        auto&y = ws.tsum[static_cast<int>(s.type)];
        y.count += 1;
        y.total_span += t;
        y.min_span = (y.count == 1) ? t : std::min(t, y.min_span);
        y.max_span = (y.count == 1) ? t : std::max(t, y.max_span);
        
        // update the delay
        //if(i) {
        //  size_t d = duration_cast<nanoseconds>(
        //    s.beg - _timeline.segments[w][l][i-1].end
        //  ).count();
        //  ws.total_delay += d;
        //  ws.min_delay = (i == 1) ? d : std::min(ws.min_delay, d);
        //  ws.max_delay = (i == 1) ? d : std::max(ws.max_delay, d);
        //}
      }
      summary.wsum.push_back(ws);
    }
  }

  end_of_summary:

  size_t view = 0;
  if(view_beg && view_end) {
    view = duration_cast<microseconds>(*view_end - *view_beg).count();
  }

  os << "==Observer " << _timeline.uid << ": "
     << num_workers() << " workers completed "
     << num_tasks() << " tasks in "
     << view << " us\n";

  summary.dump(os);
 }

 // Procedure: summary
 inline std::string TFProfObserver::summary() const {
  std::ostringstream oss;
  summary(oss);
  return oss.str();
 }

 // Function: num_tasks
 inline size_t TFProfObserver::num_tasks() const {
  size_t s = 0;
  for(size_t w=0; w<_timeline.segments.size(); ++w) {
    for(size_t l=0; l<_timeline.segments[w].size(); ++l) {
      s += _timeline.segments[w][l].size();
    }
  }
  return s;
 }
  
 // Function: num_workers
 inline size_t TFProfObserver::num_workers() const {
  size_t w = 0;
  for(size_t i=0; i<_timeline.segments.size(); ++i) {
    w += (!_timeline.segments[i].empty());
  }
  return w;
 }


 // ----------------------------------------------------------------------------
 // TFProfManager
 // ----------------------------------------------------------------------------

 /**
 @private
 */
 class TFProfManager {

  friend class Executor;

  public:
    
    ~TFProfManager();
    
    TFProfManager(const TFProfManager&) = delete;
    TFProfManager& operator=(const TFProfManager&) = delete;

    static TFProfManager& get();

    void dump(std::ostream& ostream) const;

  private:
    
    const std::string _fpath;

    std::mutex _mutex;
    std::vector<std::shared_ptr<TFProfObserver>> _observers;
    
    TFProfManager();

    void _manage(std::shared_ptr<TFProfObserver> observer);
 };

 // constructor
 inline TFProfManager::TFProfManager() :
  _fpath {get_env(TF_ENABLE_PROFILER)} {

 }

 // Procedure: manage
 inline void TFProfManager::_manage(std::shared_ptr<TFProfObserver> observer) {
  std::lock_guard lock(_mutex);
  _observers.push_back(std::move(observer));
 }

 // Procedure: dump
 inline void TFProfManager::dump(std::ostream& os) const {
  for(size_t i=0; i<_observers.size(); ++i) {
    if(i) os << ',';
    _observers[i]->dump(os); 
  }
 }

 // Destructor
 inline TFProfManager::~TFProfManager() {
  std::ofstream ofs(_fpath);
  if(ofs) {
    // .tfp
    if(_fpath.rfind(".tfp") != std::string::npos) {
      ProfileData data;
      data.timelines.reserve(_observers.size());
      for(size_t i=0; i<_observers.size(); ++i) {
        data.timelines.push_back(std::move(_observers[i]->_timeline));
      }
      Serializer<std::ofstream> serializer(ofs); 
      serializer(data);
    }
    // .json
    else { // if(_fpath.rfind(".json") != std::string::npos) {
      ofs << "[\n";
      for(size_t i=0; i<_observers.size(); ++i) {
        if(i) ofs << ',';
        _observers[i]->dump(ofs);
      }
      ofs << "]\n";
    }
  }
  // do a summary report in stderr for each observer
  else {
    std::ostringstream oss;
    for(size_t i=0; i<_observers.size(); ++i) {
      _observers[i]->summary(oss);
    }
    fprintf(stderr, "%s", oss.str().c_str());
  }
 }
    
 // Function: get
 inline TFProfManager& TFProfManager::get() {
  static TFProfManager mgr;
  return mgr;
 }

 // ----------------------------------------------------------------------------
 // Identifier for Each Built-in Observer
 // ----------------------------------------------------------------------------

 /** @enum ObserverType

 @brief enumeration of all observer types

 */
 enum class ObserverType : int {
  TFPROF = 0,
  CHROME,
  UNDEFINED
 };

 /**
 @brief convert an observer type to a human-readable string
 */
 inline const char* to_string(ObserverType type) {
  switch(type) {
    case ObserverType::TFPROF: return "tfprof";
    case ObserverType::CHROME: return "chrome";
    default:                   return "undefined";
  }
 }


 }  // end of namespace tf -----------------------------------------------------








 // reference:
 // - gomp: https://github.com/gcc-mirror/gcc/blob/master/libgomp/iter.c
 // - komp: https://github.com/llvm-mirror/openmp/blob/master/runtime/src/kmp_dispatch.cpp



 /**
 @file partitioner.hpp
 @brief partitioner include file
 */

 namespace tf {

 // ----------------------------------------------------------------------------
 // Partitioner Base
 // ----------------------------------------------------------------------------

 /**
 @class PartitionerBase

 @brief class to derive a partitioner for scheduling parallel algorithms

 The class provides base methods to derive a partitioner that can be used
 to schedule parallel iterations (e.g., tf::Taskflow::for_each).

 An partitioner defines the scheduling method for running parallel algorithms,
 such tf::Taskflow::for_each, tf::Taskflow::reduce, and so on.
 By default, we provide the following partitioners:

 + tf::GuidedPartitioner to enable guided scheduling algorithm of adaptive chunk size
 + tf::DynamicPartitioner to enable dynamic scheduling algorithm of equal chunk size
 + tf::StaticPartitioner to enable static scheduling algorithm of static chunk size
 + tf::RandomPartitioner to enable random scheduling algorithm of random chunk size

 Depending on applications, partitioning algorithms can impact the performance
 a lot. 
 For example, if a parallel-iteration workload contains a regular work unit per
 iteration, tf::StaticPartitioner can deliver the best performance.
 On the other hand, if the work unit per iteration is irregular and unbalanced,
 tf::GuidedPartitioner or tf::DynamicPartitioner can outperform tf::StaticPartitioner.
 In most situations, tf::GuidedPartitioner can deliver decent performance and
 is thus used as our default partitioner.
 */
 class PartitionerBase {

  public:

  /**
  @brief default constructor
  */
  PartitionerBase() = default;

  /**
  @brief construct a partitioner with the given chunk size
  */
  explicit PartitionerBase(size_t chunk_size) : _chunk_size {chunk_size} {}

  /**
  @brief query the chunk size of this partitioner
  */
  size_t chunk_size() const { return _chunk_size; }
  
  /**
  @brief update the chunk size of this partitioner
  */
  void chunk_size(size_t cz) { _chunk_size = cz; }

  protected:
  
  /**
  @brief chunk size 
  */
  size_t _chunk_size{0};
 };

 // ----------------------------------------------------------------------------
 // Guided Partitioner
 // ----------------------------------------------------------------------------
  
 /**
 @class GuidedPartitioner

 @brief class to construct a guided partitioner for scheduling parallel algorithms

 The size of a partition is proportional to the number of unassigned iterations 
 divided by the number of workers, 
 and the size will gradually decrease to the given chunk size.
 The last partition may be smaller than the chunk size.
 */
 class GuidedPartitioner : public PartitionerBase {

  public:
  
  /**
  @brief default constructor
  */
  GuidedPartitioner() : PartitionerBase{1} {}

  /**
  @brief construct a guided partitioner with the given chunk size
  */
  explicit GuidedPartitioner(size_t sz) : PartitionerBase (sz) {}
  
  // --------------------------------------------------------------------------
  // scheduling methods
  // --------------------------------------------------------------------------
  
  /**
  @private
  */
  template <typename F, 
    std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr
  >
  void loop(
    size_t N, 
    size_t W, 
    std::atomic<size_t>& next, 
    F&& func
  ) const {

    size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size;

    size_t p1 = 2 * W * (chunk_size + 1);
    float  p2 = 0.5f / static_cast<float>(W);
    size_t curr_b = next.load(std::memory_order_relaxed);

    while(curr_b < N) {

      size_t r = N - curr_b;

      // fine-grained
      if(r < p1) {
        while(1) {
          curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
          if(curr_b >= N) {
            return;
          }
          func(curr_b, std::min(curr_b + chunk_size, N));
        }
        break;
      }
      // coarse-grained
      else {
        size_t q = static_cast<size_t>(p2 * r);
        if(q < chunk_size) {
          q = chunk_size;
        }
        //size_t curr_e = (q <= r) ? curr_b + q : N;
        size_t curr_e = std::min(curr_b + q, N);
        if(next.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed,
                                                        std::memory_order_relaxed)) {
          func(curr_b, curr_e);
          curr_b = next.load(std::memory_order_relaxed);
        }
      }
    }
  }
  
  /**
  @private
  */
  template <typename F, 
    std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr
  >
  void loop_until(
    size_t N, 
    size_t W, 
    std::atomic<size_t>& next, 
    F&& func
  ) const {

    size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size;

    size_t p1 = 2 * W * (chunk_size + 1);
    float  p2 = 0.5f / static_cast<float>(W);
    size_t curr_b = next.load(std::memory_order_relaxed);

    while(curr_b < N) {

      size_t r = N - curr_b;

      // fine-grained
      if(r < p1) {
        while(1) {
          curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
          if(curr_b >= N) {
            return;
          }
          if(func(curr_b, std::min(curr_b + chunk_size, N))) {
            return;
          }
        }
        break;
      }
      // coarse-grained
      else {
        size_t q = static_cast<size_t>(p2 * r);
        if(q < chunk_size) {
          q = chunk_size;
        }
        //size_t curr_e = (q <= r) ? curr_b + q : N;
        size_t curr_e = std::min(curr_b + q, N);
        if(next.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed,
                                                        std::memory_order_relaxed)) {
          if(func(curr_b, curr_e)) {
            return;
          }
          curr_b = next.load(std::memory_order_relaxed);
        }
      }
    }
  }
 };

 // ----------------------------------------------------------------------------
 // Dynamic Partitioner
 // ----------------------------------------------------------------------------

 /**
 @class DynamicPartitioner

 @brief class to construct a dynamic partitioner for scheduling parallel algorithms

 The partitioner splits iterations into many partitions each of size equal to 
 the given chunk size.
 Different partitions are distributed dynamically to workers 
 without any specific order.
 */
 class DynamicPartitioner : public PartitionerBase {

  public:

  /**
  @brief default constructor
  */
  DynamicPartitioner() : PartitionerBase{1} {};
  
  /**
  @brief construct a dynamic partitioner with the given chunk size
  */
  explicit DynamicPartitioner(size_t sz) : PartitionerBase (sz) {}
  
  // --------------------------------------------------------------------------
  // scheduling methods
  // --------------------------------------------------------------------------
  
  /**
  @private
  */
  template <typename F, 
    std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr
  >
  void loop(
    size_t N, 
    size_t, 
    std::atomic<size_t>& next, 
    F&& func
  ) const {

    size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size;
    size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);

    while(curr_b < N) {
      func(curr_b, std::min(curr_b + chunk_size, N));
      curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
    }
  }
  
  /**
  @private
  */
  template <typename F, 
    std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr
  >
  void loop_until(
    size_t N, 
    size_t, 
    std::atomic<size_t>& next, 
    F&& func
  ) const {

    size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size;
    size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);

    while(curr_b < N) {
      if(func(curr_b, std::min(curr_b + chunk_size, N))) {
        return;
      }
      curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
    }
  }
 };

 // ----------------------------------------------------------------------------
 // Static Partitioner
 // ----------------------------------------------------------------------------

 /**
 @class StaticPartitioner

 @brief class to construct a dynamic partitioner for scheduling parallel algorithms

 The partitioner divides iterations into chunks and distributes chunks 
 to workers in order.
 If the chunk size is not specified (default @c 0), the partitioner resorts to a chunk size
 that equally distributes iterations into workers.

 @code{.cpp}
 std::vector<int> data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
 taskflow.for_each(
  data.begin(), data.end(), [](int i){}, StaticPartitioner(0)
 );
 executor.run(taskflow).run();
 @endcode
 */
 class StaticPartitioner : public PartitionerBase {

  public:

  /**
  @brief default constructor
  */
  StaticPartitioner() : PartitionerBase{0} {};
  
  /**
  @brief construct a dynamic partitioner with the given chunk size
  */
  explicit StaticPartitioner(size_t sz) : PartitionerBase(sz) {}
  
  /**
  @brief queries the adjusted chunk size
  
  Returns the given chunk size if it is not zero, or returns
  <tt>N/W + (w < N%W)</tt>, where @c N is the number of iterations,
  @c W is the number of workers, and @c w is the worker ID.
  */
  size_t adjusted_chunk_size(size_t N, size_t W, size_t w) const {
    return _chunk_size ? _chunk_size : N/W + (w < N%W);
  }
  
  // --------------------------------------------------------------------------
  // scheduling methods
  // --------------------------------------------------------------------------

  /**
  @private
  */
  template <typename F, 
    std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr
  >
  void loop(
    size_t N, 
    size_t W, 
    size_t curr_b, 
    size_t chunk_size,
    F&& func
  ) {
    size_t stride = W * chunk_size;
    while(curr_b < N) {
      size_t curr_e = std::min(curr_b + chunk_size, N);
      func(curr_b, curr_e);
      curr_b += stride;
    }
  }
  
  /**
  @private
  */
  template <typename F, 
    std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr
  >
  void loop_until(
    size_t N, 
    size_t W, 
    size_t curr_b, 
    size_t chunk_size,
    F&& func
  ) {
    size_t stride = W * chunk_size;
    while(curr_b < N) {
      size_t curr_e = std::min(curr_b + chunk_size, N);
      if(func(curr_b, curr_e)) {
        return;
      }
      curr_b += stride;
    }
  }
 };

 // ----------------------------------------------------------------------------
 // RandomPartitioner
 // ----------------------------------------------------------------------------

 /**
 @class RandomPartitioner

 @brief class to construct a random partitioner for scheduling parallel algorithms

 Similar to tf::DynamicPartitioner, 
 the partitioner splits iterations into many partitions but each with a random
 chunk size in the range, <tt>c = [alpha * N * W, beta * N * W]</tt>.
 By default, @c alpha is <tt>0.01</tt> and @c beta is <tt>0.5</tt>, respectively.

 */
 class RandomPartitioner : public PartitionerBase {

  public:

  /**
  @brief default constructor
  */
  RandomPartitioner() = default;
  
  /**
  @brief constructs a random partitioner 
  */
  RandomPartitioner(size_t cz) : PartitionerBase(cz) {}
  
  /**
  @brief constructs a random partitioner with the given parameters
  */
  RandomPartitioner(float alpha, float beta) : _alpha {alpha}, _beta {beta} {}

  /**
  @brief queries the @c alpha value
  */
  float alpha() const { return _alpha; }
  
  /**
  @brief queries the @c beta value
  */
  float beta() const { return _beta; }
  
  /**
  @brief queries the range of chunk size
  
  @param N number of iterations
  @param W number of workers
  */
  std::pair<size_t, size_t> chunk_size_range(size_t N, size_t W) const {
    
    size_t b1 = static_cast<size_t>(_alpha * N * W);
    size_t b2 = static_cast<size_t>(_beta  * N * W);

    if(b1 > b2) {
      std::swap(b1, b2);
    }

    b1 = std::max(b1, size_t{1});
    b2 = std::max(b2, b1 + 1);

    return {b1, b2};
  }

  // --------------------------------------------------------------------------
  // scheduling methods
  // --------------------------------------------------------------------------
  
  /**
  @private
  */
  template <typename F, 
    std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr
  >
  void loop(
    size_t N, 
    size_t W, 
    std::atomic<size_t>& next, 
    F&& func
  ) const {

    auto [b1, b2] = chunk_size_range(N, W); 
    
    std::default_random_engine engine {std::random_device{}()};
    std::uniform_int_distribution<size_t> dist(b1, b2);
    
    size_t chunk_size = dist(engine);
    size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);

    while(curr_b < N) {
      func(curr_b, std::min(curr_b + chunk_size, N));
      chunk_size = dist(engine);
      curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
    }
  }

  /**
  @private
  */
  template <typename F, 
    std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr
  >
  void loop_until(
    size_t N, 
    size_t W, 
    std::atomic<size_t>& next, 
    F&& func
  ) const {

    auto [b1, b2] = chunk_size_range(N, W); 
    
    std::default_random_engine engine {std::random_device{}()};
    std::uniform_int_distribution<size_t> dist(b1, b2);
    
    size_t chunk_size = dist(engine);
    size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);

    while(curr_b < N) {
      if(func(curr_b, std::min(curr_b + chunk_size, N))){
        return;
      }
      chunk_size = dist(engine);
      curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
    }
  }

  private:

  float _alpha {0.01f};
  float _beta  {0.5f};

 };

 /**
 @brief default partitioner set to tf::GuidedPartitioner

 Guided partitioner can achieve decent performance for most parallel algorithms,
 especially for those with irregular and unbalanced workload per iteration.
 */
 using DefaultPartitioner = GuidedPartitioner;

 /**
 @brief determines if a type is a partitioner 

 A partitioner is a derived type from tf::PartitionerBase.
 */
 template <typename C>
 inline constexpr bool is_partitioner_v = std::is_base_of<PartitionerBase, C>::value;

 }  // end of namespace tf -----------------------------------------------------





 /**
 @file flow_builder.hpp
 @brief flow builder include file
 */

 namespace tf {

 /**
 @class FlowBuilder

 @brief class to build a task dependency graph

 The class provides essential methods to construct a task dependency graph
 from which tf::Taskflow and tf::Subflow are derived.

 */
 class FlowBuilder {

  friend class Executor;

  public:

  /**
  @brief constructs a flow builder with a graph
  */
  FlowBuilder(Graph& graph);

  /**
  @brief creates a static task

  @tparam C callable type constructible from std::function<void()>

  @param callable callable to construct a static task

  @return a tf::Task handle

  The following example creates a static task.

  @code{.cpp}
  tf::Task static_task = taskflow.emplace([](){});
  @endcode

  Please refer to @ref StaticTasking for details.
  */
  template <typename C,
    std::enable_if_t<is_static_task_v<C>, void>* = nullptr
  >
  Task emplace(C&& callable);

  /**
  @brief creates a dynamic task

  @tparam C callable type constructible from std::function<void(tf::Subflow&)>

  @param callable callable to construct a dynamic task

  @return a tf::Task handle

  The following example creates a dynamic task (tf::Subflow)
  that spawns two static tasks.

  @code{.cpp}
  tf::Task dynamic_task = taskflow.emplace([](tf::Subflow& sf){
    tf::Task static_task1 = sf.emplace([](){});
    tf::Task static_task2 = sf.emplace([](){});
  });
  @endcode

  Please refer to @ref DynamicTasking for details.
  */
  template <typename C,
    std::enable_if_t<is_dynamic_task_v<C>, void>* = nullptr
  >
  Task emplace(C&& callable);

  /**
  @brief creates a condition task

  @tparam C callable type constructible from std::function<int()>

  @param callable callable to construct a condition task

  @return a tf::Task handle

  The following example creates an if-else block using one condition task
  and three static tasks.

  @code{.cpp}
  tf::Taskflow taskflow;

  auto [init, cond, yes, no] = taskflow.emplace(
    [] () { },
    [] () { return 0; },
    [] () { std::cout << "yes\n"; },
    [] () { std::cout << "no\n"; }
  );

  // executes yes if cond returns 0, or no if cond returns 1
  cond.precede(yes, no);
  cond.succeed(init);
  @endcode

  Please refer to @ref ConditionalTasking for details.
  */
  template <typename C,
    std::enable_if_t<is_condition_task_v<C>, void>* = nullptr
  >
  Task emplace(C&& callable);

  /**
  @brief creates a multi-condition task

  @tparam C callable type constructible from
          std::function<tf::SmallVector<int>()>

  @param callable callable to construct a multi-condition task

  @return a tf::Task handle

  The following example creates a multi-condition task that selectively
  jumps to two successor tasks.

  @code{.cpp}
  tf::Taskflow taskflow;

  auto [init, cond, branch1, branch2, branch3] = taskflow.emplace(
    [] () { },
    [] () { return tf::SmallVector{0, 2}; },
    [] () { std::cout << "branch1\n"; },
    [] () { std::cout << "branch2\n"; },
    [] () { std::cout << "branch3\n"; }
  );

  // executes branch1 and branch3 when cond returns 0 and 2
  cond.precede(branch1, branch2, branch3);
  cond.succeed(init);
  @endcode

  Please refer to @ref ConditionalTasking for details.
  */
  template <typename C,
    std::enable_if_t<is_multi_condition_task_v<C>, void>* = nullptr
  >
  Task emplace(C&& callable);

  /**
  @brief creates multiple tasks from a list of callable objects

  @tparam C callable types

  @param callables one or multiple callable objects constructible from each task category

  @return a tf::Task handle

  The method returns a tuple of tasks each corresponding to the given
  callable target. You can use structured binding to get the return tasks
  one by one.
  The following example creates four static tasks and assign them to
  @c A, @c B, @c C, and @c D using structured binding.

  @code{.cpp}
  auto [A, B, C, D] = taskflow.emplace(
    [] () { std::cout << "A"; },
    [] () { std::cout << "B"; },
    [] () { std::cout << "C"; },
    [] () { std::cout << "D"; }
  );
  @endcode
  */
  template <typename... C, std::enable_if_t<(sizeof...(C)>1), void>* = nullptr>
  auto emplace(C&&... callables);

  /**
  @brief removes a task from a taskflow

  @param task task to remove

  Removes a task and its input and output dependencies from the graph
  associated with the flow builder.
  If the task does not belong to the graph, nothing will happen.

  @code{.cpp}
  tf::Task A = taskflow.emplace([](){ std::cout << "A"; });
  tf::Task B = taskflow.emplace([](){ std::cout << "B"; });
  tf::Task C = taskflow.emplace([](){ std::cout << "C"; });
  tf::Task D = taskflow.emplace([](){ std::cout << "D"; });
  A.precede(B, C, D);

  // erase A from the taskflow and its dependencies to B, C, and D
  taskflow.erase(A);
  @endcode
  */
  void erase(Task task);

  /**
  @brief creates a module task for the target object

  @tparam T target object type
  @param object a custom object that defines the method @c T::graph()

  @return a tf::Task handle

  The example below demonstrates a taskflow composition using
  the @c composed_of method.

  @code{.cpp}
  tf::Taskflow t1, t2;
  t1.emplace([](){ std::cout << "t1"; });

  // t2 is partially composed of t1
  tf::Task comp = t2.composed_of(t1);
  tf::Task init = t2.emplace([](){ std::cout << "t2"; });
  init.precede(comp);
  @endcode

  The taskflow object @c t2 is composed of another taskflow object @c t1,
  preceded by another static task @c init.
  When taskflow @c t2 is submitted to an executor,
  @c init will run first and then @c comp which spwans its definition
  in taskflow @c t1.

  The target @c object being composed must define the method
  <tt>T::graph()</tt> that returns a reference to a graph object of
  type tf::Graph such that it can interact with the executor.
  For example:

  @code{.cpp}
  // custom struct
  struct MyObj {
    tf::Graph graph;
    MyObj() {
      tf::FlowBuilder builder(graph);
      tf::Task task = builder.emplace([](){
        std::cout << "a task\n";  // static task
      });
    }
    Graph& graph() { return graph; }
  };

  MyObj obj;
  tf::Task comp = taskflow.composed_of(obj);
  @endcode

  Please refer to @ref ComposableTasking for details.
  */
  template <typename T>
  Task composed_of(T& object);

  /**
  @brief creates a placeholder task

  @return a tf::Task handle

  A placeholder task maps to a node in the taskflow graph, but
  it does not have any callable work assigned yet.
  A placeholder task is different from an empty task handle that
  does not point to any node in a graph.

  @code{.cpp}
  // create a placeholder task with no callable target assigned
  tf::Task placeholder = taskflow.placeholder();
  assert(placeholder.empty() == false && placeholder.has_work() == false);

  // create an empty task handle
  tf::Task task;
  assert(task.empty() == true);

  // assign the task handle to the placeholder task
  task = placeholder;
  assert(task.empty() == false && task.has_work() == false);
  @endcode
  */
  Task placeholder();

  /**
  @brief adds adjacent dependency links to a linear list of tasks

  @param tasks a vector of tasks

  This member function creates linear dependencies over a vector of tasks.

  @code{.cpp}
  tf::Task A = taskflow.emplace([](){ std::cout << "A"; });
  tf::Task B = taskflow.emplace([](){ std::cout << "B"; });
  tf::Task C = taskflow.emplace([](){ std::cout << "C"; });
  tf::Task D = taskflow.emplace([](){ std::cout << "D"; });
  std::vector<tf::Task> tasks {A, B, C, D}
  taskflow.linearize(tasks);  // A->B->C->D
  @endcode

  */
  void linearize(std::vector<Task>& tasks);

  /**
  @brief adds adjacent dependency links to a linear list of tasks

  @param tasks an initializer list of tasks

  This member function creates linear dependencies over a list of tasks.

  @code{.cpp}
  tf::Task A = taskflow.emplace([](){ std::cout << "A"; });
  tf::Task B = taskflow.emplace([](){ std::cout << "B"; });
  tf::Task C = taskflow.emplace([](){ std::cout << "C"; });
  tf::Task D = taskflow.emplace([](){ std::cout << "D"; });
  taskflow.linearize({A, B, C, D});  // A->B->C->D
  @endcode
  */
  void linearize(std::initializer_list<Task> tasks);

  // ------------------------------------------------------------------------
  // parallel iterations
  // ------------------------------------------------------------------------

  /**
  @brief constructs an STL-styled parallel-for task

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam C callable type
  @tparam P partitioner type (default tf::GuidedPartitioner)

  @param first iterator to the beginning (inclusive)
  @param last iterator to the end (exclusive)
  @param callable callable object to apply to the dereferenced iterator
  @param part partitioning algorithm to schedule parallel iterations

  @return a tf::Task handle

  The task spawns asynchronous tasks that applies the callable object to each object
  obtained by dereferencing every iterator in the range <tt>[first, last)</tt>.
  This method is equivalent to the parallel execution of the following loop:

  @code{.cpp}
  for(auto itr=first; itr!=last; itr++) {
    callable(*itr);
  }
  @endcode

  Iterators are templated to enable stateful range using std::reference_wrapper.
  The callable needs to take a single argument of
  the dereferenced iterator type.

  Please refer to @ref ParallelIterations for details.
  */
  template <typename B, typename E, typename C, typename P = GuidedPartitioner>
  Task for_each(B first, E last, C callable, P&& part = P());
  
  /**
  @brief constructs an STL-styled index-based parallel-for task 

  @tparam B beginning index type (must be integral)
  @tparam E ending index type (must be integral)
  @tparam S step type (must be integral)
  @tparam C callable type
  @tparam P partitioner type (default tf::GuidedPartitioner)

  @param first index of the beginning (inclusive)
  @param last index of the end (exclusive)
  @param step step size
  @param callable callable object to apply to each valid index
  @param part partitioning algorithm to schedule parallel iterations

  @return a tf::Task handle

  The task spawns asynchronous tasks that applies the callable object to each index
  in the range <tt>[first, last)</tt> with the step size.
  This method is equivalent to the parallel execution of the following loop:

  @code{.cpp}
  // case 1: step size is positive
  for(auto i=first; i<last; i+=step) {
    callable(i);
  }

  // case 2: step size is negative
  for(auto i=first, i>last; i+=step) {
    callable(i);
  }
  @endcode

  Iterators are templated to enable stateful range using std::reference_wrapper.
  The callable needs to take a single argument of the integral index type.

  Please refer to @ref ParallelIterations for details.
  */
  template <typename B, typename E, typename S, typename C, typename P = GuidedPartitioner>
  Task for_each_index(
    B first, E last, S step, C callable, P&& part = P()
  );

  // ------------------------------------------------------------------------
  // transform
  // ------------------------------------------------------------------------

  /**
  @brief constructs a parallel-transform task

  @tparam B beginning input iterator type
  @tparam E ending input iterator type
  @tparam O output iterator type
  @tparam C callable type
  @tparam P partitioner type (default tf::GuidedPartitioner)

  @param first1 iterator to the beginning of the first range
  @param last1 iterator to the end of the first range
  @param d_first iterator to the beginning of the output range
  @param c an unary callable to apply to dereferenced input elements
  @param part partitioning algorithm to schedule parallel iterations

  @return a tf::Task handle

  The task spawns asynchronous tasks that applies the callable object to an
  input range and stores the result in another output range.
  This method is equivalent to the parallel execution of the following loop:

  @code{.cpp}
  while (first1 != last1) {
    *d_first++ = c(*first1++);
  }
  @endcode

  Iterators are templated to enable stateful range using std::reference_wrapper.
  The callable needs to take a single argument of the dereferenced
  iterator type.
  
  Please refer to @ref ParallelTransforms for details.
  */
  template <
    typename B, typename E, typename O, typename C, typename P = GuidedPartitioner
  >
  Task transform(B first1, E last1, O d_first, C c, P&& part = P());
  
  /**
  @brief constructs a parallel-transform task

  @tparam B1 beginning input iterator type for the first input range
  @tparam E1 ending input iterator type for the first input range
  @tparam B2 beginning input iterator type for the first second range
  @tparam O output iterator type
  @tparam C callable type
  @tparam P partitioner type (default tf::GuidedPartitioner)

  @param first1 iterator to the beginning of the first input range
  @param last1 iterator to the end of the first input range
  @param first2 iterator to the beginning of the second input range
  @param d_first iterator to the beginning of the output range
  @param c a binary operator to apply to dereferenced input elements
  @param part partitioning algorithm to schedule parallel iterations

  @return a tf::Task handle

  The task spawns asynchronous tasks that applies the callable object to two
  input ranges and stores the result in another output range.
  This method is equivalent to the parallel execution of the following loop:

  @code{.cpp}
  while (first1 != last1) {
    *d_first++ = c(*first1++, *first2++);
  }
  @endcode

  Iterators are templated to enable stateful range using std::reference_wrapper.
  The callable needs to take two arguments of dereferenced elements
  from the two input ranges.
  
  Please refer to @ref ParallelTransforms for details.
  */
  template <
    typename B1, typename E1, typename B2, typename O, typename C, typename P=GuidedPartitioner,
    std::enable_if_t<!is_partitioner_v<std::decay_t<C>>, void>* = nullptr
  >
  Task transform(B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part = P());
  
  // ------------------------------------------------------------------------
  // reduction
  // ------------------------------------------------------------------------

  /**
  @brief constructs an STL-styled parallel-reduce task

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam T result type
  @tparam O binary reducer type
  @tparam P partitioner type (default tf::GuidedPartitioner)

  @param first iterator to the beginning (inclusive)
  @param last iterator to the end (exclusive)
  @param init initial value of the reduction and the storage for the reduced result
  @param bop binary operator that will be applied
  @param part partitioning algorithm to schedule parallel iterations

  @return a tf::Task handle

  The task spawns asynchronous tasks to perform parallel reduction over @c init
  and the elements in the range <tt>[first, last)</tt>.
  The reduced result is store in @c init.
  This method is equivalent to the parallel execution of the following loop:

  @code{.cpp}
  for(auto itr=first; itr!=last; itr++) {
    init = bop(init, *itr);
  }
  @endcode

  Iterators are templated to enable stateful range using std::reference_wrapper.

  Please refer to @ref ParallelReduction for details.
  */
  template <typename B, typename E, typename T, typename O, typename P = GuidedPartitioner>
  Task reduce(B first, E last, T& init, O bop, P&& part = P());
  
  // ------------------------------------------------------------------------
  // transfrom and reduction
  // ------------------------------------------------------------------------

  /**
  @brief constructs an STL-styled parallel transform-reduce task

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam T result type
  @tparam BOP binary reducer type
  @tparam UOP unary transformion type
  @tparam P partitioner type (default tf::GuidedPartitioner)

  @param first iterator to the beginning (inclusive)
  @param last iterator to the end (exclusive)
  @param init initial value of the reduction and the storage for the reduced result
  @param bop binary operator that will be applied in unspecified order to the results of @c uop
  @param uop unary operator that will be applied to transform each element in the range to the result type
  @param part partitioning algorithm to schedule parallel iterations

  @return a tf::Task handle

  The task spawns asynchronous tasks to perform parallel reduction over @c init and
  the transformed elements in the range <tt>[first, last)</tt>.
  The reduced result is store in @c init.
  This method is equivalent to the parallel execution of the following loop:

  @code{.cpp}
  for(auto itr=first; itr!=last; itr++) {
    init = bop(init, uop(*itr));
  }
  @endcode

  Iterators are templated to enable stateful range using std::reference_wrapper.

  Please refer to @ref ParallelReduction for details.
  */
  template <
   typename B, typename E, typename T, typename BOP, typename UOP, typename P = GuidedPartitioner
  >
  Task transform_reduce(B first, E last, T& init, BOP bop, UOP uop, P&& part = P());
  
  // ------------------------------------------------------------------------
  // scan
  // ------------------------------------------------------------------------
  
  /**
  @brief creates an STL-styled parallel inclusive-scan task

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam D destination iterator type
  @tparam BOP summation operator type

  @param first start of input range
  @param last end of input range
  @param d_first start of output range (may be the same as input range)
  @param bop function to perform summation

  Performs the cumulative sum (aka prefix sum, aka scan) of the input range
  and writes the result to the output range. 
  Each element of the output range contains the
  running total of all earlier elements using the given binary operator
  for summation.
  
  This function generates an @em inclusive scan, meaning that the N-th element
  of the output range is the sum of the first N input elements,
  so the N-th input element is included.

  @code{.cpp}
  std::vector<int> input = {1, 2, 3, 4, 5};
  taskflow.inclusive_scan(
    input.begin(), input.end(), input.begin(), std::plus<int>{}
  );
  executor.run(taskflow).wait();
  
  // input is {1, 3, 6, 10, 15}
  @endcode
  
  Iterators are templated to enable stateful range using std::reference_wrapper.
  
  Please refer to @ref ParallelScan for details.
  */
  template <typename B, typename E, typename D, typename BOP>
  Task inclusive_scan(B first, E last, D d_first, BOP bop);
  
  /**
  @brief creates an STL-styled parallel inclusive-scan task with an initial value

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam D destination iterator type
  @tparam BOP summation operator type
  @tparam T initial value type

  @param first start of input range
  @param last end of input range
  @param d_first start of output range (may be the same as input range)
  @param bop function to perform summation
  @param init initial value

  Performs the cumulative sum (aka prefix sum, aka scan) of the input range
  and writes the result to the output range. 
  Each element of the output range contains the
  running total of all earlier elements (and the initial value)
  using the given binary operator for summation.
  
  This function generates an @em inclusive scan, meaning the N-th element
  of the output range is the sum of the first N input elements,
  so the N-th input element is included.

  @code{.cpp}
  std::vector<int> input = {1, 2, 3, 4, 5};
  taskflow.inclusive_scan(
    input.begin(), input.end(), input.begin(), std::plus<int>{}, -1
  );
  executor.run(taskflow).wait();
  
  // input is {0, 2, 5, 9, 14}
  @endcode
  
  Iterators are templated to enable stateful range using std::reference_wrapper.
 
  Please refer to @ref ParallelScan for details.

  */
  template <typename B, typename E, typename D, typename BOP, typename T>
  Task inclusive_scan(B first, E last, D d_first, BOP bop, T init);
  
  /**
  @brief creates an STL-styled parallel exclusive-scan task

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam D destination iterator type
  @tparam T initial value type
  @tparam BOP summation operator type

  @param first start of input range
  @param last end of input range
  @param d_first start of output range (may be the same as input range)
  @param init initial value
  @param bop function to perform summation

  Performs the cumulative sum (aka prefix sum, aka scan) of the input range
  and writes the result to the output range. 
  Each element of the output range contains the
  running total of all earlier elements (and the initial value)
  using the given binary operator for summation.
  
  This function generates an @em exclusive scan, meaning the N-th element
  of the output range is the sum of the first N-1 input elements,
  so the N-th input element is not included.

  @code{.cpp}
  std::vector<int> input = {1, 2, 3, 4, 5};
  taskflow.exclusive_scan(
    input.begin(), input.end(), input.begin(), -1, std::plus<int>{}
  );
  executor.run(taskflow).wait();
  
  // input is {-1, 0, 2, 5, 9}
  @endcode
  
  Iterators are templated to enable stateful range using std::reference_wrapper.
  
  Please refer to @ref ParallelScan for details.
  */
  template <typename B, typename E, typename D, typename T, typename BOP>
  Task exclusive_scan(B first, E last, D d_first, T init, BOP bop);
  
  // ------------------------------------------------------------------------
  // transform scan
  // ------------------------------------------------------------------------
  
  /**
  @brief creates an STL-styled parallel transform-inclusive scan task

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam D destination iterator type
  @tparam BOP summation operator type
  @tparam UOP transform operator type

  @param first start of input range
  @param last end of input range
  @param d_first start of output range (may be the same as input range)
  @param bop function to perform summation
  @param uop function to transform elements of the input range

  Write the cumulative sum (aka prefix sum, aka scan) of the input range
  to the output range. Each element of the output range contains the
  running total of all earlier elements
  using @c uop to transform the input elements
  and using @c bop for summation.
  
  This function generates an @em inclusive scan, meaning the Nth element
  of the output range is the sum of the first N input elements,
  so the Nth input element is included.

  @code{.cpp}
  std::vector<int> input = {1, 2, 3, 4, 5};
  taskflow.transform_inclusive_scan(
    input.begin(), input.end(), input.begin(), std::plus<int>{}, 
    [] (int item) { return -item; }
  );
  executor.run(taskflow).wait();
  
  // input is {-1, -3, -6, -10, -15}
  @endcode
  
  Iterators are templated to enable stateful range using std::reference_wrapper.
  
  Please refer to @ref ParallelScan for details.
  */
  template <typename B, typename E, typename D, typename BOP, typename UOP>
  Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop);
  
  /**
  @brief creates an STL-styled parallel transform-inclusive scan task

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam D destination iterator type
  @tparam BOP summation operator type
  @tparam UOP transform operator type
  @tparam T initial value type

  @param first start of input range
  @param last end of input range
  @param d_first start of output range (may be the same as input range)
  @param bop function to perform summation
  @param uop function to transform elements of the input range
  @param init initial value

  Write the cumulative sum (aka prefix sum, aka scan) of the input range
  to the output range. Each element of the output range contains the
  running total of all earlier elements (including an initial value)
  using @c uop to transform the input elements
  and using @c bop for summation.
  
  This function generates an @em inclusive scan, meaning the Nth element
  of the output range is the sum of the first N input elements,
  so the Nth input element is included.

  @code{.cpp}
  std::vector<int> input = {1, 2, 3, 4, 5};
  taskflow.transform_inclusive_scan(
    input.begin(), input.end(), input.begin(), std::plus<int>{}, 
    [] (int item) { return -item; },
    -1
  );
  executor.run(taskflow).wait();
  
  // input is {-2, -4, -7, -11, -16}
  @endcode
  
  Iterators are templated to enable stateful range using std::reference_wrapper.
  
  Please refer to @ref ParallelScan for details.
  */
  template <typename B, typename E, typename D, typename BOP, typename UOP, typename T>
  Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init);
  
  /**
  @brief creates an STL-styled parallel transform-exclusive scan task

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam D destination iterator type
  @tparam BOP summation operator type
  @tparam UOP transform operator type
  @tparam T initial value type

  @param first start of input range
  @param last end of input range
  @param d_first start of output range (may be the same as input range)
  @param bop function to perform summation
  @param uop function to transform elements of the input range
  @param init initial value

  Write the cumulative sum (aka prefix sum, aka scan) of the input range
  to the output range. Each element of the output range contains the
  running total of all earlier elements (including an initial value)
  using @c uop to transform the input elements
  and using @c bop for summation.
  
  This function generates an @em exclusive scan, meaning the Nth element
  of the output range is the sum of the first N-1 input elements,
  so the Nth input element is not included.

  @code{.cpp}
  std::vector<int> input = {1, 2, 3, 4, 5};
  taskflow.transform_exclusive_scan(
    input.begin(), input.end(), input.begin(), -1, std::plus<int>{},
    [](int item) { return -item; }
  );
  executor.run(taskflow).wait();
  
  // input is {-1, -2, -4, -7, -11}
  @endcode
  
  Iterators are templated to enable stateful range using std::reference_wrapper.
  
  Please refer to @ref ParallelScan for details.
  */
  template <typename B, typename E, typename D, typename T, typename BOP, typename UOP>
  Task transform_exclusive_scan(B first, E last, D d_first, T init, BOP bop, UOP uop);

  // ------------------------------------------------------------------------
  // find
  // ------------------------------------------------------------------------
 
  /**
  @brief constructs a task to perform STL-styled find-if algorithm

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam T resulting iterator type
  @tparam UOP unary predicate type
  @tparam P partitioner type
  
  @param first start of the input range
  @param last end of the input range
  @param result resulting iterator to the found element in the input range
  @param predicate unary predicate which returns @c true for the required element
  @param part partitioning algorithm (default tf::GuidedPartitioner)

  Returns an iterator to the first element in the range <tt>[first, last)</tt> 
  that satisfies the given criteria (or last if there is no such iterator).
  This method is equivalent to the parallel execution of the following loop:

  @code{.cpp}
  auto find_if(InputIt first, InputIt last, UnaryPredicate p) {
    for (; first != last; ++first) {
      if (predicate(*first)){
        return first;
      }
    }
    return last;
  }
  @endcode

  For example, the code below find the element that satisfies the given 
  criteria (value plus one is equal to 23) from an input range of 10 elements:

  @code{.cpp}
  std::vector<int> input = {1, 6, 9, 10, 22, 5, 7, 8, 9, 11};
  std::vector<int>::iterator result;
  taskflow.find_if(
    input.begin(), input.end(), [](int i){ return i+1 = 23; }, result
  );
  executor.run(taskflow).wait();
  assert(*result == 22);
  @endcode
  
  Iterators are templated to enable stateful range using std::reference_wrapper.
  */
  template <typename B, typename E, typename T, typename UOP, typename P = GuidedPartitioner>
  Task find_if(B first, E last, T& result, UOP predicate, P&& part = P());
  
  /**
  @brief constructs a task to perform STL-styled find-if-not algorithm

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam T resulting iterator type
  @tparam UOP unary predicate type
  @tparam P partitioner type
  
  @param first start of the input range
  @param last end of the input range
  @param result resulting iterator to the found element in the input range
  @param predicate unary predicate which returns @c false for the required element
  @param part partitioning algorithm (default tf::GuidedPartitioner)

  Returns an iterator to the first element in the range <tt>[first, last)</tt> 
  that satisfies the given criteria (or last if there is no such iterator).
  This method is equivalent to the parallel execution of the following loop:

  @code{.cpp}
  auto find_if(InputIt first, InputIt last, UnaryPredicate p) {
    for (; first != last; ++first) {
      if (!predicate(*first)){
        return first;
      }
    }
    return last;
  }
  @endcode

  For example, the code below find the element that satisfies the given 
  criteria (value is not equal to 1) from an input range of 10 elements:

  @code{.cpp}
  std::vector<int> input = {1, 1, 1, 1, 22, 1, 1, 1, 1, 1};
  std::vector<int>::iterator result;
  taskflow.find_if_not(
    input.begin(), input.end(), [](int i){ return i == 1; }, result
  );
  executor.run(taskflow).wait();
  assert(*result == 22);
  @endcode
  
  Iterators are templated to enable stateful range using std::reference_wrapper.
  */
  template <typename B, typename E, typename T, typename UOP,typename P = GuidedPartitioner>
  Task find_if_not(B first, E last, T& result, UOP predicate, P&& part = P());

  /**
  @brief constructs a task to perform STL-styled min-element algorithm

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam T resulting iterator type
  @tparam C comparator type
  @tparam P partitioner type
  
  @param first start of the input range
  @param last end of the input range
  @param result resulting iterator to the found element in the input range
  @param comp comparison function object
  @param part partitioning algorithm (default tf::GuidedPartitioner)

  Finds the smallest element in the <tt>[first, last)</tt> 
  using the given comparison function object.
  The iterator to that smallest element is stored in @c result.
  This method is equivalent to the parallel execution of the following loop:

  @code{.cpp}
  if (first == last) {
    return last;
  }
  auto smallest = first;
  ++first;
  for (; first != last; ++first) {
    if (comp(*first, *smallest)) {
      smallest = first;
    }
  }
  return smallest;
  @endcode

  For example, the code below find the smallest element from an input
  range of 10 elements.

  @code{.cpp}
  std::vector<int> input = {1, 1, 1, 1, 1, -1, 1, 1, 1, 1};
  std::vector<int>::iterator result;
  taskflow.min_element(
    input.begin(), input.end(), std::less<int>(), result
  );
  executor.run(taskflow).wait();
  assert(*result == -1);
  @endcode
  
  Iterators are templated to enable stateful range using std::reference_wrapper.
  */
  template <typename B, typename E, typename T, typename C, typename P>
  Task min_element(B first, E last, T& result, C comp, P&& part);
  
  /**
  @brief constructs a task to perform STL-styled max-element algorithm

  @tparam B beginning iterator type
  @tparam E ending iterator type
  @tparam T resulting iterator type
  @tparam C comparator type
  @tparam P partitioner type
  
  @param first start of the input range
  @param last end of the input range
  @param result resulting iterator to the found element in the input range
  @param comp comparison function object
  @param part partitioning algorithm (default tf::GuidedPartitioner)

  Finds the largest element in the <tt>[first, last)</tt> 
  using the given comparison function object.
  The iterator to that largest element is stored in @c result.
  This method is equivalent to the parallel execution of the following loop:

  @code{.cpp}
  if (first == last){
    return last;
  }
  auto largest = first;
  ++first;
  for (; first != last; ++first) {
    if (comp(*largest, *first)) {
      largest = first;
    }
  }
  return largest;
  @endcode

  For example, the code below find the largest element from an input
  range of 10 elements.

  @code{.cpp}
  std::vector<int> input = {1, 1, 1, 1, 1, 2, 1, 1, 1, 1};
  std::vector<int>::iterator result;
  taskflow.max_element(
    input.begin(), input.end(), std::less<int>(), result
  );
  executor.run(taskflow).wait();
  assert(*result == 2);
  @endcode
  
  Iterators are templated to enable stateful range using std::reference_wrapper.
  */
  template <typename B, typename E, typename T, typename C, typename P>
  Task max_element(B first, E last, T& result, C comp, P&& part);

  // ------------------------------------------------------------------------
  // sort
  // ------------------------------------------------------------------------

  /**
  @brief constructs a dynamic task to perform STL-styled parallel sort

  @tparam B beginning iterator type (random-accessible)
  @tparam E ending iterator type (random-accessible)
  @tparam C comparator type

  @param first iterator to the beginning (inclusive)
  @param last iterator to the end (exclusive)
  @param cmp comparison operator

  The task spawns asynchronous tasks to sort elements in the range
  <tt>[first, last)</tt> in parallel.

  Iterators are templated to enable stateful range using std::reference_wrapper.

  Please refer to @ref ParallelSort for details.
  */
  template <typename B, typename E, typename C>
  Task sort(B first, E last, C cmp);

  /**
  @brief constructs a dynamic task to perform STL-styled parallel sort using
         the @c std::less<T> comparator, where @c T is the element type

  @tparam B beginning iterator type (random-accessible)
  @tparam E ending iterator type (random-accessible)

  @param first iterator to the beginning (inclusive)
  @param last iterator to the end (exclusive)

  The task spawns asynchronous tasks to parallelly sort elements in the range
  <tt>[first, last)</tt> using the @c std::less<T> comparator,
  where @c T is the dereferenced iterator type.

  Iterators are templated to enable stateful range using std::reference_wrapper.

  Please refer to @ref ParallelSort for details.
   */
  template <typename B, typename E>
  Task sort(B first, E last);

  protected:

  /**
  @brief associated graph object
  */
  Graph& _graph;

  private:

  template <typename L>
  void _linearize(L&);
 };

 // Constructor
 inline FlowBuilder::FlowBuilder(Graph& graph) :
  _graph {graph} {
 }

 // Function: emplace
 template <typename C, std::enable_if_t<is_static_task_v<C>, void>*>
 Task FlowBuilder::emplace(C&& c) {
  return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0,
    std::in_place_type_t<Node::Static>{}, std::forward<C>(c)
  ));
 }

 // Function: emplace
 template <typename C, std::enable_if_t<is_dynamic_task_v<C>, void>*>
 Task FlowBuilder::emplace(C&& c) {
  return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0,
    std::in_place_type_t<Node::Dynamic>{}, std::forward<C>(c)
  ));
 }

 // Function: emplace
 template <typename C, std::enable_if_t<is_condition_task_v<C>, void>*>
 Task FlowBuilder::emplace(C&& c) {
  return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0,
    std::in_place_type_t<Node::Condition>{}, std::forward<C>(c)
  ));
 }

 // Function: emplace
 template <typename C, std::enable_if_t<is_multi_condition_task_v<C>, void>*>
 Task FlowBuilder::emplace(C&& c) {
  return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0,
    std::in_place_type_t<Node::MultiCondition>{}, std::forward<C>(c)
  ));
 }

 // Function: emplace
 template <typename... C, std::enable_if_t<(sizeof...(C)>1), void>*>
 auto FlowBuilder::emplace(C&&... cs) {
  return std::make_tuple(emplace(std::forward<C>(cs))...);
 }

 // Function: erase
 inline void FlowBuilder::erase(Task task) {

  if (!task._node) {
    return;
  }

  task.for_each_dependent([&] (Task dependent) {
    auto& S = dependent._node->_successors;
    if(auto I = std::find(S.begin(), S.end(), task._node); I != S.end()) {
      S.erase(I);
    }
  });

  task.for_each_successor([&] (Task dependent) {
    auto& D = dependent._node->_dependents;
    if(auto I = std::find(D.begin(), D.end(), task._node); I != D.end()) {
      D.erase(I);
    }
  });

  _graph._erase(task._node);
 }

 // Function: composed_of
 template <typename T>
 Task FlowBuilder::composed_of(T& object) {
  auto node = _graph._emplace_back("", 0, nullptr, nullptr, 0,
    std::in_place_type_t<Node::Module>{}, object
  );
  return Task(node);
 }

 // Function: placeholder
 inline Task FlowBuilder::placeholder() {
  auto node = _graph._emplace_back("", 0, nullptr, nullptr, 0,
    std::in_place_type_t<Node::Placeholder>{}
  );
  return Task(node);
 }

 // Procedure: _linearize
 template <typename L>
 void FlowBuilder::_linearize(L& keys) {

  auto itr = keys.begin();
  auto end = keys.end();

  if(itr == end) {
    return;
  }

  auto nxt = itr;

  for(++nxt; nxt != end; ++nxt, ++itr) {
    itr->_node->_precede(nxt->_node);
  }
 }

 // Procedure: linearize
 inline void FlowBuilder::linearize(std::vector<Task>& keys) {
  _linearize(keys);
 }

 // Procedure: linearize
 inline void FlowBuilder::linearize(std::initializer_list<Task> keys) {
  _linearize(keys);
 }

 // ----------------------------------------------------------------------------

 /**
 @class Subflow

 @brief class to construct a subflow graph from the execution of a dynamic task

 tf::Subflow is a derived class from tf::Runtime with a specialized mechanism
 to manage the execution of a child graph.
 By default, a subflow automatically @em joins its parent node.
 You may explicitly join or detach a subflow by calling tf::Subflow::join
 or tf::Subflow::detach, respectively.
 The following example creates a taskflow graph that spawns a subflow from
 the execution of task @c B, and the subflow contains three tasks, @c B1,
 @c B2, and @c B3, where @c B3 runs after @c B1 and @c B2.

 @code{.cpp}
 // create three static tasks
 tf::Task A = taskflow.emplace([](){}).name("A");
 tf::Task C = taskflow.emplace([](){}).name("C");
 tf::Task D = taskflow.emplace([](){}).name("D");

 // create a subflow graph (dynamic tasking)
 tf::Task B = taskflow.emplace([] (tf::Subflow& subflow) {
  tf::Task B1 = subflow.emplace([](){}).name("B1");
  tf::Task B2 = subflow.emplace([](){}).name("B2");
  tf::Task B3 = subflow.emplace([](){}).name("B3");
  B1.precede(B3);
  B2.precede(B3);
 }).name("B");

 A.precede(B);  // B runs after A
 A.precede(C);  // C runs after A
 B.precede(D);  // D runs after B
 C.precede(D);  // D runs after C
 @endcode

 */
 class Subflow : public FlowBuilder,
                public Runtime {

  friend class Executor;
  friend class FlowBuilder;
  friend class Runtime;

  public:

    /**
    @brief enables the subflow to join its parent task

    Performs an immediate action to join the subflow. Once the subflow is joined,
    it is considered finished and you may not modify the subflow anymore.

    @code{.cpp}
    taskflow.emplace([](tf::Subflow& sf){
      sf.emplace([](){});
      sf.join();  // join the subflow of one task
    });
    @endcode

    Only the worker that spawns this subflow can join it.
    */
    void join();

    /**
    @brief enables the subflow to detach from its parent task

    Performs an immediate action to detach the subflow. Once the subflow is detached,
    it is considered finished and you may not modify the subflow anymore.

    @code{.cpp}
    taskflow.emplace([](tf::Subflow& sf){
      sf.emplace([](){});
      sf.detach();
    });
    @endcode

    Only the worker that spawns this subflow can detach it.
    */
    void detach();

    /**
    @brief resets the subflow to a joinable state

    @param clear_graph specifies whether to clear the associated graph (default @c true)

    Clears the underlying task graph depending on the 
    given variable @c clear_graph (default @c true) and then
    updates the subflow to a joinable state.
    */
    void reset(bool clear_graph = true);

    /**
    @brief queries if the subflow is joinable

    This member function queries if the subflow is joinable.
    When a subflow is joined or detached, it becomes not joinable.

    @code{.cpp}
    taskflow.emplace([](tf::Subflow& sf){
      sf.emplace([](){});
      std::cout << sf.joinable() << '\n';  // true
      sf.join();
      std::cout << sf.joinable() << '\n';  // false
    });
    @endcode
    */
    bool joinable() const noexcept;

  private:

    bool _joinable {true};

    Subflow(Executor&, Worker&, Node*, Graph&);
 };

 // Constructor
 inline Subflow::Subflow(
  Executor& executor, Worker& worker, Node* parent, Graph& graph
 ) :
  FlowBuilder {graph},
  Runtime {executor, worker, parent} {
  // assert(_parent != nullptr);
 }

 // Function: joined
 inline bool Subflow::joinable() const noexcept {
  return _joinable;
 }

 // Procedure: reset
 inline void Subflow::reset(bool clear_graph) {
  if(clear_graph) {
    _graph._clear();
  }
  _joinable = true;
 }

 }  // end of namespace tf. ---------------------------------------------------












 /**
 @file taskflow/core/taskflow.hpp
 @brief taskflow include file
 */

 namespace tf {

 // ----------------------------------------------------------------------------

 /**
 @class Taskflow

 @brief class to create a taskflow object

 A %taskflow manages a task dependency graph where each task represents a
 callable object (e.g., @std_lambda, @std_function) and an edge represents a
 dependency between two tasks. A task is one of the following types:

  1. static task         : the callable constructible from
                           @c std::function<void()>
  2. dynamic task        : the callable constructible from
                           @c std::function<void(tf::Subflow&)>
  3. condition task      : the callable constructible from
                           @c std::function<int()>
  4. multi-condition task: the callable constructible from
                           @c %std::function<tf::SmallVector<int>()>
  5. module task         : the task constructed from tf::Taskflow::composed_of
                           @c std::function<void(tf::Runtime&)>

 Each task is a basic computation unit and is run by one worker thread
 from an executor.
 The following example creates a simple taskflow graph of four static tasks,
 @c A, @c B, @c C, and @c D, where
 @c A runs before @c B and @c C and
 @c D runs after  @c B and @c C.

 @code{.cpp}
 tf::Executor executor;
 tf::Taskflow taskflow("simple");

 tf::Task A = taskflow.emplace([](){ std::cout << "TaskA\n"; });
 tf::Task B = taskflow.emplace([](){ std::cout << "TaskB\n"; });
 tf::Task C = taskflow.emplace([](){ std::cout << "TaskC\n"; });
 tf::Task D = taskflow.emplace([](){ std::cout << "TaskD\n"; });

 A.precede(B, C);  // A runs before B and C
 D.succeed(B, C);  // D runs after  B and C

 executor.run(taskflow).wait();
 @endcode

 The taskflow object itself is NOT thread-safe. You should not
 modifying the graph while it is running,
 such as adding new tasks, adding new dependencies, and moving
 the taskflow to another.
 To minimize the overhead of task creation,
 our runtime leverages a global object pool to recycle
 tasks in a thread-safe manner.

 Please refer to @ref Cookbook to learn more about each task type
 and how to submit a taskflow to an executor.
 */
 class Taskflow : public FlowBuilder {

  friend class Topology;
  friend class Executor;
  friend class FlowBuilder;

  struct Dumper {
    size_t id;
    std::stack<std::pair<const Node*, const Graph*>> stack;
    std::unordered_map<const Graph*, size_t> visited;
  };

  public:

    /**
    @brief constructs a taskflow with the given name

    @code{.cpp}
    tf::Taskflow taskflow("My Taskflow");
    std::cout << taskflow.name();         // "My Taskflow"
    @endcode
    */
    Taskflow(const std::string& name);

    /**
    @brief constructs a taskflow
    */
    Taskflow();

    /**
    @brief constructs a taskflow from a moved taskflow

    Constructing a taskflow @c taskflow1 from a moved taskflow @c taskflow2 will
    migrate the graph of @c taskflow2 to @c taskflow1.
    After the move, @c taskflow2 will become empty.

    @code{.cpp}
    tf::Taskflow taskflow1(std::move(taskflow2));
    assert(taskflow2.empty());
    @endcode

    Notice that @c taskflow2 should not be running in an executor
    during the move operation, or the behavior is undefined.
    */
    Taskflow(Taskflow&& rhs);

    /**
    @brief move assignment operator

    Moving a taskflow @c taskflow2 to another taskflow @c taskflow1 will destroy
    the existing graph of @c taskflow1 and assign it the graph of @c taskflow2.
    After the move, @c taskflow2 will become empty.

    @code{.cpp}
    taskflow1 = std::move(taskflow2);
    assert(taskflow2.empty());
    @endcode

    Notice that both @c taskflow1 and @c taskflow2 should not be running
    in an executor during the move operation, or the behavior is undefined.
    */
    Taskflow& operator = (Taskflow&& rhs);

    /**
    @brief default destructor

    When the destructor is called, all tasks and their associated data
    (e.g., captured data) will be destroyed.
    It is your responsibility to ensure all submitted execution of this
    taskflow have completed before destroying it.
    For instance, the following code results in undefined behavior
    since the executor may still be running the taskflow while
    it is destroyed after the block.

    @code{.cpp}
    {
      tf::Taskflow taskflow;
      executor.run(taskflow);
    }
    @endcode

    To fix the problem, we must wait for the execution to complete
    before destroying the taskflow.

    @code{.cpp}
    {
      tf::Taskflow taskflow;
      executor.run(taskflow).wait();
    }
    @endcode
    */
    ~Taskflow() = default;

    /**
    @brief dumps the taskflow to a DOT format through a std::ostream target

    @code{.cpp}
    taskflow.dump(std::cout);  // dump the graph to the standard output

    std::ofstream ofs("output.dot");
    taskflow.dump(ofs);        // dump the graph to the file output.dot
    @endcode

    For dynamically spawned tasks, such as module tasks, subflow tasks,
    and GPU tasks, you need to run the taskflow first before you can
    dump the entire graph.

    @code{.cpp}
    tf::Task parent = taskflow.emplace([](tf::Subflow sf){
      sf.emplace([](){ std::cout << "child\n"; });
    });
    taskflow.dump(std::cout);      // this dumps only the parent tasks
    executor.run(taskflow).wait();
    taskflow.dump(std::cout);      // this dumps both parent and child tasks
    @endcode
    */
    void dump(std::ostream& ostream) const;

    /**
    @brief dumps the taskflow to a std::string of DOT format

    This method is similar to tf::Taskflow::dump(std::ostream& ostream),
    but returning a string of the graph in DOT format.
    */
    std::string dump() const;

    /**
    @brief queries the number of tasks
    */
    size_t num_tasks() const;

    /**
    @brief queries the emptiness of the taskflow

    An empty taskflow has no tasks. That is the return of
    tf::Taskflow::num_tasks is zero.
    */
    bool empty() const;

    /**
    @brief assigns a name to the taskflow

    @code{.cpp}
    taskflow.name("assign another name");
    @endcode
    */
    void name(const std::string&);

    /**
    @brief queries the name of the taskflow

    @code{.cpp}
    std::cout << "my name is: " << taskflow.name();
    @endcode
    */
    const std::string& name() const;

    /**
    @brief clears the associated task dependency graph

    When you clear a taskflow, all tasks and their associated data
    (e.g., captured data in task callables) will be destroyed.
    The behavior of clearing a running taskflow is undefined.
    */
    void clear();

    /**
    @brief applies a visitor to each task in the taskflow

    A visitor is a callable that takes an argument of type tf::Task
    and returns nothing. The following example iterates each task in a
    taskflow and prints its name:

    @code{.cpp}
    taskflow.for_each_task([](tf::Task task){
      std::cout << task.name() << '\n';
    });
    @endcode
    */
    template <typename V>
    void for_each_task(V&& visitor) const;

    /**
    @brief returns a reference to the underlying graph object

    A graph object (of type tf::Graph) is the ultimate storage for the
    task dependency graph and should only be used as an opaque
    data structure to interact with the executor (e.g., composition).
    */
    Graph& graph();

  private:

    mutable std::mutex _mutex;

    std::string _name;

    Graph _graph;

    std::queue<std::shared_ptr<Topology>> _topologies;

    std::optional<std::list<Taskflow>::iterator> _satellite;

    void _dump(std::ostream&, const Graph*) const;
    void _dump(std::ostream&, const Node*, Dumper&) const;
    void _dump(std::ostream&, const Graph*, Dumper&) const;
 };

 // Constructor
 inline Taskflow::Taskflow(const std::string& name) :
  FlowBuilder {_graph},
  _name       {name} {
 }

 // Constructor
 inline Taskflow::Taskflow() : FlowBuilder{_graph} {
 }

 // Move constructor
 inline Taskflow::Taskflow(Taskflow&& rhs) : FlowBuilder{_graph} {

  std::scoped_lock<std::mutex> lock(rhs._mutex);

  _name = std::move(rhs._name);
  _graph = std::move(rhs._graph);
  _topologies = std::move(rhs._topologies);
  _satellite = rhs._satellite;

  rhs._satellite.reset();
 }

 // Move assignment
 inline Taskflow& Taskflow::operator = (Taskflow&& rhs) {
  if(this != &rhs) {
    std::scoped_lock<std::mutex, std::mutex> lock(_mutex, rhs._mutex);
    _name = std::move(rhs._name);
    _graph = std::move(rhs._graph);
    _topologies = std::move(rhs._topologies);
    _satellite = rhs._satellite;
    rhs._satellite.reset();
  }
  return *this;
 }

 // Procedure:
 inline void Taskflow::clear() {
  _graph._clear();
 }

 // Function: num_tasks
 inline size_t Taskflow::num_tasks() const {
  return _graph.size();
 }

 // Function: empty
 inline bool Taskflow::empty() const {
  return _graph.empty();
 }

 // Function: name
 inline void Taskflow::name(const std::string &name) {
  _name = name;
 }

 // Function: name
 inline const std::string& Taskflow::name() const {
  return _name;
 }

 // Function: graph
 inline Graph& Taskflow::graph() {
  return _graph;
 }

 // Function: for_each_task
 template <typename V>
 void Taskflow::for_each_task(V&& visitor) const {
  for(size_t i=0; i<_graph._nodes.size(); ++i) {
    visitor(Task(_graph._nodes[i]));
  }
 }

 // Procedure: dump
 inline std::string Taskflow::dump() const {
  std::ostringstream oss;
  dump(oss);
  return oss.str();
 }

 // Function: dump
 inline void Taskflow::dump(std::ostream& os) const {
  os << "digraph Taskflow {\n";
  _dump(os, &_graph);
  os << "}\n";
 }

 // Procedure: _dump
 inline void Taskflow::_dump(std::ostream& os, const Graph* top) const {

  Dumper dumper;

  dumper.id = 0;
  dumper.stack.push({nullptr, top});
  dumper.visited[top] = dumper.id++;

  while(!dumper.stack.empty()) {

    auto [p, f] = dumper.stack.top();
    dumper.stack.pop();

    os << "subgraph cluster_p" << f << " {\nlabel=\"";

    // n-level module
    if(p) {
      os << 'm' << dumper.visited[f];
    }
    // top-level taskflow graph
    else {
      os << "Taskflow: ";
      if(_name.empty()) os << 'p' << this;
      else os << _name;
    }

    os << "\";\n";

    _dump(os, f, dumper);
    os << "}\n";
  }
 }

 // Procedure: _dump
 inline void Taskflow::_dump(
  std::ostream& os, const Node* node, Dumper& dumper
 ) const {

  os << 'p' << node << "[label=\"";
  if(node->_name.empty()) os << 'p' << node;
  else os << node->_name;
  os << "\" ";

  // shape for node
  switch(node->_handle.index()) {

    case Node::CONDITION:
    case Node::MULTI_CONDITION:
      os << "shape=diamond color=black fillcolor=aquamarine style=filled";
    break;

    default:
    break;
  }

  os << "];\n";

  for(size_t s=0; s<node->_successors.size(); ++s) {
    if(node->_is_conditioner()) {
      // case edge is dashed
      os << 'p' << node << " -> p" << node->_successors[s]
         << " [style=dashed label=\"" << s << "\"];\n";
    } else {
      os << 'p' << node << " -> p" << node->_successors[s] << ";\n";
    }
  }

  // subflow join node
  if(node->_parent && node->_parent->_handle.index() == Node::DYNAMIC &&
     node->_successors.size() == 0
    ) {
    os << 'p' << node << " -> p" << node->_parent << ";\n";
  }

  // node info
  switch(node->_handle.index()) {

    case Node::DYNAMIC: {
      auto& sbg = std::get_if<Node::Dynamic>(&node->_handle)->subgraph;
      if(!sbg.empty()) {
        os << "subgraph cluster_p" << node << " {\nlabel=\"Subflow: ";
        if(node->_name.empty()) os << 'p' << node;
        else os << node->_name;

        os << "\";\n" << "color=blue\n";
        _dump(os, &sbg, dumper);
        os << "}\n";
      }
    }
    break;

    default:
    break;
  }
 }

 // Procedure: _dump
 inline void Taskflow::_dump(
  std::ostream& os, const Graph* graph, Dumper& dumper
 ) const {

  for(const auto& n : graph->_nodes) {

    // regular task
    if(n->_handle.index() != Node::MODULE) {
      _dump(os, n, dumper);
    }
    // module task
    else {
      //auto module = &(std::get_if<Node::Module>(&n->_handle)->module);
      auto module = &(std::get_if<Node::Module>(&n->_handle)->graph);

      os << 'p' << n << "[shape=box3d, color=blue, label=\"";
      if(n->_name.empty()) os << 'p' << n;
      else os << n->_name;

      if(dumper.visited.find(module) == dumper.visited.end()) {
        dumper.visited[module] = dumper.id++;
        dumper.stack.push({n, module});
      }

      os << " [m" << dumper.visited[module] << "]\"];\n";

      for(const auto s : n->_successors) {
        os << 'p' << n << "->" << 'p' << s << ";\n";
      }
    }
  }
 }

 // ----------------------------------------------------------------------------
 // class definition: Future
 // ----------------------------------------------------------------------------

 /**
 @class Future

 @brief class to access the result of an execution

 tf::Future is a derived class from std::future that will eventually hold the
 execution result of a submitted taskflow (tf::Executor::run)
 or an asynchronous task (tf::Executor::async, tf::Executor::silent_async).
 In addition to the base methods inherited from std::future,
 you can call tf::Future::cancel to cancel the execution of the running taskflow
 associated with this future object.
 The following example cancels a submission of a taskflow that contains
 1000 tasks each running one second.

 @code{.cpp}
 tf::Executor executor;
 tf::Taskflow taskflow;

 for(int i=0; i<1000; i++) {
  taskflow.emplace([](){
    std::this_thread::sleep_for(std::chrono::seconds(1));
  });
 }

 // submit the taskflow
 tf::Future fu = executor.run(taskflow);

 // request to cancel the submitted execution above
 fu.cancel();

 // wait until the cancellation finishes
 fu.get();
 @endcode
 */
 template <typename T>
 class Future : public std::future<T>  {

  friend class Executor;
  friend class Subflow;
  friend class Runtime;

  using handle_t = std::variant<
    std::monostate, std::weak_ptr<Topology>
  >;

  public:

    /**
    @brief default constructor
    */
    Future() = default;

    /**
    @brief disabled copy constructor
    */
    Future(const Future&) = delete;

    /**
    @brief default move constructor
    */
    Future(Future&&) = default;

    /**
    @brief disabled copy assignment
    */
    Future& operator = (const Future&) = delete;

    /**
    @brief default move assignment
    */
    Future& operator = (Future&&) = default;

    /**
    @brief cancels the execution of the running taskflow associated with
           this future object

    @return @c true if the execution can be cancelled or
            @c false if the execution has already completed

    When you request a cancellation, the executor will stop scheduling
    any tasks onwards. Tasks that are already running will continue to finish
    (non-preemptive).
    You can call tf::Future::wait to wait for the cancellation to complete.
    */
    bool cancel();

  private:

    handle_t _handle;

    template <typename P>
    Future(std::future<T>&&, P&&);
 };

 template <typename T>
 template <typename P>
 Future<T>::Future(std::future<T>&& fu, P&& p) :
  std::future<T> {std::move(fu)},
  _handle        {std::forward<P>(p)} {
 }

 // Function: cancel
 template <typename T>
 bool Future<T>::cancel() {
  return std::visit([](auto&& arg){
    using P = std::decay_t<decltype(arg)>;
    if constexpr(std::is_same_v<P, std::monostate>) {
      return false;
    }
    else {
      auto ptr = arg.lock();
      if(ptr) {
        ptr->_is_cancelled.store(true, std::memory_order_relaxed);
        return true;
      }
      return false;
    }
  }, _handle);
 }


 }  // end of namespace tf. ---------------------------------------------------





 /**
 @file async_task.hpp
 @brief asynchronous task include file
 */

 namespace tf {

 // ----------------------------------------------------------------------------
 // AsyncTask
 // ----------------------------------------------------------------------------

 /**
 @brief class to create a dependent asynchronous task

 A tf::AsyncTask is a lightweight handle that retains @em shared ownership
 of a dependent async task created by an executor.
 This shared ownership ensures that the async task remains alive when
 adding it to the dependency list of another async task, 
 thus avoiding the classical [ABA problem](https://en.wikipedia.org/wiki/ABA_problem).

 @code{.cpp}
 // main thread retains shared ownership of async task A
 tf::AsyncTask A = executor.silent_dependent_async([](){});

 // task A remains alive (i.e., at least one ref count by the main thread) 
 // when being added to the dependency list of async task B
 tf::AsyncTask B = executor.silent_dependent_async([](){}, A);
 @endcode

 Currently, tf::AsyncTask is implemented based on C++ smart pointer std::shared_ptr and 
 is considered cheap to copy or move as long as only a handful of objects
 own it.
 When a worker completes an async task, it will remove the task from the executor,
 decrementing the number of shared owners by one.
 If that counter reaches zero, the task is destroyed.
 */
 class AsyncTask {
  
  friend class FlowBuilder;
  friend class Runtime;
  friend class Taskflow;
  friend class TaskView;
  friend class Executor;
  
  public:
    
    /**
    @brief constructs an empty task handle
    */
    AsyncTask() = default;
    
    /**
    @brief destroys the managed asynchronous task if this is the last owner
    */
    ~AsyncTask() = default;
    
    /**
    @brief constructs an task that shares ownership of @c rhs
    */
    AsyncTask(const AsyncTask& rhs) = default;

    /**
    @brief move-constructs an task from @c rhs
    */
    AsyncTask(AsyncTask&& rhs) = default;
    
    /**
    @brief shares ownership of the task managed by @c rhs
    */
    AsyncTask& operator = (const AsyncTask& rhs) = default;

    /**
    @brief move-assigns the task from @c rhs
    */
    AsyncTask& operator = (AsyncTask&& rhs) = default;
    
    /**
    @brief checks if the task stores a non-null shared pointer
    */
    bool empty() const;
    
    /**
    @brief release the ownership 
    */
    void reset();
    
    /**
    @brief obtains a hash value of the underlying node
    */
    size_t hash_value() const;

  private:

    AsyncTask(std::shared_ptr<Node>);

    std::shared_ptr<Node> _node;
 };

 // Constructor
 inline AsyncTask::AsyncTask(std::shared_ptr<Node> ptr) : _node {std::move(ptr)} {
 }

 // Function: empty
 inline bool AsyncTask::empty() const {
  return _node == nullptr;
 }

 // Function: reset
 inline void AsyncTask::reset() {
  _node.reset();
 }

 // Function: hash_value
 inline size_t AsyncTask::hash_value() const {
  return std::hash<std::shared_ptr<Node>>{}(_node);
 }

 }  // end of namespace tf ----------------------------------------------------





 /**
 @file executor.hpp
 @brief executor include file
 */

 namespace tf {

 // ----------------------------------------------------------------------------
 // Executor Definition
 // ----------------------------------------------------------------------------

 /** @class Executor

 @brief class to create an executor for running a taskflow graph

 An executor manages a set of worker threads to run one or multiple taskflows
 using an efficient work-stealing scheduling algorithm.

 @code{.cpp}
 // Declare an executor and a taskflow
 tf::Executor executor;
 tf::Taskflow taskflow;

 // Add three tasks into the taskflow
 tf::Task A = taskflow.emplace([] () { std::cout << "This is TaskA\n"; });
 tf::Task B = taskflow.emplace([] () { std::cout << "This is TaskB\n"; });
 tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; });

 // Build precedence between tasks
 A.precede(B, C);

 tf::Future<void> fu = executor.run(taskflow);
 fu.wait();                // block until the execution completes

 executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait();
 executor.run_n(taskflow, 4);
 executor.wait_for_all();  // block until all associated executions finish
 executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait();
 executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; });
 @endcode

 All the @c run methods are @em thread-safe. You can submit multiple
 taskflows at the same time to an executor from different threads.
 */
 class Executor {

  friend class FlowBuilder;
  friend class Subflow;
  friend class Runtime;

  public:

  /**
  @brief constructs the executor with @c N worker threads


  @param N number of workers (default std::thread::hardware_concurrency)
  @param wix worker interface class to alter worker (thread) behaviors
  
  The constructor spawns @c N worker threads to run tasks in a
  work-stealing loop. The number of workers must be greater than zero
  or an exception will be thrown.
  By default, the number of worker threads is equal to the maximum
  hardware concurrency returned by std::thread::hardware_concurrency.

  Users can alter the worker behavior, such as changing thread affinity,
  via deriving an instance from tf::WorkerInterface.
  */
  explicit Executor(
    size_t N = std::thread::hardware_concurrency(),
    std::shared_ptr<WorkerInterface> wix = nullptr 
  );

  /**
  @brief destructs the executor

  The destructor calls Executor::wait_for_all to wait for all submitted
  taskflows to complete and then notifies all worker threads to stop
  and join these threads.
  */
  ~Executor();

  /**
  @brief runs a taskflow once

  @param taskflow a tf::Taskflow object

  @return a tf::Future that holds the result of the execution

  This member function executes the given taskflow once and returns a tf::Future
  object that eventually holds the result of the execution.

  @code{.cpp}
  tf::Future<void> future = executor.run(taskflow);
  // do something else
  future.wait();
  @endcode

  This member function is thread-safe.

  @attention
  The executor does not own the given taskflow. It is your responsibility to
  ensure the taskflow remains alive during its execution.
  */
  tf::Future<void> run(Taskflow& taskflow);

  /**
  @brief runs a moved taskflow once

  @param taskflow a moved tf::Taskflow object

  @return a tf::Future that holds the result of the execution

  This member function executes a moved taskflow once and returns a tf::Future
  object that eventually holds the result of the execution.
  The executor will take care of the lifetime of the moved taskflow.

  @code{.cpp}
  tf::Future<void> future = executor.run(std::move(taskflow));
  // do something else
  future.wait();
  @endcode

  This member function is thread-safe.
  */
  tf::Future<void> run(Taskflow&& taskflow);

  /**
  @brief runs a taskflow once and invoke a callback upon completion

  @param taskflow a tf::Taskflow object
  @param callable a callable object to be invoked after this run

  @return a tf::Future that holds the result of the execution

  This member function executes the given taskflow once and invokes the given
  callable when the execution completes.
  This member function returns a tf::Future object that
  eventually holds the result of the execution.

  @code{.cpp}
  tf::Future<void> future = executor.run(taskflow, [](){ std::cout << "done"; });
  // do something else
  future.wait();
  @endcode

  This member function is thread-safe.

  @attention
  The executor does not own the given taskflow. It is your responsibility to
  ensure the taskflow remains alive during its execution.
  */
  template<typename C>
  tf::Future<void> run(Taskflow& taskflow, C&& callable);

  /**
  @brief runs a moved taskflow once and invoke a callback upon completion

  @param taskflow a moved tf::Taskflow object
  @param callable a callable object to be invoked after this run

  @return a tf::Future that holds the result of the execution

  This member function executes a moved taskflow once and invokes the given
  callable when the execution completes.
  This member function returns a tf::Future object that
  eventually holds the result of the execution.
  The executor will take care of the lifetime of the moved taskflow.

  @code{.cpp}
  tf::Future<void> future = executor.run(
    std::move(taskflow), [](){ std::cout << "done"; }
  );
  // do something else
  future.wait();
  @endcode

  This member function is thread-safe.
  */
  template<typename C>
  tf::Future<void> run(Taskflow&& taskflow, C&& callable);

  /**
  @brief runs a taskflow for @c N times

  @param taskflow a tf::Taskflow object
  @param N number of runs

  @return a tf::Future that holds the result of the execution

  This member function executes the given taskflow @c N times and returns a tf::Future
  object that eventually holds the result of the execution.

  @code{.cpp}
  tf::Future<void> future = executor.run_n(taskflow, 2);  // run taskflow 2 times
  // do something else
  future.wait();
  @endcode

  This member function is thread-safe.

  @attention
  The executor does not own the given taskflow. It is your responsibility to
  ensure the taskflow remains alive during its execution.
  */
  tf::Future<void> run_n(Taskflow& taskflow, size_t N);

  /**
  @brief runs a moved taskflow for @c N times

  @param taskflow a moved tf::Taskflow object
  @param N number of runs

  @return a tf::Future that holds the result of the execution

  This member function executes a moved taskflow @c N times and returns a tf::Future
  object that eventually holds the result of the execution.
  The executor will take care of the lifetime of the moved taskflow.

  @code{.cpp}
  tf::Future<void> future = executor.run_n(
    std::move(taskflow), 2    // run the moved taskflow 2 times
  );
  // do something else
  future.wait();
  @endcode

  This member function is thread-safe.
  */
  tf::Future<void> run_n(Taskflow&& taskflow, size_t N);

  /**
  @brief runs a taskflow for @c N times and then invokes a callback

  @param taskflow a tf::Taskflow
  @param N number of runs
  @param callable a callable object to be invoked after this run

  @return a tf::Future that holds the result of the execution

  This member function executes the given taskflow @c N times and invokes the given
  callable when the execution completes.
  This member function returns a tf::Future object that
  eventually holds the result of the execution.

  @code{.cpp}
  tf::Future<void> future = executor.run(
    taskflow, 2, [](){ std::cout << "done"; }  // runs taskflow 2 times and invoke
                                               // the lambda to print "done"
  );
  // do something else
  future.wait();
  @endcode

  This member function is thread-safe.

  @attention
  The executor does not own the given taskflow. It is your responsibility to
  ensure the taskflow remains alive during its execution.
  */
  template<typename C>
  tf::Future<void> run_n(Taskflow& taskflow, size_t N, C&& callable);

  /**
  @brief runs a moved taskflow for @c N times and then invokes a callback

  @param taskflow a moved tf::Taskflow
  @param N number of runs
  @param callable a callable object to be invoked after this run

  @return a tf::Future that holds the result of the execution

  This member function executes a moved taskflow @c N times and invokes the given
  callable when the execution completes.
  This member function returns a tf::Future object that
  eventually holds the result of the execution.

  @code{.cpp}
  tf::Future<void> future = executor.run_n(
    // run the moved taskflow 2 times and invoke the lambda to print "done"
    std::move(taskflow), 2, [](){ std::cout << "done"; }
  );
  // do something else
  future.wait();
  @endcode

  This member function is thread-safe.
  */
  template<typename C>
  tf::Future<void> run_n(Taskflow&& taskflow, size_t N, C&& callable);

  /**
  @brief runs a taskflow multiple times until the predicate becomes true

  @param taskflow a tf::Taskflow
  @param pred a boolean predicate to return @c true for stop

  @return a tf::Future that holds the result of the execution

  This member function executes the given taskflow multiple times until
  the predicate returns @c true.
  This member function returns a tf::Future object that
  eventually holds the result of the execution.

  @code{.cpp}
  tf::Future<void> future = executor.run_until(
    taskflow, [](){ return rand()%10 == 0 }
  );
  // do something else
  future.wait();
  @endcode

  This member function is thread-safe.

  @attention
  The executor does not own the given taskflow. It is your responsibility to
  ensure the taskflow remains alive during its execution.
  */
  template<typename P>
  tf::Future<void> run_until(Taskflow& taskflow, P&& pred);

  /**
  @brief runs a moved taskflow and keeps running it
         until the predicate becomes true

  @param taskflow a moved tf::Taskflow object
  @param pred a boolean predicate to return @c true for stop

  @return a tf::Future that holds the result of the execution

  This member function executes a moved taskflow multiple times until
  the predicate returns @c true.
  This member function returns a tf::Future object that
  eventually holds the result of the execution.
  The executor will take care of the lifetime of the moved taskflow.

  @code{.cpp}
  tf::Future<void> future = executor.run_until(
    std::move(taskflow), [](){ return rand()%10 == 0 }
  );
  // do something else
  future.wait();
  @endcode

  This member function is thread-safe.
  */
  template<typename P>
  tf::Future<void> run_until(Taskflow&& taskflow, P&& pred);

  /**
  @brief runs a taskflow multiple times until the predicate becomes true and
         then invokes the callback

  @param taskflow a tf::Taskflow
  @param pred a boolean predicate to return @c true for stop
  @param callable a callable object to be invoked after this run completes

  @return a tf::Future that holds the result of the execution

  This member function executes the given taskflow multiple times until
  the predicate returns @c true and then invokes the given callable when
  the execution completes.
  This member function returns a tf::Future object that
  eventually holds the result of the execution.

  @code{.cpp}
  tf::Future<void> future = executor.run_until(
    taskflow, [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; }
  );
  // do something else
  future.wait();
  @endcode

  This member function is thread-safe.

  @attention
  The executor does not own the given taskflow. It is your responsibility to
  ensure the taskflow remains alive during its execution.
  */
  template<typename P, typename C>
  tf::Future<void> run_until(Taskflow& taskflow, P&& pred, C&& callable);

  /**
  @brief runs a moved taskflow and keeps running
         it until the predicate becomes true and then invokes the callback

  @param taskflow a moved tf::Taskflow
  @param pred a boolean predicate to return @c true for stop
  @param callable a callable object to be invoked after this run completes

  @return a tf::Future that holds the result of the execution

  This member function executes a moved taskflow multiple times until
  the predicate returns @c true and then invokes the given callable when
  the execution completes.
  This member function returns a tf::Future object that
  eventually holds the result of the execution.
  The executor will take care of the lifetime of the moved taskflow.

  @code{.cpp}
  tf::Future<void> future = executor.run_until(
    std::move(taskflow),
    [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; }
  );
  // do something else
  future.wait();
  @endcode

  This member function is thread-safe.
  */
  template<typename P, typename C>
  tf::Future<void> run_until(Taskflow&& taskflow, P&& pred, C&& callable);

  /**
  @brief runs a target graph and waits until it completes using 
         an internal worker of this executor
  
  @tparam T target type which has `tf::Graph& T::graph()` defined
  @param target the target task graph object

  The method runs a target graph which has `tf::Graph& T::graph()` defined 
  and waits until the execution completes.
  Unlike the typical flow of calling `tf::Executor::run` series 
  plus waiting on the result, this method must be called by an internal
  worker of this executor. The caller worker will participate in
  the work-stealing loop of the scheduler, therby avoiding potential
  deadlock caused by blocked waiting.
  
  @code{.cpp}
  tf::Executor executor(2);
  tf::Taskflow taskflow;
  std::array<tf::Taskflow, 1000> others;
  
  std::atomic<size_t> counter{0};
  
  for(size_t n=0; n<1000; n++) {
    for(size_t i=0; i<1000; i++) {
      others[n].emplace([&](){ counter++; });
    }
    taskflow.emplace([&executor, &tf=others[n]](){
      executor.corun(tf);
      //executor.run(tf).wait();  <- blocking the worker without doing anything
      //                             will introduce deadlock
    });
  }
  executor.run(taskflow).wait();
  @endcode 

  The method is thread-safe as long as the target is not concurrently
  ran by two or more threads.

  @attention
  You must call tf::Executor::corun from a worker of the calling executor
  or an exception will be thrown.
  */
  template <typename T>
  void corun(T& target);

  /**
  @brief keeps running the work-stealing loop until the predicate becomes true
  
  @tparam P predicate type
  @param predicate a boolean predicate to indicate when to stop the loop

  The method keeps the caller worker running in the work-stealing loop
  until the stop predicate becomes true.

  @code{.cpp}
  taskflow.emplace([&](){
    std::future<void> fu = std::async([](){ std::sleep(100s); });
    executor.corun_until([](){
      return fu.wait_for(std::chrono::seconds(0)) == future_status::ready;
    });
  });
  @endcode

  @attention
  You must call tf::Executor::corun_until from a worker of the calling executor
  or an exception will be thrown.
  */
  template <typename P>
  void corun_until(P&& predicate);

  /**
  @brief waits for all tasks to complete

  This member function waits until all submitted tasks
  (e.g., taskflows, asynchronous tasks) to finish.

  @code{.cpp}
  executor.run(taskflow1);
  executor.run_n(taskflow2, 10);
  executor.run_n(taskflow3, 100);
  executor.wait_for_all();  // wait until the above submitted taskflows finish
  @endcode
  */
  void wait_for_all();

  /**
  @brief queries the number of worker threads

  Each worker represents one unique thread spawned by an executor
  upon its construction time.

  @code{.cpp}
  tf::Executor executor(4);
  std::cout << executor.num_workers();    // 4
  @endcode
  */
  size_t num_workers() const noexcept;

  /**
  @brief queries the number of running topologies at the time of this call

  When a taskflow is submitted to an executor, a topology is created to store
  runtime metadata of the running taskflow.
  When the execution of the submitted taskflow finishes,
  its corresponding topology will be removed from the executor.

  @code{.cpp}
  executor.run(taskflow);
  std::cout << executor.num_topologies();  // 0 or 1 (taskflow still running)
  @endcode
  */
  size_t num_topologies() const;

  /**
  @brief queries the number of running taskflows with moved ownership

  @code{.cpp}
  executor.run(std::move(taskflow));
  std::cout << executor.num_taskflows();  // 0 or 1 (taskflow still running)
  @endcode
  */
  size_t num_taskflows() const;
  
  /**
  @brief queries the id of the caller thread in this executor

  Each worker has an unique id in the range of @c 0 to @c N-1 associated with
  its parent executor.
  If the caller thread does not belong to the executor, @c -1 is returned.

  @code{.cpp}
  tf::Executor executor(4);   // 4 workers in the executor
  executor.this_worker_id();  // -1 (main thread is not a worker)

  taskflow.emplace([&](){
    std::cout << executor.this_worker_id();  // 0, 1, 2, or 3
  });
  executor.run(taskflow);
  @endcode
  */
  int this_worker_id() const;
 
  // --------------------------------------------------------------------------
  // Observer methods
  // --------------------------------------------------------------------------

  /**
  @brief constructs an observer to inspect the activities of worker threads

  @tparam Observer observer type derived from tf::ObserverInterface
  @tparam ArgsT argument parameter pack

  @param args arguments to forward to the constructor of the observer

  @return a shared pointer to the created observer

  Each executor manages a list of observers with shared ownership with callers.
  For each of these observers, the two member functions,
  tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit
  will be called before and after the execution of a task.

  This member function is not thread-safe.
  */
  template <typename Observer, typename... ArgsT>
  std::shared_ptr<Observer> make_observer(ArgsT&&... args);

  /**
  @brief removes an observer from the executor

  This member function is not thread-safe.
  */
  template <typename Observer>
  void remove_observer(std::shared_ptr<Observer> observer);

  /**
  @brief queries the number of observers
  */
  size_t num_observers() const noexcept;

  // --------------------------------------------------------------------------
  // Async Task Methods
  // --------------------------------------------------------------------------

  /**
  @brief runs a given function asynchronously

  @tparam F callable type

  @param func callable object

  @return a @std_future that will hold the result of the execution

  The method creates an asynchronous task to run the given function
  and return a @std_future object that eventually will hold the result
  of the return value.

  @code{.cpp}
  std::future<int> future = executor.async([](){
    std::cout << "create an asynchronous task and returns 1\n";
    return 1;
  });
  future.get();
  @endcode

  This member function is thread-safe.
  */
  template <typename F>
  auto async(F&& func);

  /**
  @brief runs a given function asynchronously and gives a name to this task

  @tparam F callable type

  @param name name of the asynchronous task
  @param func callable object

  @return a @std_future that will hold the result of the execution
  
  The method creates and assigns a name to an asynchronous task 
  to run the given function, 
  returning @std_future object that eventually will hold the result
  Assigned task names will appear in the observers of the executor.

  @code{.cpp}
  std::future<int> future = executor.async("name", [](){
    std::cout << "create an asynchronous task with a name and returns 1\n";
    return 1;
  });
  future.get();
  @endcode

  This member function is thread-safe.
  */
  template <typename F>
  auto async(const std::string& name, F&& func);

  /**
  @brief similar to tf::Executor::async but does not return a future object
  
  @tparam F callable type
  
  @param func callable object

  This member function is more efficient than tf::Executor::async
  and is encouraged to use when you do not want a @std_future to
  acquire the result or synchronize the execution.

  @code{.cpp}
  executor.silent_async([](){
    std::cout << "create an asynchronous task with no return\n";
  });
  executor.wait_for_all();
  @endcode

  This member function is thread-safe.
  */
  template <typename F>
  void silent_async(F&& func);

  /**
  @brief similar to tf::Executor::async but does not return a future object

  @tparam F callable type

  @param name assigned name to the task
  @param func callable object

  This member function is more efficient than tf::Executor::async
  and is encouraged to use when you do not want a @std_future to
  acquire the result or synchronize the execution.
  Assigned task names will appear in the observers of the executor.

  @code{.cpp}
  executor.silent_async("name", [](){
    std::cout << "create an asynchronous task with a name and no return\n";
  });
  executor.wait_for_all();
  @endcode

  This member function is thread-safe.
  */
  template <typename F>
  void silent_async(const std::string& name, F&& func);

  // --------------------------------------------------------------------------
  // Silent Dependent Async Methods
  // --------------------------------------------------------------------------
  
  /**
  @brief runs the given function asynchronously 
         when the given dependents finish

  @tparam F callable type
  @tparam Tasks task types convertible to tf::AsyncTask

  @param func callable object
  @param tasks asynchronous tasks on which this execution depends
  
  @return a tf::AsyncTask handle 
  
  This member function is more efficient than tf::Executor::dependent_async
  and is encouraged to use when you do not want a @std_future to
  acquire the result or synchronize the execution.
  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
  in which task @c C runs after task @c A and task @c B.

  @code{.cpp}
  tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); });
  tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); });
  executor.silent_dependent_async([](){ printf("C runs after A and B\n"); }, A, B);
  executor.wait_for_all();
  @endcode

  This member function is thread-safe.
  */
  template <typename F, typename... Tasks,
    std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr
  >
  tf::AsyncTask silent_dependent_async(F&& func, Tasks&&... tasks);
  
  /**
  @brief names and runs the given function asynchronously 
         when the given dependents finish
  
  @tparam F callable type
  @tparam Tasks task types convertible to tf::AsyncTask

  @param name assigned name to the task
  @param func callable object
  @param tasks asynchronous tasks on which this execution depends
  
  @return a tf::AsyncTask handle 
  
  This member function is more efficient than tf::Executor::dependent_async
  and is encouraged to use when you do not want a @std_future to
  acquire the result or synchronize the execution.
  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
  in which task @c C runs after task @c A and task @c B.
  Assigned task names will appear in the observers of the executor.

  @code{.cpp}
  tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); });
  tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); });
  executor.silent_dependent_async(
    "C", [](){ printf("C runs after A and B\n"); }, A, B
  );
  executor.wait_for_all();
  @endcode

  This member function is thread-safe.
  */
  template <typename F, typename... Tasks,
    std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr
  >
  tf::AsyncTask silent_dependent_async(const std::string& name, F&& func, Tasks&&... tasks);
  
  /**
  @brief runs the given function asynchronously 
         when the given range of dependents finish
  
  @tparam F callable type
  @tparam I iterator type 

  @param func callable object
  @param first iterator to the beginning (inclusive)
  @param last iterator to the end (exclusive)
  
  @return a tf::AsyncTask handle 
  
  This member function is more efficient than tf::Executor::dependent_async
  and is encouraged to use when you do not want a @std_future to
  acquire the result or synchronize the execution.
  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
  in which task @c C runs after task @c A and task @c B.

  @code{.cpp}
  std::array<tf::AsyncTask, 2> array {
    executor.silent_dependent_async([](){ printf("A\n"); }),
    executor.silent_dependent_async([](){ printf("B\n"); })
  };
  executor.silent_dependent_async(
    [](){ printf("C runs after A and B\n"); }, array.begin(), array.end()
  );
  executor.wait_for_all();
  @endcode

  This member function is thread-safe.
  */
  template <typename F, typename I, 
    std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr
  >
  tf::AsyncTask silent_dependent_async(F&& func, I first, I last);
  
  /**
  @brief names and runs the given function asynchronously 
         when the given range of dependents finish
  
  @tparam F callable type
  @tparam I iterator type 

  @param name assigned name to the task
  @param func callable object
  @param first iterator to the beginning (inclusive)
  @param last iterator to the end (exclusive)

  @return a tf::AsyncTask handle 
  
  This member function is more efficient than tf::Executor::dependent_async
  and is encouraged to use when you do not want a @std_future to
  acquire the result or synchronize the execution.
  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
  in which task @c C runs after task @c A and task @c B.
  Assigned task names will appear in the observers of the executor.

  @code{.cpp}
  std::array<tf::AsyncTask, 2> array {
    executor.silent_dependent_async("A", [](){ printf("A\n"); }),
    executor.silent_dependent_async("B", [](){ printf("B\n"); })
  };
  executor.silent_dependent_async(
    "C", [](){ printf("C runs after A and B\n"); }, array.begin(), array.end()
  );
  executor.wait_for_all();
  @endcode

  This member function is thread-safe.
  */
  template <typename F, typename I, 
    std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr
  >
  tf::AsyncTask silent_dependent_async(const std::string& name, F&& func, I first, I last);
  
  // --------------------------------------------------------------------------
  // Dependent Async Methods
  // --------------------------------------------------------------------------
  
  /**
  @brief runs the given function asynchronously 
         when the given dependents finish
  
  @tparam F callable type
  @tparam Tasks task types convertible to tf::AsyncTask

  @param func callable object
  @param tasks asynchronous tasks on which this execution depends
  
  @return a pair of a tf::AsyncTask handle and 
                    a @std_future that holds the result of the execution
  
  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
  in which task @c C runs after task @c A and task @c B.
  Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int>
  that eventually will hold the result of the execution.

  @code{.cpp}
  tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); });
  tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); });
  auto [C, fuC] = executor.dependent_async(
    [](){ 
      printf("C runs after A and B\n"); 
      return 1;
    }, 
    A, B
  );
  fuC.get();  // C finishes, which in turns means both A and B finish
  @endcode

  You can mixed the use of tf::AsyncTask handles 
  returned by Executor::dependent_async and Executor::silent_dependent_async
  when specifying task dependencies.

  This member function is thread-safe.
  */
  template <typename F, typename... Tasks,
    std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr
  >
  auto dependent_async(F&& func, Tasks&&... tasks);
  
  /**
  @brief names and runs the given function asynchronously
         when the given dependents finish
  
  @tparam F callable type
  @tparam Tasks task types convertible to tf::AsyncTask
  
  @param name assigned name to the task
  @param func callable object
  @param tasks asynchronous tasks on which this execution depends
  
  @return a pair of a tf::AsyncTask handle and 
                    a @std_future that holds the result of the execution
  
  The example below creates three named asynchronous tasks, @c A, @c B, and @c C,
  in which task @c C runs after task @c A and task @c B.
  Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int>
  that eventually will hold the result of the execution.
  Assigned task names will appear in the observers of the executor.

  @code{.cpp}
  tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); });
  tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); });
  auto [C, fuC] = executor.dependent_async(
    "C",
    [](){ 
      printf("C runs after A and B\n"); 
      return 1;
    }, 
    A, B
  );
  assert(fuC.get()==1);  // C finishes, which in turns means both A and B finish
  @endcode

  You can mixed the use of tf::AsyncTask handles 
  returned by Executor::dependent_async and Executor::silent_dependent_async
  when specifying task dependencies.

  This member function is thread-safe.
  */
  template <typename F, typename... Tasks,
    std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr
  >
  auto dependent_async(const std::string& name, F&& func, Tasks&&... tasks);
  
  /**
  @brief runs the given function asynchronously 
         when the given range of dependents finish
  
  @tparam F callable type
  @tparam I iterator type 

  @param func callable object
  @param first iterator to the beginning (inclusive)
  @param last iterator to the end (exclusive)
  
  @return a pair of a tf::AsyncTask handle and 
                    a @std_future that holds the result of the execution
  
  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
  in which task @c C runs after task @c A and task @c B.
  Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int>
  that eventually will hold the result of the execution.

  @code{.cpp}
  std::array<tf::AsyncTask, 2> array {
    executor.silent_dependent_async([](){ printf("A\n"); }),
    executor.silent_dependent_async([](){ printf("B\n"); })
  };
  auto [C, fuC] = executor.dependent_async(
    [](){ 
      printf("C runs after A and B\n"); 
      return 1;
    }, 
    array.begin(), array.end()
  );
  assert(fuC.get()==1);  // C finishes, which in turns means both A and B finish
  @endcode

  You can mixed the use of tf::AsyncTask handles 
  returned by Executor::dependent_async and Executor::silent_dependent_async
  when specifying task dependencies.

  This member function is thread-safe.
  */
  template <typename F, typename I,
    std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr
  >
  auto dependent_async(F&& func, I first, I last);
  
  /**
  @brief names and runs the given function asynchronously 
         when the given range of dependents finish
  
  @tparam F callable type
  @tparam I iterator type 
  
  @param name assigned name to the task
  @param func callable object
  @param first iterator to the beginning (inclusive)
  @param last iterator to the end (exclusive)
  
  @return a pair of a tf::AsyncTask handle and 
                    a @std_future that holds the result of the execution
  
  The example below creates three named asynchronous tasks, @c A, @c B, and @c C,
  in which task @c C runs after task @c A and task @c B.
  Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int>
  that eventually will hold the result of the execution.
  Assigned task names will appear in the observers of the executor.

  @code{.cpp}
  std::array<tf::AsyncTask, 2> array {
    executor.silent_dependent_async("A", [](){ printf("A\n"); }),
    executor.silent_dependent_async("B", [](){ printf("B\n"); })
  };
  auto [C, fuC] = executor.dependent_async(
    "C",
    [](){ 
      printf("C runs after A and B\n"); 
      return 1;
    }, 
    array.begin(), array.end()
  );
  assert(fuC.get()==1);  // C finishes, which in turns means both A and B finish
  @endcode

  You can mixed the use of tf::AsyncTask handles 
  returned by Executor::dependent_async and Executor::silent_dependent_async
  when specifying task dependencies.

  This member function is thread-safe.
  */
  template <typename F, typename I,
    std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr
  >
  auto dependent_async(const std::string& name, F&& func, I first, I last);

  private:
    
  const size_t _MAX_STEALS;

  std::condition_variable _topology_cv;
  std::mutex _taskflows_mutex;
  std::mutex _topology_mutex;
  std::mutex _wsq_mutex;
  std::mutex _asyncs_mutex;

  size_t _num_topologies {0};
  
  std::unordered_map<std::thread::id, size_t> _wids;
  std::vector<std::thread> _threads;
  std::vector<Worker> _workers;
  std::list<Taskflow> _taskflows;

  std::unordered_set<std::shared_ptr<Node>> _asyncs;

  Notifier _notifier;

  TaskQueue<Node*> _wsq;

  std::atomic<bool> _done {0};

  std::shared_ptr<WorkerInterface> _worker_interface;
  std::unordered_set<std::shared_ptr<ObserverInterface>> _observers;

  Worker* _this_worker();

  bool _wait_for_task(Worker&, Node*&);

  void _observer_prologue(Worker&, Node*);
  void _observer_epilogue(Worker&, Node*);
  void _spawn(size_t);
  void _exploit_task(Worker&, Node*&);
  void _explore_task(Worker&, Node*&);
  void _schedule(Worker&, Node*);
  void _schedule(Node*);
  void _schedule(Worker&, const SmallVector<Node*>&);
  void _schedule(const SmallVector<Node*>&);
  void _set_up_topology(Worker*, Topology*);
  void _tear_down_topology(Worker&, Topology*);
  void _tear_down_async(Node*);
  void _tear_down_dependent_async(Worker&, Node*);
  void _tear_down_invoke(Worker&, Node*);
  void _increment_topology();
  void _decrement_topology();
  void _decrement_topology_and_notify();
  void _invoke(Worker&, Node*);
  void _invoke_static_task(Worker&, Node*);
  void _invoke_dynamic_task(Worker&, Node*);
  void _consume_graph(Worker&, Node*, Graph&);
  void _detach_dynamic_task(Worker&, Node*, Graph&);
  void _invoke_condition_task(Worker&, Node*, SmallVector<int>&);
  void _invoke_multi_condition_task(Worker&, Node*, SmallVector<int>&);
  void _invoke_module_task(Worker&, Node*);
  void _invoke_async_task(Worker&, Node*);
  void _invoke_dependent_async_task(Worker&, Node*);
  void _process_async_dependent(Node*, tf::AsyncTask&, size_t&);
  void _schedule_async_task(Node*);
  
  template <typename P>
  void _corun_until(Worker&, P&&);
  
  template <typename R, typename F>
  auto _make_promised_async(std::promise<R>&&, F&&);
 };

 // Constructor
 inline Executor::Executor(size_t N, std::shared_ptr<WorkerInterface> wix) :
  _MAX_STEALS {((N+1) << 1)},
  _threads    {N},
  _workers    {N},
  _notifier   {N},
  _worker_interface {std::move(wix)} {

  if(N == 0) {
    TF_THROW("no cpu workers to execute taskflows");
  }

  _spawn(N);

  // instantite the default observer if requested
  if(has_env(TF_ENABLE_PROFILER)) {
    TFProfManager::get()._manage(make_observer<TFProfObserver>());
  }
 }

 // Destructor
 inline Executor::~Executor() {

  // wait for all topologies to complete
  wait_for_all();

  // shut down the scheduler
  _done = true;

  _notifier.notify(true);

  for(auto& t : _threads){
    t.join();
  }
 }

 // Function: num_workers
 inline size_t Executor::num_workers() const noexcept {
  return _workers.size();
 }

 // Function: num_topologies
 inline size_t Executor::num_topologies() const {
  return _num_topologies;
 }

 // Function: num_taskflows
 inline size_t Executor::num_taskflows() const {
  return _taskflows.size();
 }

 // Function: _this_worker
 inline Worker* Executor::_this_worker() {
  auto itr = _wids.find(std::this_thread::get_id());
  return itr == _wids.end() ? nullptr : &_workers[itr->second];
 }

 // Function: this_worker_id
 inline int Executor::this_worker_id() const {
  auto i = _wids.find(std::this_thread::get_id());
  return i == _wids.end() ? -1 : static_cast<int>(_workers[i->second]._id);
 }

 // Procedure: _spawn
 inline void Executor::_spawn(size_t N) {

  std::mutex mutex;
  std::condition_variable cond;
  size_t n=0;

  for(size_t id=0; id<N; ++id) {

    _workers[id]._id = id;
    _workers[id]._vtm = id;
    _workers[id]._executor = this;
    _workers[id]._waiter = &_notifier._waiters[id];

    _threads[id] = std::thread([this] (
      Worker& w, std::mutex& mutex, std::condition_variable& cond, size_t& n
    ) -> void {
      
      // assign the thread
      w._thread = &_threads[w._id];

      // enables the mapping
      {
        std::scoped_lock lock(mutex);
        _wids[std::this_thread::get_id()] = w._id;
        if(n++; n == num_workers()) {
          cond.notify_one();
        }
      }

      Node* t = nullptr;
      
      // before entering the scheduler (work-stealing loop), 
      // call the user-specified prologue function
      if(_worker_interface) {
        _worker_interface->scheduler_prologue(w);
      }
      
      // must use 1 as condition instead of !done because
      // the previous worker may stop while the following workers
      // are still preparing for entering the scheduling loop
      std::exception_ptr ptr{nullptr};
      try {
        while(1) {

          // execute the tasks.
          _exploit_task(w, t);

          // wait for tasks
          if(_wait_for_task(w, t) == false) {
            break;
          }
        }
      } 
      catch(...) {
        ptr = std::current_exception();
      }
      
      // call the user-specified epilogue function
      if(_worker_interface) {
        _worker_interface->scheduler_epilogue(w, ptr);
      }

    }, std::ref(_workers[id]), std::ref(mutex), std::ref(cond), std::ref(n));
    
    // POSIX-like system can use the following to affine threads to cores 
    //cpu_set_t cpuset;
    //CPU_ZERO(&cpuset);
    //CPU_SET(id, &cpuset);
    //pthread_setaffinity_np(
    //  _threads[id].native_handle(), sizeof(cpu_set_t), &cpuset
    //);
  }

  std::unique_lock<std::mutex> lock(mutex);
  cond.wait(lock, [&](){ return n==N; });
 }

 // Function: _corun_until
 template <typename P>
 void Executor::_corun_until(Worker& w, P&& stop_predicate) {
  
  std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1);

  exploit:

  while(!stop_predicate()) {

    //exploit:

    if(auto t = w._wsq.pop(); t) {
      _invoke(w, t);
    }
    else {
      size_t num_steals = 0;

      explore:

      t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal();

      if(t) {
        _invoke(w, t);
        goto exploit;
      }
      else if(!stop_predicate()) {
        if(num_steals++ > _MAX_STEALS) {
          std::this_thread::yield();
        }
        w._vtm = rdvtm(w._rdgen);
        goto explore;
      }
      else {
        break;
      }
    }
  }
 }

 // Function: _explore_task
 inline void Executor::_explore_task(Worker& w, Node*& t) {

  //assert(_workers[w].wsq.empty());
  //assert(!t);

  size_t num_steals = 0;
  size_t num_yields = 0;

  std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1);
  
  // Here, we write do-while to make the worker steal at once
  // from the assigned victim.
  do {
    t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal();

    if(t) {
      break;
    }

    if(num_steals++ > _MAX_STEALS) {
      std::this_thread::yield();
      if(num_yields++ > 100) {
        break;
      }
    }

    w._vtm = rdvtm(w._rdgen);
  } while(!_done);

 }

 // Procedure: _exploit_task
 inline void Executor::_exploit_task(Worker& w, Node*& t) {
  while(t) {
    _invoke(w, t);
    t = w._wsq.pop();
  }
 }

 // Function: _wait_for_task
 inline bool Executor::_wait_for_task(Worker& worker, Node*& t) {

  explore_task:

  _explore_task(worker, t);
  
  // The last thief who successfully stole a task will wake up
  // another thief worker to avoid starvation.
  if(t) {
    _notifier.notify(false);
    return true;
  }

  // ---- 2PC guard ----
  _notifier.prepare_wait(worker._waiter);

  if(!_wsq.empty()) {
    _notifier.cancel_wait(worker._waiter);
    worker._vtm = worker._id;
    goto explore_task;
  }
  
  if(_done) {
    _notifier.cancel_wait(worker._waiter);
    _notifier.notify(true);
    return false;
  }
  
  // We need to use index-based scanning to avoid data race
  // with _spawn which may initialize a worker at the same time.
  for(size_t vtm=0; vtm<_workers.size(); vtm++) {
    if(!_workers[vtm]._wsq.empty()) {
      _notifier.cancel_wait(worker._waiter);
      worker._vtm = vtm;
      goto explore_task;
    }
  }
  
  // Now I really need to relinguish my self to others
  _notifier.commit_wait(worker._waiter);

  goto explore_task;
 }

 // Function: make_observer
 template<typename Observer, typename... ArgsT>
 std::shared_ptr<Observer> Executor::make_observer(ArgsT&&... args) {

  static_assert(
    std::is_base_of_v<ObserverInterface, Observer>,
    "Observer must be derived from ObserverInterface"
  );

  // use a local variable to mimic the constructor
  auto ptr = std::make_shared<Observer>(std::forward<ArgsT>(args)...);

  ptr->set_up(_workers.size());

  _observers.emplace(std::static_pointer_cast<ObserverInterface>(ptr));

  return ptr;
 }

 // Procedure: remove_observer
 template <typename Observer>
 void Executor::remove_observer(std::shared_ptr<Observer> ptr) {

  static_assert(
    std::is_base_of_v<ObserverInterface, Observer>,
    "Observer must be derived from ObserverInterface"
  );

  _observers.erase(std::static_pointer_cast<ObserverInterface>(ptr));
 }

 // Function: num_observers
 inline size_t Executor::num_observers() const noexcept {
  return _observers.size();
 }

 // Procedure: _schedule
 inline void Executor::_schedule(Worker& worker, Node* node) {
  
  // We need to fetch p before the release such that the read 
  // operation is synchronized properly with other thread to
  // void data race.
  auto p = node->_priority;

  node->_state.fetch_or(Node::READY, std::memory_order_release);

  // caller is a worker to this pool - starting at v3.5 we do not use
  // any complicated notification mechanism as the experimental result
  // has shown no significant advantage.
  if(worker._executor == this) {
    worker._wsq.push(node, p);
    _notifier.notify(false);
    return;
  }

  {
    std::lock_guard<std::mutex> lock(_wsq_mutex);
    _wsq.push(node, p);
  }

  _notifier.notify(false);
 }

 // Procedure: _schedule
 inline void Executor::_schedule(Node* node) {
  
  // We need to fetch p before the release such that the read 
  // operation is synchronized properly with other thread to
  // void data race.
  auto p = node->_priority;

  node->_state.fetch_or(Node::READY, std::memory_order_release);

  {
    std::lock_guard<std::mutex> lock(_wsq_mutex);
    _wsq.push(node, p);
  }

  _notifier.notify(false);
 }

 // Procedure: _schedule
 inline void Executor::_schedule(Worker& worker, const SmallVector<Node*>& nodes) {

  // We need to cacth the node count to avoid accessing the nodes
  // vector while the parent topology is removed!
  const auto num_nodes = nodes.size();

  if(num_nodes == 0) {
    return;
  }

  // caller is a worker to this pool - starting at v3.5 we do not use
  // any complicated notification mechanism as the experimental result
  // has shown no significant advantage.
  if(worker._executor == this) {
    for(size_t i=0; i<num_nodes; ++i) {
      // We need to fetch p before the release such that the read 
      // operation is synchronized properly with other thread to
      // void data race.
      auto p = nodes[i]->_priority;
      nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release);
      worker._wsq.push(nodes[i], p);
      _notifier.notify(false);
    }
    return;
  }

  {
    std::lock_guard<std::mutex> lock(_wsq_mutex);
    for(size_t k=0; k<num_nodes; ++k) {
      auto p = nodes[k]->_priority;
      nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release);
      _wsq.push(nodes[k], p);
    }
  }

  _notifier.notify_n(num_nodes);
 }

 // Procedure: _schedule
 inline void Executor::_schedule(const SmallVector<Node*>& nodes) {

  // parent topology may be removed!
  const auto num_nodes = nodes.size();

  if(num_nodes == 0) {
    return;
  }

  // We need to fetch p before the release such that the read 
  // operation is synchronized properly with other thread to
  // void data race.
  {
    std::lock_guard<std::mutex> lock(_wsq_mutex);
    for(size_t k=0; k<num_nodes; ++k) {
      auto p = nodes[k]->_priority;
      nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release);
      _wsq.push(nodes[k], p);
    }
  }

  _notifier.notify_n(num_nodes);
 }

 // Procedure: _invoke
 inline void Executor::_invoke(Worker& worker, Node* node) {

  // synchronize all outstanding memory operations caused by reordering
  while(!(node->_state.load(std::memory_order_acquire) & Node::READY));

  begin_invoke:

  // no need to do other things if the topology is cancelled
  if(node->_is_cancelled()) {
    _tear_down_invoke(worker, node);
    return;
  }

  // if acquiring semaphore(s) exists, acquire them first
  if(node->_semaphores && !node->_semaphores->to_acquire.empty()) {
    SmallVector<Node*> nodes;
    if(!node->_acquire_all(nodes)) {
      _schedule(worker, nodes);
      return;
    }
    node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release);
  }

  // condition task
  //int cond = -1;
  SmallVector<int> conds;

  // switch is faster than nested if-else due to jump table
  switch(node->_handle.index()) {
    // static task
    case Node::STATIC:{
      _invoke_static_task(worker, node);
    }
    break;

    // dynamic task
    case Node::DYNAMIC: {
      _invoke_dynamic_task(worker, node);
    }
    break;

    // condition task
    case Node::CONDITION: {
      _invoke_condition_task(worker, node, conds);
    }
    break;

    // multi-condition task
    case Node::MULTI_CONDITION: {
      _invoke_multi_condition_task(worker, node, conds);
    }
    break;

    // module task
    case Node::MODULE: {
      _invoke_module_task(worker, node);
    }
    break;

    // async task
    case Node::ASYNC: {
      _invoke_async_task(worker, node);
      _tear_down_async(node);
      return ;
    }
    break;

    // dependent async task
    case Node::DEPENDENT_ASYNC: {
      _invoke_dependent_async_task(worker, node);
      _tear_down_dependent_async(worker, node);
      if(worker._cache) {
        node = worker._cache;
        goto begin_invoke;
      }
      return;
    }
    break;

    // monostate (placeholder)
    default:
    break;
  }

  // if releasing semaphores exist, release them
  if(node->_semaphores && !node->_semaphores->to_release.empty()) {
    _schedule(worker, node->_release_all());
  }
  
  // Reset the join counter to support the cyclic control flow.
  // + We must do this before scheduling the successors to avoid race
  //   condition on _dependents.
  // + We must use fetch_add instead of direct assigning
  //   because the user-space call on "invoke" may explicitly schedule 
  //   this task again (e.g., pipeline) which can access the join_counter.
  if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) {
    node->_join_counter.fetch_add(node->num_strong_dependents(), std::memory_order_relaxed);
  }
  else {
    node->_join_counter.fetch_add(node->num_dependents(), std::memory_order_relaxed);
  }

  // acquire the parent flow counter
  auto& j = (node->_parent) ? node->_parent->_join_counter :
                              node->_topology->_join_counter;

  // Here, we want to cache the latest successor with the highest priority
  worker._cache = nullptr;
  auto max_p = static_cast<unsigned>(TaskPriority::MAX);

  // Invoke the task based on the corresponding type
  switch(node->_handle.index()) {

    // condition and multi-condition tasks
    case Node::CONDITION:
    case Node::MULTI_CONDITION: {
      for(auto cond : conds) {
        if(cond >= 0 && static_cast<size_t>(cond) < node->_successors.size()) {
          auto s = node->_successors[cond];
          // zeroing the join counter for invariant
          s->_join_counter.store(0, std::memory_order_relaxed);
          j.fetch_add(1, std::memory_order_relaxed);
          if(s->_priority <= max_p) {
            if(worker._cache) {
              _schedule(worker, worker._cache);
            }
            worker._cache = s;
            max_p = s->_priority;
          }
          else {
            _schedule(worker, s);
          }
        }
      }
    }
    break;

    // non-condition task
    default: {
      for(size_t i=0; i<node->_successors.size(); ++i) {
        //if(auto s = node->_successors[i]; --(s->_join_counter) == 0) {
        if(auto s = node->_successors[i]; 
          s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) {
          j.fetch_add(1, std::memory_order_relaxed);
          if(s->_priority <= max_p) {
            if(worker._cache) {
              _schedule(worker, worker._cache);
            }
            worker._cache = s;
            max_p = s->_priority;
          }
          else {
            _schedule(worker, s);
          }
        }
      }
    }
    break;
  }

  // tear_down the invoke
  _tear_down_invoke(worker, node);

  // perform tail recursion elimination for the right-most child to reduce
  // the number of expensive pop/push operations through the task queue
  if(worker._cache) {
    node = worker._cache;
    //node->_state.fetch_or(Node::READY, std::memory_order_release);
    goto begin_invoke;
  }
 }

 // Proecdure: _tear_down_invoke
 inline void Executor::_tear_down_invoke(Worker& worker, Node* node) {
  // we must check parent first before substracting the join counter,
  // or it can introduce data race
  if(node->_parent == nullptr) {
    if(node->_topology->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) {
      _tear_down_topology(worker, node->_topology);
    }
  }
  // joined subflow
  else {  
    node->_parent->_join_counter.fetch_sub(1, std::memory_order_release);
  }
 }

 // Procedure: _observer_prologue
 inline void Executor::_observer_prologue(Worker& worker, Node* node) {
  for(auto& observer : _observers) {
    observer->on_entry(WorkerView(worker), TaskView(*node));
  }
 }

 // Procedure: _observer_epilogue
 inline void Executor::_observer_epilogue(Worker& worker, Node* node) {
  for(auto& observer : _observers) {
    observer->on_exit(WorkerView(worker), TaskView(*node));
  }
 }

 // Procedure: _invoke_static_task
 inline void Executor::_invoke_static_task(Worker& worker, Node* node) {
  _observer_prologue(worker, node);
  auto& work = std::get_if<Node::Static>(&node->_handle)->work;
  switch(work.index()) {
    case 0:
      std::get_if<0>(&work)->operator()();
    break;

    case 1:
      Runtime rt(*this, worker, node);
      std::get_if<1>(&work)->operator()(rt);
    break;
  }
  _observer_epilogue(worker, node);
 }

 // Procedure: _invoke_dynamic_task
 inline void Executor::_invoke_dynamic_task(Worker& w, Node* node) {

  _observer_prologue(w, node);

  auto handle = std::get_if<Node::Dynamic>(&node->_handle);

  handle->subgraph._clear();

  Subflow sf(*this, w, node, handle->subgraph);

  handle->work(sf);

  if(sf._joinable) {
    _consume_graph(w, node, handle->subgraph);
  }

  _observer_epilogue(w, node);
 }

 // Procedure: _detach_dynamic_task
 inline void Executor::_detach_dynamic_task(
  Worker& w, Node* p, Graph& g
 ) {

  // graph is empty and has no async tasks
  if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) {
    return;
  }

  SmallVector<Node*> src;

  for(auto n : g._nodes) {

    n->_state.store(Node::DETACHED, std::memory_order_relaxed);
    n->_set_up_join_counter();
    n->_topology = p->_topology;
    n->_parent = nullptr;

    if(n->num_dependents() == 0) {
      src.push_back(n);
    }
  }

  {
    std::lock_guard<std::mutex> lock(p->_topology->_taskflow._mutex);
    p->_topology->_taskflow._graph._merge(std::move(g));
  }

  p->_topology->_join_counter.fetch_add(src.size(), std::memory_order_relaxed);
  _schedule(w, src);
 }

 // Procedure: _consume_graph
 inline void Executor::_consume_graph(Worker& w, Node* p, Graph& g) {

  // graph is empty and has no async tasks
  if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) {
    return;
  }

  SmallVector<Node*> src;

  for(auto n : g._nodes) {
    n->_state.store(0, std::memory_order_relaxed);
    n->_set_up_join_counter();
    n->_topology = p->_topology;
    n->_parent = p;
    if(n->num_dependents() == 0) {
      src.push_back(n);
    }
  }
  p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed);
  
  _schedule(w, src);
  _corun_until(w, [p] () -> bool { return p->_join_counter.load(std::memory_order_acquire) == 0; });
 }

 // Procedure: _invoke_condition_task
 inline void Executor::_invoke_condition_task(
  Worker& worker, Node* node, SmallVector<int>& conds
 ) {
  _observer_prologue(worker, node);
  auto& work = std::get_if<Node::Condition>(&node->_handle)->work;
  switch(work.index()) {
    case 0:
      conds = { std::get_if<0>(&work)->operator()() };
    break;

    case 1:
      Runtime rt(*this, worker, node);
      conds = { std::get_if<1>(&work)->operator()(rt) };
    break;
  }
  _observer_epilogue(worker, node);
 }

 // Procedure: _invoke_multi_condition_task
 inline void Executor::_invoke_multi_condition_task(
  Worker& worker, Node* node, SmallVector<int>& conds
 ) {
  _observer_prologue(worker, node);
  auto& work = std::get_if<Node::MultiCondition>(&node->_handle)->work;
  switch(work.index()) {
    case 0:
      conds = std::get_if<0>(&work)->operator()();
    break;

    case 1:
      Runtime rt(*this, worker, node);
      conds = std::get_if<1>(&work)->operator()(rt);
    break;
  }
  _observer_epilogue(worker, node);
 }

 // Procedure: _invoke_module_task
 inline void Executor::_invoke_module_task(Worker& w, Node* node) {
  _observer_prologue(w, node);
  _consume_graph(
    w, node, std::get_if<Node::Module>(&node->_handle)->graph
  );
  _observer_epilogue(w, node);
 }

 // Procedure: _invoke_async_task
 inline void Executor::_invoke_async_task(Worker& w, Node* node) {
  _observer_prologue(w, node);
  std::get_if<Node::Async>(&node->_handle)->work();
  _observer_epilogue(w, node);
 }

 // Procedure: _invoke_dependent_async_task
 inline void Executor::_invoke_dependent_async_task(Worker& w, Node* node) {
  _observer_prologue(w, node);
  std::get_if<Node::DependentAsync>(&node->_handle)->work();
  _observer_epilogue(w, node);
 }

 // Function: run
 inline tf::Future<void> Executor::run(Taskflow& f) {
  return run_n(f, 1, [](){});
 }

 // Function: run
 inline tf::Future<void> Executor::run(Taskflow&& f) {
  return run_n(std::move(f), 1, [](){});
 }

 // Function: run
 template <typename C>
 tf::Future<void> Executor::run(Taskflow& f, C&& c) {
  return run_n(f, 1, std::forward<C>(c));
 }

 // Function: run
 template <typename C>
 tf::Future<void> Executor::run(Taskflow&& f, C&& c) {
  return run_n(std::move(f), 1, std::forward<C>(c));
 }

 // Function: run_n
 inline tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat) {
  return run_n(f, repeat, [](){});
 }

 // Function: run_n
 inline tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat) {
  return run_n(std::move(f), repeat, [](){});
 }

 // Function: run_n
 template <typename C>
 tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat, C&& c) {
  return run_until(
    f, [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c)
  );
 }

 // Function: run_n
 template <typename C>
 tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat, C&& c) {
  return run_until(
    std::move(f), [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c)
  );
 }

 // Function: run_until
 template<typename P>
 tf::Future<void> Executor::run_until(Taskflow& f, P&& pred) {
  return run_until(f, std::forward<P>(pred), [](){});
 }

 // Function: run_until
 template<typename P>
 tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred) {
  return run_until(std::move(f), std::forward<P>(pred), [](){});
 }

 // Function: run_until
 template <typename P, typename C>
 tf::Future<void> Executor::run_until(Taskflow& f, P&& p, C&& c) {

  _increment_topology();

  // Need to check the empty under the lock since dynamic task may
  // define detached blocks that modify the taskflow at the same time
  bool empty;
  {
    std::lock_guard<std::mutex> lock(f._mutex);
    empty = f.empty();
  }

  // No need to create a real topology but returns an dummy future
  if(empty || p()) {
    c();
    std::promise<void> promise;
    promise.set_value();
    _decrement_topology_and_notify();
    return tf::Future<void>(promise.get_future(), std::monostate{});
  }

  // create a topology for this run
  auto t = std::make_shared<Topology>(f, std::forward<P>(p), std::forward<C>(c));

  // need to create future before the topology got torn down quickly
  tf::Future<void> future(t->_promise.get_future(), t);

  // modifying topology needs to be protected under the lock
  {
    std::lock_guard<std::mutex> lock(f._mutex);
    f._topologies.push(t);
    if(f._topologies.size() == 1) {
      _set_up_topology(_this_worker(), t.get());
    }
  }

  return future;
 }

 // Function: run_until
 template <typename P, typename C>
 tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred, C&& c) {

  std::list<Taskflow>::iterator itr;

  {
    std::scoped_lock<std::mutex> lock(_taskflows_mutex);
    itr = _taskflows.emplace(_taskflows.end(), std::move(f));
    itr->_satellite = itr;
  }

  return run_until(*itr, std::forward<P>(pred), std::forward<C>(c));
 }

 // Function: corun
 template <typename T>
 void Executor::corun(T& target) {
  
  auto w = _this_worker();

  if(w == nullptr) {
    TF_THROW("corun must be called by a worker of the executor");
  }

  Node parent;  // dummy parent
  _consume_graph(*w, &parent, target.graph());
 }

 // Function: corun_until
 template <typename P>
 void Executor::corun_until(P&& predicate) {
  
  auto w = _this_worker();

  if(w == nullptr) {
    TF_THROW("corun_until must be called by a worker of the executor");
  }

  _corun_until(*w, std::forward<P>(predicate));
 }

 // Procedure: _increment_topology
 inline void Executor::_increment_topology() {
  std::lock_guard<std::mutex> lock(_topology_mutex);
  ++_num_topologies;
 }

 // Procedure: _decrement_topology_and_notify
 inline void Executor::_decrement_topology_and_notify() {
  std::lock_guard<std::mutex> lock(_topology_mutex);
  if(--_num_topologies == 0) {
    _topology_cv.notify_all();
  }
 }

 // Procedure: _decrement_topology
 inline void Executor::_decrement_topology() {
  std::lock_guard<std::mutex> lock(_topology_mutex);
  --_num_topologies;
 }

 // Procedure: wait_for_all
 inline void Executor::wait_for_all() {
  std::unique_lock<std::mutex> lock(_topology_mutex);
  _topology_cv.wait(lock, [&](){ return _num_topologies == 0; });
 }

 // Function: _set_up_topology
 inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) {

  // ---- under taskflow lock ----

  tpg->_sources.clear();
  tpg->_taskflow._graph._clear_detached();

  // scan each node in the graph and build up the links
  for(auto node : tpg->_taskflow._graph._nodes) {

    node->_topology = tpg;
    node->_parent = nullptr;
    node->_state.store(0, std::memory_order_relaxed);

    if(node->num_dependents() == 0) {
      tpg->_sources.push_back(node);
    }

    node->_set_up_join_counter();
  }

  tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed);

  if(worker) {
    _schedule(*worker, tpg->_sources);
  }
  else {
    _schedule(tpg->_sources);
  }
 }

 // Function: _tear_down_topology
 inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) {

  auto &f = tpg->_taskflow;

  //assert(&tpg == &(f._topologies.front()));

  // case 1: we still need to run the topology again
  if(!tpg->_is_cancelled && !tpg->_pred()) {
    //assert(tpg->_join_counter == 0);
    std::lock_guard<std::mutex> lock(f._mutex);
    tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed);
    _schedule(worker, tpg->_sources);
  }
  // case 2: the final run of this topology
  else {

    // TODO: if the topology is cancelled, need to release all semaphores
    if(tpg->_call != nullptr) {
      tpg->_call();
    }

    // If there is another run (interleave between lock)
    if(std::unique_lock<std::mutex> lock(f._mutex); f._topologies.size()>1) {
      //assert(tpg->_join_counter == 0);

      // Set the promise
      tpg->_promise.set_value();
      f._topologies.pop();
      tpg = f._topologies.front().get();

      // decrement the topology but since this is not the last we don't notify
      _decrement_topology();

      // set up topology needs to be under the lock or it can
      // introduce memory order error with pop
      _set_up_topology(&worker, tpg);
    }
    else {
      //assert(f._topologies.size() == 1);

      // Need to back up the promise first here becuz taskflow might be
      // destroy soon after calling get
      auto p {std::move(tpg->_promise)};

      // Back up lambda capture in case it has the topology pointer,
      // to avoid it releasing on pop_front ahead of _mutex.unlock &
      // _promise.set_value. Released safely when leaving scope.
      auto c {std::move(tpg->_call)};

      // Get the satellite if any
      auto s {f._satellite};

      // Now we remove the topology from this taskflow
      f._topologies.pop();

      //f._mutex.unlock();
      lock.unlock();

      // We set the promise in the end in case taskflow leaves the scope.
      // After set_value, the caller will return from wait
      p.set_value();

      _decrement_topology_and_notify();

      // remove the taskflow if it is managed by the executor
      // TODO: in the future, we may need to synchronize on wait
      // (which means the following code should the moved before set_value)
      if(s) {
        std::scoped_lock<std::mutex> lock(_taskflows_mutex);
        _taskflows.erase(*s);
      }
    }
  }
 }

 // ############################################################################
 // Forward Declaration: Subflow
 // ############################################################################

 inline void Subflow::join() {

  // assert(this_worker().worker == &_worker);

  if(!_joinable) {
    TF_THROW("subflow not joinable");
  }

  // only the parent worker can join the subflow
  _executor._consume_graph(_worker, _parent, _graph);
  _joinable = false;
 }

 inline void Subflow::detach() {

  // assert(this_worker().worker == &_worker);

  if(!_joinable) {
    TF_THROW("subflow already joined or detached");
  }

  // only the parent worker can detach the subflow
  _executor._detach_dynamic_task(_worker, _parent, _graph);
  _joinable = false;
 }

 // ############################################################################
 // Forward Declaration: Runtime
 // ############################################################################

 // Procedure: schedule
 inline void Runtime::schedule(Task task) {
  
  auto node = task._node;
  // need to keep the invariant: when scheduling a task, the task must have
  // zero dependency (join counter is 0)
  // or we can encounter bug when inserting a nested flow (e.g., module task)
  node->_join_counter.store(0, std::memory_order_relaxed);

  auto& j = node->_parent ? node->_parent->_join_counter :
                            node->_topology->_join_counter;
  j.fetch_add(1, std::memory_order_relaxed);
  _executor._schedule(_worker, node);
 }

 // Procedure: corun
 template <typename T>
 void Runtime::corun(T&& target) {

  // dynamic task (subflow)
  if constexpr(is_dynamic_task_v<T>) {
    Graph graph;
    Subflow sf(_executor, _worker, _parent, graph);
    target(sf);
    if(sf._joinable) {
      _executor._consume_graph(_worker, _parent, graph);
    }
  }
  // a composable graph object with `tf::Graph& T::graph()` defined
  else {
    _executor._consume_graph(_worker, _parent, target.graph());
  }
 }

 // Procedure: corun_until
 template <typename P>
 void Runtime::corun_until(P&& predicate) {
  _executor._corun_until(_worker, std::forward<P>(predicate));
 }

 // Function: _silent_async
 template <typename F>
 void Runtime::_silent_async(Worker& w, const std::string& name, F&& f) {

  _parent->_join_counter.fetch_add(1, std::memory_order_relaxed);

  auto node = node_pool.animate(
    name, 0, _parent->_topology, _parent, 0,
    std::in_place_type_t<Node::Async>{}, std::forward<F>(f)
  );

  _executor._schedule(w, node);
 }

 // Function: silent_async
 template <typename F>
 void Runtime::silent_async(F&& f) {
  _silent_async(*_executor._this_worker(), "", std::forward<F>(f));
 }

 // Function: silent_async
 template <typename F>
 void Runtime::silent_async(const std::string& name, F&& f) {
  _silent_async(*_executor._this_worker(), name, std::forward<F>(f));
 }

 // Function: silent_async_unchecked
 template <typename F>
 void Runtime::silent_async_unchecked(const std::string& name, F&& f) {
  _silent_async(_worker, name, std::forward<F>(f));
 }

 // Function: _async
 template <typename F>
 auto Runtime::_async(Worker& w, const std::string& name, F&& f) {

  _parent->_join_counter.fetch_add(1, std::memory_order_relaxed);

  using R = std::invoke_result_t<std::decay_t<F>>;

  std::promise<R> p;
  auto fu{p.get_future()};

  auto node = node_pool.animate(
    name, 0, _parent->_topology, _parent, 0,
    std::in_place_type_t<Node::Async>{},
    [p=make_moc(std::move(p)), f=std::forward<F>(f)] () mutable {
      if constexpr(std::is_same_v<R, void>) {
        f();
        p.object.set_value();
      }
      else {
        p.object.set_value(f());
      }
    }
  );

  _executor._schedule(w, node);

  return fu;
 }

 // Function: async
 template <typename F>
 auto Runtime::async(F&& f) {
  return _async(*_executor._this_worker(), "", std::forward<F>(f));
 }

 // Function: async
 template <typename F>
 auto Runtime::async(const std::string& name, F&& f) {
  return _async(*_executor._this_worker(), name, std::forward<F>(f));
 }

 // Function: join
 inline void Runtime::join() {
  corun_until([this] () -> bool { 
    return _parent->_join_counter.load(std::memory_order_acquire) == 0; 
  });
 }

 }  // end of namespace tf -----------------------------------------------------











 // https://hackmd.io/@sysprog/concurrency-atomics

 namespace tf {

 // ----------------------------------------------------------------------------
 // Async
 // ----------------------------------------------------------------------------

 // Function: async
 template <typename F>
 auto Executor::async(const std::string& name, F&& f) {

  _increment_topology();

  using R = std::invoke_result_t<std::decay_t<F>>;

  std::promise<R> p;
  auto fu{p.get_future()};

  auto node = node_pool.animate(
    name, 0, nullptr, nullptr, 0,
    std::in_place_type_t<Node::Async>{}, 
    _make_promised_async(std::move(p), std::forward<F>(f))
  );

  _schedule_async_task(node);

  return fu;
 }

 // Function: async
 template <typename F>
 auto Executor::async(F&& f) {
  return async("", std::forward<F>(f));
 }

 // ----------------------------------------------------------------------------
 // Silent Async
 // ----------------------------------------------------------------------------

 // Function: silent_async
 template <typename F>
 void Executor::silent_async(const std::string& name, F&& f) {

  _increment_topology();

  auto node = node_pool.animate(
    name, 0, nullptr, nullptr, 0,
    std::in_place_type_t<Node::Async>{}, std::forward<F>(f)
  );

  _schedule_async_task(node);
 }

 // Function: silent_async
 template <typename F>
 void Executor::silent_async(F&& f) {
  silent_async("", std::forward<F>(f));
 }

 // ----------------------------------------------------------------------------
 // Async Helper Methods
 // ----------------------------------------------------------------------------

 // Function: _make_promised_async
 template <typename R, typename F>
 auto Executor::_make_promised_async(std::promise<R>&& p, F&& func) {
  return [p=make_moc(std::move(p)), func=std::forward<F>(func)]() mutable {
    if constexpr(std::is_same_v<R, void>) {
      func();
      p.object.set_value();
    }
    else {
      p.object.set_value(func());
    }
  };
 }
  
 // Procedure: _schedule_async_task
 inline void Executor::_schedule_async_task(Node* node) {  
  if(auto w = _this_worker(); w) {
    _schedule(*w, node);
  }
  else{
    _schedule(node);
  }
 }

 // Procedure: _tear_down_async
 inline void Executor::_tear_down_async(Node* node) {
  // from runtime
  if(node->_parent) {
    node->_parent->_join_counter.fetch_sub(1, std::memory_order_release);
  }
  // from executor
  else {
    _decrement_topology_and_notify();
  }
  node_pool.recycle(node);
 }

 // ----------------------------------------------------------------------------
 // Silent Dependent Async
 // ----------------------------------------------------------------------------

 // Function: silent_dependent_async
 template <typename F, typename... Tasks,
  std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>*
 >
 tf::AsyncTask Executor::silent_dependent_async(F&& func, Tasks&&... tasks) {
  return silent_dependent_async("", std::forward<F>(func), std::forward<Tasks>(tasks)...);
 }

 // Function: silent_dependent_async
 template <typename F, typename... Tasks,
  std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>*
 >
 tf::AsyncTask Executor::silent_dependent_async(
  const std::string& name, F&& func, Tasks&&... tasks 
 ){

  _increment_topology();

  size_t num_dependents = sizeof...(Tasks);
  
  std::shared_ptr<Node> node(
    node_pool.animate(
      name, 0, nullptr, nullptr, num_dependents,
      std::in_place_type_t<Node::DependentAsync>{}, std::forward<F>(func)
    ),
    [&](Node* ptr){ node_pool.recycle(ptr); }
  );
  
  {
    std::scoped_lock lock(_asyncs_mutex);
    _asyncs.insert(node);
  }
  
  if constexpr(sizeof...(Tasks) > 0) {
    (_process_async_dependent(node.get(), tasks, num_dependents), ...);
  }

  if(num_dependents == 0) {
    _schedule_async_task(node.get());
  }

  return AsyncTask(std::move(node));
 }

 // Function: silent_dependent_async
 template <typename F, typename I,
  std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>*
 >
 tf::AsyncTask Executor::silent_dependent_async(F&& func, I first, I last) {
  return silent_dependent_async("", std::forward<F>(func), first, last);
 }

 // Function: silent_dependent_async
 template <typename F, typename I,
  std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>*
 >
 tf::AsyncTask Executor::silent_dependent_async(
  const std::string& name, F&& func, I first, I last
 ) {

  _increment_topology();

  size_t num_dependents = std::distance(first, last);
  
  std::shared_ptr<Node> node(
    node_pool.animate(
      name, 0, nullptr, nullptr, num_dependents,
      std::in_place_type_t<Node::DependentAsync>{}, std::forward<F>(func)
    ),
    [&](Node* ptr){ node_pool.recycle(ptr); }
  );
  
  {
    std::scoped_lock lock(_asyncs_mutex);
    _asyncs.insert(node);
  }
  
  for(; first != last; first++){
    _process_async_dependent(node.get(), *first, num_dependents);
  }

  if(num_dependents == 0) {
    _schedule_async_task(node.get());
  }

  return AsyncTask(std::move(node));
 }

 // ----------------------------------------------------------------------------
 // Dependent Async
 // ----------------------------------------------------------------------------

 // Function: dependent_async
 template <typename F, typename... Tasks,
  std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>*
 >
 auto Executor::dependent_async(F&& func, Tasks&&... tasks) {
  return dependent_async("", std::forward<F>(func), std::forward<Tasks>(tasks)...);
 }

 // Function: dependent_async
 template <typename F, typename... Tasks,
  std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>*
 >
 auto Executor::dependent_async(
  const std::string& name, F&& func, Tasks&&... tasks 
 ) {
  
  _increment_topology();
  
  using R = std::invoke_result_t<std::decay_t<F>>;

  std::promise<R> p;
  auto fu{p.get_future()};

  size_t num_dependents = sizeof...(tasks);

  std::shared_ptr<Node> node(
    node_pool.animate(
      name, 0, nullptr, nullptr, num_dependents,
      std::in_place_type_t<Node::DependentAsync>{},
      _make_promised_async(std::move(p), std::forward<F>(func))
    ),
    [&](Node* ptr){ node_pool.recycle(ptr); }
  );
  
  {
    std::scoped_lock lock(_asyncs_mutex);
    _asyncs.insert(node);
  }
  
  if constexpr(sizeof...(Tasks) > 0) {
    (_process_async_dependent(node.get(), tasks, num_dependents), ...);
  }

  if(num_dependents == 0) {
    _schedule_async_task(node.get());
  }

  return std::make_pair(AsyncTask(std::move(node)), std::move(fu));
 }

 // Function: dependent_async
 template <typename F, typename I,
  std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>*
 >
 auto Executor::dependent_async(F&& func, I first, I last) {
  return dependent_async("", std::forward<F>(func), first, last);
 }

 // Function: dependent_async
 template <typename F, typename I,
  std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>*
 >
 auto Executor::dependent_async(
  const std::string& name, F&& func, I first, I last
 ) {
  
  _increment_topology();
  
  using R = std::invoke_result_t<std::decay_t<F>>;

  std::promise<R> p;
  auto fu{p.get_future()};

  size_t num_dependents = std::distance(first, last);

  std::shared_ptr<Node> node(
    node_pool.animate(
      name, 0, nullptr, nullptr, num_dependents,
      std::in_place_type_t<Node::DependentAsync>{},
      _make_promised_async(std::move(p), std::forward<F>(func))
    ),
    [&](Node* ptr){ node_pool.recycle(ptr); }
  );
  
  {
    std::scoped_lock lock(_asyncs_mutex);
    _asyncs.insert(node);
  }
  
  for(; first != last; first++) {
    _process_async_dependent(node.get(), *first, num_dependents);
  }

  if(num_dependents == 0) {
    _schedule_async_task(node.get());
  }

  return std::make_pair(AsyncTask(std::move(node)), std::move(fu));
 }

 // ----------------------------------------------------------------------------
 // Dependent Async Helper Functions
 // ----------------------------------------------------------------------------

 // Procedure: _process_async_dependent
 inline void Executor::_process_async_dependent(
  Node* node, tf::AsyncTask& task, size_t& num_dependents
 ) {

  std::shared_ptr<Node> dep;
  {
    std::scoped_lock lock(_asyncs_mutex);
    if(auto itr = _asyncs.find(task._node); itr != _asyncs.end()){
      dep = *itr;
    }
  }
  
  // if the dependent task exists
  if(dep) {
    auto& state = std::get_if<Node::DependentAsync>(&(dep->_handle))->state;

    add_dependent:

    auto target = Node::AsyncState::UNFINISHED;
    
    // acquires the lock
    if(state.compare_exchange_weak(target, Node::AsyncState::LOCKED,
                                   std::memory_order_acq_rel,
                                   std::memory_order_acquire)) {
      dep->_successors.push_back(node);
      state.store(Node::AsyncState::UNFINISHED, std::memory_order_release);
    }
    // dep's state is FINISHED, which means dep finished its callable already
    // thus decrement the node's join counter by 1
    else if (target == Node::AsyncState::FINISHED) {
      // decrement the counter needs to be the order of acquire and release
      // to synchronize with the worker
      num_dependents = node->_join_counter.fetch_sub(1, std::memory_order_acq_rel) - 1;
    }
    // another worker adding an async task that shares the same dependent
    else {
      goto add_dependent;
    }
  }
  else {
    num_dependents = node->_join_counter.fetch_sub(1, std::memory_order_acq_rel) - 1;
  }
 }

 // Procedure: _tear_down_dependent_async
 inline void Executor::_tear_down_dependent_async(Worker& worker, Node* node) {
  
  // this async task comes from Executor
  auto& state = std::get_if<Node::DependentAsync>(&(node->_handle))->state;
  auto target = Node::AsyncState::UNFINISHED;

  while(!state.compare_exchange_weak(target, Node::AsyncState::FINISHED,
                                     std::memory_order_acq_rel,
                                     std::memory_order_relaxed)) {
    target = Node::AsyncState::UNFINISHED;
  }
  
  // spaw successors whenever their dependencies are resolved
  worker._cache = nullptr;
  for(size_t i=0; i<node->_successors.size(); ++i) {
    //if(auto s = node->_successors[i]; --(s->_join_counter) == 0) {
    if(auto s = node->_successors[i]; 
      s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1
    ) {
      if(worker._cache) {
        _schedule(worker, worker._cache);
      }
      worker._cache = s;
    }
  }
    
  // remove myself from the asyncs using extraction to avoid calling
  // ~Node inside the lock
  typename std::unordered_set<std::shared_ptr<Node>>::node_type extracted;
  {
    std::shared_ptr<Node> ptr(node, [](Node*){});
    std::scoped_lock lock(_asyncs_mutex); 
    extracted = _asyncs.extract(ptr);
    // assert(extracted.empty() == false);
  }
  
  _decrement_topology_and_notify();
 }





 }  // end of namespace tf -----------------------------------------------------






 /**
 @file critical.hpp
 @brief critical include file
 */

 namespace tf {

 // ----------------------------------------------------------------------------
 // CriticalSection
 // ----------------------------------------------------------------------------

 /**
 @class CriticalSection

 @brief class to create a critical region of limited workers to run tasks

 tf::CriticalSection is a warpper over tf::Semaphore and is specialized for
 limiting the maximum concurrency over a set of tasks.
 A critical section starts with an initial count representing that limit.
 When a task is added to the critical section,
 the task acquires and releases the semaphore internal to the critical section.
 This design avoids explicit call of tf::Task::acquire and tf::Task::release.
 The following example creates a critical section of one worker and adds
 the five tasks to the critical section.

 @code{.cpp}
 tf::Executor executor(8);   // create an executor of 8 workers
 tf::Taskflow taskflow;

 // create a critical section of 1 worker
 tf::CriticalSection critical_section(1);

 tf::Task A = taskflow.emplace([](){ std::cout << "A" << std::endl; });
 tf::Task B = taskflow.emplace([](){ std::cout << "B" << std::endl; });
 tf::Task C = taskflow.emplace([](){ std::cout << "C" << std::endl; });
 tf::Task D = taskflow.emplace([](){ std::cout << "D" << std::endl; });
 tf::Task E = taskflow.emplace([](){ std::cout << "E" << std::endl; });

 critical_section.add(A, B, C, D, E);

 executor.run(taskflow).wait();
 @endcode

 */
 class CriticalSection : public Semaphore {

  public:

    /**
    @brief constructs a critical region of a limited number of workers
    */
    explicit CriticalSection(size_t max_workers = 1);

    /**
    @brief adds a task into the critical region
    */
    template <typename... Tasks>
    void add(Tasks...tasks);
 };

 inline CriticalSection::CriticalSection(size_t max_workers) :
  Semaphore {max_workers} {
 }

 template <typename... Tasks>
 void CriticalSection::add(Tasks... tasks) {
  (tasks.acquire(*this), ...);
  (tasks.release(*this), ...);
 }


 }  // end of namespace tf. ---------------------------------------------------




 /**
 @dir taskflow
 @brief root taskflow include dir
 */

 /**
 @dir taskflow/core
 @brief taskflow core include dir
 */

 /**
 @dir taskflow/algorithm
 @brief taskflow algorithms include dir
 */

 /**
 @dir taskflow/cuda
 @brief taskflow CUDA include dir
 */

 /**
 @file taskflow/taskflow.hpp
 @brief main taskflow include file
 */

 // TF_VERSION % 100 is the patch level
 // TF_VERSION / 100 % 1000 is the minor version
 // TF_VERSION / 100000 is the major version

 // current version: 3.6.0
 #define TF_VERSION 300600

 #define TF_MAJOR_VERSION TF_VERSION/100000
 #define TF_MINOR_VERSION TF_VERSION/100%1000
 #define TF_PATCH_VERSION TF_VERSION%100

 /**
 @brief taskflow namespace
 */
 namespace tf {

 /**
 @private
 */
 namespace detail { }


 /**
 @brief queries the version information in a string format @c major.minor.patch

 Release notes are available here: https://taskflow.github.io/taskflow/Releases.html
 */
 constexpr const char* version() {
  return "3.6.0";
 }


 }  // end of namespace tf -----------------------------------------------------