From 0a5d5ad807cda53a2276f617d5b1f0a09dfb17c2 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Wed, 18 Mar 2026 19:02:25 -0700
Subject: [PATCH 1/7] use symmetric transfer when co_await'ing a `task` from
 another `task`

---
 include/exec/any_sender_of.hpp                |   2 +-
 include/exec/task.hpp                         |   6 +-
 include/exec/when_any.hpp                     |   4 +-
 include/nvexec/stream/ensure_started.cuh      |   2 +-
 include/nvexec/stream/split.cuh               |   2 +-
 include/nvexec/stream/when_all.cuh            |   2 +-
 include/stdexec/__detail/__as_awaitable.hpp   | 173 +++---
 .../__detail/__parallel_scheduler_backend.hpp |   2 +-
 include/stdexec/__detail/__task.hpp           | 536 ++++++++++++------
 include/stdexec/__detail/__utility.hpp        |  10 -
 include/stdexec/functional.hpp                | 134 ++++-
 include/stdexec/stop_token.hpp                |   7 +-
 12 files changed, 545 insertions(+), 335 deletions(-)

diff --git a/include/exec/any_sender_of.hpp b/include/exec/any_sender_of.hpp
index ed24f088b..05b5fb794 100644
--- a/include/exec/any_sender_of.hpp
+++ b/include/exec/any_sender_of.hpp
@@ -977,7 +977,7 @@ namespace experimental::execution
       STDEXEC_ATTRIBUTE(no_unique_address) _Receiver __rcvr_;
       STDEXEC::inplace_stop_source __stop_source_{};
       using __stop_callback = typename STDEXEC::stop_token_of_t<
-        STDEXEC::env_of_t<_Receiver>>::template callback_type<STDEXEC::__forward_stop_request>;
+        STDEXEC::env_of_t<_Receiver>>::template callback_type<STDEXEC::__forward_stop_request<>>;
       std::optional<__stop_callback> __on_stop_{};
     };
 
diff --git a/include/exec/task.hpp b/include/exec/task.hpp
index eebf599c1..cc4a31a32 100644
--- a/include/exec/task.hpp
+++ b/include/exec/task.hpp
@@ -186,7 +186,7 @@ namespace experimental::execution
     struct __default_awaiter_context<_ParentPromise>
     {
       using __stop_token_t    = stop_token_of_t<env_of_t<_ParentPromise>>;
-      using __stop_callback_t = __stop_token_t::template callback_type<__forward_stop_request>;
+      using __stop_callback_t = __stop_token_t::template callback_type<__forward_stop_request<>>;
 
       template <__scheduler_affinity _Affinity>
       constexpr explicit __default_awaiter_context(__default_task_context_impl<_Affinity>& __self,
@@ -199,7 +199,7 @@ namespace experimental::execution
       {
         static_assert(std::is_nothrow_constructible_v<__stop_callback_t,
                                                       __stop_token_t,
-                                                      __forward_stop_request>);
+                                                      __forward_stop_request<>>);
         __self.__stop_token_ = __stop_source_.get_token();
       }
 
@@ -251,7 +251,7 @@ namespace experimental::execution
         // stop_source when stop is requested on the parent coroutine's stop
         // token.
         using __stop_token_t    = stop_token_of_t<env_of_t<_ParentPromise>>;
-        using __stop_callback_t = stop_callback_for_t<__stop_token_t, __forward_stop_request>;
+        using __stop_callback_t = stop_callback_for_t<__stop_token_t, __forward_stop_request<>>;
 
         if constexpr (std::same_as<__stop_token_t, inplace_stop_token>)
         {
diff --git a/include/exec/when_any.hpp b/include/exec/when_any.hpp
index fa31f4dd6..8691813ba 100644
--- a/include/exec/when_any.hpp
+++ b/include/exec/when_any.hpp
@@ -95,7 +95,7 @@ namespace experimental::execution
       {}
 
       using __on_stop =
-        stop_callback_for_t<stop_token_of_t<env_of_t<_Receiver>&>, __forward_stop_request>;
+        stop_callback_for_t<stop_token_of_t<env_of_t<_Receiver>&>, __forward_stop_request<>>;
 
       inplace_stop_source      __stop_source_{};
       std::optional<__on_stop> __on_stop_{};
@@ -260,7 +260,7 @@ namespace experimental::execution
                                               __copy_cvref_t<_Self, _Senders>...>)
         -> __opstate_t<_Self, _Receiver>
       {
-        return STDEXEC::__apply(STDEXEC::__construct<__opstate_t<_Self, _Receiver>>{},
+        return STDEXEC::__apply(STDEXEC::__construct_from<__opstate_t<_Self, _Receiver>>{},
                                 static_cast<_Self&&>(__self).__sndrs_,
                                 static_cast<_Receiver&&>(__rcvr));
       }
diff --git a/include/nvexec/stream/ensure_started.cuh b/include/nvexec/stream/ensure_started.cuh
index 385816a86..8844020ae 100644
--- a/include/nvexec/stream/ensure_started.cuh
+++ b/include/nvexec/stream/ensure_started.cuh
@@ -247,7 +247,7 @@ namespace nv::execution::_strm
       , public _strm::opstate_base<Receiver>
     {
       using on_stop_t = std::optional<
-        stop_callback_for_t<stop_token_of_t<env_of_t<Receiver>>, __forward_stop_request>>;
+        stop_callback_for_t<stop_token_of_t<env_of_t<Receiver>>, __forward_stop_request<>>>;
 
       on_stop_t                         on_stop_{};
       __intrusive_ptr<sh_state<Sender>> shared_state_;
diff --git a/include/nvexec/stream/split.cuh b/include/nvexec/stream/split.cuh
index 87234c6f6..a63be1bc4 100644
--- a/include/nvexec/stream/split.cuh
+++ b/include/nvexec/stream/split.cuh
@@ -246,7 +246,7 @@ namespace nv::execution::_strm
       , public _strm::opstate_base<Receiver>
     {
       using on_stop_t = std::optional<
-        stop_callback_for_t<stop_token_of_t<env_of_t<Receiver>>, __forward_stop_request>>;
+        stop_callback_for_t<stop_token_of_t<env_of_t<Receiver>>, __forward_stop_request<>>>;
 
       on_stop_t                         on_stop_{};
       std::shared_ptr<sh_state<Sender>> sh_state_;
diff --git a/include/nvexec/stream/when_all.cuh b/include/nvexec/stream/when_all.cuh
index 68919b875..c76c5a22b 100644
--- a/include/nvexec/stream/when_all.cuh
+++ b/include/nvexec/stream/when_all.cuh
@@ -301,7 +301,7 @@ namespace nv::execution::_strm
       using _indices_t          = __indices_for<Senders...>;
       using _stream_providers_t = std::array<stream_provider, sizeof...(Senders)>;
       using _stop_callback_t =
-        stop_callback_for_t<stop_token_of_t<env_of_t<_receiver_t>>, __forward_stop_request>;
+        stop_callback_for_t<stop_token_of_t<env_of_t<_receiver_t>>, __forward_stop_request<>>;
 
       template <class Sender, std::size_t Index>
       using _child_opstate_t =
diff --git a/include/stdexec/__detail/__as_awaitable.hpp b/include/stdexec/__detail/__as_awaitable.hpp
index d46e3c4ba..dc404e6ad 100644
--- a/include/stdexec/__detail/__as_awaitable.hpp
+++ b/include/stdexec/__detail/__as_awaitable.hpp
@@ -17,6 +17,7 @@
 
 #include "__execution_fwd.hpp"
 
+#include "../functional.hpp"
 #include "__atomic.hpp"
 #include "__awaitable.hpp"
 #include "__completion_signatures_of.hpp"
@@ -40,7 +41,10 @@ STDEXEC_PRAGMA_IGNORE_MSVC(4714)  // marked as __forceinline not inlined
 namespace STDEXEC
 {
 #if !STDEXEC_NO_STDCPP_COROUTINES()
-  namespace __detail
+  /////////////////////////////////////////////////////////////////////////////
+  // STDEXEC::as_awaitable [exec.as.awaitable]
+
+  namespace __as_awaitable
   {
     template <std::size_t _Count>
     extern __q<__decayed_std_tuple> const __as_single;
@@ -74,12 +78,7 @@ namespace STDEXEC
     template <class _Sender>
     using __adapted_sender_t =
       __remove_rvalue_reference_t<__call_result_t<__adapt_completion_t<_Sender>, _Sender>>;
-  }  // namespace __detail
 
-  /////////////////////////////////////////////////////////////////////////////
-  // STDEXEC::as_awaitable [exec.as.awaitable]
-  namespace __as_awaitable
-  {
     struct __void
     {};
 
@@ -90,6 +89,8 @@ namespace STDEXEC
     using __expected_t =
       std::variant<std::monostate, __value_or_void_t<_Value>, std::exception_ptr>;
 
+    using __connect_await::__has_as_awaitable_member;
+
     template <class _Tag, class _Sender, class... _Env>
     concept __completes_inline_for = __never_sends<_Tag, _Sender, _Env...>
                                   || STDEXEC::__completes_inline<_Tag, env_of_t<_Sender>, _Env...>;
@@ -278,19 +279,18 @@ namespace STDEXEC
     };
 
     template <class _Sender, class _Promise>
-    using __sync_receiver_t = __sync_receiver<_Promise, __detail::__value_t<_Sender, _Promise>>;
+    using __sync_receiver_t = __sync_receiver<_Promise, __value_t<_Sender, _Promise>>;
 
     template <class _Sender, class _Promise>
-    using __async_receiver_t = __async_receiver<_Promise, __detail::__value_t<_Sender, _Promise>>;
+    using __async_receiver_t = __async_receiver<_Promise, __value_t<_Sender, _Promise>>;
 
     //////////////////////////////////////////////////////////////////////////////////////
     // __sender_awaitable: awaitable type returned by as_awaitable when given a sender
     // that does not have an as_awaitable member function
-    template <class _Promise, class _Sender>
-    struct __sender_awaitable
-      : __sender_awaitable_base<__detail::__value_t<_Sender, _Promise>, false>
+    template <class _Promise, sender_in<env_of_t<_Promise&>> _Sender>
+    struct __sender_awaitable : __sender_awaitable_base<__value_t<_Sender, _Promise>, false>
     {
-      using __value_t = __detail::__value_t<_Sender, _Promise>;
+      using __value_t = __value_t<_Sender, _Promise>;
 
       constexpr explicit __sender_awaitable(_Sender&&                         __sndr,
                                             __std::coroutine_handle<_Promise> __hcoro)
@@ -350,12 +350,12 @@ namespace STDEXEC
 
     // When the sender is known to complete inline, we can connect and start the operation
     // in await_suspend.
-    template <class _Promise, class _Sender>
+    template <class _Promise, sender_in<env_of_t<_Promise&>> _Sender>
       requires __completes_inline<_Sender, env_of_t<_Promise&>>
     struct __sender_awaitable<_Promise, _Sender>
-      : __sender_awaitable_base<__detail::__value_t<_Sender, _Promise>, true>
+      : __sender_awaitable_base<__value_t<_Sender, _Promise>, true>
     {
-      using __value_t = __detail::__value_t<_Sender, _Promise>;
+      using __value_t = __as_awaitable::__value_t<_Sender, _Promise>;
 
       constexpr explicit __sender_awaitable(_Sender&&                         sndr,
                                             __std::coroutine_handle<_Promise> __hcoro)
@@ -404,7 +404,7 @@ namespace STDEXEC
 
     template <class _Sender, class _Promise>
     concept __awaitable_adapted_sender = sender_in<_Sender, env_of_t<_Promise&>>
-                                      && __minvocable_q<__detail::__value_t, _Sender, _Promise>
+                                      && __minvocable_q<__value_t, _Sender, _Promise>
                                       && requires(_Promise& __promise) {
                                            {
                                              __promise.unhandled_stopped()
@@ -412,8 +412,7 @@ namespace STDEXEC
                                          };
 
     template <class _Sender, class _Promise>
-    concept __awaitable_sender =
-      __awaitable_adapted_sender<__detail::__adapted_sender_t<_Sender>, _Promise>;
+    concept __awaitable_sender = __awaitable_adapted_sender<__adapted_sender_t<_Sender>, _Promise>;
 
     struct __unspecified
     {
@@ -426,92 +425,68 @@ namespace STDEXEC
     };
 
     template <class _Sender, class _Promise>
-    concept __incompatible_sender = sender<_Sender>
-                                 && __merror<__detail::__value_t<_Sender, _Promise>>;
-  }  // namespace __as_awaitable
+    concept __incompatible_sender = sender<_Sender> && __merror<__value_t<_Sender, _Promise>>;
 
-  struct as_awaitable_t
-  {
-    template <class _Tp, class _Promise>
-    static consteval auto __get_declfn() noexcept
-    {
-      using namespace __as_awaitable;
-      if constexpr (__connect_await::__has_as_awaitable_member<_Tp, _Promise>)
-      {
-        using __result_t = decltype(__declval<_Tp>().as_awaitable(__declval<_Promise&>()));
-        constexpr bool __is_nothrow = noexcept(
-          __declval<_Tp>().as_awaitable(__declval<_Promise&>()));
-        return __declfn<__result_t, __is_nothrow>();
-      }
-      else if constexpr (__awaitable<_Tp, __unspecified>)  // NOT __awaitable<_Tp, _Promise> !!
-      {                                                    // NOLINT(bugprone-branch-clone)
-        return __declfn<_Tp&&>();
-      }
-      else if constexpr (__awaitable_sender<_Tp, _Promise>)
-      {
-        using __result_t            = decltype(  //
-          __sender_awaitable{__detail::__adapt_sender_for_await(__declval<_Tp>()),
-                             __std::coroutine_handle<_Promise>()});
-        constexpr bool __is_nothrow = noexcept(
-          __sender_awaitable{__detail::__adapt_sender_for_await(__declval<_Tp>()),
-                             __std::coroutine_handle<_Promise>()});
-        return __declfn<__result_t, __is_nothrow>();
-      }
-      else if constexpr (__incompatible_sender<_Tp, _Promise>)
-      {
-        // NOT TO SPEC: It's a sender, but it isn't a sender in the current promise's
-        // environment, so we can return the error type that results from trying to
-        // compute the sender's value type:
-        return __declfn<__detail::__value_t<_Tp, _Promise>>();
-      }
-      else
-      {
-        return __declfn<_Tp&&>();
-      }
-    }
+    template <class _Sender, class _Promise>
+    concept __has_transform_as_awaitable_member =
+      sender_in<_Sender, env_of_t<_Promise>>
+      && __has_as_awaitable_member<transform_sender_result_t<_Sender, env_of_t<_Promise>>,
+                                   _Promise>;
 
-    template <class _Tp, class _Promise, auto _DeclFn = __get_declfn<_Tp, _Promise>()>
-      requires __callable<__mtypeof<_DeclFn>>
-    auto operator()(_Tp&& __t, _Promise& __promise) const noexcept(noexcept(_DeclFn()))
-      -> decltype(_DeclFn())
+    template <class _Sender, class _Promise>
+    concept __awaitable_transform_sender =  //
+      sender_in<_Sender, env_of_t<_Promise>>
+      && __awaitable_sender<transform_sender_result_t<_Sender, env_of_t<_Promise>>, _Promise>;
+
+    inline constexpr auto __with_member =  //
+      []<class _Promise, __has_as_awaitable_member<_Promise> _Tp>(_Tp&& __t, auto& __promise)
+        STDEXEC_AUTO_RETURN(static_cast<_Tp&&>(__t).as_awaitable(__promise));
+
+    inline constexpr auto __with_transform_member =  //
+      []<class _Promise, __has_transform_as_awaitable_member<_Promise> _Tp>(_Tp&&     __t,
+                                                                            _Promise& __promise)
+        STDEXEC_AUTO_RETURN(
+          STDEXEC::transform_sender(static_cast<_Tp&&>(__t), STDEXEC::get_env(__promise))
+            .as_awaitable(__promise));
+
+    inline constexpr auto __with_await =  //
+      []<__awaitable<__unspecified> _Tp>(_Tp&& __t, __ignore)
+        STDEXEC_AUTO_RETURN(static_cast<_Tp&&>(__t));
+
+    inline constexpr auto __with_sender =  //
+      []<class _Promise, __awaitable_transform_sender<_Promise> _Tp>(_Tp&& __t, _Promise& __promise)
+        STDEXEC_AUTO_RETURN(__sender_awaitable{
+          __as_awaitable::__adapt_sender_for_await(
+            STDEXEC::transform_sender(static_cast<_Tp&&>(__t), STDEXEC::get_env(__promise))),
+          __std::coroutine_handle<_Promise>::from_promise(__promise)});
+
+    // NOT TO SPEC: It's a sender, but it isn't a sender in the current promise's
+    // environment, so we can return the error type that results from trying to
+    // compute the sender's value type:
+    inline constexpr auto __with_incompatible_sender =  //
+      []<class _Promise, __incompatible_sender<_Promise> _Tp>(_Tp&&, _Promise&)
     {
-      using namespace __as_awaitable;
-      if constexpr (__connect_await::__has_as_awaitable_member<_Tp, _Promise>)
-      {
-        return static_cast<_Tp&&>(__t).as_awaitable(__promise);
-      }
-      else if constexpr (__awaitable<_Tp, __unspecified>)  // NOT __awaitable<_Tp, _Promise> !!
-      {                                                    // NOLINT(bugprone-branch-clone)
-        return static_cast<_Tp&&>(__t);
-      }
-      else if constexpr (__awaitable_sender<_Tp, _Promise>)
-      {
-        auto __hcoro = __std::coroutine_handle<_Promise>::from_promise(__promise);
-        return __sender_awaitable{__detail::__adapt_sender_for_await(static_cast<_Tp&&>(__t)),
-                                  __hcoro};
-      }
-      else if constexpr (__incompatible_sender<_Tp, _Promise>)
-      {
-        return __detail::__value_t<_Tp, _Promise>();
-      }
-      else
-      {
-        return static_cast<_Tp&&>(__t);
-      }
-    }
+      return __value_t<_Tp, _Promise>{};
+    };
 
-    template <class _Tp, class _Promise, auto _DeclFn = __get_declfn<_Tp, _Promise>()>
-      requires __callable<__mtypeof<_DeclFn>> || __tag_invocable<as_awaitable_t, _Tp, _Promise&>
-    [[deprecated("the use of tag_invoke for as_awaitable is deprecated")]]
-    auto operator()(_Tp&& __t, _Promise& __promise) const
-      noexcept(__nothrow_tag_invocable<as_awaitable_t, _Tp, _Promise&>)
-        -> __tag_invoke_result_t<as_awaitable_t, _Tp, _Promise&>
+    inline constexpr auto __identity =  //
+      []<class _Tp>(_Tp&& __t, __ignore) noexcept -> decltype(auto)
     {
-      using __result_t = __tag_invoke_result_t<as_awaitable_t, _Tp, _Promise&>;
-      static_assert(__awaitable<__result_t, _Promise>);
-      return __tag_invoke(*this, static_cast<_Tp&&>(__t), __promise);
-    }
-  };
+      return static_cast<_Tp&&>(__t);
+    };
+
+    inline constexpr auto __as_awaitable_impl =  //
+      __first_callable{__with_member,
+                       __with_transform_member,
+                       __with_await,
+                       __with_sender,
+                       __with_incompatible_sender,
+                       __identity};
+
+  }  // namespace __as_awaitable
+
+  struct as_awaitable_t : decltype(__as_awaitable::__as_awaitable_impl)
+  {};
 
   inline constexpr as_awaitable_t as_awaitable{};
 #endif
diff --git a/include/stdexec/__detail/__parallel_scheduler_backend.hpp b/include/stdexec/__detail/__parallel_scheduler_backend.hpp
index 502c89264..421f86548 100644
--- a/include/stdexec/__detail/__parallel_scheduler_backend.hpp
+++ b/include/stdexec/__detail/__parallel_scheduler_backend.hpp
@@ -116,7 +116,7 @@ namespace STDEXEC
     template <class _Token>
     struct __stop_callback_for
     {
-      using __callback_t = stop_callback_for_t<_Token, __forward_stop_request>;
+      using __callback_t = stop_callback_for_t<_Token, __forward_stop_request<>>;
 
       bool __register_stop_callback(_Token __token)
       {
diff --git a/include/stdexec/__detail/__task.hpp b/include/stdexec/__detail/__task.hpp
index a3c125cfd..496fd23f2 100644
--- a/include/stdexec/__detail/__task.hpp
+++ b/include/stdexec/__detail/__task.hpp
@@ -15,11 +15,11 @@
  */
 #pragma once
 
+#include "../stop_token.hpp"
 #include "__affine_on.hpp"
 #include "__as_awaitable.hpp"
 #include "__config.hpp"
 #include "__inline_scheduler.hpp"
-#include "__manual_lifetime.hpp"
 #include "__meta.hpp"
 #include "__optional.hpp"
 #include "__schedulers.hpp"
@@ -58,19 +58,6 @@ namespace STDEXEC
       constexpr void return_void() {}
     };
 
-    template <class _StopSource>
-    struct __on_stopped
-    {
-      void operator()() noexcept
-      {
-        __source_.request_stop();
-      }
-      _StopSource& __source_;
-    };
-
-    template <class _StopSource>
-    __on_stopped(_StopSource&) -> __on_stopped<_StopSource>;
-
     constexpr size_t __divmod(size_t __total_size, size_t __chunk_size) noexcept
     {
       return (__total_size / __chunk_size) + (__total_size % __chunk_size != 0);
@@ -135,18 +122,68 @@ namespace STDEXEC
     template <class _Env>
     using __stop_source_type = _Env::stop_source_type;
 
-    template <class _Env, class _Rcvr>
-    using __environment_type = _Env::template env_type<env_of_t<_Rcvr>>;
-
     template <class _Env>
-    using __error_types = __error_types_t<typename _Env::error_types,
-                                          __munique<__qq<completion_signatures>>,
-                                          __mcompose<__qf<set_error_t>, __q1<__decay_t>>>;
-
-    template <class _Rcvr, class _Alloc>
-    concept __has_allocator_compatible_with = requires(_Rcvr& __rcvr) {
-      _Alloc(STDEXEC::get_allocator(STDEXEC::get_env(__rcvr)));
-    } || std::default_initializable<_Alloc>;
+    using __error_types = _Env::error_types;
+
+    template <class _Env, class _EnvProvider>
+    using __environment_type = _Env::template env_type<env_of_t<_EnvProvider>>;
+
+    template <class _EnvProvider, class _Alloc>
+    concept __has_allocator_compatible_with = requires(_EnvProvider const & __has_env) {
+      _Alloc(STDEXEC::get_allocator(STDEXEC::get_env(__has_env)));
+    };
+
+    template <class _EnvProvider, class _Scheduler>
+    concept __has_scheduler_compatible_with = requires(_EnvProvider const & __has_env) {
+      _Scheduler(STDEXEC::get_scheduler(STDEXEC::get_env(__has_env)));
+    };
+
+    template <class _StopSource>
+    using __stop_source_token_t = decltype(__declval<_StopSource>().get_token());
+
+    template <class _StopToken, class _StopSource>
+    struct __stop_callback_box
+    {
+      void __register_callback(__ignore, __ignore) noexcept {}
+      void __reset_callback() noexcept {}
+    };
+
+    template <class _StopToken, class _StopSource>
+      requires __not_same_as<__stop_source_token_t<_StopSource>, _StopToken>
+    struct __stop_callback_box<_StopToken, _StopSource>
+    {
+      using __stop_variant_t  = __variant<_StopSource, __stop_source_token_t<_StopSource>>;
+      using __callback_fn_t   = __forward_stop_request<_StopSource>;
+      using __stop_callback_t = stop_callback_for_t<_StopToken, __callback_fn_t>;
+
+      constexpr __stop_callback_box() {}
+
+      void __register_callback(auto const & __has_env, __stop_variant_t& __stop)
+        noexcept(__nothrow_constructible_from<__stop_callback_t, _StopToken, _StopSource&>)
+      {
+        std::construct_at(&__cb_, get_stop_token(get_env(__has_env)), __var::__get<0>(__stop));
+      }
+
+      void __reset_callback() noexcept
+      {
+        std::destroy_at(&__cb_);
+      }
+
+      union
+      {
+        __stop_callback_t __cb_;
+      };
+    };
+
+    template <class _EnvProvider, class _StopSource>
+    using __stop_callback_box_t =
+      __stop_callback_box<stop_token_of_t<env_of_t<_EnvProvider>>, _StopSource>;
+
+    inline constexpr auto __throw_error = __overload{
+      []<class _Error>(_Error&& __error) { STDEXEC_THROW(static_cast<_Error&&>(__error)); },
+      [](std::error_code __ec) { STDEXEC_THROW(std::system_error(__ec)); },
+      [](std::exception_ptr __eptr) { std::rethrow_exception(__eptr); }};
+
   }  // namespace __task
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -169,6 +206,8 @@ namespace STDEXEC
     struct __promise;
     template <class _Rcvr>
     struct __opstate;
+    template <class _EnvProvider>
+    using __own_env_t = __minvoke_or_q<__task::__environment_type, env<>, _Env, _EnvProvider>;
    public:
     using sender_concept = sender_t;
     using promise_type   = __promise;
@@ -193,9 +232,10 @@ namespace STDEXEC
     template <receiver _Rcvr>
     constexpr auto connect(_Rcvr rcvr) && -> __opstate<_Rcvr>
     {
+      static_assert(__task::__has_allocator_compatible_with<_Rcvr, allocator_type>
+                    || std::default_initializable<allocator_type>);
       STDEXEC_ASSERT(__coro_);
-      static_assert(__task::__has_allocator_compatible_with<_Rcvr, allocator_type>);
-      return __opstate<_Rcvr>(std::exchange(__coro_, {}), static_cast<_Rcvr&&>(rcvr));
+      return __opstate<_Rcvr>(static_cast<task&&>(*this), static_cast<_Rcvr&&>(rcvr));
     }
 
     template <class>
@@ -207,133 +247,98 @@ namespace STDEXEC
     [[nodiscard]]
     constexpr auto get_env() const noexcept
     {
-      return __env{};
+      return __attrs{};
+    }
+
+    template <class _ParentPromise>
+    constexpr auto as_awaitable(_ParentPromise& __parent) && noexcept
+    {
+      return __awaiter<_ParentPromise>(static_cast<task&&>(*this), __parent);
     }
 
    private:
-    using __on_stopped_t = __task::__on_stopped<stop_source_type>;
+    using __on_stopped_t   = __forward_stop_request<stop_source_type>;
+    using __stop_variant_t = __variant<stop_source_type, stop_token_type>;
 
-    using __error_variant_t = __error_types_t<error_types, __q<__variant>, __q1<__decay_t>>;
+    template <class _EnvProvider>
+    using __stop_callback_t =
+      stop_callback_for_t<stop_token_of_t<env_of_t<_EnvProvider>>, __on_stopped_t>;
 
-    using __completions_t = __concat_completion_signatures_t<
-      completion_signatures<__detail::__single_value_sig_t<_Ty>, set_stopped_t()>,
-      error_types>;
-
-    template <class _Rcvr>
-    using __stop_callback_t = stop_callback_for_t<stop_token_of_t<env_of_t<_Rcvr>>, __on_stopped_t>;
+    template <class _EnvProvider>
+    using __stop_callback_box_t = __task::__stop_callback_box_t<_EnvProvider, stop_source_type>;
 
-    template <class _Rcvr>
+    template <class _EnvProvider>
     static constexpr bool __needs_stop_callback =
-      __not_same_as<stop_token_type, stop_token_of_t<env_of_t<_Rcvr>>>;
-
-    struct __env
-    {
-      template <class _Tag>
-      [[nodiscard]]
-      constexpr auto query(__get_completion_behavior_t<_Tag>) const noexcept
-      {
-        return __completion_behavior::__asynchronous_affine
-             | __completion_behavior::__inline_completion;
-      }
-    };
+      __not_same_as<stop_token_type, stop_token_of_t<env_of_t<_EnvProvider>>>;
 
-    struct __opstate_base
-    {
-      constexpr explicit __opstate_base(scheduler_type __sched) noexcept
-        : __sch_(std::move(__sched))
-      {}
-
-      virtual void __completed() noexcept                       = 0;
-      virtual void __canceled() noexcept                        = 0;
-      virtual auto __get_allocator() noexcept -> allocator_type = 0;
-
-      scheduler_type    __sch_;
-      __error_variant_t __errors_{__no_init};
-    };
+    template <class _EnvProvider>
+    static constexpr bool __nothrow_callback_registration = noexcept(
+      __declval<__stop_callback_box_t<_EnvProvider>&>()
+        .__register_callback(__declval<_EnvProvider&>(), __declval<__stop_variant_t&>()));
 
-    constexpr explicit task(__std::coroutine_handle<promise_type> __coro) noexcept
-      : __coro_(std::move(__coro))
-    {}
+    using __error_variant_t = __error_types_t<error_types, __q<__variant>, __q1<__decay_t>>;
 
-    __std::coroutine_handle<promise_type> __coro_;
-  };
+    using __completions_t = __concat_completion_signatures_t<
+      completion_signatures<__detail::__single_value_sig_t<_Ty>, set_stopped_t()>,
+      error_types>;
 
-  ////////////////////////////////////////////////////////////////////////////////////////
-  // task<T,E>::__opstate
-  template <class _Ty, class _Env>
-  template <class _Rcvr>
-  struct STDEXEC_ATTRIBUTE(empty_bases) task<_Ty, _Env>::__opstate final
-    : __opstate_base
-    , __if_c<__needs_stop_callback<_Rcvr>, __manual_lifetime<__stop_callback_t<_Rcvr>>, __empty>
-  {
-   public:
-    using operation_state_concept = operation_state_t;
+    static constexpr void __sink(task) noexcept {}
 
-    explicit __opstate(__std::coroutine_handle<promise_type> __coro, _Rcvr&& __rcvr) noexcept
-      : __opstate_base(__mk_sched(__rcvr))
-      , __coro_(std::move(__coro))
-      , __rcvr_(static_cast<_Rcvr&&>(__rcvr))
-      , __own_env_(__mk_own_env(__rcvr_))
-      , __env_(__mk_env(__rcvr_, __own_env_))
+    template <class _EnvProvider>
+    [[nodiscard]]
+    static auto __mk_alloc(_EnvProvider const & __has_env) noexcept -> allocator_type
     {
-      // Set the promise's state pointer to this operation state, so it can call back into
-      // it when the coroutine completes or is stopped.
-      __coro_.promise().__state_ = this;
-      // Initialize the promise's stop source if translation is needed between the
-      // receiver's stop token and the task's stop token:
-      if constexpr (__needs_stop_callback<_Rcvr>)
+      if constexpr (__task::__has_allocator_compatible_with<_EnvProvider, allocator_type>)
       {
-        __coro_.promise().__stop_.template emplace<0>();
+        return allocator_type(get_allocator(STDEXEC::get_env(__has_env)));
       }
       else
       {
-        __coro_.promise().__stop_.template emplace<1>(get_stop_token(STDEXEC::get_env(__rcvr_)));
+        return allocator_type{};
       }
     }
 
-    ~__opstate()
-    {
-      if (__coro_)
-        __coro_.destroy();
-    }
-
-    void start() & noexcept
+    template <class _EnvProvider>
+    [[nodiscard]]
+    static auto __mk_sched(_EnvProvider const & __has_env) noexcept -> scheduler_type
     {
-      if constexpr (__needs_stop_callback<_Rcvr>)
+      if constexpr (__task::__has_scheduler_compatible_with<_EnvProvider, scheduler_type>)
+      {
+        return scheduler_type(get_scheduler(STDEXEC::get_env(__has_env)));
+      }
+      else
       {
-        // If the receiver's stop token is different from the task's stop token, then we need
-        // to set up a callback to request a stop on the task's stop source when the receiver's
-        // stop token is triggered:
-        __stop_callback().__construct(get_stop_token(STDEXEC::get_env(__rcvr_)),
-                                      __on_stopped_t{__var::__get<0>(__coro_.promise().__stop_)});
+        return scheduler_type{};
       }
-      __coro_.resume();
     }
 
-   private:
-    using __own_env_t = __minvoke_or_q<__task::__environment_type, env<>, _Env, _Rcvr>;
-
-    static auto __mk_own_env(_Rcvr const & __rcvr) noexcept -> __own_env_t
+    template <class _EnvProvider>
+    [[nodiscard]]
+    static auto __mk_own_env(_EnvProvider const & __has_env) noexcept
     {
-      if constexpr (__std::constructible_from<__own_env_t, env_of_t<_Rcvr>>)
+      if constexpr (__std::constructible_from<__own_env_t<_EnvProvider>, env_of_t<_EnvProvider>>)
       {
-        return __own_env_t(STDEXEC::get_env(__rcvr));
+        return __own_env_t<_EnvProvider>(STDEXEC::get_env(__has_env));
       }
       else
       {
-        return __own_env_t{};
+        return __own_env_t<_EnvProvider>{};
       }
     }
 
-    static auto __mk_env(_Rcvr const & __rcvr, __own_env_t const & __own_env) noexcept -> _Env
+    template <class _EnvProvider>
+    [[nodiscard]]
+    static auto
+    __mk_env(_EnvProvider const & __has_env, __own_env_t<_EnvProvider> const & __own_env) noexcept
+      -> _Env
     {
-      if constexpr (__std::constructible_from<_Env, __own_env_t const &>)
+      if constexpr (__std::constructible_from<_Env, __own_env_t<_EnvProvider> const &>)
       {
         return _Env(__own_env);
       }
-      else if constexpr (__std::constructible_from<_Env, env_of_t<_Rcvr>>)
+      else if constexpr (__std::constructible_from<_Env, env_of_t<_EnvProvider>>)
       {
-        return _Env(STDEXEC::get_env(__rcvr));
+        return _Env(STDEXEC::get_env(__has_env));
       }
       else
       {
@@ -341,93 +346,247 @@ namespace STDEXEC
       }
     }
 
-    static auto __mk_sched(_Rcvr const & __rcvr) noexcept -> scheduler_type
+    struct __opstate_base : allocator_type
     {
-      if constexpr (requires { scheduler_type(get_scheduler(STDEXEC::get_env(__rcvr))); })
+      template <class _EnvProvider>
+      constexpr explicit __opstate_base(task&& __task, _EnvProvider const & __has_env) noexcept
+        : allocator_type(__mk_alloc(__has_env))
+        , __sch_(__mk_sched(__has_env))
+        , __task_(static_cast<task&&>(__task))
       {
-        return scheduler_type(get_scheduler(STDEXEC::get_env(__rcvr)));
+        auto& __promise = __task_.__coro_.promise();
+        // Set the promise's state pointer to this operation state, so it can call back into
+        // it when the coroutine completes or is stopped.
+        __promise.__state_ = this;
+
+        // Initialize the promise's stop source if translation is needed between the
+        // receiver's stop token and the task's stop token:
+        if constexpr (__needs_stop_callback<_EnvProvider>)
+        {
+          __promise.__stop_.template emplace<0>();
+        }
+        else
+        {
+          __promise.__stop_.template emplace<1>(get_stop_token(STDEXEC::get_env(__has_env)));
+        }
       }
-      else
+
+      STDEXEC_IMMOVABLE(__opstate_base);
+
+      virtual auto __completed() noexcept -> __std::coroutine_handle<> = 0;
+      virtual auto __canceled() noexcept -> __std::coroutine_handle<>  = 0;
+
+      [[nodiscard]]
+      constexpr auto __get_allocator() const noexcept -> allocator_type
       {
-        return scheduler_type{};
+        return static_cast<allocator_type const &>(*this);
       }
-    }
 
-    auto __stop_callback() noexcept -> __manual_lifetime<__stop_callback_t<_Rcvr>>&
-      requires __needs_stop_callback<_Rcvr>
-    {
-      return *this;
-    }
+      constexpr auto __handle() const noexcept -> __std::coroutine_handle<promise_type>
+      {
+        return __task_.__coro_;
+      }
+
+      scheduler_type    __sch_;
+      task              __task_;
+      __error_variant_t __errors_{__no_init};
+    };
 
-    void __completed() noexcept final
+    template <class _ParentPromise>
+    struct STDEXEC_ATTRIBUTE(empty_bases) __awaiter final
+      : __opstate_base
+      , __stop_callback_box_t<_ParentPromise>
     {
-      if constexpr (__needs_stop_callback<_Rcvr>)
+      constexpr explicit __awaiter(task&& __task, _ParentPromise& __parent) noexcept
+        : __opstate_base(static_cast<task&&>(__task), __parent)
+        , __own_env_(__mk_own_env(__parent))
+        , __env_(__mk_env(__parent, __own_env_))
+        , __parent_(__parent)
+      {}
+
+      static constexpr auto await_ready() noexcept -> bool
       {
-        // If we set up a stop callback on the receiver's stop token, then we need to
-        // disable it when the operation completes:
-        __stop_callback().__destroy();
+        return false;
       }
 
-      if (this->__errors_.index() != __variant_npos)
+      constexpr auto await_suspend(__std::coroutine_handle<_ParentPromise> __h)
+        noexcept(__nothrow_callback_registration<_ParentPromise>) -> __std::coroutine_handle<>
       {
-        std::exchange(__coro_, {}).destroy();
-        __visit(STDEXEC::set_error, std::move(this->__errors_), static_cast<_Rcvr&&>(__rcvr_));
+        auto& __task_promise = this->__handle().promise();
+        // If the following throws, the coroutine is immediately resumed and the exception
+        // is rethrown at the suspension point.
+        this->__register_callback(__h.promise(), __task_promise.__stop_);
+        __task_promise.__state_ = this;
+        __continuation_         = __h;
+        return this->__handle();
       }
-      else if constexpr (__same_as<_Ty, void>)
+
+      constexpr auto await_resume() -> _Ty
       {
-        std::exchange(__coro_, {}).destroy();
-        STDEXEC::set_value(static_cast<_Rcvr&&>(__rcvr_));
+        // Destroy the coroutine after moving the result/error out of it
+        auto __task = std::move(this->__task_);
+        if (!this->__errors_.__is_valueless())
+        {
+          __visit(__task::__throw_error, std::move(this->__errors_));
+        }
+        else if constexpr (__same_as<_Ty, void>)
+        {
+          return;
+        }
+        else
+        {
+          return static_cast<_Ty&&>(*__task.__coro_.promise().__result_);
+        }
+        __std::unreachable();
       }
-      else
+
+      [[nodiscard]]
+      auto __completed() noexcept -> __std::coroutine_handle<> final
       {
-        STDEXEC_TRY
+        this->__reset_callback();
+        return __continuation_;
+      }
+
+      [[nodiscard]]
+      auto __canceled() noexcept -> __std::coroutine_handle<> final
+      {
+        this->__reset_callback();
+        return __parent_.unhandled_stopped();
+      }
+
+      STDEXEC_IMMOVABLE_NO_UNIQUE_ADDRESS
+      __own_env_t<_ParentPromise> __own_env_;
+      STDEXEC_IMMOVABLE_NO_UNIQUE_ADDRESS
+      _Env                        __env_;
+      __std::coroutine_handle<>   __continuation_;
+      _ParentPromise&             __parent_;
+    };
+
+    struct __attrs
+    {
+      template <class _Tag, class... _OtherEnv>
+      [[nodiscard]]
+      constexpr auto query(__get_completion_behavior_t<_Tag>, _OtherEnv&&...) const noexcept
+      {
+        using __attrs_t = env_of_t<schedule_result_t<scheduler_type>>;
+
+        if constexpr (__completes_inline<set_value_t, __attrs_t, _OtherEnv...>)
         {
-          // Move the result out of the promise before destroying the coroutine.
-          _Ty __result = static_cast<_Ty&&>(*__coro_.promise().__result_);
-          std::exchange(__coro_, {}).destroy();
-          STDEXEC::set_value(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Ty&&>(__result));
+          return __completion_behavior::__unknown;
         }
-        STDEXEC_CATCH_ALL
+        else
         {
-          if constexpr (!__nothrow_move_constructible<_Ty>)
-          {
-            std::exchange(__coro_, {}).destroy();
-            STDEXEC::set_error(static_cast<_Rcvr&&>(__rcvr_), std::current_exception());
-          }
+          return __completion_behavior::__asynchronous_affine
+               | __completion_behavior::__inline_completion;
         }
       }
-    }
+    };
+
+    constexpr explicit task(__std::coroutine_handle<promise_type> __coro) noexcept
+      : __coro_(std::move(__coro))
+    {}
+
+    __std::coroutine_handle<promise_type> __coro_;
+  };
 
-    void __canceled() noexcept final
+  ////////////////////////////////////////////////////////////////////////////////////////
+  // task<T,E>::__opstate
+  template <class _Ty, class _Env>
+  template <class _Rcvr>
+  struct STDEXEC_ATTRIBUTE(empty_bases) task<_Ty, _Env>::__opstate final
+    : __opstate_base
+    , __stop_callback_box_t<_Rcvr>
+  {
+   public:
+    using operation_state_concept = operation_state_t;
+
+    explicit __opstate(task&& __task, _Rcvr&& __rcvr) noexcept
+      : __opstate_base(static_cast<task&&>(__task), __rcvr)
+      , __own_env_(__mk_own_env(__rcvr))
+      , __env_(__mk_env(__rcvr, __own_env_))
+      , __rcvr_(static_cast<_Rcvr&&>(__rcvr))
+    {}
+
+    void start() & noexcept
     {
-      if constexpr (__needs_stop_callback<_Rcvr>)
+      STDEXEC_TRY
       {
-        __stop_callback().__destroy();
+        // Register a stop callback if needed
+        this->__register_callback(__rcvr_, this->__handle().promise().__stop_);
+        this->__handle().resume();
+      }
+      STDEXEC_CATCH_ALL
+      {
+        if constexpr (__nothrow_callback_registration<_Rcvr>)
+        {
+          // no-op
+        }
+        else if constexpr (__mapply<__mcontains<set_error_t(std::exception_ptr)>,
+                                    error_types>::value)
+        {
+          STDEXEC::set_error(static_cast<_Rcvr&&>(__rcvr_), std::current_exception());
+        }
+        else
+        {
+          STDEXEC::__die("Starting the task failed due to an exception being thrown while "
+                         "registering a stop callback, but the task's error_types does not "
+                         "include std::exception_ptr, so the exception cannot be propagated.");
+        }
       }
-
-      std::exchange(__coro_, {}).destroy();
-      STDEXEC::set_stopped(static_cast<_Rcvr&&>(__rcvr_));
     }
 
-    [[nodiscard]]
-    auto __get_allocator() noexcept -> allocator_type final
+   private:
+    auto __completed() noexcept -> __std::coroutine_handle<> final
     {
-      if constexpr (requires { allocator_type(get_allocator(STDEXEC::get_env(__rcvr_))); })
+      STDEXEC_TRY
       {
-        return allocator_type(get_allocator(STDEXEC::get_env(__rcvr_)));
+        this->__reset_callback();
+
+        if (!this->__errors_.__is_valueless())
+        {
+          // Move the errors out of the promise before destroying the coroutine.
+          auto __errors = std::move(this->__errors_);
+          __sink(static_cast<task&&>(this->__task_));
+          __visit(STDEXEC::set_error, std::move(__errors), static_cast<_Rcvr&&>(__rcvr_));
+        }
+        else if constexpr (__same_as<_Ty, void>)
+        {
+          __sink(static_cast<task&&>(this->__task_));
+          STDEXEC::set_value(static_cast<_Rcvr&&>(__rcvr_));
+        }
+        else
+        {
+          // Move the result out of the promise before destroying the coroutine.
+          _Ty __result = static_cast<_Ty&&>(*this->__handle().promise().__result_);
+          __sink(static_cast<task&&>(this->__task_));
+          STDEXEC::set_value(static_cast<_Rcvr&&>(__rcvr_), static_cast<_Ty&&>(__result));
+        }
       }
-      else
+      STDEXEC_CATCH_ALL
       {
-        return allocator_type{};
+        if constexpr (!__nothrow_move_constructible<_Ty>
+                      || !__nothrow_move_constructible<__error_variant_t>)
+        {
+          __sink(static_cast<task&&>(this->__task_));
+          STDEXEC::set_error(static_cast<_Rcvr&&>(__rcvr_), std::current_exception());
+        }
       }
+      return std::noop_coroutine();
+    }
+
+    auto __canceled() noexcept -> __std::coroutine_handle<> final
+    {
+      this->__reset_callback();
+      __sink(static_cast<task&&>(this->__task_));
+      STDEXEC::set_stopped(static_cast<_Rcvr&&>(__rcvr_));
+      return std::noop_coroutine();
     }
 
-    __std::coroutine_handle<promise_type> __coro_;
-    _Rcvr                                 __rcvr_;
     STDEXEC_IMMOVABLE_NO_UNIQUE_ADDRESS
-    __own_env_t                           __own_env_;
+    __own_env_t<_Rcvr> __own_env_;
     STDEXEC_IMMOVABLE_NO_UNIQUE_ADDRESS
-    _Env                                  __env_;
+    _Env               __env_;
+    _Rcvr              __rcvr_;
   };
 
   ////////////////////////////////////////////////////////////////////////////////////////
@@ -450,7 +609,7 @@ namespace STDEXEC
 
     auto final_suspend() noexcept
     {
-      return __completed_awaitable{};
+      return __completed_awaiter{};
     }
 
     void unhandled_exception()
@@ -458,10 +617,8 @@ namespace STDEXEC
       if constexpr (!__mapply<__mcontains<std::exception_ptr>, __error_variant_t>::value)
       {
         STDEXEC::__die("A task threw an exception but does not have std::exception_ptr in its "
-                       "error_types. "
-                       "Either add std::exception_ptr to the task's error_types or ensure that all "
-                       "code called "
-                       "by the task is noexcept.");
+                       "error_types. Either add std::exception_ptr to the task's error_types or "
+                       "ensure that all code called by the task is noexcept.");
       }
       else
       {
@@ -472,13 +629,12 @@ namespace STDEXEC
     [[nodiscard]]
     auto unhandled_stopped() noexcept -> __std::coroutine_handle<>
     {
-      __state_->__canceled();
-      return std::noop_coroutine();
+      return __state_->__canceled();
     }
 
     template <class _Error>
-    constexpr auto
-    yield_value(with_error<_Error> __error) noexcept(__nothrow_decay_copyable<_Error>)
+    constexpr auto yield_value(with_error<_Error> __error)  //
+      noexcept(__nothrow_decay_copyable<_Error>)
     {
       if constexpr (__mapply<__mcontains<__decay_t<_Error>>, __error_variant_t>::value)
       {
@@ -488,13 +644,14 @@ namespace STDEXEC
       {
         static_assert(__mnever<_Error>, "Error type not in task's error_types");
       }
-      return __completed_awaitable{};
+      return __completed_awaiter{};
     }
 
     template <sender _Sender>
     constexpr auto await_transform(_Sender&& __sndr) noexcept
     {
-      if constexpr (__same_as<scheduler_type, STDEXEC::inline_scheduler>)
+      using __schedule_sndr_t = schedule_result_t<scheduler_type>;
+      if constexpr (__completes_inline<set_value_t, env_of_t<__schedule_sndr_t>, __env>)
       {
         return STDEXEC::as_awaitable(static_cast<_Sender&&>(__sndr), *this);
       }
@@ -537,10 +694,10 @@ namespace STDEXEC
         __task::__divmod(sizeof(__task::__any_alloc<__palloc_t>), sizeof(__task::__memblock));
       size_t const __promise_blocks = __task::__divmod(__bytes, sizeof(__task::__memblock));
 
-      __palloc_t        __palloc(__alloc);
-      __pointer_t const __ptr = std::allocator_traits<__palloc_t>::allocate(__palloc,
-                                                                            __promise_blocks
-                                                                              + __alloc_blocks);
+      __palloc_t  __palloc(__alloc);
+      auto* const __ptr = std::allocator_traits<__palloc_t>::allocate(__palloc,
+                                                                      __promise_blocks
+                                                                        + __alloc_blocks);
 
       // construct the allocator in the blocks immediately following the promise object:
       void* const __alloc_loc = __ptr + __promise_blocks;
@@ -557,20 +714,23 @@ namespace STDEXEC
     }
 
    private:
+    template <class>
+    friend struct __awaiter;
     template <class>
     friend struct __opstate;
+    friend struct __opstate_base;
 
-    struct __completed_awaitable
+    struct __completed_awaiter
     {
       static constexpr bool await_ready() noexcept
       {
         return false;
       }
 
-      static constexpr void await_suspend(__std::coroutine_handle<__promise> __coro) noexcept
+      static constexpr auto await_suspend(__std::coroutine_handle<__promise> __coro) noexcept  //
+        -> __std::coroutine_handle<>
       {
-        __promise& __self = __coro.promise();
-        __self.__state_->__completed();
+        return __coro.promise().__state_->__completed();
       }
 
       static constexpr void await_resume() noexcept {}
@@ -606,8 +766,8 @@ namespace STDEXEC
       __promise const * __promise_;
     };
 
-    __variant<stop_source_type, stop_token_type> __stop_{__no_init};
-    __opstate_base*                              __state_ = nullptr;
+    __stop_variant_t __stop_{__no_init};
+    __opstate_base*  __state_ = nullptr;
   };
 #endif  // !STDEXEC_NO_STDCPP_COROUTINES()
 }  // namespace STDEXEC
diff --git a/include/stdexec/__detail/__utility.hpp b/include/stdexec/__detail/__utility.hpp
index c3ff206d8..f19373191 100644
--- a/include/stdexec/__detail/__utility.hpp
+++ b/include/stdexec/__detail/__utility.hpp
@@ -70,16 +70,6 @@ namespace STDEXEC
     auto operator=(__move_only const &) -> __move_only& = delete;
   };
 
-  // Helper to combine multiple function objects into one overload set
-  template <class... _Fns>
-  struct __overload : _Fns...
-  {
-    using _Fns::operator()...;
-  };
-
-  template <class... _Fns>
-  STDEXEC_HOST_DEVICE_DEDUCTION_GUIDE __overload(_Fns...) -> __overload<_Fns...>;
-
   template <class _Fun, class... _As>
   using __call_result_t = decltype(__declval<_Fun>()(__declval<_As>()...));
 
diff --git a/include/stdexec/functional.hpp b/include/stdexec/functional.hpp
index 21f018014..77d5cbf2e 100644
--- a/include/stdexec/functional.hpp
+++ b/include/stdexec/functional.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "__detail/__config.hpp"
+#include "__detail/__tuple.hpp"
 #include "__detail/__utility.hpp"
 
 #include "concepts.hpp"  // IWYU pragma: keep
@@ -28,32 +29,19 @@ namespace STDEXEC
   template <class _Fun0, class _Fun1>
   struct __composed
   {
-    STDEXEC_ATTRIBUTE(no_unique_address) _Fun0 __t0_;
-    STDEXEC_ATTRIBUTE(no_unique_address) _Fun1 __t1_;
-
-    template <class... _Ts>
-      requires __callable<_Fun1, _Ts...> && __callable<_Fun0, __call_result_t<_Fun1, _Ts...>>
+    template <class _Self, class... _Ts>
+      requires __callable<__copy_cvref_t<_Self, _Fun1>, _Ts...>
+            && __callable<__copy_cvref_t<_Self, _Fun0>,
+                          __call_result_t<__copy_cvref_t<_Self, _Fun1>, _Ts...>>
     STDEXEC_ATTRIBUTE(host, device, always_inline)
-    constexpr auto
-    operator()(_Ts &&...__ts) && noexcept(__callable<_Fun1, _Ts...>
-                                          && __callable<_Fun0, __call_result_t<_Fun1, _Ts...>>)
-      -> __call_result_t<_Fun0, __call_result_t<_Fun1, _Ts...>>
-    {
-      return static_cast<_Fun0 &&>(__t0_)(
-        static_cast<_Fun1 &&>(__t1_)(static_cast<_Ts &&>(__ts)...));
-    }
+    constexpr STDEXEC_EXPLICIT_THIS_BEGIN(auto operator())(this _Self &&__self, _Ts &&...__ts)
+      STDEXEC_AUTO_RETURN(  //
+        static_cast<_Self &&>(__self).__t0_(
+          static_cast<_Self &&>(__self).__t1_(static_cast<_Ts &&>(__ts)...)))  //
+      STDEXEC_EXPLICIT_THIS_END(operator())
 
-    template <class... _Ts>
-      requires __callable<_Fun1 const &, _Ts...>
-            && __callable<_Fun0 const &, __call_result_t<_Fun1 const &, _Ts...>>
-    STDEXEC_ATTRIBUTE(host, device, always_inline)
-    constexpr auto operator()(_Ts &&...__ts) const & noexcept(
-      __callable<_Fun1 const &, _Ts...>
-      && __callable<_Fun0 const &, __call_result_t<_Fun1 const &, _Ts...>>)
-      -> __call_result_t<_Fun0, __call_result_t<_Fun1, _Ts...>>
-    {
-      return __t0_(__t1_(static_cast<_Ts &&>(__ts)...));
-    }
+        STDEXEC_ATTRIBUTE(no_unique_address) _Fun0 __t0_;
+    STDEXEC_ATTRIBUTE(no_unique_address) _Fun1 __t1_;
   };
 
   inline constexpr struct __compose_t
@@ -245,8 +233,8 @@ namespace STDEXEC
 
     template <class... _Ts>
     STDEXEC_ATTRIBUTE(host, device, always_inline)
-    constexpr void
-    operator()(_Ts &&...__ts) const noexcept((__nothrow_callable<_Fn const &, _Ts> && ...))
+    constexpr void operator()(_Ts &&...__ts) const  //
+      noexcept((__nothrow_callable<_Fn const &, _Ts> && ...))
     {
       (static_cast<void>(__fn_(static_cast<_Ts &&>(__ts))), ...);
     }
@@ -277,7 +265,7 @@ namespace STDEXEC
   STDEXEC_HOST_DEVICE_DEDUCTION_GUIDE __always(_Ty) -> __always<std::unwrap_reference_t<_Ty>>;
 
   template <class _Ty>
-  struct __construct
+  struct __construct_from
   {
     template <class... _As>
       requires __std::constructible_from<_Ty, _As...>
@@ -288,4 +276,96 @@ namespace STDEXEC
       return _Ty(static_cast<_As &&>(__as)...);
     }
   };
+
+  //! \brief Helper to combine multiple function objects into one overload set
+  template <class... _Fns>
+  struct __overload : _Fns...
+  {
+    using _Fns::operator()...;
+  };
+
+  template <class... _Fns>
+  STDEXEC_HOST_DEVICE_DEDUCTION_GUIDE __overload(_Fns...) -> __overload<_Fns...>;
+
+  namespace __detail
+  {
+    template <class... _Args>
+    struct __get_1st_fn
+    {
+      STDEXEC_ATTRIBUTE(host, device, always_inline)
+      constexpr void operator()() const noexcept {}
+
+      template <class _Fn0, class... _Fns>
+      STDEXEC_ATTRIBUTE(host, device, always_inline)
+      constexpr auto operator()(_Fn0 &&__fn0, _Fns &&...__fns) const noexcept -> decltype(auto)
+      {
+        if constexpr (__callable<_Fn0, _Args...>)
+        {
+          return static_cast<_Fn0 &&>(__fn0);
+        }
+        else
+        {
+          return (*this)(static_cast<_Fns &&>(__fns)...);
+        }
+      }
+    };
+  }  // namespace __detail
+
+  //! \brief A callable that wraps a set of functions and calls the first one that is
+  //! callable with a given set of arguments.
+  template <class... _Fns>
+  struct __first_callable
+  {
+    //! \brief Alias for the type of the first function that is callable with a given set of arguments.
+    template <class _Self, class... _Args>
+    using __1st_fn_t =
+      __call_result_t<__detail::__get_1st_fn<_Args...>, __copy_cvref_t<_Self, _Fns>...>;
+
+    //! \brief Calls the first function that is callable with a given set of arguments.
+    template <class _Self, class... _Args>
+      requires __callable<__1st_fn_t<_Self, _Args...>, _Args...>
+    constexpr STDEXEC_EXPLICIT_THIS_BEGIN(auto operator())(this _Self &&__self, _Args &&...__args)
+      noexcept(__nothrow_callable<__1st_fn_t<_Self, _Args...>, _Args...>)
+        -> __call_result_t<__1st_fn_t<_Self, _Args...>, _Args...>
+    {
+      return __apply(__detail::__get_1st_fn<_Args...>(),
+                     static_cast<_Self &&>(__self).__fns_)(static_cast<_Args &&>(__args)...);
+    }
+    STDEXEC_EXPLICIT_THIS_END(operator())
+
+    __tuple<_Fns...> __fns_;
+  };
+
+  template <class... _Fns>
+  STDEXEC_HOST_DEVICE_DEDUCTION_GUIDE __first_callable(_Fns...) -> __first_callable<_Fns...>;
+
+  template <class _Fn, class... _BoundArgs>
+  struct __back_binder
+  {
+    template <class _Self, class... _Args>
+      requires __callable<_Fn, _Args..., __copy_cvref_t<_Self, _BoundArgs>...>
+    STDEXEC_ATTRIBUTE(host, device)
+    constexpr STDEXEC_EXPLICIT_THIS_BEGIN(auto operator())(this _Self &&__self,
+                                                           _Args &&...__args)            //
+      noexcept(__nothrow_callable<_Fn, _Args..., __copy_cvref_t<_Self, _BoundArgs>...>)  //
+      -> __call_result_t<_Fn, _Args..., __copy_cvref_t<_Self, _BoundArgs>...>
+    {
+      return STDEXEC::__apply(static_cast<_Self &&>(__self).__fn_,
+                              static_cast<_Self &&>(__self).__bound_args_,
+                              static_cast<_Args &&>(__args)...);
+    }
+    STDEXEC_EXPLICIT_THIS_END(operator())
+
+    _Fn                    __fn_;
+    __tuple<_BoundArgs...> __bound_args_;
+  };
+
+  template <class... _BoundArgs, class _Fn>
+  constexpr auto __bind_back(_Fn &&__fn, _BoundArgs... __bound_args)
+    noexcept(__nothrow_move_constructible<_BoundArgs...> && __nothrow_decay_copyable<_Fn>)
+  {
+    return __back_binder<__decay_t<_Fn>, _BoundArgs...>{static_cast<_Fn &&>(__fn),
+                                                        static_cast<_BoundArgs &&>(
+                                                          __bound_args)...};
+  };
 }  // namespace STDEXEC
diff --git a/include/stdexec/stop_token.hpp b/include/stdexec/stop_token.hpp
index 6669729c2..15a3decab 100644
--- a/include/stdexec/stop_token.hpp
+++ b/include/stdexec/stop_token.hpp
@@ -457,6 +457,7 @@ namespace STDEXEC
     }
   }  // namespace __stok
 
+  template <class _StopSource = inplace_stop_source>
   struct __forward_stop_request
   {
     void operator()() const noexcept
@@ -464,8 +465,12 @@ namespace STDEXEC
       __stop_source_.request_stop();
     }
 
-    inplace_stop_source& __stop_source_;
+    _StopSource& __stop_source_;
   };
+
+  template <class _StopSource>
+  STDEXEC_HOST_DEVICE_DEDUCTION_GUIDE
+  __forward_stop_request(_StopSource&) -> __forward_stop_request<_StopSource>;
 }  // namespace STDEXEC
 
 STDEXEC_PRAGMA_POP()

From 399a3310b863dc3cb6df855d6c85ff5f7e63877f Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Wed, 18 Mar 2026 19:08:41 -0700
Subject: [PATCH 2/7] conformance fix for gcc

---
 include/stdexec/__detail/__as_awaitable.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/stdexec/__detail/__as_awaitable.hpp b/include/stdexec/__detail/__as_awaitable.hpp
index dc404e6ad..8cb841810 100644
--- a/include/stdexec/__detail/__as_awaitable.hpp
+++ b/include/stdexec/__detail/__as_awaitable.hpp
@@ -290,7 +290,7 @@ namespace STDEXEC
     template <class _Promise, sender_in<env_of_t<_Promise&>> _Sender>
     struct __sender_awaitable : __sender_awaitable_base<__value_t<_Sender, _Promise>, false>
     {
-      using __value_t = __value_t<_Sender, _Promise>;
+      using __value_t = __as_awaitable::__value_t<_Sender, _Promise>;
 
       constexpr explicit __sender_awaitable(_Sender&&                         __sndr,
                                             __std::coroutine_handle<_Promise> __hcoro)

From 81b246d179e4010bfb68ba2fb5c226cc651ef2f7 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Wed, 18 Mar 2026 19:41:12 -0700
Subject: [PATCH 3/7] fix unused parameter warnings

---
 include/stdexec/__detail/__task.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/stdexec/__detail/__task.hpp b/include/stdexec/__detail/__task.hpp
index 496fd23f2..3577c074f 100644
--- a/include/stdexec/__detail/__task.hpp
+++ b/include/stdexec/__detail/__task.hpp
@@ -180,9 +180,9 @@ namespace STDEXEC
       __stop_callback_box<stop_token_of_t<env_of_t<_EnvProvider>>, _StopSource>;
 
     inline constexpr auto __throw_error = __overload{
-      []<class _Error>(_Error&& __error) { STDEXEC_THROW(static_cast<_Error&&>(__error)); },
-      [](std::error_code __ec) { STDEXEC_THROW(std::system_error(__ec)); },
-      [](std::exception_ptr __eptr) { std::rethrow_exception(__eptr); }};
+      []([[maybe_unused]] auto&& __error) { STDEXEC_THROW((decltype(__error)&&) __error); },
+      []([[maybe_unused]] std::error_code __ec) { STDEXEC_THROW(std::system_error(__ec)); },
+      []([[maybe_unused]] std::exception_ptr __eptr) { std::rethrow_exception(__eptr); }};
 
   }  // namespace __task
 

From 05157741ff145b5dd2e585bcb60ce4ef0feeeb01 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Thu, 19 Mar 2026 09:40:51 -0700
Subject: [PATCH 4/7] try using lld on macos clang build

---
 .github/workflows/ci.cpu.yml | 314 ++++++++++++++++++-----------------
 .github/workflows/ci.gpu.yml | 120 ++++++-------
 2 files changed, 218 insertions(+), 216 deletions(-)

diff --git a/.github/workflows/ci.cpu.yml b/.github/workflows/ci.cpu.yml
index 947577859..af0069e23 100644
--- a/.github/workflows/ci.cpu.yml
+++ b/.github/workflows/ci.cpu.yml
@@ -12,162 +12,162 @@ concurrency:
 
 jobs:
 
-  build-cpu:
-    runs-on: ubuntu-latest
-    name: ${{ matrix.name }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { name: "CPU (clang 16, Debug)",         build: "Debug",   tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-stdlib=libc++" }
-          - { name: "CPU (clang 16, Debug, c++23)",  build: "Debug",   tag: llvm16-cuda12.9, cxxstd: "23", cxxflags: "-stdlib=libc++" }
-          - { name: "CPU (clang 16, Debug, TSAN)",   build: "Debug",   tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-fsanitize=thread" }
-          - { name: "CPU (clang 16, Release)",       build: "Release", tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-stdlib=libc++" }
-          - { name: "CPU (clang 16, Release, ASAN)", build: "Release", tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-stdlib=libc++ -fsanitize=address -fsanitize-ignorelist=/home/coder/stdexec/sanitizer-ignorelist.txt" }
-          - { name: "CPU (gcc 11, Debug)",           build: "Debug",   tag: gcc11-cuda12.9,  cxxstd: "20", cxxflags: "", }
-          - { name: "CPU (gcc 11, Release)",         build: "Release", tag: gcc11-cuda12.9,  cxxstd: "20", cxxflags: "", }
-          - { name: "CPU (gcc 11, Release, ASAN)",   build: "Release", tag: gcc11-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=address" }
-          - { name: "CPU (gcc 12, Release, TSAN)",   build: "Release", tag: gcc12-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=thread" }
-          - { name: "CPU (gcc 13, Debug)",           build: "Debug",   tag: gcc13-cuda12.9,  cxxstd: "20", cxxflags: "", }
-          - { name: "CPU (gcc 14, Debug)",           build: "Debug",   tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "", }
-          - { name: "CPU (gcc 14, Debug, ASAN)",     build: "Debug",   tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=address" }
-          - { name: "CPU (gcc 14, Debug, TSAN)",     build: "Debug",   tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=thread" }
-          - { name: "CPU (gcc 14, Release, LEAK)",   build: "Release", tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=leak", }
-          - { name: "CPU (gcc 14, Release, c++23)",  build: "Release", tag: gcc14-cuda12.9,  cxxstd: "23", cxxflags: "", }
-    container:
-      options: -u root
-      image: rapidsai/devcontainers:26.02-cpp-${{ matrix.tag }}
-    permissions:
-      id-token: write # This is required for configure-aws-credentials
-      contents: read  # This is required for actions/checkout
-    defaults:
-      run:
-        shell: su coder {0}
-        working-directory: /home/coder
-    steps:
-      - name: Checkout stdexec
-        uses: actions/checkout@v4
-        with:
-          path: stdexec
-          persist-credentials: false
-      - name: Setup environment
-        run: |
-          echo "ARTIFACT_PREFIX=${{runner.os}}-${{matrix.tag}}-amd64" >> "${GITHUB_ENV}"
-          echo "ARTIFACT_SUFFIX=${{github.run_id}}-${{github.run_attempt}}-$RANDOM" >> "${GITHUB_ENV}"
-      - if: github.repository_owner == 'NVIDIA'
-        name: Get AWS credentials for sccache bucket
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: us-east-2
-          role-duration-seconds: 28800 # 8 hours
-          role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
-      - name: Build and test CPU schedulers
-        env:
-          ASAN_OPTIONS: alloc_dealloc_mismatch=0
-          NVCC_APPEND_FLAGS: "-t=100"
-          SCCACHE_BUCKET: "rapids-sccache-devs"
-          SCCACHE_DIST_REQUEST_TIMEOUT: "7140"
-          SCCACHE_DIST_URL: "https://amd64.linux.sccache.rapids.nvidia.com"
-          SCCACHE_IDLE_TIMEOUT: "0"
-          SCCACHE_REGION: "us-east-2"
-          SCCACHE_S3_KEY_PREFIX: "nvidia-stdexec-dev"
-          SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: "nvidia-stdexec-dev/preprocessor"
-          SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: "true"
-          SCCACHE_SERVER_LOG: "sccache=debug"
-          SCCACHE_SERVER_PORT: "4225"
-        run: |
-          set -e;
-          source /etc/profile
-          set -x;
-
-          devcontainer-utils-install-sccache --version rapids;
-
-          devcontainer-utils-init-sccache-dist                                          \
-              --enable-sccache-dist - <<< "                                             \
-              --auth-type 'token'                                                       \
-              --auth-token '$(                                                          \
-                curl -fsSL -H "Authorization: Bearer $(                                 \
-                  curl -fsSL -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
-                    "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=token.rapids.nvidia.com"  \
-                | jq -r '.value'                                                        \
-                )" https://token.rapids.nvidia.com/gh/token/exchange                    \
-              | jq -r '.token')'                                                        \
-          ";
-
-          # Copy source folder into ~/stdexec
-          cp -r "${GITHUB_WORKSPACE}"/stdexec ~/;
-          chown -R coder:coder ~/stdexec;
-          cd ~/stdexec;
-
-          # Configure
-          cmake -S . -B build -GNinja \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-            -DCMAKE_CXX_FLAGS="${{ matrix.cxxflags }}" \
-            -DSTDEXEC_ENABLE_TBB:BOOL=${{ !contains(matrix.cxxflags, '-fsanitize') }} \
-            -DSTDEXEC_ENABLE_ASIO:BOOL=TRUE \
-            -DSTDEXEC_ASIO_IMPLEMENTATION:STRING=boost \
-            -DCMAKE_CXX_STANDARD=${{ matrix.cxxstd }} \
-            -DCMAKE_CXX_EXTENSIONS=OFF \
-            ;
-
-          # Compile
-          cmake --build build -v -j 512;
-
-          # Print sccache stats
-          sccache -s;
-
-          # Tests
-          SCCACHE_NO_CACHE=1 SCCACHE_NO_DIST_COMPILE=1 \
-          ctest --test-dir build --verbose --output-on-failure --timeout 60;
-      - if: ${{ !cancelled() }}
-        name: Upload sccache logs
-        uses: actions/upload-artifact@v4
-        with:
-          name: sccache-client-logs-${{env.ARTIFACT_PREFIX}}-${{env.ARTIFACT_SUFFIX}}
-          path: /tmp/sccache*.log
-          compression-level: 9
-
-  ci-cpu:
-    runs-on: ubuntu-latest
-    name: CI (CPU)
-    needs:
-      - build-cpu
-    steps:
-      - run: echo "CI (CPU) success"
-
-  build-cpu-windows:
-    runs-on: windows-2022
-    name: ${{ matrix.name }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - { compiler: "cl",       build: "Debug",   name: "CPU (Windows) (msvc, Debug)" }
-          - { compiler: "cl",       build: "Release", name: "CPU (Windows) (msvc, Release)" }
-          #- { compiler: "clang++",  build: "Debug",   name: "CPU (Windows) (clang, Debug)" }
-          #- { compiler: "clang++",  build: "Release", name: "CPU (Windows) (clang, Release)" }
-          #- { compiler: "clang-cl", build: "Debug",   name: "CPU (Windows) (clang-cl, Debug)" }
-          #- { compiler: "clang-cl", build: "Release", name: "CPU (Windows) (clang-cl, Release)" }
-
-    steps:
-      - name: Checkout stdexec (Windows)
-        uses: actions/checkout@v4
-        with:
-          persist-credentials: false
-
-      - name: Build and test CPU schedulers (Windows)
-        shell: pwsh
-        run: |
-          docker pull rapidsai/devcontainers:26.02-cuda12.9-cl14.43
-          docker run --isolation=process -v "$(pwd):C:/stdexec" rapidsai/devcontainers:26.02-cuda12.9-cl14.43 powershell C:/stdexec/.github/workflows/test-windows.ps1 -Compiler '${{ matrix.compiler }}' -Config '${{ matrix.build }}'
-
-  ci-cpu-windows:
-    runs-on: windows-latest
-    name: CI (CPU) (Windows)
-    needs:
-      - build-cpu-windows
-    steps:
-      - run: echo "CI (CPU) (Windows) success"
+  # build-cpu:
+  #   runs-on: ubuntu-latest
+  #   name: ${{ matrix.name }}
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - { name: "CPU (clang 16, Debug)",         build: "Debug",   tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-stdlib=libc++" }
+  #         - { name: "CPU (clang 16, Debug, c++23)",  build: "Debug",   tag: llvm16-cuda12.9, cxxstd: "23", cxxflags: "-stdlib=libc++" }
+  #         - { name: "CPU (clang 16, Debug, TSAN)",   build: "Debug",   tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-fsanitize=thread" }
+  #         - { name: "CPU (clang 16, Release)",       build: "Release", tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-stdlib=libc++" }
+  #         - { name: "CPU (clang 16, Release, ASAN)", build: "Release", tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-stdlib=libc++ -fsanitize=address -fsanitize-ignorelist=/home/coder/stdexec/sanitizer-ignorelist.txt" }
+  #         - { name: "CPU (gcc 11, Debug)",           build: "Debug",   tag: gcc11-cuda12.9,  cxxstd: "20", cxxflags: "", }
+  #         - { name: "CPU (gcc 11, Release)",         build: "Release", tag: gcc11-cuda12.9,  cxxstd: "20", cxxflags: "", }
+  #         - { name: "CPU (gcc 11, Release, ASAN)",   build: "Release", tag: gcc11-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=address" }
+  #         - { name: "CPU (gcc 12, Release, TSAN)",   build: "Release", tag: gcc12-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=thread" }
+  #         - { name: "CPU (gcc 13, Debug)",           build: "Debug",   tag: gcc13-cuda12.9,  cxxstd: "20", cxxflags: "", }
+  #         - { name: "CPU (gcc 14, Debug)",           build: "Debug",   tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "", }
+  #         - { name: "CPU (gcc 14, Debug, ASAN)",     build: "Debug",   tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=address" }
+  #         - { name: "CPU (gcc 14, Debug, TSAN)",     build: "Debug",   tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=thread" }
+  #         - { name: "CPU (gcc 14, Release, LEAK)",   build: "Release", tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=leak", }
+  #         - { name: "CPU (gcc 14, Release, c++23)",  build: "Release", tag: gcc14-cuda12.9,  cxxstd: "23", cxxflags: "", }
+  #   container:
+  #     options: -u root
+  #     image: rapidsai/devcontainers:26.02-cpp-${{ matrix.tag }}
+  #   permissions:
+  #     id-token: write # This is required for configure-aws-credentials
+  #     contents: read  # This is required for actions/checkout
+  #   defaults:
+  #     run:
+  #       shell: su coder {0}
+  #       working-directory: /home/coder
+  #   steps:
+  #     - name: Checkout stdexec
+  #       uses: actions/checkout@v4
+  #       with:
+  #         path: stdexec
+  #         persist-credentials: false
+  #     - name: Setup environment
+  #       run: |
+  #         echo "ARTIFACT_PREFIX=${{runner.os}}-${{matrix.tag}}-amd64" >> "${GITHUB_ENV}"
+  #         echo "ARTIFACT_SUFFIX=${{github.run_id}}-${{github.run_attempt}}-$RANDOM" >> "${GITHUB_ENV}"
+  #     - if: github.repository_owner == 'NVIDIA'
+  #       name: Get AWS credentials for sccache bucket
+  #       uses: aws-actions/configure-aws-credentials@v4
+  #       with:
+  #         aws-region: us-east-2
+  #         role-duration-seconds: 28800 # 8 hours
+  #         role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
+  #     - name: Build and test CPU schedulers
+  #       env:
+  #         ASAN_OPTIONS: alloc_dealloc_mismatch=0
+  #         NVCC_APPEND_FLAGS: "-t=100"
+  #         SCCACHE_BUCKET: "rapids-sccache-devs"
+  #         SCCACHE_DIST_REQUEST_TIMEOUT: "7140"
+  #         SCCACHE_DIST_URL: "https://amd64.linux.sccache.rapids.nvidia.com"
+  #         SCCACHE_IDLE_TIMEOUT: "0"
+  #         SCCACHE_REGION: "us-east-2"
+  #         SCCACHE_S3_KEY_PREFIX: "nvidia-stdexec-dev"
+  #         SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: "nvidia-stdexec-dev/preprocessor"
+  #         SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: "true"
+  #         SCCACHE_SERVER_LOG: "sccache=debug"
+  #         SCCACHE_SERVER_PORT: "4225"
+  #       run: |
+  #         set -e;
+  #         source /etc/profile
+  #         set -x;
+
+  #         devcontainer-utils-install-sccache --version rapids;
+
+  #         devcontainer-utils-init-sccache-dist                                          \
+  #             --enable-sccache-dist - <<< "                                             \
+  #             --auth-type 'token'                                                       \
+  #             --auth-token '$(                                                          \
+  #               curl -fsSL -H "Authorization: Bearer $(                                 \
+  #                 curl -fsSL -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
+  #                   "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=token.rapids.nvidia.com"  \
+  #               | jq -r '.value'                                                        \
+  #               )" https://token.rapids.nvidia.com/gh/token/exchange                    \
+  #             | jq -r '.token')'                                                        \
+  #         ";
+
+  #         # Copy source folder into ~/stdexec
+  #         cp -r "${GITHUB_WORKSPACE}"/stdexec ~/;
+  #         chown -R coder:coder ~/stdexec;
+  #         cd ~/stdexec;
+
+  #         # Configure
+  #         cmake -S . -B build -GNinja \
+  #           -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+  #           -DCMAKE_CXX_FLAGS="${{ matrix.cxxflags }}" \
+  #           -DSTDEXEC_ENABLE_TBB:BOOL=${{ !contains(matrix.cxxflags, '-fsanitize') }} \
+  #           -DSTDEXEC_ENABLE_ASIO:BOOL=TRUE \
+  #           -DSTDEXEC_ASIO_IMPLEMENTATION:STRING=boost \
+  #           -DCMAKE_CXX_STANDARD=${{ matrix.cxxstd }} \
+  #           -DCMAKE_CXX_EXTENSIONS=OFF \
+  #           ;
+
+  #         # Compile
+  #         cmake --build build -v -j 512;
+
+  #         # Print sccache stats
+  #         sccache -s;
+
+  #         # Tests
+  #         SCCACHE_NO_CACHE=1 SCCACHE_NO_DIST_COMPILE=1 \
+  #         ctest --test-dir build --verbose --output-on-failure --timeout 60;
+  #     - if: ${{ !cancelled() }}
+  #       name: Upload sccache logs
+  #       uses: actions/upload-artifact@v4
+  #       with:
+  #         name: sccache-client-logs-${{env.ARTIFACT_PREFIX}}-${{env.ARTIFACT_SUFFIX}}
+  #         path: /tmp/sccache*.log
+  #         compression-level: 9
+
+  # ci-cpu:
+  #   runs-on: ubuntu-latest
+  #   name: CI (CPU)
+  #   needs:
+  #     - build-cpu
+  #   steps:
+  #     - run: echo "CI (CPU) success"
+
+  # build-cpu-windows:
+  #   runs-on: windows-2022
+  #   name: ${{ matrix.name }}
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       include:
+  #         - { compiler: "cl",       build: "Debug",   name: "CPU (Windows) (msvc, Debug)" }
+  #         - { compiler: "cl",       build: "Release", name: "CPU (Windows) (msvc, Release)" }
+  #         #- { compiler: "clang++",  build: "Debug",   name: "CPU (Windows) (clang, Debug)" }
+  #         #- { compiler: "clang++",  build: "Release", name: "CPU (Windows) (clang, Release)" }
+  #         #- { compiler: "clang-cl", build: "Debug",   name: "CPU (Windows) (clang-cl, Debug)" }
+  #         #- { compiler: "clang-cl", build: "Release", name: "CPU (Windows) (clang-cl, Release)" }
+
+  #   steps:
+  #     - name: Checkout stdexec (Windows)
+  #       uses: actions/checkout@v4
+  #       with:
+  #         persist-credentials: false
+
+  #     - name: Build and test CPU schedulers (Windows)
+  #       shell: pwsh
+  #       run: |
+  #         docker pull rapidsai/devcontainers:26.02-cuda12.9-cl14.43
+  #         docker run --isolation=process -v "$(pwd):C:/stdexec" rapidsai/devcontainers:26.02-cuda12.9-cl14.43 powershell C:/stdexec/.github/workflows/test-windows.ps1 -Compiler '${{ matrix.compiler }}' -Config '${{ matrix.build }}'
+
+  # ci-cpu-windows:
+  #   runs-on: windows-latest
+  #   name: CI (CPU) (Windows)
+  #   needs:
+  #     - build-cpu-windows
+  #   steps:
+  #     - run: echo "CI (CPU) (Windows) success"
 
   build-cpu-macos:
     runs-on: macos-15-large
@@ -192,6 +192,8 @@ jobs:
 
       - name: Build and test CPU schedulers (MacOS)
         shell: bash
+        env:
+          LDFLAGS: "-fuse-ld=lld"
         run: |
           mkdir build
           cmake -S. -Bbuild -GNinja \
diff --git a/.github/workflows/ci.gpu.yml b/.github/workflows/ci.gpu.yml
index 17adc9d56..b80a3ad4b 100644
--- a/.github/workflows/ci.gpu.yml
+++ b/.github/workflows/ci.gpu.yml
@@ -57,74 +57,74 @@ jobs:
           aws-region: us-east-2
           role-duration-seconds: 28800 # 8 hours
           role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
-      - name: Build and test GPU schedulers
-        env:
-          NVCC_APPEND_FLAGS: "-t=100"
-          SCCACHE_BUCKET: "rapids-sccache-devs"
-          SCCACHE_DIST_REQUEST_TIMEOUT: "7140"
-          SCCACHE_DIST_URL: "https://${{ matrix.arch }}.linux.sccache.rapids.nvidia.com"
-          SCCACHE_IDLE_TIMEOUT: "0"
-          SCCACHE_REGION: "us-east-2"
-          SCCACHE_S3_KEY_PREFIX: "nvidia-stdexec-dev"
-          SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: "nvidia-stdexec-dev/preprocessor"
-          SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: "true"
-          SCCACHE_SERVER_LOG: "sccache=debug"
-          SCCACHE_SERVER_PORT: "4225"
-        run: |
-          set -e;
-          source /etc/profile
-          set -x;
+      # - name: Build and test GPU schedulers
+      #   env:
+      #     NVCC_APPEND_FLAGS: "-t=100"
+      #     SCCACHE_BUCKET: "rapids-sccache-devs"
+      #     SCCACHE_DIST_REQUEST_TIMEOUT: "7140"
+      #     SCCACHE_DIST_URL: "https://${{ matrix.arch }}.linux.sccache.rapids.nvidia.com"
+      #     SCCACHE_IDLE_TIMEOUT: "0"
+      #     SCCACHE_REGION: "us-east-2"
+      #     SCCACHE_S3_KEY_PREFIX: "nvidia-stdexec-dev"
+      #     SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: "nvidia-stdexec-dev/preprocessor"
+      #     SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: "true"
+      #     SCCACHE_SERVER_LOG: "sccache=debug"
+      #     SCCACHE_SERVER_PORT: "4225"
+      #   run: |
+      #     set -e;
+      #     source /etc/profile
+      #     set -x;
 
-          devcontainer-utils-install-sccache --version rapids;
+      #     devcontainer-utils-install-sccache --version rapids;
 
-          devcontainer-utils-init-sccache-dist                                          \
-              --enable-sccache-dist - <<< "                                             \
-              --auth-type 'token'                                                       \
-              --auth-token '$(                                                          \
-                curl -fsSL -H "Authorization: Bearer $(                                 \
-                  curl -fsSL -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
-                    "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=token.rapids.nvidia.com"  \
-                | jq -r '.value'                                                        \
-                )" https://token.rapids.nvidia.com/gh/token/exchange                    \
-              | jq -r '.token')'                                                        \
-          ";
+      #     devcontainer-utils-init-sccache-dist                                          \
+      #         --enable-sccache-dist - <<< "                                             \
+      #         --auth-type 'token'                                                       \
+      #         --auth-token '$(                                                          \
+      #           curl -fsSL -H "Authorization: Bearer $(                                 \
+      #             curl -fsSL -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
+      #               "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=token.rapids.nvidia.com"  \
+      #           | jq -r '.value'                                                        \
+      #           )" https://token.rapids.nvidia.com/gh/token/exchange                    \
+      #         | jq -r '.token')'                                                        \
+      #     ";
 
-          # Copy source folder into ~/stdexec
-          cp -r "${GITHUB_WORKSPACE}"/stdexec ~/;
-          chown -R coder:coder ~/stdexec;
-          cd ~/stdexec;
+      #     # Copy source folder into ~/stdexec
+      #     cp -r "${GITHUB_WORKSPACE}"/stdexec ~/;
+      #     chown -R coder:coder ~/stdexec;
+      #     cd ~/stdexec;
 
-          # Configure
-          cmake -S . -B build -GNinja \
-            -DSTDEXEC_ENABLE_CUDA=ON \
-            -DSTDEXEC_ENABLE_IO_URING=OFF \
-            -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-            -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \
-            -DCMAKE_CUDA_COMPILER=${{ matrix.cxx }} \
-            -DCMAKE_CUDA_ARCHITECTURES=${{ matrix.sm }} \
-            ;
+      #     # Configure
+      #     cmake -S . -B build -GNinja \
+      #       -DSTDEXEC_ENABLE_CUDA=ON \
+      #       -DSTDEXEC_ENABLE_IO_URING=OFF \
+      #       -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+      #       -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \
+      #       -DCMAKE_CUDA_COMPILER=${{ matrix.cxx }} \
+      #       -DCMAKE_CUDA_ARCHITECTURES=${{ matrix.sm }} \
+      #       ;
 
-          # Compile
-          cmake --build build -v -j 512;
+      #     # Compile
+      #     cmake --build build -v -j 512;
 
-          # Print sccache stats
-          sccache -s;
+      #     # Print sccache stats
+      #     sccache -s;
 
-          # Tests
-          SCCACHE_NO_CACHE=1 SCCACHE_NO_DIST_COMPILE=1 \
-          ctest --test-dir build --verbose --output-on-failure --timeout 60;
+      #     # Tests
+      #     SCCACHE_NO_CACHE=1 SCCACHE_NO_DIST_COMPILE=1 \
+      #     ctest --test-dir build --verbose --output-on-failure --timeout 60;
 
-          # Examples
-          ./build/examples/nvexec/maxwell_cpu_st --iterations=1000 --N=512 --run-cpp --run-inline-scheduler;
-          ./build/examples/nvexec/maxwell_cpu_mt --iterations=1000 --N=512 --run-std --run-stdpar --run-thread-pool-scheduler;
-          ./build/examples/nvexec/maxwell_gpu_s --iterations=1000 --N=512 --run-cuda --run-stdpar --run-stream-scheduler;
-      - if: ${{ !cancelled() }}
-        name: Upload sccache logs
-        uses: actions/upload-artifact@v4
-        with:
-          name: sccache-client-logs-${{env.ARTIFACT_PREFIX}}-${{env.ARTIFACT_SUFFIX}}
-          path: /tmp/sccache*.log
-          compression-level: 9
+      #     # Examples
+      #     ./build/examples/nvexec/maxwell_cpu_st --iterations=1000 --N=512 --run-cpp --run-inline-scheduler;
+      #     ./build/examples/nvexec/maxwell_cpu_mt --iterations=1000 --N=512 --run-std --run-stdpar --run-thread-pool-scheduler;
+      #     ./build/examples/nvexec/maxwell_gpu_s --iterations=1000 --N=512 --run-cuda --run-stdpar --run-stream-scheduler;
+      # - if: ${{ !cancelled() }}
+      #   name: Upload sccache logs
+      #   uses: actions/upload-artifact@v4
+      #   with:
+      #     name: sccache-client-logs-${{env.ARTIFACT_PREFIX}}-${{env.ARTIFACT_SUFFIX}}
+      #     path: /tmp/sccache*.log
+      #     compression-level: 9
 
   ci-gpu:
     runs-on: ubuntu-latest

From b23163618bb84e68b54397944b5017b32189d06f Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Thu, 19 Mar 2026 09:53:17 -0700
Subject: [PATCH 5/7] try a newer macos

---
 .github/workflows/ci.cpu.yml |  2 +-
 .github/workflows/ci.gpu.yml | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ci.cpu.yml b/.github/workflows/ci.cpu.yml
index af0069e23..5fd74cfac 100644
--- a/.github/workflows/ci.cpu.yml
+++ b/.github/workflows/ci.cpu.yml
@@ -170,7 +170,7 @@ jobs:
   #     - run: echo "CI (CPU) (Windows) success"
 
   build-cpu-macos:
-    runs-on: macos-15-large
+    runs-on: macos-26-large
     name: macos-${{ matrix.name }}
     strategy:
       fail-fast: false
diff --git a/.github/workflows/ci.gpu.yml b/.github/workflows/ci.gpu.yml
index b80a3ad4b..1f0d6e747 100644
--- a/.github/workflows/ci.gpu.yml
+++ b/.github/workflows/ci.gpu.yml
@@ -20,11 +20,11 @@ jobs:
       matrix:
         include:
           - { name: "clang 21",    cuda: "12.0", cxx: "clang++", build: "Release", tag: "llvm21-cuda12.0",  gpu: "v100", sm: "70", driver: "latest", arch: "amd64" }
-          - { name: "clang 21",    cuda: "12.9", cxx: "clang++", build: "Release", tag: "llvm21-cuda12.9",  gpu: "v100", sm: "70", driver: "latest", arch: "amd64" }
-          - { name: "nvc++ 25.9",  cuda: "12.9", cxx: "mpic++",  build: "Release", tag: "nvhpc25.9",        gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
-          - { name: "nvc++ 25.9",  cuda: "12.9", cxx: "mpic++",  build: "Debug",   tag: "nvhpc25.9",        gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
-          - { name: "nvc++ 25.11", cuda: "13.0", cxx: "mpic++",  build: "Release", tag: "nvhpc25.11",       gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
-          - { name: "nvc++ 25.11", cuda: "13.0", cxx: "mpic++",  build: "Debug",   tag: "nvhpc25.11",       gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
+          # - { name: "clang 21",    cuda: "12.9", cxx: "clang++", build: "Release", tag: "llvm21-cuda12.9",  gpu: "v100", sm: "70", driver: "latest", arch: "amd64" }
+          # - { name: "nvc++ 25.9",  cuda: "12.9", cxx: "mpic++",  build: "Release", tag: "nvhpc25.9",        gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
+          # - { name: "nvc++ 25.9",  cuda: "12.9", cxx: "mpic++",  build: "Debug",   tag: "nvhpc25.9",        gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
+          # - { name: "nvc++ 25.11", cuda: "13.0", cxx: "mpic++",  build: "Release", tag: "nvhpc25.11",       gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
+          # - { name: "nvc++ 25.11", cuda: "13.0", cxx: "mpic++",  build: "Debug",   tag: "nvhpc25.11",       gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
     runs-on: linux-${{ matrix.arch }}-gpu-${{ matrix.gpu }}-${{ matrix.driver }}-1
     container:
       options: -u root

From cb005604975e20bc9995fa3e620c79462056b0ff Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Thu, 19 Mar 2026 09:55:11 -0700
Subject: [PATCH 6/7] remove -use-ld flag

---
 .github/workflows/ci.cpu.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/ci.cpu.yml b/.github/workflows/ci.cpu.yml
index 5fd74cfac..e9706d55f 100644
--- a/.github/workflows/ci.cpu.yml
+++ b/.github/workflows/ci.cpu.yml
@@ -192,8 +192,6 @@ jobs:
 
       - name: Build and test CPU schedulers (MacOS)
         shell: bash
-        env:
-          LDFLAGS: "-fuse-ld=lld"
         run: |
           mkdir build
           cmake -S. -Bbuild -GNinja \

From c359895eda845199019e08adddcbdb389ec6b717 Mon Sep 17 00:00:00 2001
From: Eric Niebler <eniebler@nvidia.com>
Date: Thu, 19 Mar 2026 10:00:16 -0700
Subject: [PATCH 7/7] re-enable other tests in CI

---
 .github/workflows/ci.cpu.yml | 312 +++++++++++++++++------------------
 .github/workflows/ci.gpu.yml | 130 +++++++--------
 2 files changed, 221 insertions(+), 221 deletions(-)

diff --git a/.github/workflows/ci.cpu.yml b/.github/workflows/ci.cpu.yml
index e9706d55f..e4c90f93c 100644
--- a/.github/workflows/ci.cpu.yml
+++ b/.github/workflows/ci.cpu.yml
@@ -12,162 +12,162 @@ concurrency:
 
 jobs:
 
-  # build-cpu:
-  #   runs-on: ubuntu-latest
-  #   name: ${{ matrix.name }}
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       include:
-  #         - { name: "CPU (clang 16, Debug)",         build: "Debug",   tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-stdlib=libc++" }
-  #         - { name: "CPU (clang 16, Debug, c++23)",  build: "Debug",   tag: llvm16-cuda12.9, cxxstd: "23", cxxflags: "-stdlib=libc++" }
-  #         - { name: "CPU (clang 16, Debug, TSAN)",   build: "Debug",   tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-fsanitize=thread" }
-  #         - { name: "CPU (clang 16, Release)",       build: "Release", tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-stdlib=libc++" }
-  #         - { name: "CPU (clang 16, Release, ASAN)", build: "Release", tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-stdlib=libc++ -fsanitize=address -fsanitize-ignorelist=/home/coder/stdexec/sanitizer-ignorelist.txt" }
-  #         - { name: "CPU (gcc 11, Debug)",           build: "Debug",   tag: gcc11-cuda12.9,  cxxstd: "20", cxxflags: "", }
-  #         - { name: "CPU (gcc 11, Release)",         build: "Release", tag: gcc11-cuda12.9,  cxxstd: "20", cxxflags: "", }
-  #         - { name: "CPU (gcc 11, Release, ASAN)",   build: "Release", tag: gcc11-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=address" }
-  #         - { name: "CPU (gcc 12, Release, TSAN)",   build: "Release", tag: gcc12-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=thread" }
-  #         - { name: "CPU (gcc 13, Debug)",           build: "Debug",   tag: gcc13-cuda12.9,  cxxstd: "20", cxxflags: "", }
-  #         - { name: "CPU (gcc 14, Debug)",           build: "Debug",   tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "", }
-  #         - { name: "CPU (gcc 14, Debug, ASAN)",     build: "Debug",   tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=address" }
-  #         - { name: "CPU (gcc 14, Debug, TSAN)",     build: "Debug",   tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=thread" }
-  #         - { name: "CPU (gcc 14, Release, LEAK)",   build: "Release", tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=leak", }
-  #         - { name: "CPU (gcc 14, Release, c++23)",  build: "Release", tag: gcc14-cuda12.9,  cxxstd: "23", cxxflags: "", }
-  #   container:
-  #     options: -u root
-  #     image: rapidsai/devcontainers:26.02-cpp-${{ matrix.tag }}
-  #   permissions:
-  #     id-token: write # This is required for configure-aws-credentials
-  #     contents: read  # This is required for actions/checkout
-  #   defaults:
-  #     run:
-  #       shell: su coder {0}
-  #       working-directory: /home/coder
-  #   steps:
-  #     - name: Checkout stdexec
-  #       uses: actions/checkout@v4
-  #       with:
-  #         path: stdexec
-  #         persist-credentials: false
-  #     - name: Setup environment
-  #       run: |
-  #         echo "ARTIFACT_PREFIX=${{runner.os}}-${{matrix.tag}}-amd64" >> "${GITHUB_ENV}"
-  #         echo "ARTIFACT_SUFFIX=${{github.run_id}}-${{github.run_attempt}}-$RANDOM" >> "${GITHUB_ENV}"
-  #     - if: github.repository_owner == 'NVIDIA'
-  #       name: Get AWS credentials for sccache bucket
-  #       uses: aws-actions/configure-aws-credentials@v4
-  #       with:
-  #         aws-region: us-east-2
-  #         role-duration-seconds: 28800 # 8 hours
-  #         role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
-  #     - name: Build and test CPU schedulers
-  #       env:
-  #         ASAN_OPTIONS: alloc_dealloc_mismatch=0
-  #         NVCC_APPEND_FLAGS: "-t=100"
-  #         SCCACHE_BUCKET: "rapids-sccache-devs"
-  #         SCCACHE_DIST_REQUEST_TIMEOUT: "7140"
-  #         SCCACHE_DIST_URL: "https://amd64.linux.sccache.rapids.nvidia.com"
-  #         SCCACHE_IDLE_TIMEOUT: "0"
-  #         SCCACHE_REGION: "us-east-2"
-  #         SCCACHE_S3_KEY_PREFIX: "nvidia-stdexec-dev"
-  #         SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: "nvidia-stdexec-dev/preprocessor"
-  #         SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: "true"
-  #         SCCACHE_SERVER_LOG: "sccache=debug"
-  #         SCCACHE_SERVER_PORT: "4225"
-  #       run: |
-  #         set -e;
-  #         source /etc/profile
-  #         set -x;
-
-  #         devcontainer-utils-install-sccache --version rapids;
-
-  #         devcontainer-utils-init-sccache-dist                                          \
-  #             --enable-sccache-dist - <<< "                                             \
-  #             --auth-type 'token'                                                       \
-  #             --auth-token '$(                                                          \
-  #               curl -fsSL -H "Authorization: Bearer $(                                 \
-  #                 curl -fsSL -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
-  #                   "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=token.rapids.nvidia.com"  \
-  #               | jq -r '.value'                                                        \
-  #               )" https://token.rapids.nvidia.com/gh/token/exchange                    \
-  #             | jq -r '.token')'                                                        \
-  #         ";
-
-  #         # Copy source folder into ~/stdexec
-  #         cp -r "${GITHUB_WORKSPACE}"/stdexec ~/;
-  #         chown -R coder:coder ~/stdexec;
-  #         cd ~/stdexec;
-
-  #         # Configure
-  #         cmake -S . -B build -GNinja \
-  #           -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-  #           -DCMAKE_CXX_FLAGS="${{ matrix.cxxflags }}" \
-  #           -DSTDEXEC_ENABLE_TBB:BOOL=${{ !contains(matrix.cxxflags, '-fsanitize') }} \
-  #           -DSTDEXEC_ENABLE_ASIO:BOOL=TRUE \
-  #           -DSTDEXEC_ASIO_IMPLEMENTATION:STRING=boost \
-  #           -DCMAKE_CXX_STANDARD=${{ matrix.cxxstd }} \
-  #           -DCMAKE_CXX_EXTENSIONS=OFF \
-  #           ;
-
-  #         # Compile
-  #         cmake --build build -v -j 512;
-
-  #         # Print sccache stats
-  #         sccache -s;
-
-  #         # Tests
-  #         SCCACHE_NO_CACHE=1 SCCACHE_NO_DIST_COMPILE=1 \
-  #         ctest --test-dir build --verbose --output-on-failure --timeout 60;
-  #     - if: ${{ !cancelled() }}
-  #       name: Upload sccache logs
-  #       uses: actions/upload-artifact@v4
-  #       with:
-  #         name: sccache-client-logs-${{env.ARTIFACT_PREFIX}}-${{env.ARTIFACT_SUFFIX}}
-  #         path: /tmp/sccache*.log
-  #         compression-level: 9
-
-  # ci-cpu:
-  #   runs-on: ubuntu-latest
-  #   name: CI (CPU)
-  #   needs:
-  #     - build-cpu
-  #   steps:
-  #     - run: echo "CI (CPU) success"
-
-  # build-cpu-windows:
-  #   runs-on: windows-2022
-  #   name: ${{ matrix.name }}
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       include:
-  #         - { compiler: "cl",       build: "Debug",   name: "CPU (Windows) (msvc, Debug)" }
-  #         - { compiler: "cl",       build: "Release", name: "CPU (Windows) (msvc, Release)" }
-  #         #- { compiler: "clang++",  build: "Debug",   name: "CPU (Windows) (clang, Debug)" }
-  #         #- { compiler: "clang++",  build: "Release", name: "CPU (Windows) (clang, Release)" }
-  #         #- { compiler: "clang-cl", build: "Debug",   name: "CPU (Windows) (clang-cl, Debug)" }
-  #         #- { compiler: "clang-cl", build: "Release", name: "CPU (Windows) (clang-cl, Release)" }
-
-  #   steps:
-  #     - name: Checkout stdexec (Windows)
-  #       uses: actions/checkout@v4
-  #       with:
-  #         persist-credentials: false
-
-  #     - name: Build and test CPU schedulers (Windows)
-  #       shell: pwsh
-  #       run: |
-  #         docker pull rapidsai/devcontainers:26.02-cuda12.9-cl14.43
-  #         docker run --isolation=process -v "$(pwd):C:/stdexec" rapidsai/devcontainers:26.02-cuda12.9-cl14.43 powershell C:/stdexec/.github/workflows/test-windows.ps1 -Compiler '${{ matrix.compiler }}' -Config '${{ matrix.build }}'
-
-  # ci-cpu-windows:
-  #   runs-on: windows-latest
-  #   name: CI (CPU) (Windows)
-  #   needs:
-  #     - build-cpu-windows
-  #   steps:
-  #     - run: echo "CI (CPU) (Windows) success"
+  build-cpu:
+    runs-on: ubuntu-latest
+    name: ${{ matrix.name }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - { name: "CPU (clang 16, Debug)",         build: "Debug",   tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-stdlib=libc++" }
+          - { name: "CPU (clang 16, Debug, c++23)",  build: "Debug",   tag: llvm16-cuda12.9, cxxstd: "23", cxxflags: "-stdlib=libc++" }
+          - { name: "CPU (clang 16, Debug, TSAN)",   build: "Debug",   tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-fsanitize=thread" }
+          - { name: "CPU (clang 16, Release)",       build: "Release", tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-stdlib=libc++" }
+          - { name: "CPU (clang 16, Release, ASAN)", build: "Release", tag: llvm16-cuda12.9, cxxstd: "20", cxxflags: "-stdlib=libc++ -fsanitize=address -fsanitize-ignorelist=/home/coder/stdexec/sanitizer-ignorelist.txt" }
+          - { name: "CPU (gcc 11, Debug)",           build: "Debug",   tag: gcc11-cuda12.9,  cxxstd: "20", cxxflags: "", }
+          - { name: "CPU (gcc 11, Release)",         build: "Release", tag: gcc11-cuda12.9,  cxxstd: "20", cxxflags: "", }
+          - { name: "CPU (gcc 11, Release, ASAN)",   build: "Release", tag: gcc11-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=address" }
+          - { name: "CPU (gcc 12, Release, TSAN)",   build: "Release", tag: gcc12-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=thread" }
+          - { name: "CPU (gcc 13, Debug)",           build: "Debug",   tag: gcc13-cuda12.9,  cxxstd: "20", cxxflags: "", }
+          - { name: "CPU (gcc 14, Debug)",           build: "Debug",   tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "", }
+          - { name: "CPU (gcc 14, Debug, ASAN)",     build: "Debug",   tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=address" }
+          - { name: "CPU (gcc 14, Debug, TSAN)",     build: "Debug",   tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=thread" }
+          - { name: "CPU (gcc 14, Release, LEAK)",   build: "Release", tag: gcc14-cuda12.9,  cxxstd: "20", cxxflags: "-fsanitize=leak", }
+          - { name: "CPU (gcc 14, Release, c++23)",  build: "Release", tag: gcc14-cuda12.9,  cxxstd: "23", cxxflags: "", }
+    container:
+      options: -u root
+      image: rapidsai/devcontainers:26.02-cpp-${{ matrix.tag }}
+    permissions:
+      id-token: write # This is required for configure-aws-credentials
+      contents: read  # This is required for actions/checkout
+    defaults:
+      run:
+        shell: su coder {0}
+        working-directory: /home/coder
+    steps:
+      - name: Checkout stdexec
+        uses: actions/checkout@v4
+        with:
+          path: stdexec
+          persist-credentials: false
+      - name: Setup environment
+        run: |
+          echo "ARTIFACT_PREFIX=${{runner.os}}-${{matrix.tag}}-amd64" >> "${GITHUB_ENV}"
+          echo "ARTIFACT_SUFFIX=${{github.run_id}}-${{github.run_attempt}}-$RANDOM" >> "${GITHUB_ENV}"
+      - if: github.repository_owner == 'NVIDIA'
+        name: Get AWS credentials for sccache bucket
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: us-east-2
+          role-duration-seconds: 28800 # 8 hours
+          role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
+      - name: Build and test CPU schedulers
+        env:
+          ASAN_OPTIONS: alloc_dealloc_mismatch=0
+          NVCC_APPEND_FLAGS: "-t=100"
+          SCCACHE_BUCKET: "rapids-sccache-devs"
+          SCCACHE_DIST_REQUEST_TIMEOUT: "7140"
+          SCCACHE_DIST_URL: "https://amd64.linux.sccache.rapids.nvidia.com"
+          SCCACHE_IDLE_TIMEOUT: "0"
+          SCCACHE_REGION: "us-east-2"
+          SCCACHE_S3_KEY_PREFIX: "nvidia-stdexec-dev"
+          SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: "nvidia-stdexec-dev/preprocessor"
+          SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: "true"
+          SCCACHE_SERVER_LOG: "sccache=debug"
+          SCCACHE_SERVER_PORT: "4225"
+        run: |
+          set -e;
+          source /etc/profile
+          set -x;
+
+          devcontainer-utils-install-sccache --version rapids;
+
+          devcontainer-utils-init-sccache-dist                                          \
+              --enable-sccache-dist - <<< "                                             \
+              --auth-type 'token'                                                       \
+              --auth-token '$(                                                          \
+                curl -fsSL -H "Authorization: Bearer $(                                 \
+                  curl -fsSL -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
+                    "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=token.rapids.nvidia.com"  \
+                | jq -r '.value'                                                        \
+                )" https://token.rapids.nvidia.com/gh/token/exchange                    \
+              | jq -r '.token')'                                                        \
+          ";
+
+          # Copy source folder into ~/stdexec
+          cp -r "${GITHUB_WORKSPACE}"/stdexec ~/;
+          chown -R coder:coder ~/stdexec;
+          cd ~/stdexec;
+
+          # Configure
+          cmake -S . -B build -GNinja \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+            -DCMAKE_CXX_FLAGS="${{ matrix.cxxflags }}" \
+            -DSTDEXEC_ENABLE_TBB:BOOL=${{ !contains(matrix.cxxflags, '-fsanitize') }} \
+            -DSTDEXEC_ENABLE_ASIO:BOOL=TRUE \
+            -DSTDEXEC_ASIO_IMPLEMENTATION:STRING=boost \
+            -DCMAKE_CXX_STANDARD=${{ matrix.cxxstd }} \
+            -DCMAKE_CXX_EXTENSIONS=OFF \
+            ;
+
+          # Compile
+          cmake --build build -v -j 512;
+
+          # Print sccache stats
+          sccache -s;
+
+          # Tests
+          SCCACHE_NO_CACHE=1 SCCACHE_NO_DIST_COMPILE=1 \
+          ctest --test-dir build --verbose --output-on-failure --timeout 60;
+      - if: ${{ !cancelled() }}
+        name: Upload sccache logs
+        uses: actions/upload-artifact@v4
+        with:
+          name: sccache-client-logs-${{env.ARTIFACT_PREFIX}}-${{env.ARTIFACT_SUFFIX}}
+          path: /tmp/sccache*.log
+          compression-level: 9
+
+  ci-cpu:
+    runs-on: ubuntu-latest
+    name: CI (CPU)
+    needs:
+      - build-cpu
+    steps:
+      - run: echo "CI (CPU) success"
+
+  build-cpu-windows:
+    runs-on: windows-2022
+    name: ${{ matrix.name }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - { compiler: "cl",       build: "Debug",   name: "CPU (Windows) (msvc, Debug)" }
+          - { compiler: "cl",       build: "Release", name: "CPU (Windows) (msvc, Release)" }
+          #- { compiler: "clang++",  build: "Debug",   name: "CPU (Windows) (clang, Debug)" }
+          #- { compiler: "clang++",  build: "Release", name: "CPU (Windows) (clang, Release)" }
+          #- { compiler: "clang-cl", build: "Debug",   name: "CPU (Windows) (clang-cl, Debug)" }
+          #- { compiler: "clang-cl", build: "Release", name: "CPU (Windows) (clang-cl, Release)" }
+
+    steps:
+      - name: Checkout stdexec (Windows)
+        uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+
+      - name: Build and test CPU schedulers (Windows)
+        shell: pwsh
+        run: |
+          docker pull rapidsai/devcontainers:26.02-cuda12.9-cl14.43
+          docker run --isolation=process -v "$(pwd):C:/stdexec" rapidsai/devcontainers:26.02-cuda12.9-cl14.43 powershell C:/stdexec/.github/workflows/test-windows.ps1 -Compiler '${{ matrix.compiler }}' -Config '${{ matrix.build }}'
+
+  ci-cpu-windows:
+    runs-on: windows-latest
+    name: CI (CPU) (Windows)
+    needs:
+      - build-cpu-windows
+    steps:
+      - run: echo "CI (CPU) (Windows) success"
 
   build-cpu-macos:
     runs-on: macos-26-large
diff --git a/.github/workflows/ci.gpu.yml b/.github/workflows/ci.gpu.yml
index 1f0d6e747..17adc9d56 100644
--- a/.github/workflows/ci.gpu.yml
+++ b/.github/workflows/ci.gpu.yml
@@ -20,11 +20,11 @@ jobs:
       matrix:
         include:
           - { name: "clang 21",    cuda: "12.0", cxx: "clang++", build: "Release", tag: "llvm21-cuda12.0",  gpu: "v100", sm: "70", driver: "latest", arch: "amd64" }
-          # - { name: "clang 21",    cuda: "12.9", cxx: "clang++", build: "Release", tag: "llvm21-cuda12.9",  gpu: "v100", sm: "70", driver: "latest", arch: "amd64" }
-          # - { name: "nvc++ 25.9",  cuda: "12.9", cxx: "mpic++",  build: "Release", tag: "nvhpc25.9",        gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
-          # - { name: "nvc++ 25.9",  cuda: "12.9", cxx: "mpic++",  build: "Debug",   tag: "nvhpc25.9",        gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
-          # - { name: "nvc++ 25.11", cuda: "13.0", cxx: "mpic++",  build: "Release", tag: "nvhpc25.11",       gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
-          # - { name: "nvc++ 25.11", cuda: "13.0", cxx: "mpic++",  build: "Debug",   tag: "nvhpc25.11",       gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
+          - { name: "clang 21",    cuda: "12.9", cxx: "clang++", build: "Release", tag: "llvm21-cuda12.9",  gpu: "v100", sm: "70", driver: "latest", arch: "amd64" }
+          - { name: "nvc++ 25.9",  cuda: "12.9", cxx: "mpic++",  build: "Release", tag: "nvhpc25.9",        gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
+          - { name: "nvc++ 25.9",  cuda: "12.9", cxx: "mpic++",  build: "Debug",   tag: "nvhpc25.9",        gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
+          - { name: "nvc++ 25.11", cuda: "13.0", cxx: "mpic++",  build: "Release", tag: "nvhpc25.11",       gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
+          - { name: "nvc++ 25.11", cuda: "13.0", cxx: "mpic++",  build: "Debug",   tag: "nvhpc25.11",       gpu: "l4",   sm: "75", driver: "latest", arch: "amd64" }
     runs-on: linux-${{ matrix.arch }}-gpu-${{ matrix.gpu }}-${{ matrix.driver }}-1
     container:
       options: -u root
@@ -57,74 +57,74 @@ jobs:
           aws-region: us-east-2
           role-duration-seconds: 28800 # 8 hours
           role-to-assume: arn:aws:iam::279114543810:role/gha-oidc-NVIDIA
-      # - name: Build and test GPU schedulers
-      #   env:
-      #     NVCC_APPEND_FLAGS: "-t=100"
-      #     SCCACHE_BUCKET: "rapids-sccache-devs"
-      #     SCCACHE_DIST_REQUEST_TIMEOUT: "7140"
-      #     SCCACHE_DIST_URL: "https://${{ matrix.arch }}.linux.sccache.rapids.nvidia.com"
-      #     SCCACHE_IDLE_TIMEOUT: "0"
-      #     SCCACHE_REGION: "us-east-2"
-      #     SCCACHE_S3_KEY_PREFIX: "nvidia-stdexec-dev"
-      #     SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: "nvidia-stdexec-dev/preprocessor"
-      #     SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: "true"
-      #     SCCACHE_SERVER_LOG: "sccache=debug"
-      #     SCCACHE_SERVER_PORT: "4225"
-      #   run: |
-      #     set -e;
-      #     source /etc/profile
-      #     set -x;
+      - name: Build and test GPU schedulers
+        env:
+          NVCC_APPEND_FLAGS: "-t=100"
+          SCCACHE_BUCKET: "rapids-sccache-devs"
+          SCCACHE_DIST_REQUEST_TIMEOUT: "7140"
+          SCCACHE_DIST_URL: "https://${{ matrix.arch }}.linux.sccache.rapids.nvidia.com"
+          SCCACHE_IDLE_TIMEOUT: "0"
+          SCCACHE_REGION: "us-east-2"
+          SCCACHE_S3_KEY_PREFIX: "nvidia-stdexec-dev"
+          SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX: "nvidia-stdexec-dev/preprocessor"
+          SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE: "true"
+          SCCACHE_SERVER_LOG: "sccache=debug"
+          SCCACHE_SERVER_PORT: "4225"
+        run: |
+          set -e;
+          source /etc/profile
+          set -x;
 
-      #     devcontainer-utils-install-sccache --version rapids;
+          devcontainer-utils-install-sccache --version rapids;
 
-      #     devcontainer-utils-init-sccache-dist                                          \
-      #         --enable-sccache-dist - <<< "                                             \
-      #         --auth-type 'token'                                                       \
-      #         --auth-token '$(                                                          \
-      #           curl -fsSL -H "Authorization: Bearer $(                                 \
-      #             curl -fsSL -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
-      #               "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=token.rapids.nvidia.com"  \
-      #           | jq -r '.value'                                                        \
-      #           )" https://token.rapids.nvidia.com/gh/token/exchange                    \
-      #         | jq -r '.token')'                                                        \
-      #     ";
+          devcontainer-utils-init-sccache-dist                                          \
+              --enable-sccache-dist - <<< "                                             \
+              --auth-type 'token'                                                       \
+              --auth-token '$(                                                          \
+                curl -fsSL -H "Authorization: Bearer $(                                 \
+                  curl -fsSL -H "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
+                    "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=token.rapids.nvidia.com"  \
+                | jq -r '.value'                                                        \
+                )" https://token.rapids.nvidia.com/gh/token/exchange                    \
+              | jq -r '.token')'                                                        \
+          ";
 
-      #     # Copy source folder into ~/stdexec
-      #     cp -r "${GITHUB_WORKSPACE}"/stdexec ~/;
-      #     chown -R coder:coder ~/stdexec;
-      #     cd ~/stdexec;
+          # Copy source folder into ~/stdexec
+          cp -r "${GITHUB_WORKSPACE}"/stdexec ~/;
+          chown -R coder:coder ~/stdexec;
+          cd ~/stdexec;
 
-      #     # Configure
-      #     cmake -S . -B build -GNinja \
-      #       -DSTDEXEC_ENABLE_CUDA=ON \
-      #       -DSTDEXEC_ENABLE_IO_URING=OFF \
-      #       -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
-      #       -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \
-      #       -DCMAKE_CUDA_COMPILER=${{ matrix.cxx }} \
-      #       -DCMAKE_CUDA_ARCHITECTURES=${{ matrix.sm }} \
-      #       ;
+          # Configure
+          cmake -S . -B build -GNinja \
+            -DSTDEXEC_ENABLE_CUDA=ON \
+            -DSTDEXEC_ENABLE_IO_URING=OFF \
+            -DCMAKE_BUILD_TYPE=${{ matrix.build }} \
+            -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \
+            -DCMAKE_CUDA_COMPILER=${{ matrix.cxx }} \
+            -DCMAKE_CUDA_ARCHITECTURES=${{ matrix.sm }} \
+            ;
 
-      #     # Compile
-      #     cmake --build build -v -j 512;
+          # Compile
+          cmake --build build -v -j 512;
 
-      #     # Print sccache stats
-      #     sccache -s;
+          # Print sccache stats
+          sccache -s;
 
-      #     # Tests
-      #     SCCACHE_NO_CACHE=1 SCCACHE_NO_DIST_COMPILE=1 \
-      #     ctest --test-dir build --verbose --output-on-failure --timeout 60;
+          # Tests
+          SCCACHE_NO_CACHE=1 SCCACHE_NO_DIST_COMPILE=1 \
+          ctest --test-dir build --verbose --output-on-failure --timeout 60;
 
-      #     # Examples
-      #     ./build/examples/nvexec/maxwell_cpu_st --iterations=1000 --N=512 --run-cpp --run-inline-scheduler;
-      #     ./build/examples/nvexec/maxwell_cpu_mt --iterations=1000 --N=512 --run-std --run-stdpar --run-thread-pool-scheduler;
-      #     ./build/examples/nvexec/maxwell_gpu_s --iterations=1000 --N=512 --run-cuda --run-stdpar --run-stream-scheduler;
-      # - if: ${{ !cancelled() }}
-      #   name: Upload sccache logs
-      #   uses: actions/upload-artifact@v4
-      #   with:
-      #     name: sccache-client-logs-${{env.ARTIFACT_PREFIX}}-${{env.ARTIFACT_SUFFIX}}
-      #     path: /tmp/sccache*.log
-      #     compression-level: 9
+          # Examples
+          ./build/examples/nvexec/maxwell_cpu_st --iterations=1000 --N=512 --run-cpp --run-inline-scheduler;
+          ./build/examples/nvexec/maxwell_cpu_mt --iterations=1000 --N=512 --run-std --run-stdpar --run-thread-pool-scheduler;
+          ./build/examples/nvexec/maxwell_gpu_s --iterations=1000 --N=512 --run-cuda --run-stdpar --run-stream-scheduler;
+      - if: ${{ !cancelled() }}
+        name: Upload sccache logs
+        uses: actions/upload-artifact@v4
+        with:
+          name: sccache-client-logs-${{env.ARTIFACT_PREFIX}}-${{env.ARTIFACT_SUFFIX}}
+          path: /tmp/sccache*.log
+          compression-level: 9
 
   ci-gpu:
     runs-on: ubuntu-latest