NVIDIA · miscco · Feb 20, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 4, 2025
@@ -16,3 +16,4 @@ Extended API
    extended_api/streams
    extended_api/memory_resource
    extended_api/math
+   extended_api/work_stealing
@@ -0,0 +1,111 @@
+.. _libcudacxx-extended-api-work-stealing:
+
+Work stealing
+=============
+
+In header file ``<cuda/for_each_canceled>``:
+
+.. code:: cuda
+
+   namespace cuda {
+
+       template <int ThreadBlockRank = 3, typename UnaryFunction = ..unspecified..>
+       __device__ void for_each_canceled_block(UnaryFunction uf);
+
+   } // namespace cuda
+
+On devices with compute capability 10.0 or higher, it may leverage hardware acceleration.
+
+This API is mainly intended to implement work-stealing at thread-block level granularity.
+When compared against alternative work distribution techniques like `grid-stride loops <https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/>`__, which distribute load statically, or against other dynamic work distribution techniques using global memory concurrency, the main advantages of this API over these alternatives are:
+
+   - It performs work-stealing dynamically: thread blocks that finish work sooner may do more work than thread blocks whose work takes longer.
+   - It may cooperate with the GPU work-scheduler to respect work priorities and perform load-balancing.
+   - It may have lower work-stealing latency than global memory atomics.
+
+For better performance, extract the shared thread block prologue and epilog outside the lambda, and re-use it across thread-block iterations:
+
+  - Prologue: thread-block initialization code and data that is common to all thread blocks, e.g., ``__shared__`` memory allocation, their initialization, etc.
+  - Epilogue: thread-block finalization code that is common to all thread blocks, e.g., writing back shared memory to global memory, etc.
+
+**Mandates**:
+
+   - ``ThreadBlockRank`` equals the rank of the thread block: ``1``, ``2``, or ``3`` for one-dimensional, two-dimensional, and three-dimensional thread blocks, respectively.
+   - ``is_invokable_r_v<UnaryFunction, void, dim3>`` is true.
+
+**Preconditions**:
+
+   - All threads of the current thread-block shall call ``for_each_canceled_block`` **exactly once**.
+
+**Effects**:
+
+   - Invokes ``uf`` with ``blockIdx``, then repeatedly attempts to cancel the launch of another thread block in the current grid, and:
+
+      - on success, calls ``uf`` with that thread block's ``blockIdx`` and repeats,
+      - otherwise, it failed to cancel the launch of a thread block and it returns.
+
+Example
+-------
+
+This example shows how to perform work-stealing at thread-block granularity using this API.
+
+.. code:: cuda
+
+   // Before:
+
+   #include <cuda/math>
+   #include <cuda/for_each_canceled>
+   __global__ void vec_add(int* a, int* b, int* c, int n) {
+     // Extract common prologue outside the lambda, e.g.,
+     // - __shared__ or global (malloc) memory allocation
+     // - common initialization code
+     // - etc.
+
+     cuda::for_each_canceled_block<1>([=](dim3 block_idx) {
+       int idx = threadIdx.x + block_idx.x * blockDim.x;
+       // assert(block_idx == blockIdx); // May fail!
+       if (idx < n) {
+         c[idx] += a[idx] + b[idx];
+       }
+     });
+     // Note: Calling for_each_canceled_block<1> again from this
+     // thread block exhibits undefined behavior.
+
+     // Extract common epilogue outside the lambda, e.g.,
+     // - write back shared memory to global memory
+     // - external synchronization
+     // - global memory deallocation (free)
+     // - etc.
+   }
+
+   int main() {
+    int N = 10000;
+    int *a, *b, *c;
+    cudaMallocManaged(&a, N * sizeof(int));
+    cudaMallocManaged(&b, N * sizeof(int));
+    cudaMallocManaged(&c, N * sizeof(int));
+    for (int i = 0; i < N; ++i) {
+      a[i] = i;
+      b[i] = 1;
+      c[i] = 0;
+    }
+
+    int tpb = 256;
+    int bpg = cuda::ceil_div(N, tpb);
+
+    vec_add<<<bpg, tpb>>>(a, b, c, N);
+    cudaDeviceSynchronize();
+
+    bool success = true;
+    for (int i = 0; i < N; ++i) {
+      if (c[i] != (1 + i)) {
+	std::cerr << "ERROR " << i << ", " << c[i] << std::endl;
+	success = false;
+      }
+    }
+    cudaFree(a);
+    cudaFree(b);
+    cudaFree(c);
+
+    return success? 0 : 1;
+   }
diff --git a/libcudacxx/include/cuda/for_each_canceled b/libcudacxx/include/cuda/for_each_canceled
@@ -0,0 +1,240 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef _CUDA_FOR_EACH_CANCELED
+#define _CUDA_FOR_EACH_CANCELED
+
+#include <cuda/std/detail/__config>
+
+#include <cuda/std/type_traits> // For cuda::std::is_invokable_r_v
+#include <cuda/std/utility> // For cuda::std::move, unreachable
+
+#include <nv/target>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
+
+namespace __detail
+{
+
+template <int __I>
+_CCCL_NODISCARD _CCCL_DEVICE _CCCL_HIDE_FROM_ABI int __cluster_get_dim(__int128 __result) noexcept
+{
+  int __r;
+  if constexpr (__I == 0)
+  {
+    asm volatile("clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 %0, %1;"
+                 : "=r"(__r)
+                 : "q"(__result)
+                 : "memory");
+  }
+  else if constexpr (__I == 1)
+  {
+    asm volatile("clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 %0, %1;"
+                 : "=r"(__r)
+                 : "q"(__result)
+                 : "memory");
+  }
+  else if constexpr (__I == 2)
+  {
+    asm volatile("clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 %0, %1;"
+                 : "=r"(__r)
+                 : "q"(__result)
+                 : "memory");
+  }
+  else
+  {
+    _CCCL_UNREACHABLE();
+  }
+  return __r;
+}
+
+/// This API for implementing work-stealing, repeatedly attempts to cancel the launch of a thread block
+/// from the current grid. On success, it invokes the unary function `__uf` before trying again.
+/// On failure, it returns.
+///
+/// This API does not provide any memory synchronization.
+/// This API does not guarantee that any thread will invoke `__uf` with the next block index until all
+/// invocatons of `__uf` for the prior block index have returned.
+///
+/// Preconditions:
+/// - All thread block threads shall call this API exactly once.
+/// - Exactly one thread block thread shall call this API with `__is_leader` equals `true`.
+template <int __ThreadBlockRank = 3, typename __UnaryFunction = void>
+_CCCL_DEVICE _CCCL_HIDE_FROM_ABI void __for_each_canceled_block(bool __is_leader, __UnaryFunction __uf)
+{
+  static_assert(__ThreadBlockRank >= 1 && __ThreadBlockRank <= 3, "ThreadBlockRank out-of-range [1, 3].");
+  static_assert(::cuda::std::is_invocable_r_v<void, __UnaryFunction, dim3>,
+                "__for_each_canceled_block first argument requires an UnaryFunction with signature: void(dim3).\n"
+                "For example, call with lambda: __for_each_canceled_block([](dim3 block_idx) { ... });");
+  dim3 __block_idx = dim3(blockIdx.x, 1, 1);
+  if constexpr (__ThreadBlockRank >= 2)
+  {
+    __block_idx = dim3(blockIdx.x, blockIdx.y, 1);
+  }
+  if constexpr (__ThreadBlockRank >= 3)
+  {
+    __block_idx = dim3(blockIdx.x, blockIdx.y, blockIdx.z);
+  }
+
+  NV_IF_ELSE_TARGET(
+    NV_PROVIDES_SM_100,
+    (
+      __shared__ uint64_t __barrier; // TODO: use 2 barriers and 2 results to avoid last sync threads
+      __shared__ __int128 __result;
+      bool __phase = false;
+
+      // Initialize barrier and kick-start try_cancel pipeline:
+      if (__is_leader) {
+        auto __leader_mask = __activemask();
+        asm volatile(
+          "{\n\t"
+          ".reg .pred p;\n\t"
+          // elect.sync is a workaround for peeling loop (#nvbug-id)
+          "elect.sync _|p, %2;\n\t"
+          "@p mbarrier.init.shared::cta.b64 [%1], 1;\n\t"
+          // `try_cancel` access the mbarrier using generic-proxy, so no cross-proxy fence required here
+          "@p clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [%0], [%1];\n\t"
+          // This arrive does not order prior memory operations and can be relaxed.
+          "@p mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 _, [%1], 16;\n\t"
+          "}"
+          :
+          : "r"((int) __cvta_generic_to_shared(&__result)),
+            "r"((int) __cvta_generic_to_shared(&__barrier)),
+            "r"(__leader_mask)
+          : "memory");
+      }
+
+      do {
+        __uf(__block_idx); // Invoke unary function.
+
+        if (__is_leader)
+        {
+          asm volatile(
+            "{\n\t"
+            ".reg .pred p;\n\t"
+            "waitLoop:\n\t\t"
+            "mbarrier.try_wait.parity.relaxed.cta.shared.b64 p, [%0], %1;\n\t\t"
+            "@!p bra waitLoop;\n\t"
+            "}"
+            :
+            : "r"((int) __cvta_generic_to_shared(&__barrier)), "r"((unsigned) __phase)
+            : "memory");
+          __phase = !__phase;
+        }
+        __syncthreads(); // All threads of prior thread block have "exited".
+        // Note: this syncthreads provides the .acquire.cta fence preventing
+        // the next query operations from being re-ordered above the poll loop.
+        {
+          int __success = 0;
+          asm volatile(
+            "{\n\t"
+            ".reg .pred p;\n\t"
+            "clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 p, %1;\n\t"
+            "selp.b32 %0, 1, 0, p;\n\t"
+            "}\n\t"
+            : "=r"(__success)
+            : "q"(__result));
+          if (__success != 1)
+          {
+            // Invalidating mbarrier and synchronizing before exiting not
+            // required since each thread block calls this API at most once.
+            break;
+          }
+        }
+
+        // Read new thread block dimensions
+        dim3 __b(__detail::__cluster_get_dim<0>(__result), 1, 1);
+        if constexpr (__ThreadBlockRank >= 2)
+        {
+          __b.y = __detail::__cluster_get_dim<1>(__result);
+        }
+        if constexpr (__ThreadBlockRank == 3)
+        {
+          __b.z = __detail::__cluster_get_dim<2>(__result);
+        }
+        __block_idx = __b;
+
+        // Wait for all threads to read __result before issuing next async op.
+        // generic->generic synchronization
+        __syncthreads();
+        // TODO: only control-warp requires sync, other warps can arrive
+        // TODO: double-buffering results+barrier pairs using phase to avoids this sync
+
+        if (__is_leader)
+        {
+          auto __leader_mask = __activemask();
+          asm volatile(
+            "{\n\t"
+            ".reg .pred p;\n\t"
+            // elect.sync is a workaround for peeling loop (#nvbug-id)
+            "elect.sync _|p, %2;\n\t"
+            // generic->async release + acquire synchronization of prior reads:
+            // use bi-directional cross-proxy acq_rel fence instead of uni-dir rel; acq; fences.
+            "@p fence.proxy.async.shared::cta;\n\t"
+            // try to cancel another thread block
+            "@p clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [%0], [%1];\n\t"
+            "@p mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 _, [%1], 16;\n\t"
+            "}"
+            :
+            : "r"((int) __cvta_generic_to_shared(&__result)),
+              "r"((int) __cvta_generic_to_shared(&__barrier)),
+              "r"(__leader_mask)
+            : "memory");
+        }
+      } while (true);),
+    ( // NV_IF_ELSE_TARGET(NV_PROVIDES_SM_100,
+      // SW fall-back for lower compute capabilities.
+      // TODO: it may make sense to __trap here instead since lower compute capabilities may want
+      // to do something else (grid-stride, atomics, etc.).
+      // A higher-level abstraction like for_each should handle that.
+
+      __uf(__block_idx);)) // NV_IF_ELSE_TARGET(NV_PROVIDES_SM_100,
+}
+
+} // namespace __detail
+
+/// This API used to implement work-stealing, repeatedly attempts to cancel the launch of a thread block
+/// from the current grid. On success, it invokes the unary function `__uf` before trying again.
+/// On failure, it returns.
+///
+/// This API does not provide any memory synchronization.
+/// This API does not guarantee that any thread will invoke `__uf` with the next block index until all
+/// invocatons of `__uf` for the prior block index have returned.
+///
+/// Preconditions:
+/// - All thread block threads shall call this API exactly once.
+/// - Exactly one thread block thread shall call this API with `__is_leader` equals `true`.
+template <int __ThreadBlockRank = 3, typename __UnaryFunction = void>
+_CCCL_DEVICE _CCCL_HIDE_FROM_ABI void for_each_canceled_block(__UnaryFunction __uf)
+{
+  static_assert(__ThreadBlockRank >= 1 && __ThreadBlockRank <= 3,
+                "for_each_canceled_block<ThreadBlockRank>: ThreadBlockRank out-of-range [1, 3].");
+  static_assert(::cuda::std::is_invocable_r_v<void, __UnaryFunction, dim3>,
+                "for_each_canceled_block first argument requires an UnaryFunction with signature: void(dim3).\n"
+                "For example, call with lambda: for_each_canceled_block([](dim3 block_idx) { ... });");
+  if constexpr (__ThreadBlockRank == 1)
+  {
+    __detail::__for_each_canceled_block<1>(threadIdx.x == 0, ::cuda::std::move(__uf));
+  }
+  else if constexpr (__ThreadBlockRank == 2)
+  {
+    __detail::__for_each_canceled_block<2>(threadIdx.x == 0 && threadIdx.y == 0, ::cuda::std::move(__uf));
+  }
+  else if constexpr (__ThreadBlockRank == 3)
+  {
+    __detail::__for_each_canceled_block<3>(
+      threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0, ::cuda::std::move(__uf));
+  }
+}
+
+_LIBCUDACXX_END_NAMESPACE_CUDA
+
+#endif // _CUDA_FOR_EACH_CANCELED