Skip to content
Merged
Show file tree
Hide file tree
Changes from 50 commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
803e781
[libcudacxx] Experimental try_cancel exposure
gonzalobg Feb 4, 2025
3740f0f
Update documentation
gonzalobg Feb 4, 2025
09c48f8
Update try_cancel_blocks ABI
gonzalobg Feb 4, 2025
6d15626
Update ABI of __cluster_get_dim
gonzalobg Feb 4, 2025
4b9f6a3
Use if target and provide SW fallback
gonzalobg Feb 4, 2025
57dcd42
Use simple license
gonzalobg Feb 4, 2025
aba388b
Fix if guard
gonzalobg Feb 4, 2025
220cf29
Guard for C++20 or newer
gonzalobg Feb 4, 2025
d2baa62
Simplify API
gonzalobg Feb 6, 2025
27a6a52
Add tests
gonzalobg Feb 6, 2025
e1f090b
Clarify C++20 support in docs
gonzalobg Feb 6, 2025
7e99c76
Test main function should only run in host
gonzalobg Feb 6, 2025
0b62b46
Rename to for_each_cancelled_block and extend docs
gonzalobg Feb 6, 2025
0202b52
Fix typo in docs
gonzalobg Feb 6, 2025
a9b46bc
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Feb 7, 2025
dd6c971
Support C++17, move to different file, improve docs
gonzalobg Feb 18, 2025
cbd4885
Fix two typos
gonzalobg Feb 18, 2025
bdc1011
Free memory in doc example
gonzalobg Feb 18, 2025
ac02abc
Fix typos and add suggestions
gonzalobg Feb 18, 2025
ad807c1
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Feb 18, 2025
9e75ce2
remove dangling requires clauses
gonzalobg Feb 18, 2025
ab5f28b
More comments; initial arrive can be relaxed
gonzalobg Feb 18, 2025
b3a9933
cancelled 2 cancelled for consistency with PTX
gonzalobg Feb 18, 2025
299e4b3
Add missing invocable include
gonzalobg Feb 18, 2025
e20a87a
Add missing __detail namespace closing brace
gonzalobg Feb 18, 2025
1ddcc83
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Feb 18, 2025
d11cb32
Merge branch 'main' into try_cancel_api
gonzalobg Feb 19, 2025
c09a381
Stabilize API for CTK
gonzalobg Feb 19, 2025
aba1484
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Feb 19, 2025
69c5313
Update docs
gonzalobg Feb 19, 2025
59cf5e1
Update docs
gonzalobg Feb 19, 2025
9cb8360
Enable tests in C++17
gonzalobg Feb 19, 2025
c3da9b3
Update test
gonzalobg Feb 19, 2025
cb16ef3
Update test
gonzalobg Feb 19, 2025
898ae57
Run test on all silicon
gonzalobg Feb 19, 2025
ea9f3bb
Fix Bernhard suggestions
gonzalobg Feb 19, 2025
709671c
Improve docs clarity
gonzalobg Feb 19, 2025
1e5ed5d
This needs a cuda compiler
gonzalobg Feb 19, 2025
18d2cec
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Feb 19, 2025
57a6918
Use int as the size type
miscco Feb 19, 2025
2654baa
Use `assert` in tests
miscco Feb 19, 2025
828fbd1
Use functions from cuda::std
miscco Feb 19, 2025
01bb9ac
Reduce includes to necessary ones
miscco Feb 19, 2025
e788fb0
Use proper license
miscco Feb 19, 2025
f7fc53c
Drop unnecessary `__detail` namespace
miscco Feb 19, 2025
ca0e92a
Cleanup the test a bit more
miscco Feb 19, 2025
e03de4e
Drop unsupported dialects
miscco Feb 19, 2025
4104076
Move to `<cuda/functional>`
miscco Feb 19, 2025
a4b9783
clusterlaunchcontrol.try_cancel requires PTX 8.7
miscco Feb 19, 2025
0272ffe
Add missing include
miscco Feb 19, 2025
7ea70fb
Drop superfluous header guard
miscco Feb 19, 2025
284d84b
Use `NV_DISPATCH_TARGET` because that is more future proof
miscco Feb 19, 2025
8829e90
document requirement on PTX ISA 8.7
miscco Feb 19, 2025
0d1991e
Add check back ^^
miscco Feb 19, 2025
da83903
Fix namespace
miscco Feb 19, 2025
7068da5
Add pre PTX ISA 8.7 fallback and use invoke to support function pointers
miscco Feb 19, 2025
91f3e4d
Move to `<cuda/work_stealing>`
miscco Feb 20, 2025
990bb85
move test file
miscco Feb 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/libcudacxx/extended_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ Extended API
extended_api/streams
extended_api/memory_resource
extended_api/math
extended_api/work_stealing
115 changes: 115 additions & 0 deletions docs/libcudacxx/extended_api/work_stealing.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
.. _libcudacxx-extended-api-work-stealing:

Work stealing
=============

Defined in header ``<cuda/functional>``:

.. code:: cuda

namespace cuda {

template <int ThreadBlockRank = 3, typename UnaryFunction = ..unspecified..>
__device__ void for_each_canceled_block(UnaryFunction uf);

} // namespace cuda

**Note**: On devices with compute capability 10.0 or higher, this function may leverage hardware acceleration.

This API is primarily intended for implementing work-stealing at the thread-block level.


Compared to alternative work distribution techniques, such as `grid-stride loops <https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/>`__, which distribute work statically, or dynamic work distribution methods relying on global memory concurrency, this API offers several advantages:

- It enables dynamic work-stealing: thread blocks that complete their tasks sooner can take on additional work from slower thread blocks.
- It may cooperate with the GPU work scheduler to respect work priorities and improve load balancing.
- It may reduce work-stealing latency compared to global memory atomics.

For better performance, extract the shared thread-block prologue and epilogue outside the lambda and reuse them across thread-block iterations:

- Prologue: Thread-block initialization code and data common to all thread blocks, such as ``__shared__`` memory allocation and initialization.
- Epilogue: Epilogue: Thread-block finalization code common to all thread blocks, such as writing shared memory back to global memory..

**Mandates**:

- ``ThreadBlockRank`` equals the rank of the thread block: ``1``, ``2``, or ``3`` for one-dimensional, two-dimensional, and three-dimensional thread blocks, respectively.
- ``is_invokable_r_v<UnaryFunction, void, dim3>`` is true.

**Preconditions**:

- All threads within a thread block shall call ``for_each_canceled_block`` **exactly once**.

**Effects**:

- Invokes ``uf`` with ``blockIdx`` and then repeatedly attempts to cancel the launch of another thread block within the current grid:

- If successful: invokes ``uf`` with the canceled thread block's ``blockIdx`` and repeats.
- Otherwise, the function returns; it failed to cancel the launch of another thread block.

Example
-------

This example demonstrates work-stealing at thread-block granularity using this API.

.. code:: cuda

// Before:

#include <cuda/math>
#include <cuda/functional>
__global__ void vec_add(int* a, int* b, int* c, int n) {
// Extract common prologue outside the lambda, e.g.,
// - __shared__ or global (malloc) memory allocation
// - common initialization code
// - etc.

cuda::for_each_canceled_block<1>([=](dim3 block_idx) {
// block_idx may be different than the built-in blockIdx variable, that is:
// assert(block_idx == blockIdx); // may fail!
// so we need to use "block_idx" consistently inside for_each_canceled:
int idx = threadIdx.x + block_idx.x * blockDim.x;
if (idx < n) {
c[idx] += a[idx] + b[idx];
}
});
// Note: Calling for_each_canceled_block<1> again from this
// thread block exhibits undefined behavior.

// Extract common epilogue outside the lambda, e.g.,
// - write back shared memory to global memory
// - external synchronization
// - global memory deallocation (free)
// - etc.
}

int main() {
int N = 10000;
int *a, *b, *c;
cudaMallocManaged(&a, N * sizeof(int));
cudaMallocManaged(&b, N * sizeof(int));
cudaMallocManaged(&c, N * sizeof(int));
for (int i = 0; i < N; ++i) {
a[i] = i;
b[i] = 1;
c[i] = 0;
}

const int threads_per_block = 256;
const int blocks_per_grid = cuda::ceil_div(N, threads_per_block);

vec_add<<<blocks_per_grid, threads_per_block>>>(a, b, c, N);
cudaDeviceSynchronize();

bool success = true;
for (int i = 0; i < N; ++i) {
if (c[i] != (1 + i)) {
std::cerr << "ERROR " << i << ", " << c[i] << std::endl;
success = false;
}
}
cudaFree(a);
cudaFree(b);
cudaFree(c);

return success? 0 : 1;
}
276 changes: 276 additions & 0 deletions libcudacxx/include/cuda/__functional/for_each_canceled.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,276 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA__FUNCTIONAL_FOR_EACH_CANCELED_H
#define _CUDA__FUNCTIONAL_FOR_EACH_CANCELED_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__functional/invoke.h>
#include <cuda/std/__utility/move.h>
#include <cuda/std/__utility/unreachable.h>
#include <cuda/std/cstdint>

#include <nv/target>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if _CCCL_HAS_CUDA_COMPILER

# if _CCCL_HAS_INT128()

# if __cccl_ptx_isa >= 870

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA

template <int _Index>
_CCCL_NODISCARD _CCCL_DEVICE _CCCL_HIDE_FROM_ABI int __cluster_get_dim(__int128 __result) noexcept
{
int __r;
if constexpr (_Index == 0)
{
asm volatile("clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 %0, %1;"
: "=r"(__r)
: "q"(__result)
: "memory");
}
else if constexpr (_Index == 1)
{
asm volatile("clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 %0, %1;"
: "=r"(__r)
: "q"(__result)
: "memory");
}
else if constexpr (_Index == 2)
{
asm volatile("clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 %0, %1;"
: "=r"(__r)
: "q"(__result)
: "memory");
}
else
{
_CCCL_UNREACHABLE();
}
return __r;
}

//! This API for implementing work-stealing, repeatedly attempts to cancel the launch of a thread block
//! from the current grid. On success, it invokes the unary function `__uf` before trying again.
//! On failure, it returns.
//!
//! This API does not provide any memory synchronization.
//! This API does not guarantee that any thread will invoke `__uf` with the next block index until all
//! invocatons of `__uf` for the prior block index have returned.
//!
//! Preconditions:
//! - All thread block threads shall call this API exactly once.
//! - Exactly one thread block thread shall call this API with `__is_leader` equals `true`.
template <int __ThreadBlockRank = 3, typename __UnaryFunction = void>
_CCCL_DEVICE _CCCL_HIDE_FROM_ABI void
__for_each_canceled_block_sm100(dim3 __block_idx, bool __is_leader, __UnaryFunction __uf)
{
__shared__ uint64_t __barrier; // TODO: use 2 barriers and 2 results to avoid last sync threads
__shared__ __int128 __result;
bool __phase = false;

// Initialize barrier and kick-start try_cancel pipeline:
if (__is_leader)
{
auto __leader_mask = __activemask();
asm volatile(
"{\n\t"
".reg .pred p;\n\t"
// elect.sync is a workaround for peeling loop (#nvbug-id)
"elect.sync _|p, %2;\n\t"
"@p mbarrier.init.shared::cta.b64 [%1], 1;\n\t"
// `try_cancel` access the mbarrier using generic-proxy, so no cross-proxy fence required here
"@p clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [%0], [%1];\n\t"
// This arrive does not order prior memory operations and can be relaxed.
"@p mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 _, [%1], 16;\n\t"
"}"
:
: "r"((int) __cvta_generic_to_shared(&__result)),
"r"((int) __cvta_generic_to_shared(&__barrier)),
"r"(__leader_mask)
: "memory");
}

do
{
__uf(__block_idx); // Invoke unary function.

if (__is_leader)
{
asm volatile(
"{\n\t"
".reg .pred p;\n\t"
"waitLoop:\n\t\t"
"mbarrier.try_wait.parity.relaxed.cta.shared.b64 p, [%0], %1;\n\t\t"
"@!p bra waitLoop;\n\t"
"}"
:
: "r"((int) __cvta_generic_to_shared(&__barrier)), "r"((unsigned) __phase)
: "memory");
__phase = !__phase;
}
__syncthreads(); // All threads of prior thread block have "exited".
// Note: this syncthreads provides the .acquire.cta fence preventing
// the next query operations from being re-ordered above the poll loop.
{
int __success = 0;
asm volatile(
"{\n\t"
".reg .pred p;\n\t"
"clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 p, %1;\n\t"
"selp.b32 %0, 1, 0, p;\n\t"
"}\n\t"
: "=r"(__success)
: "q"(__result));
if (__success != 1)
{
// Invalidating mbarrier and synchronizing before exiting not
// required since each thread block calls this API at most once.
break;
}
}

// Read new thread block dimensions
dim3 __b(::cuda::__cluster_get_dim<0>(__result), 1, 1);
if constexpr (__ThreadBlockRank >= 2)
{
__b.y = ::cuda::__cluster_get_dim<1>(__result);
}
if constexpr (__ThreadBlockRank == 3)
{
__b.z = ::cuda::__cluster_get_dim<2>(__result);
}
__block_idx = __b;

// Wait for all threads to read __result before issuing next async op.
// generic->generic synchronization
__syncthreads();
// TODO: only control-warp requires sync, other warps can arrive
// TODO: double-buffering results+barrier pairs using phase to avoids this sync

if (__is_leader)
{
auto __leader_mask = __activemask();
asm volatile(
"{\n\t"
".reg .pred p;\n\t"
// elect.sync is a workaround for peeling loop (#nvbug-id)
"elect.sync _|p, %2;\n\t"
// generic->async release + acquire synchronization of prior reads:
// use bi-directional cross-proxy acq_rel fence instead of uni-dir rel; acq; fences.
"@p fence.proxy.async.shared::cta;\n\t"
// try to cancel another thread block
"@p clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [%0], [%1];\n\t"
"@p mbarrier.arrive.expect_tx.relaxed.cta.shared::cta.b64 _, [%1], 16;\n\t"
"}"
:
: "r"((int) __cvta_generic_to_shared(&__result)),
"r"((int) __cvta_generic_to_shared(&__barrier)),
"r"(__leader_mask)
: "memory");
}
} while (true);
}

# endif // __cccl_ptx_isa >= 870

//! This API for implementing work-stealing, repeatedly attempts to cancel the launch of a thread block
//! from the current grid. On success, it invokes the unary function `__uf` before trying again.
//! On failure, it returns.
//!
//! This API does not provide any memory synchronization.
//! This API does not guarantee that any thread will invoke `__uf` with the next block index until all
//! invocatons of `__uf` for the prior block index have returned.
//!
//! Preconditions:
//! - All thread block threads shall call this API exactly once.
//! - Exactly one thread block thread shall call this API with `__is_leader` equals `true`.
template <int __ThreadBlockRank = 3, typename __UnaryFunction = void>
_CCCL_DEVICE _CCCL_HIDE_FROM_ABI void __for_each_canceled_block(bool __is_leader, __UnaryFunction __uf)
{
static_assert(__ThreadBlockRank >= 1 && __ThreadBlockRank <= 3, "ThreadBlockRank out-of-range [1, 3].");
static_assert(_CUDA_VSTD::is_invocable_r_v<void, __UnaryFunction, dim3>,
"__for_each_canceled_block first argument requires an UnaryFunction with signature: void(dim3).\n"
"For example, call with lambda: __for_each_canceled_block([](dim3 block_idx) { ... });");
dim3 __block_idx = dim3(blockIdx.x, 1, 1);
if constexpr (__ThreadBlockRank >= 2)
{
__block_idx = dim3(blockIdx.x, blockIdx.y, 1);
}
if constexpr (__ThreadBlockRank >= 3)
{
__block_idx = dim3(blockIdx.x, blockIdx.y, blockIdx.z);
}

NV_IF_ELSE_TARGET(NV_PROVIDES_SM_100,
(::cuda::__for_each_canceled_block_sm100(__block_idx, __is_leader, _CUDA_VSTD::move(__uf));),
(__uf(__block_idx);))
}

//! This API used to implement work-stealing, repeatedly attempts to cancel the launch of a thread block
//! from the current grid. On success, it invokes the unary function `__uf` before trying again.
//! On failure, it returns.
//!
//! This API does not provide any memory synchronization.
//! This API does not guarantee that any thread will invoke `__uf` with the next block index until all
//! invocatons of `__uf` for the prior block index have returned.
//!
//! Preconditions:
//! - All thread block threads shall call this API exactly once.
//! - Exactly one thread block thread shall call this API with `__is_leader` equals `true`.
template <int __ThreadBlockRank = 3, typename __UnaryFunction = void>
_CCCL_DEVICE _CCCL_HIDE_FROM_ABI void for_each_canceled_block(__UnaryFunction __uf)
{
static_assert(__ThreadBlockRank >= 1 && __ThreadBlockRank <= 3,
"for_each_canceled_block<ThreadBlockRank>: ThreadBlockRank out-of-range [1, 3].");
static_assert(_CUDA_VSTD::is_invocable_r_v<void, __UnaryFunction, dim3>,
"for_each_canceled_block first argument requires an UnaryFunction with signature: void(dim3).\n"
"For example, call with lambda: for_each_canceled_block([](dim3 block_idx) { ... });");
if constexpr (__ThreadBlockRank == 1)
{
::cuda::__for_each_canceled_block<1>(threadIdx.x == 0, _CUDA_VSTD::move(__uf));
}
else if constexpr (__ThreadBlockRank == 2)
{
::cuda::__for_each_canceled_block<2>(threadIdx.x == 0 && threadIdx.y == 0, _CUDA_VSTD::move(__uf));
}
else if constexpr (__ThreadBlockRank == 3)
{
::cuda::__for_each_canceled_block<3>(
threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0, _CUDA_VSTD::move(__uf));
}
}

_LIBCUDACXX_END_NAMESPACE_CUDA

# endif // _CCCL_HAS_INT128()

#endif // _CCCL_HAS_CUDA_COMPILER

#endif // _CUDA__FUNCTIONAL_FOR_EACH_CANCELED_H
Loading
Loading