Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Use new cuda::ptx functionality
  • Loading branch information
ahendriksen committed Feb 23, 2024
commit 6876652b65909baab27cf11fa89f6cc8a3a57d1d
196 changes: 68 additions & 128 deletions libcudacxx/include/cuda/barrier
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#endif // no system header

#include "std/barrier"
#include "ptx"

// Forward-declare CUtensorMap for use in cp_async_bulk_tensor_* PTX wrapping
// functions. These functions take a pointer to CUtensorMap, so do not need to
Expand Down Expand Up @@ -60,14 +61,10 @@ void cp_async_bulk_global_to_shared(void *__dest, const void *__src, _CUDA_VSTD:
_LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "Source must be global memory address.");

asm volatile(
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
"l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
"r"(__size),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__bar))))
: "memory");
_CUDA_VPTX::cp_async_bulk(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __src, __size,
::cuda::device::barrier_native_handle(__bar));
}


Expand All @@ -79,213 +76,156 @@ void cp_async_bulk_shared_to_global(void *__dest, const void * __src, _CUDA_VSTD
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
_LIBCUDACXX_DEBUG_ASSERT(__isShared(__src), "Source must be shared memory address.");

asm volatile(
"cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;\n"
:
: "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__dest))),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__src))),
"r"(__size)
: "memory");
_CUDA_VPTX::cp_async_bulk(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__dest, __src, __size);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _LIBCUDACXX_DEVICE
void cp_async_bulk_tensor_1d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map , int __c0, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
{
asm volatile(
"cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
"[%0], [%1, {%2}], [%3];\n"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
"l"(__tensor_map),
"r"(__c0),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__bar))))
: "memory");
const _CUDA_VSTD::int32_t __coords[]{__c0};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _LIBCUDACXX_DEVICE
void cp_async_bulk_tensor_2d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
{
asm volatile(
"cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
"[%0], [%1, {%2, %3}], [%4];\n"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
"l"(__tensor_map),
"r"(__c0),
"r"(__c1),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__bar))))
: "memory");
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _LIBCUDACXX_DEVICE
void cp_async_bulk_tensor_3d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
{
asm volatile(
"cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
"[%0], [%1, {%2, %3, %4}], [%5];\n"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
"l"(__tensor_map),
"r"(__c0),
"r"(__c1),
"r"(__c2),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__bar))))
: "memory");
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _LIBCUDACXX_DEVICE
void cp_async_bulk_tensor_4d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, int __c2, int __c3, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
{
asm volatile(
"cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
"[%0], [%1, {%2, %3, %4, %5}], [%6];\n"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
"l"(__tensor_map),
"r"(__c0),
"r"(__c1),
"r"(__c2),
"r"(__c3),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__bar))))
: "memory");
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _LIBCUDACXX_DEVICE
void cp_async_bulk_tensor_5d_global_to_shared(
void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, int __c2, int __c3, int __c4, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
{
asm volatile(
"cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
"[%0], [%1, {%2, %3, %4, %5, %6}], [%7];\n"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
"l"(__tensor_map),
"r"(__c0),
"r"(__c1),
"r"(__c2),
"r"(__c3),
"r"(__c4),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__bar))))
: "memory");
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __tensor_map, __coords,
::cuda::device::barrier_native_handle(__bar));
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _LIBCUDACXX_DEVICE
void cp_async_bulk_tensor_1d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, const void *__src)
{
asm volatile(
"cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group "
"[%0, {%1}], [%2];\n"
:
: "l"(__tensor_map),
"r"(__c0),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__src)))
: "memory");
const _CUDA_VSTD::int32_t __coords[]{__c0};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _LIBCUDACXX_DEVICE
void cp_async_bulk_tensor_2d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, int __c1, const void *__src)
{
asm volatile(
"cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group "
"[%0, {%1, %2}], [%3];\n"
:
: "l"(__tensor_map),
"r"(__c0),
"r"(__c1),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__src)))
: "memory");
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _LIBCUDACXX_DEVICE
void cp_async_bulk_tensor_3d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, const void *__src)
{
asm volatile(
"cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group "
"[%0, {%1, %2, %3}], [%4];\n"
:
: "l"(__tensor_map),
"r"(__c0),
"r"(__c1),
"r"(__c2),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__src)))
: "memory");
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _LIBCUDACXX_DEVICE
void cp_async_bulk_tensor_4d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, int __c3, const void *__src)
{
asm volatile(
"cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group "
"[%0, {%1, %2, %3, %4}], [%5];\n"
:
: "l"(__tensor_map),
"r"(__c0),
"r"(__c1),
"r"(__c2),
"r"(__c3),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__src)))
: "memory");
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
inline _LIBCUDACXX_DEVICE
void cp_async_bulk_tensor_5d_shared_to_global(
const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void *__src)
{
asm volatile(
"cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group "
"[%0, {%1, %2, %3, %4, %5}], [%6];\n"
:
: "l"(__tensor_map),
"r"(__c0),
"r"(__c1),
"r"(__c2),
"r"(__c3),
"r"(__c4),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__src)))
: "memory");
const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};

_CUDA_VPTX::cp_async_bulk_tensor(
_CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
__tensor_map, __coords, __src);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
inline _LIBCUDACXX_DEVICE
void fence_proxy_async_shared_cta() {
asm volatile("fence.proxy.async.shared::cta; \n":::"memory");
_CUDA_VPTX::fence_proxy_async(_CUDA_VPTX::space_shared);
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
inline _LIBCUDACXX_DEVICE
void cp_async_bulk_commit_group()
{
asm volatile("cp.async.bulk.commit_group;\n" ::: "memory");
_CUDA_VPTX::cp_async_bulk_commit_group();
}

// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
template <int n_prior>
template <int __n_prior>
inline _LIBCUDACXX_DEVICE
void cp_async_bulk_wait_group_read()
{
static_assert(n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
asm volatile("cp.async.bulk.wait_group.read %0; \n"
:
: "n"(n_prior)
: "memory");
static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
_CUDA_VPTX::cp_async_bulk_wait_group_read(_CUDA_VPTX::n32_t<__n_prior>{});
}

#endif // __cccl_lib_experimental_ctk12_cp_async_exposure
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -677,16 +677,11 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory.");

NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
if (__isShared(__dest) && __isGlobal(__src)) {
asm volatile(
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
"l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__size)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__bh))
: "memory");
_CUDA_VPTX::cp_async_bulk(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __src, static_cast<uint32_t>(__size),
barrier_native_handle(__b));
} else {
// memcpy_async_tx only supports copying from global to shared
// or from shared to remote cluster dsmem. To copy to remote
Expand Down Expand Up @@ -953,14 +948,9 @@ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
if (__g.thread_rank() == 0) {
asm volatile(
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
"l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__size)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__bar_handle)))
: "memory");
_CUDA_VPTX::cp_async_bulk(
_CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
__dest, __src, __size, __bar_handle);
}
),(
__cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
Expand Down