Use new cuda::ptx functionality

NVIDIA · miscco · Feb 26, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024
commit 6876652b65909baab27cf11fa89f6cc8a3a57d1d
@@ -22,6 +22,7 @@
 #endif // no system header
 
 #include "std/barrier"
+#include "ptx"
 
 // Forward-declare CUtensorMap for use in cp_async_bulk_tensor_* PTX wrapping
 // functions. These functions take a pointer to CUtensorMap, so do not need to
@@ -60,14 +61,10 @@ void cp_async_bulk_global_to_shared(void *__dest, const void *__src, _CUDA_VSTD:
     _LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "Destination must be shared memory address.");
     _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src),  "Source must be global memory address.");
 
-    asm volatile(
-        "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-          "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
-          "r"(__size),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__bar))))
-        : "memory");
+    _CUDA_VPTX::cp_async_bulk(
+        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
+        __dest, __src, __size,
+        ::cuda::device::barrier_native_handle(__bar));
 }
 
 
@@ -79,213 +76,156 @@ void cp_async_bulk_shared_to_global(void *__dest, const void * __src, _CUDA_VSTD
     _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__dest), "Destination must be global memory address.");
     _LIBCUDACXX_DEBUG_ASSERT(__isShared(__src),  "Source must be shared memory address.");
 
-    asm volatile(
-        "cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;\n"
-        :
-        : "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__dest))),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__src))),
-          "r"(__size)
-        : "memory");
+    _CUDA_VPTX::cp_async_bulk(
+        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
+        __dest, __src, __size);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 inline _LIBCUDACXX_DEVICE
 void cp_async_bulk_tensor_1d_global_to_shared(
     void *__dest, const CUtensorMap *__tensor_map , int __c0, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
 {
-    asm volatile(
-        "cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
-        "[%0], [%1, {%2}], [%3];\n"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-          "l"(__tensor_map),
-          "r"(__c0),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__bar))))
-        : "memory");
+    const _CUDA_VSTD::int32_t __coords[]{__c0};
+
+    _CUDA_VPTX::cp_async_bulk_tensor(
+        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
+        __dest, __tensor_map, __coords,
+        ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 inline _LIBCUDACXX_DEVICE
 void cp_async_bulk_tensor_2d_global_to_shared(
     void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
 {
-    asm volatile(
-        "cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
-        "[%0], [%1, {%2, %3}], [%4];\n"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-          "l"(__tensor_map),
-          "r"(__c0),
-          "r"(__c1),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__bar))))
-        : "memory");
+    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};
+
+    _CUDA_VPTX::cp_async_bulk_tensor(
+        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
+        __dest, __tensor_map, __coords,
+        ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 inline _LIBCUDACXX_DEVICE
 void cp_async_bulk_tensor_3d_global_to_shared(
     void *__dest, const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
 {
-    asm volatile(
-        "cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
-        "[%0], [%1, {%2, %3, %4}], [%5];\n"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-          "l"(__tensor_map),
-          "r"(__c0),
-          "r"(__c1),
-          "r"(__c2),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__bar))))
-        : "memory");
+    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};
+
+    _CUDA_VPTX::cp_async_bulk_tensor(
+        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
+        __dest, __tensor_map, __coords,
+        ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 inline _LIBCUDACXX_DEVICE
 void cp_async_bulk_tensor_4d_global_to_shared(
     void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, int __c2, int __c3, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
 {
-    asm volatile(
-        "cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
-        "[%0], [%1, {%2, %3, %4, %5}], [%6];\n"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-          "l"(__tensor_map),
-          "r"(__c0),
-          "r"(__c1),
-          "r"(__c2),
-          "r"(__c3),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__bar))))
-        : "memory");
+    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};
+
+    _CUDA_VPTX::cp_async_bulk_tensor(
+        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
+        __dest, __tensor_map, __coords,
+        ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 inline _LIBCUDACXX_DEVICE
 void cp_async_bulk_tensor_5d_global_to_shared(
     void *__dest, const CUtensorMap *__tensor_map , int __c0, int __c1, int __c2, int __c3, int __c4, ::cuda::barrier<::cuda::thread_scope_block> &__bar)
 {
-    asm volatile(
-        "cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes "
-        "[%0], [%1, {%2, %3, %4, %5, %6}], [%7];\n"
-        :
-        : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-          "l"(__tensor_map),
-          "r"(__c0),
-          "r"(__c1),
-          "r"(__c2),
-          "r"(__c3),
-          "r"(__c4),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(::cuda::device::barrier_native_handle(__bar))))
-        : "memory");
+    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
+
+    _CUDA_VPTX::cp_async_bulk_tensor(
+        _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
+        __dest, __tensor_map, __coords,
+        ::cuda::device::barrier_native_handle(__bar));
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 inline _LIBCUDACXX_DEVICE
 void cp_async_bulk_tensor_1d_shared_to_global(
     const CUtensorMap *__tensor_map, int __c0, const void *__src)
 {
-    asm volatile(
-        "cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group "
-        "[%0, {%1}], [%2];\n"
-        :
-        : "l"(__tensor_map),
-          "r"(__c0),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__src)))
-        : "memory");
+    const _CUDA_VSTD::int32_t __coords[]{__c0};
+
+    _CUDA_VPTX::cp_async_bulk_tensor(
+        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
+        __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 inline _LIBCUDACXX_DEVICE
 void cp_async_bulk_tensor_2d_shared_to_global(
     const CUtensorMap *__tensor_map, int __c0, int __c1, const void *__src)
 {
-    asm volatile(
-        "cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group "
-        "[%0, {%1, %2}], [%3];\n"
-        :
-        : "l"(__tensor_map),
-          "r"(__c0),
-          "r"(__c1),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__src)))
-        : "memory");
+    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1};
+
+    _CUDA_VPTX::cp_async_bulk_tensor(
+        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
+        __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 inline _LIBCUDACXX_DEVICE
 void cp_async_bulk_tensor_3d_shared_to_global(
     const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, const void *__src)
 {
-    asm volatile(
-        "cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group "
-        "[%0, {%1, %2, %3}], [%4];\n"
-        :
-        : "l"(__tensor_map),
-          "r"(__c0),
-          "r"(__c1),
-          "r"(__c2),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__src)))
-        : "memory");
+    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2};
+
+    _CUDA_VPTX::cp_async_bulk_tensor(
+        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
+        __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 inline _LIBCUDACXX_DEVICE
 void cp_async_bulk_tensor_4d_shared_to_global(
     const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, int __c3, const void *__src)
 {
-    asm volatile(
-        "cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group "
-        "[%0, {%1, %2, %3, %4}], [%5];\n"
-        :
-        : "l"(__tensor_map),
-          "r"(__c0),
-          "r"(__c1),
-          "r"(__c2),
-          "r"(__c3),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__src)))
-        : "memory");
+    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3};
+
+    _CUDA_VPTX::cp_async_bulk_tensor(
+        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
+        __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
 inline _LIBCUDACXX_DEVICE
 void cp_async_bulk_tensor_5d_shared_to_global(
     const CUtensorMap *__tensor_map, int __c0, int __c1, int __c2, int __c3, int __c4, const void *__src)
 {
-    asm volatile(
-        "cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group "
-        "[%0, {%1, %2, %3, %4, %5}], [%6];\n"
-        :
-        : "l"(__tensor_map),
-          "r"(__c0),
-          "r"(__c1),
-          "r"(__c2),
-          "r"(__c3),
-          "r"(__c4),
-          "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__src)))
-        : "memory");
+    const _CUDA_VSTD::int32_t __coords[]{__c0, __c1, __c2, __c3, __c4};
+
+    _CUDA_VPTX::cp_async_bulk_tensor(
+        _CUDA_VPTX::space_global, _CUDA_VPTX::space_shared,
+        __tensor_map, __coords, __src);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
 inline _LIBCUDACXX_DEVICE
 void fence_proxy_async_shared_cta() {
-    asm volatile("fence.proxy.async.shared::cta; \n":::"memory");
+    _CUDA_VPTX::fence_proxy_async(_CUDA_VPTX::space_shared);
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group
 inline _LIBCUDACXX_DEVICE
 void cp_async_bulk_commit_group()
 {
-    asm volatile("cp.async.bulk.commit_group;\n" ::: "memory");
+    _CUDA_VPTX::cp_async_bulk_commit_group();
 }
 
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group
-template <int n_prior>
+template <int __n_prior>
 inline _LIBCUDACXX_DEVICE
 void cp_async_bulk_wait_group_read()
 {
-  static_assert(n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
-  asm volatile("cp.async.bulk.wait_group.read %0; \n"
-               :
-               : "n"(n_prior)
-               : "memory");
+  static_assert(__n_prior <= 63, "cp_async_bulk_wait_group_read: waiting for more than 63 groups is not supported.");
+  _CUDA_VPTX::cp_async_bulk_wait_group_read(_CUDA_VPTX::n32_t<__n_prior>{});
 }
 
 #endif // __cccl_lib_experimental_ctk12_cp_async_exposure

@@ -677,16 +677,11 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
     _LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory.");
 
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
-    auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
     if (__isShared(__dest) && __isGlobal(__src)) {
-        asm volatile(
-            "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
-            :
-            : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-              "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
-              "r"(static_cast<_CUDA_VSTD::uint32_t>(__size)),
-              "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh))
-            : "memory");
+        _CUDA_VPTX::cp_async_bulk(
+            _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
+            __dest, __src, static_cast<uint32_t>(__size),
+            barrier_native_handle(__b));
     } else {
         // memcpy_async_tx only supports copying from global to shared
         // or from shared to remote cluster dsmem. To copy to remote
@@ -953,14 +948,9 @@ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char
     // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
   NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
     if (__g.thread_rank() == 0) {
-        asm volatile(
-            "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
-            :
-            : "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
-              "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
-              "r"(static_cast<_CUDA_VSTD::uint32_t>(__size)),
-              "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__bar_handle)))
-            : "memory");
+        _CUDA_VPTX::cp_async_bulk(
+            _CUDA_VPTX::space_cluster, _CUDA_VPTX::space_global,
+            __dest, __src, __size, __bar_handle);
     }
   ),(
     __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();