// -*- C++ -*- //===----------------------------------------------------------------------===// // // Part of libcu++, the C++ Standard Library for your entire system, // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// #ifndef _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_SHARED_GLOBAL_H_ #define _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_SHARED_GLOBAL_H_ #include #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) # pragma GCC system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) # pragma clang system_header #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) # pragma system_header #endif // no system header #if _CCCL_HAS_CUDA_COMPILER # include # include # include # include _LIBCUDACXX_BEGIN_NAMESPACE_CUDA extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__(); template inline _CCCL_DEVICE void __cp_async_shared_global(char* __dest, const char* __src) { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async // If `if constexpr` is not available, this function gets instantiated even // if is not called. Do not static_assert in that case. # if _CCCL_STD_VER >= 2017 static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16, "cp.async.shared.global requires a copy size of 4, 8, or 16."); # endif // _CCCL_STD_VER >= 2017 NV_IF_ELSE_TARGET( NV_PROVIDES_SM_80, (asm volatile("cp.async.ca.shared.global [%0], [%1], %2, %2;" : : "r"( static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))), "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))), "n"(_Copy_size) : "memory");), (__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();)); } template <> inline _CCCL_DEVICE void __cp_async_shared_global<16>(char* __dest, const char* __src) { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async // When copying 16 bytes, it is possible to skip L1 cache (.cg). NV_IF_ELSE_TARGET( NV_PROVIDES_SM_80, (asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %2;" : : "r"( static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))), "l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))), "n"(16) : "memory");), (__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();)); } template inline _CCCL_DEVICE void __cp_async_shared_global_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size) { // If `if constexpr` is not available, this function gets instantiated even // if is not called. Do not static_assert in that case. # if _CCCL_STD_VER >= 2017 static_assert(4 <= _Alignment, "cp.async requires at least 4-byte alignment"); # endif // _CCCL_STD_VER >= 2017 // Maximal copy size is 16. constexpr int __copy_size = (_Alignment > 16) ? 16 : _Alignment; // We use an int offset here, because we are copying to shared memory, // which is easily addressable using int. const int __group_size = __g.size(); const int __group_rank = __g.thread_rank(); const int __stride = __group_size * __copy_size; for (int __offset = __group_rank * __copy_size; __offset < static_cast(__size); __offset += __stride) { __cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset); } } _LIBCUDACXX_END_NAMESPACE_CUDA #endif // _CCCL_CUDA_COMPILER #endif // _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_SHARED_GLOBAL_H_