Fix bare __device__

NVIDIA · miscco · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025
commit fe3bff9c0112734125338ba77222c835c2dcaade
@@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
 template <typename _Group>
-inline __device__ void __cp_async_bulk_shared_global(
+inline _CCCL_DEVICE void __cp_async_bulk_shared_global(
   const _Group& __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size, _CUDA_VSTD::uint64_t* __bar_handle)
 {
   // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk

@@ -34,7 +34,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
 template <size_t _Copy_size>
-inline __device__ void __cp_async_shared_global(char* __dest, const char* __src)
+inline _CCCL_DEVICE void __cp_async_shared_global(char* __dest, const char* __src)
 {
   // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
 
@@ -55,7 +55,7 @@ inline __device__ void __cp_async_shared_global(char* __dest, const char* __src)
 }
 
 template <>
-inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __src)
+inline _CCCL_DEVICE void __cp_async_shared_global<16>(char* __dest, const char* __src)
 {
   // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
   // When copying 16 bytes, it is possible to skip L1 cache (.cg).
@@ -69,7 +69,7 @@ inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __
 }
 
 template <size_t _Alignment, typename _Group>
-inline __device__ void
+inline _CCCL_DEVICE void
 __cp_async_shared_global_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size)
 {
   // If `if constexpr` is not available, this function gets instantiated even