Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Fix bare __device__
  • Loading branch information
miscco committed Feb 11, 2025
commit fe3bff9c0112734125338ba77222c835c2dcaade
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA

extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
template <typename _Group>
inline __device__ void __cp_async_bulk_shared_global(
inline _CCCL_DEVICE void __cp_async_bulk_shared_global(
const _Group& __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size, _CUDA_VSTD::uint64_t* __bar_handle)
{
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA

extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
template <size_t _Copy_size>
inline __device__ void __cp_async_shared_global(char* __dest, const char* __src)
inline _CCCL_DEVICE void __cp_async_shared_global(char* __dest, const char* __src)
{
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async

Expand All @@ -55,7 +55,7 @@ inline __device__ void __cp_async_shared_global(char* __dest, const char* __src)
}

template <>
inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __src)
inline _CCCL_DEVICE void __cp_async_shared_global<16>(char* __dest, const char* __src)
{
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
// When copying 16 bytes, it is possible to skip L1 cache (.cg).
Expand All @@ -69,7 +69,7 @@ inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __
}

template <size_t _Alignment, typename _Group>
inline __device__ void
inline _CCCL_DEVICE void
__cp_async_shared_global_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size)
{
// If `if constexpr` is not available, this function gets instantiated even
Expand Down
Loading