diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h index 8a6cc2232bb..513bfd387e2 100644 --- a/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h @@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__(); template -inline __device__ void __cp_async_bulk_shared_global( +inline _CCCL_DEVICE void __cp_async_bulk_shared_global( const _Group& __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size, _CUDA_VSTD::uint64_t* __bar_handle) { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h index 07eb5c84c93..2ae485e4602 100644 --- a/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h +++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h @@ -34,7 +34,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__(); template -inline __device__ void __cp_async_shared_global(char* __dest, const char* __src) +inline _CCCL_DEVICE void __cp_async_shared_global(char* __dest, const char* __src) { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async @@ -55,7 +55,7 @@ inline __device__ void __cp_async_shared_global(char* __dest, const char* __src) } template <> -inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __src) +inline _CCCL_DEVICE void __cp_async_shared_global<16>(char* __dest, const char* __src) { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async // When copying 16 bytes, it is possible to skip L1 cache (.cg). @@ -69,7 +69,7 @@ inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __ } template -inline __device__ void +inline _CCCL_DEVICE void __cp_async_shared_global_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size) { // If `if constexpr` is not available, this function gets instantiated even diff --git a/libcudacxx/include/cuda/std/__cstdlib/malloc.h b/libcudacxx/include/cuda/std/__cstdlib/malloc.h index 5bacaa4e510..36fc3b14dfb 100644 --- a/libcudacxx/include/cuda/std/__cstdlib/malloc.h +++ b/libcudacxx/include/cuda/std/__cstdlib/malloc.h @@ -35,7 +35,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD using ::free; using ::malloc; -#if _CCCL_HAS_CUDA_COMPILER +// We need to ensure that we not only compile with a cuda compiler but also compile cuda source files +#if _CCCL_HAS_CUDA_COMPILER && (defined(__CUDACC__) || defined(_NVHPC_CUDA)) _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE void* __calloc_device(size_t __n, size_t __size) noexcept { void* __ptr{}; @@ -53,7 +54,7 @@ _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE void* __calloc_device(size_t __ return __ptr; } -#endif // _CCCL_HAS_CUDA_COMPILER +#endif // _CCCL_HAS_CUDA_COMPILER && (defined(__CUDACC__) || defined(_NVHPC_CUDA)) _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI void* calloc(size_t __n, size_t __size) noexcept {