From 85c11f5269eece051c19a5c84c986c586ce7bf4a Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 11 Feb 2025 17:04:54 +0100 Subject: [PATCH 1/2] Fix `__calloc_device` for non cuda compilation --- libcudacxx/include/cuda/std/__cstdlib/malloc.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/std/__cstdlib/malloc.h b/libcudacxx/include/cuda/std/__cstdlib/malloc.h index 5bacaa4e510..36fc3b14dfb 100644 --- a/libcudacxx/include/cuda/std/__cstdlib/malloc.h +++ b/libcudacxx/include/cuda/std/__cstdlib/malloc.h @@ -35,7 +35,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD using ::free; using ::malloc; -#if _CCCL_HAS_CUDA_COMPILER +// We need to ensure that we not only compile with a cuda compiler but also compile cuda source files +#if _CCCL_HAS_CUDA_COMPILER && (defined(__CUDACC__) || defined(_NVHPC_CUDA)) _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE void* __calloc_device(size_t __n, size_t __size) noexcept { void* __ptr{}; @@ -53,7 +54,7 @@ _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE void* __calloc_device(size_t __ return __ptr; } -#endif // _CCCL_HAS_CUDA_COMPILER +#endif // _CCCL_HAS_CUDA_COMPILER && (defined(__CUDACC__) || defined(_NVHPC_CUDA)) _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI void* calloc(size_t __n, size_t __size) noexcept { From fe3bff9c0112734125338ba77222c835c2dcaade Mon Sep 17 00:00:00 2001 From: Michael Schellenberger Costa Date: Tue, 11 Feb 2025 17:05:07 +0100 Subject: [PATCH 2/2] Fix bare `__device__` --- .../cuda/__memcpy_async/cp_async_bulk_shared_global.h | 2 +- .../include/cuda/__memcpy_async/cp_async_shared_global.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h index 8a6cc2232bb..513bfd387e2 100644 --- a/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h +++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h @@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__(); template -inline __device__ void __cp_async_bulk_shared_global( +inline _CCCL_DEVICE void __cp_async_bulk_shared_global( const _Group& __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size, _CUDA_VSTD::uint64_t* __bar_handle) { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h index 07eb5c84c93..2ae485e4602 100644 --- a/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h +++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h @@ -34,7 +34,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__(); template -inline __device__ void __cp_async_shared_global(char* __dest, const char* __src) +inline _CCCL_DEVICE void __cp_async_shared_global(char* __dest, const char* __src) { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async @@ -55,7 +55,7 @@ inline __device__ void __cp_async_shared_global(char* __dest, const char* __src) } template <> -inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __src) +inline _CCCL_DEVICE void __cp_async_shared_global<16>(char* __dest, const char* __src) { // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async // When copying 16 bytes, it is possible to skip L1 cache (.cg). @@ -69,7 +69,7 @@ inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __ } template -inline __device__ void +inline _CCCL_DEVICE void __cp_async_shared_global_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size) { // If `if constexpr` is not available, this function gets instantiated even