From 85c11f5269eece051c19a5c84c986c586ce7bf4a Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 11 Feb 2025 17:04:54 +0100
Subject: [PATCH 1/2] Fix `__calloc_device` for non cuda compilation

---
 libcudacxx/include/cuda/std/__cstdlib/malloc.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/libcudacxx/include/cuda/std/__cstdlib/malloc.h b/libcudacxx/include/cuda/std/__cstdlib/malloc.h
index 5bacaa4e510..36fc3b14dfb 100644
--- a/libcudacxx/include/cuda/std/__cstdlib/malloc.h
+++ b/libcudacxx/include/cuda/std/__cstdlib/malloc.h
@@ -35,7 +35,8 @@ _LIBCUDACXX_BEGIN_NAMESPACE_STD
 using ::free;
 using ::malloc;
 
-#if _CCCL_HAS_CUDA_COMPILER
+// We need to ensure that we not only compile with a cuda compiler but also compile cuda source files
+#if _CCCL_HAS_CUDA_COMPILER && (defined(__CUDACC__) || defined(_NVHPC_CUDA))
 _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE void* __calloc_device(size_t __n, size_t __size) noexcept
 {
   void* __ptr{};
@@ -53,7 +54,7 @@ _CCCL_NODISCARD _CCCL_HIDE_FROM_ABI _CCCL_DEVICE void* __calloc_device(size_t __
 
   return __ptr;
 }
-#endif // _CCCL_HAS_CUDA_COMPILER
+#endif // _CCCL_HAS_CUDA_COMPILER && (defined(__CUDACC__) || defined(_NVHPC_CUDA))
 
 _CCCL_NODISCARD _LIBCUDACXX_HIDE_FROM_ABI void* calloc(size_t __n, size_t __size) noexcept
 {

From fe3bff9c0112734125338ba77222c835c2dcaade Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 11 Feb 2025 17:05:07 +0100
Subject: [PATCH 2/2] Fix bare `__device__`

---
 .../cuda/__memcpy_async/cp_async_bulk_shared_global.h       | 2 +-
 .../include/cuda/__memcpy_async/cp_async_shared_global.h    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h
index 8a6cc2232bb..513bfd387e2 100644
--- a/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h
+++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_bulk_shared_global.h
@@ -36,7 +36,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
 template <typename _Group>
-inline __device__ void __cp_async_bulk_shared_global(
+inline _CCCL_DEVICE void __cp_async_bulk_shared_global(
   const _Group& __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size, _CUDA_VSTD::uint64_t* __bar_handle)
 {
   // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
diff --git a/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h b/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h
index 07eb5c84c93..2ae485e4602 100644
--- a/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h
+++ b/libcudacxx/include/cuda/__memcpy_async/cp_async_shared_global.h
@@ -34,7 +34,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA
 
 extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
 template <size_t _Copy_size>
-inline __device__ void __cp_async_shared_global(char* __dest, const char* __src)
+inline _CCCL_DEVICE void __cp_async_shared_global(char* __dest, const char* __src)
 {
   // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
 
@@ -55,7 +55,7 @@ inline __device__ void __cp_async_shared_global(char* __dest, const char* __src)
 }
 
 template <>
-inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __src)
+inline _CCCL_DEVICE void __cp_async_shared_global<16>(char* __dest, const char* __src)
 {
   // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
   // When copying 16 bytes, it is possible to skip L1 cache (.cg).
@@ -69,7 +69,7 @@ inline __device__ void __cp_async_shared_global<16>(char* __dest, const char* __
 }
 
 template <size_t _Alignment, typename _Group>
-inline __device__ void
+inline _CCCL_DEVICE void
 __cp_async_shared_global_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size)
 {
   // If `if constexpr` is not available, this function gets instantiated even