forked from NVIDIA/cccl
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcp_async_shared_global.h
More file actions
98 lines (82 loc) · 3.99 KB
/
cp_async_shared_global.h
File metadata and controls
98 lines (82 loc) · 3.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
#ifndef _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_SHARED_GLOBAL_H_
#define _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_SHARED_GLOBAL_H_
#include <cuda/std/detail/__config>
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header
#if _CCCL_HAS_CUDA_COMPILER
# include <cuda/__ptx/ptx_dot_variants.h>
# include <cuda/__ptx/ptx_helper_functions.h>
# include <cuda/std/cstdint>
# include <nv/target>
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA
extern "C" _CCCL_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
template <size_t _Copy_size>
inline _CCCL_DEVICE void __cp_async_shared_global(char* __dest, const char* __src)
{
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
// If `if constexpr` is not available, this function gets instantiated even
// if is not called. Do not static_assert in that case.
# if _CCCL_STD_VER >= 2017
static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16,
"cp.async.shared.global requires a copy size of 4, 8, or 16.");
# endif // _CCCL_STD_VER >= 2017
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_80,
(asm volatile("cp.async.ca.shared.global [%0], [%1], %2, %2;" : : "r"(
static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
"l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
"n"(_Copy_size) : "memory");),
(__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();));
}
template <>
inline _CCCL_DEVICE void __cp_async_shared_global<16>(char* __dest, const char* __src)
{
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
// When copying 16 bytes, it is possible to skip L1 cache (.cg).
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_80,
(asm volatile("cp.async.cg.shared.global [%0], [%1], %2, %2;" : : "r"(
static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
"l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
"n"(16) : "memory");),
(__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();));
}
template <size_t _Alignment, typename _Group>
inline _CCCL_DEVICE void
__cp_async_shared_global_mechanism(_Group __g, char* __dest, const char* __src, _CUDA_VSTD::size_t __size)
{
// If `if constexpr` is not available, this function gets instantiated even
// if is not called. Do not static_assert in that case.
# if _CCCL_STD_VER >= 2017
static_assert(4 <= _Alignment, "cp.async requires at least 4-byte alignment");
# endif // _CCCL_STD_VER >= 2017
// Maximal copy size is 16.
constexpr int __copy_size = (_Alignment > 16) ? 16 : _Alignment;
// We use an int offset here, because we are copying to shared memory,
// which is easily addressable using int.
const int __group_size = __g.size();
const int __group_rank = __g.thread_rank();
const int __stride = __group_size * __copy_size;
for (int __offset = __group_rank * __copy_size; __offset < static_cast<int>(__size); __offset += __stride)
{
__cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset);
}
}
_LIBCUDACXX_END_NAMESPACE_CUDA
#endif // _CCCL_CUDA_COMPILER
#endif // _CUDA_PTX__MEMCPY_ASYNC_CP_ASYNC_SHARED_GLOBAL_H_