|
14 | 14 | #endif // no system header |
15 | 15 |
|
16 | 16 | #include <cub/detail/uninitialized_copy.cuh> |
17 | | -#include <cub/device/dispatch/tuning/tuning_transform.cuh> |
| 17 | +#include <cub/device/dispatch/kernels/transform.cuh> |
18 | 18 | #include <cub/util_arch.cuh> |
19 | 19 | #include <cub/util_device.cuh> |
20 | 20 | #include <cub/util_math.cuh> |
21 | 21 | #include <cub/util_type.cuh> |
22 | 22 |
|
23 | | -#include <thrust/detail/raw_reference_cast.h> |
24 | 23 | #include <thrust/system/cuda/detail/core/triple_chevron_launch.h> |
25 | | -#include <thrust/type_traits/is_contiguous_iterator.h> |
26 | 24 | #include <thrust/type_traits/is_trivially_relocatable.h> |
27 | 25 | #include <thrust/type_traits/unwrap_contiguous_iterator.h> |
28 | 26 |
|
29 | 27 | #include <cuda/cmath> |
30 | | -#include <cuda/ptx> |
31 | 28 | #include <cuda/std/__algorithm/clamp.h> |
32 | 29 | #include <cuda/std/__algorithm/max.h> |
33 | 30 | #include <cuda/std/__algorithm/min.h> |
@@ -55,354 +52,13 @@ enum class requires_stable_address |
55 | 52 | no, |
56 | 53 | yes |
57 | 54 | }; |
58 | | - |
59 | | -template <typename T> |
60 | | -_CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment) |
61 | | -{ |
62 | | - _CCCL_ASSERT(::cuda::std::has_single_bit(alignment), ""); |
63 | | - return reinterpret_cast<const char*>( |
64 | | - reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ~::cuda::std::uintptr_t{alignment - 1}); |
65 | | -} |
66 | | - |
67 | | -// Prefetches (at least on Hopper) a 128 byte cache line. Prefetching out-of-bounds addresses has no side effects |
68 | | -// TODO(bgruber): there is also the cp.async.bulk.prefetch instruction available on Hopper. May improve perf a tiny bit |
69 | | -// as we need to create less instructions to prefetch the same amount of data. |
70 | | -template <typename T> |
71 | | -_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch(const T* addr) |
72 | | -{ |
73 | | - // TODO(bgruber): prefetch to L1 may be even better |
74 | | - asm volatile("prefetch.global.L2 [%0];" : : "l"(__cvta_generic_to_global(addr)) : "memory"); |
75 | | -} |
76 | | - |
77 | | -template <int BlockDim, typename It> |
78 | | -_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(It begin, int tile_size) |
79 | | -{ |
80 | | - if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<It>) |
81 | | - { |
82 | | - constexpr int prefetch_byte_stride = 128; // TODO(bgruber): should correspond to cache line size. Does this need to |
83 | | - // be architecture dependent? |
84 | | - const int tile_size_bytes = tile_size * sizeof(value_t<It>); |
85 | | - // prefetch does not stall and unrolling just generates a lot of unnecessary computations and predicate handling |
86 | | -#pragma unroll 1 |
87 | | - for (int offset = threadIdx.x * prefetch_byte_stride; offset < tile_size_bytes; |
88 | | - offset += BlockDim * prefetch_byte_stride) |
89 | | - { |
90 | | - prefetch(reinterpret_cast<const char*>(::cuda::std::to_address(begin)) + offset); |
91 | | - } |
92 | | - } |
93 | | -} |
94 | | - |
95 | | -// This kernel guarantees that objects passed as arguments to the user-provided transformation function f reside in |
96 | | -// global memory. No intermediate copies are taken. If the parameter type of f is a reference, taking the address of the |
97 | | -// parameter yields a global memory address. |
98 | | -template <typename PrefetchPolicy, |
99 | | - typename Offset, |
100 | | - typename F, |
101 | | - typename RandomAccessIteratorOut, |
102 | | - typename... RandomAccessIteratorIn> |
103 | | -_CCCL_DEVICE void transform_kernel_impl( |
104 | | - ::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>, |
105 | | - Offset num_items, |
106 | | - int num_elem_per_thread, |
107 | | - F f, |
108 | | - RandomAccessIteratorOut out, |
109 | | - RandomAccessIteratorIn... ins) |
110 | | -{ |
111 | | - constexpr int block_dim = PrefetchPolicy::block_threads; |
112 | | - const int tile_stride = block_dim * num_elem_per_thread; |
113 | | - const Offset offset = static_cast<Offset>(blockIdx.x) * tile_stride; |
114 | | - const int tile_size = static_cast<int>((::cuda::std::min)(num_items - offset, Offset{tile_stride})); |
115 | | - |
116 | | - // move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below |
117 | | - { |
118 | | - (..., (ins += offset)); |
119 | | - out += offset; |
120 | | - } |
121 | | - |
122 | | - (..., prefetch_tile<block_dim>(THRUST_NS_QUALIFIER::raw_reference_cast(ins), tile_size)); |
123 | | - |
124 | | - auto process_tile = [&](auto full_tile, auto... ins2 /* nvcc fails to compile when just using the captured ins */) { |
125 | | - // ahendriksen: various unrolling yields less <1% gains at much higher compile-time cost |
126 | | - // bgruber: but A6000 and H100 show small gains without pragma |
127 | | - //_Pragma("unroll 1") |
128 | | - for (int j = 0; j < num_elem_per_thread; ++j) |
129 | | - { |
130 | | - const int idx = j * block_dim + threadIdx.x; |
131 | | - if (full_tile || idx < tile_size) |
132 | | - { |
133 | | - // we have to unwrap Thrust's proxy references here for backward compatibility (try zip_iterator.cu test) |
134 | | - out[idx] = f(THRUST_NS_QUALIFIER::raw_reference_cast(ins2[idx])...); |
135 | | - } |
136 | | - } |
137 | | - }; |
138 | | - if (tile_stride == tile_size) |
139 | | - { |
140 | | - process_tile(::cuda::std::true_type{}, ins...); |
141 | | - } |
142 | | - else |
143 | | - { |
144 | | - process_tile(::cuda::std::false_type{}, ins...); |
145 | | - } |
146 | | -} |
147 | | - |
148 | | -// Implementation notes on memcpy_async and UBLKCP kernels regarding copy alignment and padding |
149 | | -// |
150 | | -// For performance considerations of memcpy_async: |
151 | | -// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#performance-guidance-for-memcpy-async |
152 | | -// |
153 | | -// We basically have to align the base pointer to 16 bytes, and copy a multiple of 16 bytes. To achieve this, when we |
154 | | -// copy a tile of data from an input buffer, we round down the pointer to the start of the tile to the next lower |
155 | | -// address that is a multiple of 16 bytes. This introduces head padding. We also round up the total number of bytes to |
156 | | -// copy (including head padding) to a multiple of 16 bytes, which introduces tail padding. For the bulk copy kernel, we |
157 | | -// have to align to 128 bytes instead of 16. |
158 | | -// |
159 | | -// However, padding memory copies like that may access the input buffer out-of-bounds. Here are some thoughts: |
160 | | -// * According to the CUDA programming guide |
161 | | -// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses), "any address of a variable |
162 | | -// residing in global memory or returned by one of the memory allocation routines from the driver or runtime API is |
163 | | -// always aligned to at least 256 bytes." |
164 | | -// * Memory protection is usually done on memory page level, which is even larger than 256 bytes for CUDA and 4KiB on |
165 | | -// Intel x86 and 4KiB+ ARM. Front and tail padding thus never leaves the memory page of the input buffer. |
166 | | -// * This should count for device memory, but also for device accessible memory living on the host. |
167 | | -// * The base pointer alignment and size rounding also never leaves the size of a cache line. |
168 | | -// |
169 | | -// Copying larger data blocks with head and tail padding should thus be legal. Nevertheless, an out-of-bounds read is |
170 | | -// still technically undefined behavior in C++. Also, compute-sanitizer flags at least such reads after the end of a |
171 | | -// buffer. Therefore, we lean on the safer side and protect against out of bounds reads at the beginning and end. |
172 | | - |
173 | | -// A note on size and alignment: The size of a type is at least as large as its alignment. We rely on this fact in some |
174 | | -// conditions. |
175 | | -// This is guaranteed by the C++ standard, and follows from the definition of arrays: the difference between neighboring |
176 | | -// array element addresses is sizeof element type and each array element needs to fulfill the alignment requirement of |
177 | | -// the element type. |
178 | | - |
179 | | -// Pointer with metadata to describe readonly input memory for memcpy_async and UBLKCP kernels. |
180 | | -// cg::memcpy_async is most efficient when the data is 16-byte aligned and the size a multiple of 16 bytes |
181 | | -// UBLKCP is most efficient when the data is 128-byte aligned and the size a multiple of 16 bytes |
182 | | -template <typename T> // Cannot add alignment to signature, because we need a uniform kernel template instantiation |
183 | | -struct aligned_base_ptr |
184 | | -{ |
185 | | - using value_type = T; |
186 | | - |
187 | | - const char* ptr; // aligned pointer before the original pointer (16-byte or 128-byte). May not be aligned to |
188 | | - // alignof(T). E.g.: array of int3 starting at address 4, ptr == 0 |
189 | | - int head_padding; // byte offset between ptr and the original pointer. Value inside [0;15] or [0;127]. |
190 | | - |
191 | | - _CCCL_HOST_DEVICE const T* ptr_to_elements() const |
192 | | - { |
193 | | - return reinterpret_cast<const T*>(ptr + head_padding); |
194 | | - } |
195 | | - |
196 | | - _CCCL_HOST_DEVICE friend bool operator==(const aligned_base_ptr& a, const aligned_base_ptr& b) |
197 | | - { |
198 | | - return a.ptr == b.ptr && a.head_padding == b.head_padding; |
199 | | - } |
200 | | -}; |
201 | | - |
202 | 55 | template <typename T> |
203 | 56 | _CCCL_HOST_DEVICE auto make_aligned_base_ptr(const T* ptr, int alignment) -> aligned_base_ptr<T> |
204 | 57 | { |
205 | 58 | const char* base_ptr = round_down_ptr(ptr, alignment); |
206 | 59 | return aligned_base_ptr<T>{base_ptr, static_cast<int>(reinterpret_cast<const char*>(ptr) - base_ptr)}; |
207 | 60 | } |
208 | 61 |
|
209 | | -#ifdef _CUB_HAS_TRANSFORM_UBLKCP |
210 | | -_CCCL_DEVICE _CCCL_FORCEINLINE static bool elect_one() |
211 | | -{ |
212 | | - const ::cuda::std::uint32_t membermask = ~0; |
213 | | - ::cuda::std::uint32_t is_elected; |
214 | | - asm volatile( |
215 | | - "{\n\t .reg .pred P_OUT; \n\t" |
216 | | - "elect.sync _|P_OUT, %1;\n\t" |
217 | | - "selp.b32 %0, 1, 0, P_OUT; \n" |
218 | | - "}" |
219 | | - : "=r"(is_elected) |
220 | | - : "r"(membermask) |
221 | | - :); |
222 | | - return threadIdx.x < 32 && static_cast<bool>(is_elected); |
223 | | -} |
224 | | - |
225 | | -template <typename Offset, typename T> |
226 | | -_CCCL_DEVICE void bulk_copy_tile_fallback( |
227 | | - int tile_size, |
228 | | - int tile_stride, |
229 | | - char* smem, |
230 | | - int& smem_offset, |
231 | | - Offset global_offset, |
232 | | - const aligned_base_ptr<T>& aligned_ptr) |
233 | | -{ |
234 | | - const T* src = aligned_ptr.ptr_to_elements() + global_offset; |
235 | | - T* dst = reinterpret_cast<T*>(smem + smem_offset + aligned_ptr.head_padding); |
236 | | - _CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, ""); |
237 | | - _CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, ""); |
238 | | - |
239 | | - const int bytes_to_copy = static_cast<int>(sizeof(T)) * tile_size; |
240 | | - cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy); |
241 | | - |
242 | | - // add bulk_copy_alignment to make space for the next tile's head padding |
243 | | - smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment; |
244 | | -} |
245 | | - |
246 | | -template <typename BulkCopyPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs> |
247 | | -_CCCL_DEVICE void transform_kernel_ublkcp( |
248 | | - Offset num_items, int num_elem_per_thread, F f, RandomAccessIteratorOut out, aligned_base_ptr<InTs>... aligned_ptrs) |
249 | | -{ |
250 | | - __shared__ uint64_t bar; |
251 | | - extern __shared__ char __align__(bulk_copy_alignment) smem[]; |
252 | | - |
253 | | - namespace ptx = ::cuda::ptx; |
254 | | - |
255 | | - constexpr int block_dim = BulkCopyPolicy::block_threads; |
256 | | - const int tile_stride = block_dim * num_elem_per_thread; |
257 | | - const Offset offset = static_cast<Offset>(blockIdx.x) * tile_stride; |
258 | | - const int tile_size = (::cuda::std::min)(num_items - offset, Offset{tile_stride}); |
259 | | - |
260 | | - const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x; |
261 | | - if (inner_blocks) |
262 | | - { |
263 | | - // use one thread to setup the entire bulk copy |
264 | | - if (elect_one()) |
265 | | - { |
266 | | - ptx::mbarrier_init(&bar, 1); |
267 | | - ptx::fence_proxy_async(ptx::space_shared); |
268 | | - |
269 | | - int smem_offset = 0; |
270 | | - ::cuda::std::uint32_t total_copied = 0; |
271 | | - |
272 | | - auto bulk_copy_tile = [&](auto aligned_ptr) { |
273 | | - using T = typename decltype(aligned_ptr)::value_type; |
274 | | - static_assert(alignof(T) <= bulk_copy_alignment, ""); |
275 | | - |
276 | | - const char* src = aligned_ptr.ptr + offset * sizeof(T); |
277 | | - char* dst = smem + smem_offset; |
278 | | - _CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % bulk_copy_alignment == 0, ""); |
279 | | - _CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % bulk_copy_alignment == 0, ""); |
280 | | - |
281 | | - // TODO(bgruber): we could precompute bytes_to_copy on the host |
282 | | - const int bytes_to_copy = round_up_to_po2_multiple( |
283 | | - aligned_ptr.head_padding + static_cast<int>(sizeof(T)) * tile_stride, bulk_copy_size_multiple); |
284 | | - |
285 | | - ::cuda::ptx::cp_async_bulk(::cuda::ptx::space_cluster, ::cuda::ptx::space_global, dst, src, bytes_to_copy, &bar); |
286 | | - total_copied += bytes_to_copy; |
287 | | - |
288 | | - // add bulk_copy_alignment to make space for the next tile's head padding |
289 | | - smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment; |
290 | | - }; |
291 | | - |
292 | | - // Order of evaluation is left-to-right |
293 | | - (..., bulk_copy_tile(aligned_ptrs)); |
294 | | - |
295 | | - // TODO(ahendriksen): this could only have ptx::sem_relaxed, but this is not available yet |
296 | | - ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied); |
297 | | - } |
298 | | - |
299 | | - // all threads wait for bulk copy |
300 | | - __syncthreads(); |
301 | | - while (!ptx::mbarrier_try_wait_parity(&bar, 0)) |
302 | | - ; |
303 | | - } |
304 | | - else |
305 | | - { |
306 | | - // use all threads to schedule an async_memcpy |
307 | | - int smem_offset = 0; |
308 | | - |
309 | | - auto bulk_copy_tile_fallback = [&](auto aligned_ptr) { |
310 | | - using T = typename decltype(aligned_ptr)::value_type; |
311 | | - const T* src = aligned_ptr.ptr_to_elements() + offset; |
312 | | - T* dst = reinterpret_cast<T*>(smem + smem_offset + aligned_ptr.head_padding); |
313 | | - _CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, ""); |
314 | | - _CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, ""); |
315 | | - |
316 | | - const int bytes_to_copy = static_cast<int>(sizeof(T)) * tile_size; |
317 | | - cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy); |
318 | | - |
319 | | - // add bulk_copy_alignment to make space for the next tile's head padding |
320 | | - smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment; |
321 | | - }; |
322 | | - |
323 | | - // Order of evaluation is left-to-right |
324 | | - (..., bulk_copy_tile_fallback(aligned_ptrs)); |
325 | | - |
326 | | - cooperative_groups::wait(cooperative_groups::this_thread_block()); |
327 | | - } |
328 | | - |
329 | | - // move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below |
330 | | - out += offset; |
331 | | - |
332 | | - auto process_tile = [&](auto full_tile) { |
333 | | - // Unroll 1 tends to improve performance, especially for smaller data types (confirmed by benchmark) |
334 | | - _CCCL_PRAGMA(unroll 1) |
335 | | - for (int j = 0; j < num_elem_per_thread; ++j) |
336 | | - { |
337 | | - const int idx = j * block_dim + threadIdx.x; |
338 | | - if (full_tile || idx < tile_size) |
339 | | - { |
340 | | - int smem_offset = 0; |
341 | | - auto fetch_operand = [&](auto aligned_ptr) { |
342 | | - using T = typename decltype(aligned_ptr)::value_type; |
343 | | - const T* smem_operand_tile_base = reinterpret_cast<const T*>(smem + smem_offset + aligned_ptr.head_padding); |
344 | | - smem_offset += int{sizeof(T)} * tile_stride + bulk_copy_alignment; |
345 | | - return smem_operand_tile_base[idx]; |
346 | | - }; |
347 | | - |
348 | | - // need to expand into a tuple for guaranteed order of evaluation |
349 | | - out[idx] = ::cuda::std::apply( |
350 | | - [&](auto... values) { |
351 | | - return f(values...); |
352 | | - }, |
353 | | - ::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...}); |
354 | | - } |
355 | | - } |
356 | | - }; |
357 | | - // explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice |
358 | | - if (tile_stride == tile_size) |
359 | | - { |
360 | | - process_tile(::cuda::std::true_type{}); |
361 | | - } |
362 | | - else |
363 | | - { |
364 | | - process_tile(::cuda::std::false_type{}); |
365 | | - } |
366 | | -} |
367 | | - |
368 | | -template <typename BulkCopyPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs> |
369 | | -_CCCL_DEVICE void transform_kernel_impl( |
370 | | - ::cuda::std::integral_constant<Algorithm, Algorithm::ublkcp>, |
371 | | - Offset num_items, |
372 | | - int num_elem_per_thread, |
373 | | - F f, |
374 | | - RandomAccessIteratorOut out, |
375 | | - aligned_base_ptr<InTs>... aligned_ptrs) |
376 | | -{ |
377 | | - // only call the real kernel for sm90 and later |
378 | | - NV_IF_TARGET(NV_PROVIDES_SM_90, |
379 | | - (transform_kernel_ublkcp<BulkCopyPolicy>(num_items, num_elem_per_thread, f, out, aligned_ptrs...);)); |
380 | | -} |
381 | | -#endif // _CUB_HAS_TRANSFORM_UBLKCP |
382 | | - |
383 | | -template <typename It> |
384 | | -union kernel_arg |
385 | | -{ |
386 | | - aligned_base_ptr<value_t<It>> aligned_ptr; // first member is trivial |
387 | | - It iterator; // may not be trivially [default|copy]-constructible |
388 | | - |
389 | | - static_assert(::cuda::std::is_trivial_v<decltype(aligned_ptr)>, ""); |
390 | | - |
391 | | - // Sometimes It is not trivially [default|copy]-constructible (e.g. |
392 | | - // thrust::normal_iterator<thrust::device_pointer<T>>), so because of |
393 | | - // https://eel.is/c++draft/class.union#general-note-3, kernel_args's special members are deleted. We work around it by |
394 | | - // explicitly defining them. |
395 | | - _CCCL_HOST_DEVICE kernel_arg() noexcept {} |
396 | | - _CCCL_HOST_DEVICE ~kernel_arg() noexcept {} |
397 | | - |
398 | | - _CCCL_HOST_DEVICE kernel_arg(const kernel_arg& other) |
399 | | - { |
400 | | - // since we use kernel_arg only to pass data to the device, the contained data is semantically trivially copyable, |
401 | | - // even if the type system is telling us otherwise. |
402 | | - ::cuda::std::memcpy(reinterpret_cast<char*>(this), reinterpret_cast<const char*>(&other), sizeof(kernel_arg)); |
403 | | - } |
404 | | -}; |
405 | | - |
406 | 62 | template <typename It> |
407 | 63 | _CCCL_HOST_DEVICE auto make_iterator_kernel_arg(It it) -> kernel_arg<It> |
408 | 64 | { |
@@ -470,9 +126,6 @@ __launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::block_threads) |
470 | 126 | select_kernel_arg(alg, ::cuda::std::move(ins))...); |
471 | 127 | } |
472 | 128 |
|
473 | | -template <typename T> |
474 | | -using cuda_expected = ::cuda::std::expected<T, cudaError_t>; |
475 | | - |
476 | 129 | // TODO(bgruber): this is very similar to thrust::cuda_cub::core::get_max_shared_memory_per_block. We should unify this. |
477 | 130 | _CCCL_HOST_DEVICE inline cuda_expected<int> get_max_shared_memory() |
478 | 131 | { |
|
0 commit comments