Skip to content

Commit f2fa182

Browse files
committed
Move transform kernels to nvrtc compilable header
1 parent b6d6d3b commit f2fa182

File tree

5 files changed

+389
-352
lines changed

5 files changed

+389
-352
lines changed

cub/cub/device/dispatch/dispatch_transform.cuh

Lines changed: 1 addition & 348 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,17 @@
1414
#endif // no system header
1515

1616
#include <cub/detail/uninitialized_copy.cuh>
17-
#include <cub/device/dispatch/tuning/tuning_transform.cuh>
17+
#include <cub/device/dispatch/kernels/transform.cuh>
1818
#include <cub/util_arch.cuh>
1919
#include <cub/util_device.cuh>
2020
#include <cub/util_math.cuh>
2121
#include <cub/util_type.cuh>
2222

23-
#include <thrust/detail/raw_reference_cast.h>
2423
#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
25-
#include <thrust/type_traits/is_contiguous_iterator.h>
2624
#include <thrust/type_traits/is_trivially_relocatable.h>
2725
#include <thrust/type_traits/unwrap_contiguous_iterator.h>
2826

2927
#include <cuda/cmath>
30-
#include <cuda/ptx>
3128
#include <cuda/std/__algorithm/clamp.h>
3229
#include <cuda/std/__algorithm/max.h>
3330
#include <cuda/std/__algorithm/min.h>
@@ -55,354 +52,13 @@ enum class requires_stable_address
5552
no,
5653
yes
5754
};
58-
59-
template <typename T>
60-
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE const char* round_down_ptr(const T* ptr, unsigned alignment)
61-
{
62-
_CCCL_ASSERT(::cuda::std::has_single_bit(alignment), "");
63-
return reinterpret_cast<const char*>(
64-
reinterpret_cast<::cuda::std::uintptr_t>(ptr) & ~::cuda::std::uintptr_t{alignment - 1});
65-
}
66-
67-
// Prefetches (at least on Hopper) a 128 byte cache line. Prefetching out-of-bounds addresses has no side effects
68-
// TODO(bgruber): there is also the cp.async.bulk.prefetch instruction available on Hopper. May improve perf a tiny bit
69-
// as we need to create less instructions to prefetch the same amount of data.
70-
template <typename T>
71-
_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch(const T* addr)
72-
{
73-
// TODO(bgruber): prefetch to L1 may be even better
74-
asm volatile("prefetch.global.L2 [%0];" : : "l"(__cvta_generic_to_global(addr)) : "memory");
75-
}
76-
77-
template <int BlockDim, typename It>
78-
_CCCL_DEVICE _CCCL_FORCEINLINE void prefetch_tile(It begin, int tile_size)
79-
{
80-
if constexpr (THRUST_NS_QUALIFIER::is_contiguous_iterator_v<It>)
81-
{
82-
constexpr int prefetch_byte_stride = 128; // TODO(bgruber): should correspond to cache line size. Does this need to
83-
// be architecture dependent?
84-
const int tile_size_bytes = tile_size * sizeof(value_t<It>);
85-
// prefetch does not stall and unrolling just generates a lot of unnecessary computations and predicate handling
86-
#pragma unroll 1
87-
for (int offset = threadIdx.x * prefetch_byte_stride; offset < tile_size_bytes;
88-
offset += BlockDim * prefetch_byte_stride)
89-
{
90-
prefetch(reinterpret_cast<const char*>(::cuda::std::to_address(begin)) + offset);
91-
}
92-
}
93-
}
94-
95-
// This kernel guarantees that objects passed as arguments to the user-provided transformation function f reside in
96-
// global memory. No intermediate copies are taken. If the parameter type of f is a reference, taking the address of the
97-
// parameter yields a global memory address.
98-
template <typename PrefetchPolicy,
99-
typename Offset,
100-
typename F,
101-
typename RandomAccessIteratorOut,
102-
typename... RandomAccessIteratorIn>
103-
_CCCL_DEVICE void transform_kernel_impl(
104-
::cuda::std::integral_constant<Algorithm, Algorithm::prefetch>,
105-
Offset num_items,
106-
int num_elem_per_thread,
107-
F f,
108-
RandomAccessIteratorOut out,
109-
RandomAccessIteratorIn... ins)
110-
{
111-
constexpr int block_dim = PrefetchPolicy::block_threads;
112-
const int tile_stride = block_dim * num_elem_per_thread;
113-
const Offset offset = static_cast<Offset>(blockIdx.x) * tile_stride;
114-
const int tile_size = static_cast<int>((::cuda::std::min)(num_items - offset, Offset{tile_stride}));
115-
116-
// move index and iterator domain to the block/thread index, to reduce arithmetic in the loops below
117-
{
118-
(..., (ins += offset));
119-
out += offset;
120-
}
121-
122-
(..., prefetch_tile<block_dim>(THRUST_NS_QUALIFIER::raw_reference_cast(ins), tile_size));
123-
124-
auto process_tile = [&](auto full_tile, auto... ins2 /* nvcc fails to compile when just using the captured ins */) {
125-
// ahendriksen: various unrolling yields less <1% gains at much higher compile-time cost
126-
// bgruber: but A6000 and H100 show small gains without pragma
127-
//_Pragma("unroll 1")
128-
for (int j = 0; j < num_elem_per_thread; ++j)
129-
{
130-
const int idx = j * block_dim + threadIdx.x;
131-
if (full_tile || idx < tile_size)
132-
{
133-
// we have to unwrap Thrust's proxy references here for backward compatibility (try zip_iterator.cu test)
134-
out[idx] = f(THRUST_NS_QUALIFIER::raw_reference_cast(ins2[idx])...);
135-
}
136-
}
137-
};
138-
if (tile_stride == tile_size)
139-
{
140-
process_tile(::cuda::std::true_type{}, ins...);
141-
}
142-
else
143-
{
144-
process_tile(::cuda::std::false_type{}, ins...);
145-
}
146-
}
147-
148-
// Implementation notes on memcpy_async and UBLKCP kernels regarding copy alignment and padding
149-
//
150-
// For performance considerations of memcpy_async:
151-
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#performance-guidance-for-memcpy-async
152-
//
153-
// We basically have to align the base pointer to 16 bytes, and copy a multiple of 16 bytes. To achieve this, when we
154-
// copy a tile of data from an input buffer, we round down the pointer to the start of the tile to the next lower
155-
// address that is a multiple of 16 bytes. This introduces head padding. We also round up the total number of bytes to
156-
// copy (including head padding) to a multiple of 16 bytes, which introduces tail padding. For the bulk copy kernel, we
157-
// have to align to 128 bytes instead of 16.
158-
//
159-
// However, padding memory copies like that may access the input buffer out-of-bounds. Here are some thoughts:
160-
// * According to the CUDA programming guide
161-
// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory-accesses), "any address of a variable
162-
// residing in global memory or returned by one of the memory allocation routines from the driver or runtime API is
163-
// always aligned to at least 256 bytes."
164-
// * Memory protection is usually done on memory page level, which is even larger than 256 bytes for CUDA and 4KiB on
165-
// Intel x86 and 4KiB+ ARM. Front and tail padding thus never leaves the memory page of the input buffer.
166-
// * This should count for device memory, but also for device accessible memory living on the host.
167-
// * The base pointer alignment and size rounding also never leaves the size of a cache line.
168-
//
169-
// Copying larger data blocks with head and tail padding should thus be legal. Nevertheless, an out-of-bounds read is
170-
// still technically undefined behavior in C++. Also, compute-sanitizer flags at least such reads after the end of a
171-
// buffer. Therefore, we lean on the safer side and protect against out of bounds reads at the beginning and end.
172-
173-
// A note on size and alignment: The size of a type is at least as large as its alignment. We rely on this fact in some
174-
// conditions.
175-
// This is guaranteed by the C++ standard, and follows from the definition of arrays: the difference between neighboring
176-
// array element addresses is sizeof element type and each array element needs to fulfill the alignment requirement of
177-
// the element type.
178-
179-
// Pointer with metadata to describe readonly input memory for memcpy_async and UBLKCP kernels.
180-
// cg::memcpy_async is most efficient when the data is 16-byte aligned and the size a multiple of 16 bytes
181-
// UBLKCP is most efficient when the data is 128-byte aligned and the size a multiple of 16 bytes
182-
template <typename T> // Cannot add alignment to signature, because we need a uniform kernel template instantiation
183-
struct aligned_base_ptr
184-
{
185-
using value_type = T;
186-
187-
const char* ptr; // aligned pointer before the original pointer (16-byte or 128-byte). May not be aligned to
188-
// alignof(T). E.g.: array of int3 starting at address 4, ptr == 0
189-
int head_padding; // byte offset between ptr and the original pointer. Value inside [0;15] or [0;127].
190-
191-
_CCCL_HOST_DEVICE const T* ptr_to_elements() const
192-
{
193-
return reinterpret_cast<const T*>(ptr + head_padding);
194-
}
195-
196-
_CCCL_HOST_DEVICE friend bool operator==(const aligned_base_ptr& a, const aligned_base_ptr& b)
197-
{
198-
return a.ptr == b.ptr && a.head_padding == b.head_padding;
199-
}
200-
};
201-
20255
template <typename T>
20356
_CCCL_HOST_DEVICE auto make_aligned_base_ptr(const T* ptr, int alignment) -> aligned_base_ptr<T>
20457
{
20558
const char* base_ptr = round_down_ptr(ptr, alignment);
20659
return aligned_base_ptr<T>{base_ptr, static_cast<int>(reinterpret_cast<const char*>(ptr) - base_ptr)};
20760
}
20861

209-
#ifdef _CUB_HAS_TRANSFORM_UBLKCP
210-
_CCCL_DEVICE _CCCL_FORCEINLINE static bool elect_one()
211-
{
212-
const ::cuda::std::uint32_t membermask = ~0;
213-
::cuda::std::uint32_t is_elected;
214-
asm volatile(
215-
"{\n\t .reg .pred P_OUT; \n\t"
216-
"elect.sync _|P_OUT, %1;\n\t"
217-
"selp.b32 %0, 1, 0, P_OUT; \n"
218-
"}"
219-
: "=r"(is_elected)
220-
: "r"(membermask)
221-
:);
222-
return threadIdx.x < 32 && static_cast<bool>(is_elected);
223-
}
224-
225-
template <typename Offset, typename T>
226-
_CCCL_DEVICE void bulk_copy_tile_fallback(
227-
int tile_size,
228-
int tile_stride,
229-
char* smem,
230-
int& smem_offset,
231-
Offset global_offset,
232-
const aligned_base_ptr<T>& aligned_ptr)
233-
{
234-
const T* src = aligned_ptr.ptr_to_elements() + global_offset;
235-
T* dst = reinterpret_cast<T*>(smem + smem_offset + aligned_ptr.head_padding);
236-
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, "");
237-
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, "");
238-
239-
const int bytes_to_copy = static_cast<int>(sizeof(T)) * tile_size;
240-
cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy);
241-
242-
// add bulk_copy_alignment to make space for the next tile's head padding
243-
smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
244-
}
245-
246-
template <typename BulkCopyPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs>
247-
_CCCL_DEVICE void transform_kernel_ublkcp(
248-
Offset num_items, int num_elem_per_thread, F f, RandomAccessIteratorOut out, aligned_base_ptr<InTs>... aligned_ptrs)
249-
{
250-
__shared__ uint64_t bar;
251-
extern __shared__ char __align__(bulk_copy_alignment) smem[];
252-
253-
namespace ptx = ::cuda::ptx;
254-
255-
constexpr int block_dim = BulkCopyPolicy::block_threads;
256-
const int tile_stride = block_dim * num_elem_per_thread;
257-
const Offset offset = static_cast<Offset>(blockIdx.x) * tile_stride;
258-
const int tile_size = (::cuda::std::min)(num_items - offset, Offset{tile_stride});
259-
260-
const bool inner_blocks = 0 < blockIdx.x && blockIdx.x + 2 < gridDim.x;
261-
if (inner_blocks)
262-
{
263-
// use one thread to setup the entire bulk copy
264-
if (elect_one())
265-
{
266-
ptx::mbarrier_init(&bar, 1);
267-
ptx::fence_proxy_async(ptx::space_shared);
268-
269-
int smem_offset = 0;
270-
::cuda::std::uint32_t total_copied = 0;
271-
272-
auto bulk_copy_tile = [&](auto aligned_ptr) {
273-
using T = typename decltype(aligned_ptr)::value_type;
274-
static_assert(alignof(T) <= bulk_copy_alignment, "");
275-
276-
const char* src = aligned_ptr.ptr + offset * sizeof(T);
277-
char* dst = smem + smem_offset;
278-
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % bulk_copy_alignment == 0, "");
279-
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % bulk_copy_alignment == 0, "");
280-
281-
// TODO(bgruber): we could precompute bytes_to_copy on the host
282-
const int bytes_to_copy = round_up_to_po2_multiple(
283-
aligned_ptr.head_padding + static_cast<int>(sizeof(T)) * tile_stride, bulk_copy_size_multiple);
284-
285-
::cuda::ptx::cp_async_bulk(::cuda::ptx::space_cluster, ::cuda::ptx::space_global, dst, src, bytes_to_copy, &bar);
286-
total_copied += bytes_to_copy;
287-
288-
// add bulk_copy_alignment to make space for the next tile's head padding
289-
smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
290-
};
291-
292-
// Order of evaluation is left-to-right
293-
(..., bulk_copy_tile(aligned_ptrs));
294-
295-
// TODO(ahendriksen): this could only have ptx::sem_relaxed, but this is not available yet
296-
ptx::mbarrier_arrive_expect_tx(ptx::sem_release, ptx::scope_cta, ptx::space_shared, &bar, total_copied);
297-
}
298-
299-
// all threads wait for bulk copy
300-
__syncthreads();
301-
while (!ptx::mbarrier_try_wait_parity(&bar, 0))
302-
;
303-
}
304-
else
305-
{
306-
// use all threads to schedule an async_memcpy
307-
int smem_offset = 0;
308-
309-
auto bulk_copy_tile_fallback = [&](auto aligned_ptr) {
310-
using T = typename decltype(aligned_ptr)::value_type;
311-
const T* src = aligned_ptr.ptr_to_elements() + offset;
312-
T* dst = reinterpret_cast<T*>(smem + smem_offset + aligned_ptr.head_padding);
313-
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(src) % alignof(T) == 0, "");
314-
_CCCL_ASSERT(reinterpret_cast<uintptr_t>(dst) % alignof(T) == 0, "");
315-
316-
const int bytes_to_copy = static_cast<int>(sizeof(T)) * tile_size;
317-
cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), dst, src, bytes_to_copy);
318-
319-
// add bulk_copy_alignment to make space for the next tile's head padding
320-
smem_offset += static_cast<int>(sizeof(T)) * tile_stride + bulk_copy_alignment;
321-
};
322-
323-
// Order of evaluation is left-to-right
324-
(..., bulk_copy_tile_fallback(aligned_ptrs));
325-
326-
cooperative_groups::wait(cooperative_groups::this_thread_block());
327-
}
328-
329-
// move the whole index and iterator to the block/thread index, to reduce arithmetic in the loops below
330-
out += offset;
331-
332-
auto process_tile = [&](auto full_tile) {
333-
// Unroll 1 tends to improve performance, especially for smaller data types (confirmed by benchmark)
334-
_CCCL_PRAGMA(unroll 1)
335-
for (int j = 0; j < num_elem_per_thread; ++j)
336-
{
337-
const int idx = j * block_dim + threadIdx.x;
338-
if (full_tile || idx < tile_size)
339-
{
340-
int smem_offset = 0;
341-
auto fetch_operand = [&](auto aligned_ptr) {
342-
using T = typename decltype(aligned_ptr)::value_type;
343-
const T* smem_operand_tile_base = reinterpret_cast<const T*>(smem + smem_offset + aligned_ptr.head_padding);
344-
smem_offset += int{sizeof(T)} * tile_stride + bulk_copy_alignment;
345-
return smem_operand_tile_base[idx];
346-
};
347-
348-
// need to expand into a tuple for guaranteed order of evaluation
349-
out[idx] = ::cuda::std::apply(
350-
[&](auto... values) {
351-
return f(values...);
352-
},
353-
::cuda::std::tuple<InTs...>{fetch_operand(aligned_ptrs)...});
354-
}
355-
}
356-
};
357-
// explicitly calling the lambda on literal true/false lets the compiler emit the lambda twice
358-
if (tile_stride == tile_size)
359-
{
360-
process_tile(::cuda::std::true_type{});
361-
}
362-
else
363-
{
364-
process_tile(::cuda::std::false_type{});
365-
}
366-
}
367-
368-
template <typename BulkCopyPolicy, typename Offset, typename F, typename RandomAccessIteratorOut, typename... InTs>
369-
_CCCL_DEVICE void transform_kernel_impl(
370-
::cuda::std::integral_constant<Algorithm, Algorithm::ublkcp>,
371-
Offset num_items,
372-
int num_elem_per_thread,
373-
F f,
374-
RandomAccessIteratorOut out,
375-
aligned_base_ptr<InTs>... aligned_ptrs)
376-
{
377-
// only call the real kernel for sm90 and later
378-
NV_IF_TARGET(NV_PROVIDES_SM_90,
379-
(transform_kernel_ublkcp<BulkCopyPolicy>(num_items, num_elem_per_thread, f, out, aligned_ptrs...);));
380-
}
381-
#endif // _CUB_HAS_TRANSFORM_UBLKCP
382-
383-
template <typename It>
384-
union kernel_arg
385-
{
386-
aligned_base_ptr<value_t<It>> aligned_ptr; // first member is trivial
387-
It iterator; // may not be trivially [default|copy]-constructible
388-
389-
static_assert(::cuda::std::is_trivial_v<decltype(aligned_ptr)>, "");
390-
391-
// Sometimes It is not trivially [default|copy]-constructible (e.g.
392-
// thrust::normal_iterator<thrust::device_pointer<T>>), so because of
393-
// https://eel.is/c++draft/class.union#general-note-3, kernel_args's special members are deleted. We work around it by
394-
// explicitly defining them.
395-
_CCCL_HOST_DEVICE kernel_arg() noexcept {}
396-
_CCCL_HOST_DEVICE ~kernel_arg() noexcept {}
397-
398-
_CCCL_HOST_DEVICE kernel_arg(const kernel_arg& other)
399-
{
400-
// since we use kernel_arg only to pass data to the device, the contained data is semantically trivially copyable,
401-
// even if the type system is telling us otherwise.
402-
::cuda::std::memcpy(reinterpret_cast<char*>(this), reinterpret_cast<const char*>(&other), sizeof(kernel_arg));
403-
}
404-
};
405-
40662
template <typename It>
40763
_CCCL_HOST_DEVICE auto make_iterator_kernel_arg(It it) -> kernel_arg<It>
40864
{
@@ -470,9 +126,6 @@ __launch_bounds__(MaxPolicy::ActivePolicy::algo_policy::block_threads)
470126
select_kernel_arg(alg, ::cuda::std::move(ins))...);
471127
}
472128

473-
template <typename T>
474-
using cuda_expected = ::cuda::std::expected<T, cudaError_t>;
475-
476129
// TODO(bgruber): this is very similar to thrust::cuda_cub::core::get_max_shared_memory_per_block. We should unify this.
477130
_CCCL_HOST_DEVICE inline cuda_expected<int> get_max_shared_memory()
478131
{

0 commit comments

Comments
 (0)