Skip to content

Commit a52e626

Browse files
Replace CUB macros in tunings and benchmarks (#3931)
1 parent 7fbbd24 commit a52e626

File tree

9 files changed

+41
-31
lines changed

9 files changed

+41
-31
lines changed

c/parallel/src/reduce.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <cub/grid/grid_even_share.cuh>
1515
#include <cub/util_device.cuh>
1616

17+
#include <cuda/std/__algorithm_>
1718
#include <cuda/std/cstdint>
1819
#include <cuda/std/functional> // ::cuda::std::identity
1920
#include <cuda/std/variant>
@@ -97,8 +98,8 @@ reduce_runtime_tuning_policy get_policy(int cc, cccl_type_info accumulator_type)
9798
auto [_, block_size, items_per_thread, vector_load_length] = find_tuning(cc, chain);
9899

99100
// Implement part of MemBoundScaling
100-
items_per_thread = CUB_MAX(1, CUB_MIN(items_per_thread * 4 / accumulator_type.size, items_per_thread * 2));
101-
block_size = CUB_MIN(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
101+
items_per_thread = cuda::std::clamp(items_per_thread * 4 / accumulator_type.size, 1, items_per_thread * 2);
102+
block_size = _CUDA_VSTD::min(block_size, (((1024 * 48) / (accumulator_type.size * items_per_thread)) + 31) / 32 * 32);
102103

103104
return {block_size, items_per_thread, vector_load_length};
104105
}

cub/benchmarks/bench/partition/flagged.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
#include <thrust/count.h>
3131

32+
#include <cuda/std/__algorithm_>
3233
#include <cuda/std/type_traits>
3334

3435
#include <look_back_helper.cuh>
@@ -63,7 +64,7 @@ struct policy_hub_t
6364
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
6465

6566
static constexpr int ITEMS_PER_THREAD =
66-
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
67+
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
6768

6869
using SelectIfPolicyT =
6970
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

cub/benchmarks/bench/partition/if.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
#include <thrust/count.h>
3131

32+
#include <cuda/std/__algorithm_>
3233
#include <cuda/std/type_traits>
3334

3435
#include <look_back_helper.cuh>
@@ -63,7 +64,7 @@ struct policy_hub_t
6364
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
6465

6566
static constexpr int ITEMS_PER_THREAD =
66-
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
67+
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
6768

6869
using SelectIfPolicyT =
6970
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

cub/benchmarks/bench/select/flagged.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929

3030
#include <thrust/count.h>
3131

32+
#include <cuda/std/__algorithm_>
33+
3234
#include <look_back_helper.cuh>
3335
#include <nvbench_helper.cuh>
3436

@@ -61,7 +63,7 @@ struct policy_hub_t
6163
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
6264

6365
static constexpr int ITEMS_PER_THREAD =
64-
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
66+
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
6567

6668
using SelectIfPolicyT =
6769
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

cub/benchmarks/bench/select/if.cu

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929

3030
#include <thrust/count.h>
3131

32+
#include <cuda/std/__algorithm_>
33+
3234
#include <limits>
3335

3436
#include <look_back_helper.cuh>
@@ -63,7 +65,7 @@ struct policy_hub_t
6365
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
6466

6567
static constexpr int ITEMS_PER_THREAD =
66-
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
68+
_CUDA_VSTD::clamp(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT), 1, NOMINAL_4B_ITEMS_PER_THREAD);
6769

6870
using SelectIfPolicyT =
6971
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

cub/benchmarks/bench/select/unique.cu

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
#include <cub/device/device_select.cuh>
55

6+
#include <cuda/std/__algorithm_>
7+
68
#include <limits>
79

810
#include <look_back_helper.cuh>
@@ -36,8 +38,8 @@ struct policy_hub_t
3638
{
3739
static constexpr int NOMINAL_4B_ITEMS_PER_THREAD = TUNE_ITEMS_PER_THREAD;
3840

39-
static constexpr int ITEMS_PER_THREAD =
40-
CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
41+
static constexpr int ITEMS_PER_THREAD = _CUDA_VSTD::min(
42+
NOMINAL_4B_ITEMS_PER_THREAD, _CUDA_VSTD::max(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(InputT))));
4143

4244
using SelectIfPolicyT =
4345
cub::AgentSelectIfPolicy<TUNE_THREADS_PER_BLOCK,

cub/cub/device/dispatch/tuning/tuning_run_length_encode.cuh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@
4646
#include <cub/util_device.cuh>
4747
#include <cub/util_type.cuh>
4848

49-
#include <cuda/std/__algorithm/max.h>
49+
#include <cuda/cmath>
50+
#include <cuda/std/__algorithm_>
5051

5152
CUB_NAMESPACE_BEGIN
5253

@@ -315,9 +316,8 @@ struct policy_hub
315316
static constexpr int items =
316317
(max_input_bytes <= 8)
317318
? 6
318-
// TODO(bgruber): use clamp() and ceil_div in C++14
319-
: CUB_MIN(nominal_4B_items_per_thread,
320-
CUB_MAX(1, ((nominal_4B_items_per_thread * 8) + combined_input_bytes - 1) / combined_input_bytes));
319+
: ::cuda::std::clamp(
320+
::cuda::ceil_div(nominal_4B_items_per_thread * 8, combined_input_bytes), 1, nominal_4B_items_per_thread);
321321
using ReduceByKeyPolicyT =
322322
AgentReduceByKeyPolicy<128,
323323
items,
@@ -603,7 +603,7 @@ struct policy_hub
603603
static constexpr int nominal_4B_items_per_thread = 15;
604604
// TODO(bgruber): use clamp() in C++14
605605
static constexpr int ITEMS_PER_THREAD =
606-
CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(KeyT))));
606+
_CUDA_VSTD::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(KeyT)}, 1, nominal_4B_items_per_thread);
607607
using RleSweepPolicyT =
608608
AgentRlePolicy<96,
609609
ITEMS_PER_THREAD,

cub/cub/device/dispatch/tuning/tuning_select_if.cuh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545
#include <cub/util_math.cuh>
4646
#include <cub/util_type.cuh>
4747

48+
#include <cuda/std/__algorithm_>
49+
4850
CUB_NAMESPACE_BEGIN
4951

5052
namespace detail
@@ -1498,9 +1500,8 @@ struct policy_hub
14981500
struct DefaultPolicy
14991501
{
15001502
static constexpr int nominal_4B_items_per_thread = 10;
1501-
// TODO(bgruber): use cuda::std::clamp() in C++14
15021503
static constexpr int items_per_thread =
1503-
CUB_MIN(nominal_4B_items_per_thread, CUB_MAX(1, (nominal_4B_items_per_thread * 4 / sizeof(InputT))));
1504+
::cuda::std::clamp(nominal_4B_items_per_thread * 4 / int{sizeof(InputT)}, 1, nominal_4B_items_per_thread);
15041505
using SelectIfPolicyT =
15051506
AgentSelectIfPolicy<128,
15061507
items_per_thread,

thrust/testing/zip_iterator.cu

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -332,25 +332,25 @@ struct TestZipIteratorTransform
332332
device_vector<T> d_result(n);
333333

334334
// Tuples with 2 elements
335-
transform(make_zip_iterator(h_data0.begin(), h_data1.begin()),
336-
make_zip_iterator(h_data0.end(), h_data1.end()),
337-
h_result.begin(),
338-
SumTwoTuple());
339-
transform(make_zip_iterator(d_data0.begin(), d_data1.begin()),
340-
make_zip_iterator(d_data0.end(), d_data1.end()),
341-
d_result.begin(),
342-
SumTwoTuple());
335+
thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin()),
336+
make_zip_iterator(h_data0.end(), h_data1.end()),
337+
h_result.begin(),
338+
SumTwoTuple());
339+
thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin()),
340+
make_zip_iterator(d_data0.end(), d_data1.end()),
341+
d_result.begin(),
342+
SumTwoTuple());
343343
ASSERT_EQUAL(h_result, d_result);
344344

345345
// Tuples with 3 elements
346-
transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()),
347-
make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()),
348-
h_result.begin(),
349-
SumThreeTuple());
350-
transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()),
351-
make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()),
352-
d_result.begin(),
353-
SumThreeTuple());
346+
thrust::transform(make_zip_iterator(h_data0.begin(), h_data1.begin(), h_data2.begin()),
347+
make_zip_iterator(h_data0.end(), h_data1.end(), h_data2.end()),
348+
h_result.begin(),
349+
SumThreeTuple());
350+
thrust::transform(make_zip_iterator(d_data0.begin(), d_data1.begin(), d_data2.begin()),
351+
make_zip_iterator(d_data0.end(), d_data1.end(), d_data2.end()),
352+
d_result.begin(),
353+
SumThreeTuple());
354354
ASSERT_EQUAL(h_result, d_result);
355355
}
356356
};

0 commit comments

Comments
 (0)