Skip to content

Commit de21bc7

Browse files
Extend CUB reduce benchmarks
* Rename max.cu to custom.cu, since it uses a custom operator * Extend types covered my min.cu to all fundamental types * Add some notes on how to collect tuning parameters Fixes: #3283
1 parent 048b2bd commit de21bc7

File tree

5 files changed

+34
-13
lines changed

5 files changed

+34
-13
lines changed

cub/benchmarks/bench/reduce/base.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
103103
});
104104
}
105105

106-
NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))
106+
NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types))
107107
.set_name("base")
108108
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
109109
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,18 @@
2525
*
2626
******************************************************************************/
2727

28+
// This benchmark uses a custom reduction operation, max_t, which is not known to CUB, so no operator specific
29+
// optimizations (e.g. using redux or DPX instructions) are performed. This benchmark covers the unoptimized code path.
30+
31+
// Because CUB cannot detect this operator, we cannot add any tunings based on the results of this benchmark. It's main
32+
// use is to detect regressions.
33+
2834
#include <nvbench_helper.cuh>
2935

3036
// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
3137
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
3238
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
3339

34-
using op_t = max_t;
40+
using value_types = all_types;
41+
using op_t = max_t;
3542
#include "base.cuh"

cub/benchmarks/bench/reduce/min.cu

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,22 @@
2424
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2525
*
2626
******************************************************************************/
27-
// NOTE: this benchmark is intended to cover DPX instructions on Hopper+ architectures.
28-
// It specifically uses cuda::minimum<> instead of a user-defined operator.
29-
#define TUNE_T int16_t
27+
28+
// This benchmark is intended to cover DPX instructions on Hopper+ architectures. It specifically uses cuda::minimum<>
29+
// instead of a user-defined operator, which CUB recognizes to select an optimized code path.
30+
31+
// Tuning parameters found for ::cuda::minimum<> apply equally for ::cuda::maximum<>
32+
// Tuning parameters found for signed integer types apply equally for unsigned integer types
33+
// TODO(bgruber): do tuning parameters found for int16_t apply equally for __half or __nv_bfloat16 on SM90+?
34+
35+
#include <cuda/functional>
36+
3037
#include <nvbench_helper.cuh>
3138

3239
// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
3340
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
3441
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
3542

36-
using op_t = ::cuda::minimum<>;
43+
using value_types = push_back_t<fundamental_types /*, __half, __nv_bfloat16*/>;
44+
using op_t = ::cuda::minimum<>;
3745
#include "base.cuh"

cub/benchmarks/bench/reduce/sum.cu

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,17 @@
2525
*
2626
******************************************************************************/
2727

28+
// This benchmark is intended to cover redux instructions on Ampere+ architectures. It specifically uses
29+
// cuda::std::plus<> instead of a user-defined operator, which CUB recognizes to select an optimized code path.
30+
31+
// Tuning parameters found for signed integer types apply equally for unsigned integer types
32+
2833
#include <nvbench_helper.cuh>
2934

3035
// %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
3136
// %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
3237
// %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
3338

34-
using op_t = ::cuda::std::plus<>;
39+
using value_types = push_back_t<all_types /*, __half, __nv_bfloat16*/>;
40+
using op_t = ::cuda::std::plus<>;
3541
#include "base.cuh"

cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,20 +52,20 @@ struct nvbench::type_strings<::cuda::std::integral_constant<T, I>>
5252
namespace detail
5353
{
5454

55-
template <class T, class List>
55+
template <class List, class... Ts>
5656
struct push_back
5757
{};
5858

59-
template <class T, class... As>
60-
struct push_back<T, nvbench::type_list<As...>>
59+
template <class... As, class... Ts>
60+
struct push_back<nvbench::type_list<As...>, Ts...>
6161
{
62-
using type = nvbench::type_list<As..., T>;
62+
using type = nvbench::type_list<As..., Ts...>;
6363
};
6464

6565
} // namespace detail
6666

67-
template <class T, class List>
68-
using push_back_t = typename detail::push_back<T, List>::type;
67+
template <class List, class... Ts>
68+
using push_back_t = typename detail::push_back<List, Ts...>::type;
6969

7070
#ifdef TUNE_OffsetT
7171
using offset_types = nvbench::type_list<TUNE_OffsetT>;

0 commit comments

Comments
 (0)