Extend CUB reduce benchmarks

bernhardmgruber · bernhardmgruber · commit de21bc743313 · 2025-01-15T17:00:21.000+01:00
* Rename max.cu to custom.cu, since it uses a custom operator * Extend types covered my min.cu to all fundamental types * Add some notes on how to collect tuning parameters Fixes: #3283
diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh
@@ -103,7 +103,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   });
 }
 
-NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))
+NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types))
   .set_name("base")
   .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
diff --git a/cub/benchmarks/bench/reduce/custom.cu b/cub/benchmarks/bench/reduce/custom.cu
@@ -25,11 +25,18 @@
  *
  ******************************************************************************/
 
+// This benchmark uses a custom reduction operation, max_t, which is not known to CUB, so no operator specific
+// optimizations (e.g. using redux or DPX instructions) are performed. This benchmark covers the unoptimized code path.
+
+// Because CUB cannot detect this operator, we cannot add any tunings based on the results of this benchmark. It's main
+// use is to detect regressions.
+
 #include <nvbench_helper.cuh>
 
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
-using op_t = max_t;
+using value_types = all_types;
+using op_t        = max_t;
 #include "base.cuh"
diff --git a/cub/benchmarks/bench/reduce/min.cu b/cub/benchmarks/bench/reduce/min.cu
@@ -24,14 +24,22 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  ******************************************************************************/
-// NOTE: this benchmark is intended to cover DPX instructions on Hopper+ architectures.
-//       It specifically uses cuda::minimum<> instead of a user-defined operator.
-#define TUNE_T int16_t
+
+// This benchmark is intended to cover DPX instructions on Hopper+ architectures. It specifically uses cuda::minimum<>
+// instead of a user-defined operator, which CUB recognizes to select an optimized code path.
+
+// Tuning parameters found for ::cuda::minimum<> apply equally for ::cuda::maximum<>
+// Tuning parameters found for signed integer types apply equally for unsigned integer types
+// TODO(bgruber): do tuning parameters found for int16_t apply equally for __half or __nv_bfloat16 on SM90+?
+
+#include <cuda/functional>
+
 #include <nvbench_helper.cuh>
 
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
-using op_t = ::cuda::minimum<>;
+using value_types = push_back_t<fundamental_types /*, __half, __nv_bfloat16*/>;
+using op_t        = ::cuda::minimum<>;
 #include "base.cuh"
diff --git a/cub/benchmarks/bench/reduce/sum.cu b/cub/benchmarks/bench/reduce/sum.cu
@@ -25,11 +25,17 @@
  *
  ******************************************************************************/
 
+// This benchmark is intended to cover redux instructions on Ampere+ architectures. It specifically uses
+// cuda::std::plus<> instead of a user-defined operator, which CUB recognizes to select an optimized code path.
+
+// Tuning parameters found for signed integer types apply equally for unsigned integer types
+
 #include <nvbench_helper.cuh>
 
 // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
 // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
 // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
 
-using op_t = ::cuda::std::plus<>;
+using value_types = push_back_t<all_types /*, __half, __nv_bfloat16*/>;
+using op_t        = ::cuda::std::plus<>;
 #include "base.cuh"
diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh
@@ -52,20 +52,20 @@ struct nvbench::type_strings<::cuda::std::integral_constant<T, I>>
 namespace detail
 {
 
-template <class T, class List>
+template <class List, class... Ts>
 struct push_back
 {};
 
-template <class T, class... As>
-struct push_back<T, nvbench::type_list<As...>>
+template <class... As, class... Ts>
+struct push_back<nvbench::type_list<As...>, Ts...>
 {
-  using type = nvbench::type_list<As..., T>;
+  using type = nvbench::type_list<As..., Ts...>;
 };
 
 } // namespace detail
 
-template <class T, class List>
-using push_back_t = typename detail::push_back<T, List>::type;
+template <class List, class... Ts>
+using push_back_t = typename detail::push_back<List, Ts...>::type;
 
 #ifdef TUNE_OffsetT
 using offset_types = nvbench::type_list<TUNE_OffsetT>;

Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)`
`103`	`103`	`});`
`104`	`104`	`}`
`105`	`105`
`106`		`-NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))`
	`106`	`+NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(value_types, offset_types))`
`107`	`107`	`.set_name("base")`
`108`	`108`	`.set_type_axes_names({"T{ct}", "OffsetT{ct}"})`
`109`	`109`	`.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));`