Skip to content

Commit 3a12ff7

Browse files
Fix SM100 histogram tunings (NVIDIA#3691)
The tuning data member names did not match the one used when selecting tunings, so all SM100 tunings were SFINAE-ed out. Also drop tunings with no benefit.
1 parent 14eab18 commit 3a12ff7

File tree

1 file changed

+7
-119
lines changed

1 file changed

+7
-119
lines changed

cub/cub/device/dispatch/tuning/tuning_histogram.cuh

Lines changed: 7 additions & 119 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ struct sm100_tuning;
138138

139139
// even
140140
template <class SampleT>
141-
struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1>
141+
struct sm100_tuning<true, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1>
142142
{
143143
// ipt_12.tpb_928.rle_0.ws_0.mem_1.ld_2.laid_0.vec_2 1.033332 0.940517 1.031835 1.195876
144144
static constexpr int items = 12;
@@ -148,30 +148,14 @@ struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s
148148
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
149149
static constexpr CacheLoadModifier load_modifier = LOAD_CA;
150150
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
151-
static constexpr int tune_vec_size = 1 << 2;
151+
static constexpr int vec_size = 1 << 2;
152152
};
153153

154-
// same as base
155-
template <class SampleT>
156-
struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
157-
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
158-
{};
159-
160-
// same as base
161-
template <class SampleT>
162-
struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4>
163-
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4>
164-
{};
165-
166-
// same as base
167-
template <class SampleT>
168-
struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8>
169-
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8>
170-
{};
154+
// sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks
171155

172156
// range
173157
template <class SampleT>
174-
struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1>
158+
struct sm100_tuning<false, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1>
175159
{
176160
// ipt_12.tpb_448.rle_0.ws_0.mem_1.ld_1.laid_0.vec_2 1.078987 0.985542 1.085118 1.175637
177161
static constexpr int items = 12;
@@ -181,108 +165,12 @@ struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s
181165
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
182166
static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
183167
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
184-
static constexpr int tune_vec_size = 1 << 2;
185-
};
186-
187-
// same as base
188-
template <class SampleT>
189-
struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
190-
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
191-
{};
192-
193-
template <class SampleT>
194-
struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4>
195-
{
196-
// ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104
197-
static constexpr int items = 9;
198-
static constexpr int threads = 1024;
199-
static constexpr bool rle_compress = true;
200-
static constexpr bool work_stealing = false;
201-
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
202-
static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
203-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
204-
static constexpr int tune_vec_size = 1 << 0;
205-
};
206-
207-
template <class SampleT>
208-
struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8>
209-
{
210-
// ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657
211-
static constexpr int items = 7;
212-
static constexpr int threads = 544;
213-
static constexpr bool rle_compress = true;
214-
static constexpr bool work_stealing = false;
215-
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
216-
static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
217-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
218-
static constexpr int tune_vec_size = 1 << 0;
219-
};
220-
221-
// multi.even
222-
template <class SampleT>
223-
struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_1>
224-
{
225-
// ipt_9.tpb_1024.rle_0.ws_0.mem_1.ld_1.laid_1.vec_0 1.629591 0.997416 1.570900 2.772504
226-
static constexpr int items = 9;
227-
static constexpr int threads = 1024;
228-
static constexpr bool rle_compress = false;
229-
static constexpr bool work_stealing = false;
230-
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
231-
static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
232-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
233-
static constexpr int tune_vec_size = 1 << 0;
234-
};
235-
236-
// same as base
237-
template <class SampleT>
238-
struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2>
239-
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
240-
{};
241-
242-
// same as base
243-
template <class SampleT>
244-
struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4>
245-
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4>
246-
{};
247-
248-
// same as base
249-
template <class SampleT>
250-
struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8>
251-
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8>
252-
{};
253-
254-
// multi.range
255-
template <class SampleT>
256-
struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_1>
257-
{
258-
// ipt_7.tpb_160.rle_0.ws_0.mem_1.ld_1.laid_1.vec_1 1.210837 0.99556 1.189049 1.939584
259-
static constexpr int items = 7;
260-
static constexpr int threads = 160;
261-
static constexpr bool rle_compress = false;
262-
static constexpr bool work_stealing = false;
263-
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
264-
static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
265-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
266-
static constexpr int tune_vec_size = 1 << 1;
168+
static constexpr int vec_size = 1 << 2;
267169
};
268170

269-
// same as base
270-
template <class SampleT>
271-
struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2>
272-
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2>
273-
{};
274-
275-
// same as base
276-
template <class SampleT>
277-
struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4>
278-
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4>
279-
{};
171+
// sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks
280172

281-
// same as base
282-
template <class SampleT>
283-
struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8>
284-
: sm90_tuning<SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8>
285-
{};
173+
// multi.even and multi.range: none of the found tunings surpassed the SM90 tuning during verification benchmarks
286174

287175
template <class SampleT, class CounterT, int NumChannels, int NumActiveChannels, bool IsEven>
288176
struct policy_hub

0 commit comments

Comments
 (0)