Skip to content

Commit 30f5b21

Browse files
Drop tunings with no benefit
1 parent 66cbb57 commit 30f5b21

File tree

1 file changed

+3
-97
lines changed

1 file changed

+3
-97
lines changed

cub/cub/device/dispatch/tuning/tuning_histogram.cuh

Lines changed: 3 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -151,17 +151,7 @@ struct sm100_tuning<true, SampleT, 1, 1, counter_size::_4, primitive_sample::yes
151151
static constexpr int vec_size = 1 << 2;
152152
};
153153

154-
// same as SM90
155-
// template <class SampleT>
156-
// struct sm100_tuning<true, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> {};
157-
158-
// same as SM90
159-
// template <class SampleT>
160-
// struct sm100_tuning<true, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4> {};
161-
162-
// same as SM90
163-
// template <class SampleT>
164-
// struct sm100_tuning<true, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8> {};
154+
// sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks
165155

166156
// range
167157
template <class SampleT>
@@ -178,93 +168,9 @@ struct sm100_tuning<false, SampleT, 1, 1, counter_size::_4, primitive_sample::ye
178168
static constexpr int vec_size = 1 << 2;
179169
};
180170

181-
// same as SM90
182-
// template <class SampleT>
183-
// struct sm100_tuning<false, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> {};
184-
185-
// TODO(gonidelis): we found the below tuning but the verification benchmark showed regressions, so it's disabled
186-
// template <class SampleT>
187-
// struct sm100_tuning<false, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4>
188-
// {
189-
// // ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104
190-
// static constexpr int items = 9;
191-
// static constexpr int threads = 1024;
192-
// static constexpr bool rle_compress = true;
193-
// static constexpr bool work_stealing = false;
194-
// static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
195-
// static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
196-
// static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
197-
// static constexpr int vec_size = 1 << 0;
198-
// };
199-
200-
// TODO(gonidelis): we found the below tuning but the verification benchmark showed regressions, so it's disabled
201-
// template <class SampleT>
202-
// struct sm100_tuning<false, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8>
203-
// {
204-
// // ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657
205-
// static constexpr int items = 7;
206-
// static constexpr int threads = 544;
207-
// static constexpr bool rle_compress = true;
208-
// static constexpr bool work_stealing = false;
209-
// static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
210-
// static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
211-
// static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
212-
// static constexpr int vec_size = 1 << 0;
213-
// };
214-
215-
// multi.even
216-
template <class SampleT>
217-
struct sm100_tuning<true, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_1>
218-
{
219-
// ipt_9.tpb_1024.rle_0.ws_0.mem_1.ld_1.laid_1.vec_0 1.629591 0.997416 1.570900 2.772504
220-
static constexpr int items = 9;
221-
static constexpr int threads = 1024;
222-
static constexpr bool rle_compress = false;
223-
static constexpr bool work_stealing = false;
224-
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
225-
static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
226-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
227-
static constexpr int vec_size = 1 << 0;
228-
};
229-
230-
// same as SM90
231-
// template <class SampleT>
232-
// struct sm100_tuning<true, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> {};
233-
234-
// same as SM90
235-
// template <class SampleT>
236-
// struct sm100_tuning<true, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> {};
237-
238-
// same as SM90
239-
// template <class SampleT>
240-
// struct sm100_tuning<true, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> {};
241-
242-
// multi.range
243-
template <class SampleT>
244-
struct sm100_tuning<false, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_1>
245-
{
246-
// ipt_7.tpb_160.rle_0.ws_0.mem_1.ld_1.laid_1.vec_1 1.210837 0.99556 1.189049 1.939584
247-
static constexpr int items = 7;
248-
static constexpr int threads = 160;
249-
static constexpr bool rle_compress = false;
250-
static constexpr bool work_stealing = false;
251-
static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
252-
static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
253-
static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
254-
static constexpr int vec_size = 1 << 1;
255-
};
256-
257-
// same as SM90
258-
// template <class SampleT>
259-
// struct sm100_tuning<false, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> {};
260-
261-
// same as SM90
262-
// template <class SampleT>
263-
// struct sm100_tuning<false, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> {};
171+
// sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks
264172

265-
// same as SM90
266-
// template <class SampleT>
267-
// struct sm100_tuning<false, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> {};
173+
// multi.even and multi.range: none of the found tunings surpassed the SM90 tuning during verification benchmarks
268174

269175
template <class SampleT, class CounterT, int NumChannels, int NumActiveChannels, bool IsEven>
270176
struct policy_hub

0 commit comments

Comments
 (0)