@@ -151,17 +151,7 @@ struct sm100_tuning<true, SampleT, 1, 1, counter_size::_4, primitive_sample::yes
151151 static constexpr int vec_size = 1 << 2 ;
152152};
153153
154- // same as SM90
155- // template <class SampleT>
156- // struct sm100_tuning<true, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> {};
157-
158- // same as SM90
159- // template <class SampleT>
160- // struct sm100_tuning<true, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4> {};
161-
162- // same as SM90
163- // template <class SampleT>
164- // struct sm100_tuning<true, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8> {};
154+ // sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks
165155
166156// range
167157template <class SampleT >
@@ -178,93 +168,9 @@ struct sm100_tuning<false, SampleT, 1, 1, counter_size::_4, primitive_sample::ye
178168 static constexpr int vec_size = 1 << 2 ;
179169};
180170
181- // same as SM90
182- // template <class SampleT>
183- // struct sm100_tuning<false, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> {};
184-
185- // TODO(gonidelis): we found the below tuning but the verification benchmark showed regressions, so it's disabled
186- // template <class SampleT>
187- // struct sm100_tuning<false, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4>
188- // {
189- // // ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104
190- // static constexpr int items = 9;
191- // static constexpr int threads = 1024;
192- // static constexpr bool rle_compress = true;
193- // static constexpr bool work_stealing = false;
194- // static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
195- // static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
196- // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
197- // static constexpr int vec_size = 1 << 0;
198- // };
199-
200- // TODO(gonidelis): we found the below tuning but the verification benchmark showed regressions, so it's disabled
201- // template <class SampleT>
202- // struct sm100_tuning<false, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8>
203- // {
204- // // ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657
205- // static constexpr int items = 7;
206- // static constexpr int threads = 544;
207- // static constexpr bool rle_compress = true;
208- // static constexpr bool work_stealing = false;
209- // static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
210- // static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
211- // static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
212- // static constexpr int vec_size = 1 << 0;
213- // };
214-
215- // multi.even
216- template <class SampleT >
217- struct sm100_tuning <true , SampleT, 4 , 3 , counter_size::_4, primitive_sample::yes, sample_size::_1>
218- {
219- // ipt_9.tpb_1024.rle_0.ws_0.mem_1.ld_1.laid_1.vec_0 1.629591 0.997416 1.570900 2.772504
220- static constexpr int items = 9 ;
221- static constexpr int threads = 1024 ;
222- static constexpr bool rle_compress = false ;
223- static constexpr bool work_stealing = false ;
224- static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
225- static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
226- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
227- static constexpr int vec_size = 1 << 0 ;
228- };
229-
230- // same as SM90
231- // template <class SampleT>
232- // struct sm100_tuning<true, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> {};
233-
234- // same as SM90
235- // template <class SampleT>
236- // struct sm100_tuning<true, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> {};
237-
238- // same as SM90
239- // template <class SampleT>
240- // struct sm100_tuning<true, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> {};
241-
242- // multi.range
243- template <class SampleT >
244- struct sm100_tuning <false , SampleT, 4 , 3 , counter_size::_4, primitive_sample::yes, sample_size::_1>
245- {
246- // ipt_7.tpb_160.rle_0.ws_0.mem_1.ld_1.laid_1.vec_1 1.210837 0.99556 1.189049 1.939584
247- static constexpr int items = 7 ;
248- static constexpr int threads = 160 ;
249- static constexpr bool rle_compress = false ;
250- static constexpr bool work_stealing = false ;
251- static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
252- static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
253- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
254- static constexpr int vec_size = 1 << 1 ;
255- };
256-
257- // same as SM90
258- // template <class SampleT>
259- // struct sm100_tuning<false, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> {};
260-
261- // same as SM90
262- // template <class SampleT>
263- // struct sm100_tuning<false, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> {};
171+ // sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks
264172
265- // same as SM90
266- // template <class SampleT>
267- // struct sm100_tuning<false, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> {};
173+ // multi.even and multi.range: none of the found tunings surpassed the SM90 tuning during verification benchmarks
268174
269175template <class SampleT , class CounterT , int NumChannels, int NumActiveChannels, bool IsEven>
270176struct policy_hub
0 commit comments