@@ -138,7 +138,7 @@ struct sm100_tuning;
138138
139139// even
140140template <class SampleT >
141- struct sm100_tuning <1 , SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_1>
141+ struct sm100_tuning <true , SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_1>
142142{
143143 // ipt_12.tpb_928.rle_0.ws_0.mem_1.ld_2.laid_0.vec_2 1.033332 0.940517 1.031835 1.195876
144144 static constexpr int items = 12 ;
@@ -148,30 +148,14 @@ struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s
148148 static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
149149 static constexpr CacheLoadModifier load_modifier = LOAD_CA;
150150 static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
151- static constexpr int tune_vec_size = 1 << 2 ;
151+ static constexpr int vec_size = 1 << 2 ;
152152};
153153
154- // same as base
155- template <class SampleT >
156- struct sm100_tuning <1 , SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_2>
157- : sm90_tuning<SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_2>
158- {};
159-
160- // same as base
161- template <class SampleT >
162- struct sm100_tuning <1 , SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_4>
163- : sm90_tuning<SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_4>
164- {};
165-
166- // same as base
167- template <class SampleT >
168- struct sm100_tuning <1 , SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_8>
169- : sm90_tuning<SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_8>
170- {};
154+ // sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks
171155
172156// range
173157template <class SampleT >
174- struct sm100_tuning <0 , SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_1>
158+ struct sm100_tuning <false , SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_1>
175159{
176160 // ipt_12.tpb_448.rle_0.ws_0.mem_1.ld_1.laid_0.vec_2 1.078987 0.985542 1.085118 1.175637
177161 static constexpr int items = 12 ;
@@ -181,108 +165,12 @@ struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s
181165 static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
182166 static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
183167 static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
184- static constexpr int tune_vec_size = 1 << 2 ;
185- };
186-
187- // same as base
188- template <class SampleT >
189- struct sm100_tuning <0 , SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_2>
190- : sm90_tuning<SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_2>
191- {};
192-
193- template <class SampleT >
194- struct sm100_tuning <0 , SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_4>
195- {
196- // ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104
197- static constexpr int items = 9 ;
198- static constexpr int threads = 1024 ;
199- static constexpr bool rle_compress = true ;
200- static constexpr bool work_stealing = false ;
201- static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
202- static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT;
203- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
204- static constexpr int tune_vec_size = 1 << 0 ;
205- };
206-
207- template <class SampleT >
208- struct sm100_tuning <0 , SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_8>
209- {
210- // ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657
211- static constexpr int items = 7 ;
212- static constexpr int threads = 544 ;
213- static constexpr bool rle_compress = true ;
214- static constexpr bool work_stealing = false ;
215- static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
216- static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
217- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT;
218- static constexpr int tune_vec_size = 1 << 0 ;
219- };
220-
221- // multi.even
222- template <class SampleT >
223- struct sm100_tuning <1 , SampleT, 4 , 3 , counter_size::_4, primitive_sample::yes, sample_size::_1>
224- {
225- // ipt_9.tpb_1024.rle_0.ws_0.mem_1.ld_1.laid_1.vec_0 1.629591 0.997416 1.570900 2.772504
226- static constexpr int items = 9 ;
227- static constexpr int threads = 1024 ;
228- static constexpr bool rle_compress = false ;
229- static constexpr bool work_stealing = false ;
230- static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
231- static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
232- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
233- static constexpr int tune_vec_size = 1 << 0 ;
234- };
235-
236- // same as base
237- template <class SampleT >
238- struct sm100_tuning <1 , SampleT, 4 , 3 , counter_size::_4, primitive_sample::yes, sample_size::_2>
239- : sm90_tuning<SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_2>
240- {};
241-
242- // same as base
243- template <class SampleT >
244- struct sm100_tuning <1 , SampleT, 4 , 3 , counter_size::_4, primitive_sample::yes, sample_size::_4>
245- : sm90_tuning<SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_4>
246- {};
247-
248- // same as base
249- template <class SampleT >
250- struct sm100_tuning <1 , SampleT, 4 , 3 , counter_size::_4, primitive_sample::yes, sample_size::_8>
251- : sm90_tuning<SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_8>
252- {};
253-
254- // multi.range
255- template <class SampleT >
256- struct sm100_tuning <0 , SampleT, 4 , 3 , counter_size::_4, primitive_sample::yes, sample_size::_1>
257- {
258- // ipt_7.tpb_160.rle_0.ws_0.mem_1.ld_1.laid_1.vec_1 1.210837 0.99556 1.189049 1.939584
259- static constexpr int items = 7 ;
260- static constexpr int threads = 160 ;
261- static constexpr bool rle_compress = false ;
262- static constexpr bool work_stealing = false ;
263- static constexpr BlockHistogramMemoryPreference mem_preference = SMEM;
264- static constexpr CacheLoadModifier load_modifier = LOAD_LDG;
265- static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE;
266- static constexpr int tune_vec_size = 1 << 1 ;
168+ static constexpr int vec_size = 1 << 2 ;
267169};
268170
269- // same as base
270- template <class SampleT >
271- struct sm100_tuning <0 , SampleT, 4 , 3 , counter_size::_4, primitive_sample::yes, sample_size::_2>
272- : sm90_tuning<SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_2>
273- {};
274-
275- // same as base
276- template <class SampleT >
277- struct sm100_tuning <0 , SampleT, 4 , 3 , counter_size::_4, primitive_sample::yes, sample_size::_4>
278- : sm90_tuning<SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_4>
279- {};
171+ // sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks
280172
281- // same as base
282- template <class SampleT >
283- struct sm100_tuning <0 , SampleT, 4 , 3 , counter_size::_4, primitive_sample::yes, sample_size::_8>
284- : sm90_tuning<SampleT, 1 , 1 , counter_size::_4, primitive_sample::yes, sample_size::_8>
285- {};
173+ // multi.even and multi.range: none of the found tunings surpassed the SM90 tuning during verification benchmarks
286174
287175template <class SampleT , class CounterT , int NumChannels, int NumActiveChannels, bool IsEven>
288176struct policy_hub
0 commit comments