Skip to content

Commit b63ccdb

Browse files
committed
add FLAGS instead max_partition_size
1 parent f445a7a commit b63ccdb

24 files changed

+6
-58
lines changed

csrc/gpu/append_attention.cu

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
6161
const float out_linear_in_scale,
6262
const int encoder_block_shape_q,
6363
const int decoder_block_shape_q,
64-
const int max_partition_size,
6564
const int encoder_max_partition_size,
6665
const int speculate_max_draft_token_num,
6766
const bool causal,
@@ -209,7 +208,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
209208
quant_max_bound,
210209
quant_min_bound,
211210
out_linear_in_scale,
212-
max_partition_size,
213211
encoder_max_partition_size,
214212
speculate_max_draft_token_num,
215213
causal,
@@ -248,7 +246,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
248246
quant_max_bound,
249247
quant_min_bound,
250248
out_linear_in_scale,
251-
max_partition_size,
252249
encoder_max_partition_size,
253250
speculate_max_draft_token_num,
254251
causal,
@@ -292,7 +289,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
292289
quant_max_bound,
293290
quant_min_bound,
294291
out_linear_in_scale,
295-
max_partition_size,
296292
encoder_max_partition_size,
297293
speculate_max_draft_token_num,
298294
causal,
@@ -440,7 +436,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
440436
quant_max_bound,
441437
quant_min_bound,
442438
out_linear_in_scale,
443-
max_partition_size,
444439
encoder_max_partition_size,
445440
speculate_max_draft_token_num,
446441
causal,
@@ -479,7 +474,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
479474
quant_max_bound,
480475
quant_min_bound,
481476
out_linear_in_scale,
482-
max_partition_size,
483477
encoder_max_partition_size,
484478
speculate_max_draft_token_num,
485479
causal,
@@ -524,7 +518,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
524518
quant_max_bound,
525519
quant_min_bound,
526520
out_linear_in_scale,
527-
max_partition_size,
528521
encoder_max_partition_size,
529522
speculate_max_draft_token_num,
530523
causal,
@@ -585,7 +578,6 @@ std::vector<paddle::Tensor> AppendAttention(
585578
const float out_linear_in_scale,
586579
const int encoder_block_shape_q,
587580
const int decoder_block_shape_q,
588-
const int max_partition_size,
589581
const int encoder_max_partition_size,
590582
const int speculate_max_draft_token_num,
591583
const bool causal,
@@ -650,7 +642,6 @@ std::vector<paddle::Tensor> AppendAttention(
650642
out_linear_in_scale,
651643
encoder_block_shape_q,
652644
decoder_block_shape_q,
653-
max_partition_size,
654645
encoder_max_partition_size,
655646
speculate_max_draft_token_num,
656647
causal,
@@ -700,7 +691,6 @@ std::vector<paddle::Tensor> AppendAttention(
700691
out_linear_in_scale,
701692
encoder_block_shape_q,
702693
decoder_block_shape_q,
703-
max_partition_size,
704694
encoder_max_partition_size,
705695
speculate_max_draft_token_num,
706696
causal,
@@ -751,7 +741,6 @@ std::vector<paddle::Tensor> AppendAttention(
751741
out_linear_in_scale,
752742
encoder_block_shape_q,
753743
decoder_block_shape_q,
754-
max_partition_size,
755744
encoder_max_partition_size,
756745
speculate_max_draft_token_num,
757746
causal,
@@ -800,7 +789,6 @@ std::vector<paddle::Tensor> AppendAttention(
800789
out_linear_in_scale,
801790
encoder_block_shape_q,
802791
decoder_block_shape_q,
803-
max_partition_size,
804792
encoder_max_partition_size,
805793
speculate_max_draft_token_num,
806794
causal,
@@ -905,7 +893,6 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
905893
const float out_linear_in_scale,
906894
const int encoder_block_shape_q,
907895
const int decoder_block_shape_q,
908-
const int max_partition_size,
909896
const int encoder_max_partition_size,
910897
const int speculate_max_draft_token_num,
911898
const bool causal,
@@ -985,7 +972,6 @@ PD_BUILD_OP(append_attention)
985972
"out_linear_in_scale: float",
986973
"encoder_block_shape_q: int",
987974
"decoder_block_shape_q: int",
988-
"max_partition_size: int",
989975
"encoder_max_partition_size: int",
990976
"speculate_max_draft_token_num: int",
991977
"causal: bool",

csrc/gpu/append_attn/append_attention_c16_impl.cuh

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -786,7 +786,6 @@ void MultiQueryAppendAttention(
786786
const float quant_max_bound,
787787
const float quant_min_bound,
788788
const float in_scale,
789-
const int max_partition_size,
790789
const int encoder_max_partition_size,
791790
const int speculate_max_draft_token_num,
792791
const bool is_decoder,
@@ -839,7 +838,7 @@ void MultiQueryAppendAttention(
839838
int sm_count;
840839
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
841840

842-
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
841+
static uint32_t chunk_size = get_max_partition_size();
843842
if (!is_decoder) {
844843
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
845844
}
@@ -1058,7 +1057,7 @@ void MultiQueryAppendAttention(
10581057
int sm_count;
10591058
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
10601059

1061-
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
1060+
static uint32_t chunk_size = get_max_partition_size();
10621061
if (!is_decoder) {
10631062
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
10641063
}
@@ -1301,7 +1300,6 @@ void CascadeAppendAttentionC16Kernel(
13011300
const float quant_max_bound,
13021301
const float quant_min_bound,
13031302
const float in_scale,
1304-
const int max_partition_size,
13051303
const int encoder_max_partition_size,
13061304
const int speculate_max_draft_token_num,
13071305
const bool causal,
@@ -1363,7 +1361,6 @@ void CascadeAppendAttentionC16Kernel(
13631361
quant_max_bound,
13641362
quant_min_bound,
13651363
in_scale,
1366-
max_partition_size,
13671364
encoder_max_partition_size,
13681365
speculate_max_draft_token_num,
13691366
is_decoder,

csrc/gpu/append_attn/append_attention_c4_impl.cuh

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -973,7 +973,6 @@ void MultiQueryAppendC4Attention(
973973
const float quant_max_bound,
974974
const float quant_min_bound,
975975
const float in_scale,
976-
const int max_partition_size,
977976
const int encoder_max_partition_size,
978977
const int speculate_max_draft_token_num,
979978
const bool is_decoder,
@@ -1036,7 +1035,7 @@ void MultiQueryAppendC4Attention(
10361035
const float ratio = static_cast<float>(num_blocks_need) /
10371036
static_cast<float>(num_blocks_per_wave);
10381037

1039-
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
1038+
static uint32_t chunk_size = get_max_partition_size();
10401039
if (!is_decoder) {
10411040
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
10421041
}
@@ -1282,7 +1281,7 @@ void MultiQueryAppendC4Attention(
12821281
static_cast<float>(num_blocks_per_wave);
12831282

12841283

1285-
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
1284+
static uint32_t chunk_size = get_max_partition_size();
12861285
if (!is_decoder) {
12871286
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
12881287
}
@@ -1538,7 +1537,6 @@ void CascadeAppendAttentionC4Kernel(
15381537
const float quant_max_bound,
15391538
const float quant_min_bound,
15401539
const float in_scale,
1541-
const int max_partition_size,
15421540
const int encoder_max_partition_size,
15431541
const int speculate_max_draft_token_num,
15441542
const bool causal,
@@ -1604,7 +1602,6 @@ void CascadeAppendAttentionC4Kernel(
16041602
quant_max_bound,
16051603
quant_min_bound,
16061604
in_scale,
1607-
max_partition_size,
16081605
encoder_max_partition_size,
16091606
speculate_max_draft_token_num,
16101607
is_decoder,

csrc/gpu/append_attn/append_attention_c8_impl.cuh

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -860,7 +860,6 @@ void MultiQueryAppendC8Attention(
860860
const float quant_max_bound,
861861
const float quant_min_bound,
862862
const float in_scale,
863-
const int max_partition_size,
864863
const int encoder_max_partition_size,
865864
const int speculate_max_draft_token_num,
866865
const bool is_decoder,
@@ -914,7 +913,7 @@ void MultiQueryAppendC8Attention(
914913
const int dev_id = 0;
915914
int sm_count;
916915
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
917-
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
916+
static uint32_t chunk_size = get_max_partition_size();
918917
if (!is_decoder) {
919918
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
920919
}
@@ -1136,7 +1135,7 @@ void MultiQueryAppendC8Attention(
11361135
const int dev_id = 0;
11371136
int sm_count;
11381137
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
1139-
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
1138+
static uint32_t chunk_size = get_max_partition_size();
11401139
if (!is_decoder) {
11411140
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
11421141
}
@@ -1377,7 +1376,6 @@ void CascadeAppendAttentionC8Kernel(
13771376
const float quant_max_bound,
13781377
const float quant_min_bound,
13791378
const float in_scale,
1380-
const int max_partition_size,
13811379
const int encoder_max_partition_size,
13821380
const int speculate_max_draft_token_num,
13831381
const bool causal,
@@ -1441,7 +1439,6 @@ void CascadeAppendAttentionC8Kernel(
14411439
quant_max_bound,
14421440
quant_min_bound,
14431441
in_scale,
1444-
max_partition_size,
14451442
encoder_max_partition_size,
14461443
speculate_max_draft_token_num,
14471444
is_decoder,

csrc/gpu/append_attn/append_attention_kernel.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ void CascadeAppendAttentionC16Kernel(
5252
const float quant_max_bound,
5353
const float quant_min_bound,
5454
const float in_scale,
55-
const int max_partition_size,
5655
const int encoder_max_partition_size,
5756
const int speculate_max_draft_token_num,
5857
const bool causal,
@@ -97,7 +96,6 @@ void CascadeAppendAttentionC8Kernel(
9796
const float quant_max_bound,
9897
const float quant_min_bound,
9998
const float in_scale,
100-
const int max_partition_size,
10199
const int encoder_max_partition_size,
102100
const int speculate_max_draft_token_num,
103101
const bool causal,
@@ -142,7 +140,6 @@ void CascadeAppendAttentionC4Kernel(
142140
const float quant_max_bound,
143141
const float quant_min_bound,
144142
const float in_scale,
145-
const int max_partition_size,
146143
const int encoder_max_partition_size,
147144
const int speculate_max_draft_token_num,
148145
const bool causal,
@@ -188,7 +185,6 @@ void CascadeAppendAttentionKernel(
188185
const float quant_max_bound,
189186
const float quant_min_bound,
190187
const float in_scale,
191-
const int max_partition_size,
192188
const int encoder_max_partition_size,
193189
const int speculate_max_draft_token_num,
194190
const bool causal,
@@ -223,7 +219,6 @@ void CascadeAppendAttentionKernel(
223219
quant_max_bound,
224220
quant_min_bound,
225221
in_scale,
226-
max_partition_size,
227222
encoder_max_partition_size,
228223
speculate_max_draft_token_num,
229224
causal,
@@ -258,7 +253,6 @@ void CascadeAppendAttentionKernel(
258253
quant_max_bound,
259254
quant_min_bound,
260255
in_scale,
261-
max_partition_size,
262256
encoder_max_partition_size,
263257
speculate_max_draft_token_num,
264258
causal,
@@ -293,7 +287,6 @@ void CascadeAppendAttentionKernel(
293287
quant_max_bound,
294288
quant_min_bound,
295289
in_scale,
296-
max_partition_size,
297290
encoder_max_partition_size,
298291
speculate_max_draft_token_num,
299292
causal,

csrc/gpu/append_attn/template_instantiation/append_attention_c16_bfloat16_bfloat16_kernel.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, paddle::bfloat16
4949
const float quant_max_bound,
5050
const float quant_min_bound,
5151
const float in_scale,
52-
const int max_partition_size,
5352
const int encoder_max_partition_size,
5453
const int speculate_max_draft_token_num,
5554
const bool causal,

csrc/gpu/append_attn/template_instantiation/append_attention_c16_bfloat16_fp8_kernel.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, paddle::float8_e
4848
const float quant_max_bound,
4949
const float quant_min_bound,
5050
const float in_scale,
51-
const int max_partition_size,
5251
const int encoder_max_partition_size,
5352
const int speculate_max_draft_token_num,
5453
const bool causal,

csrc/gpu/append_attn/template_instantiation/append_attention_c16_bfloat16_int8_kernel.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, int8_t>(
4848
const float quant_max_bound,
4949
const float quant_min_bound,
5050
const float in_scale,
51-
const int max_partition_size,
5251
const int encoder_max_partition_size,
5352
const int speculate_max_draft_token_num,
5453
const bool causal,

csrc/gpu/append_attn/template_instantiation/append_attention_c16_float16_float16_kernel.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, paddle::float16>(
4848
const float quant_max_bound,
4949
const float quant_min_bound,
5050
const float in_scale,
51-
const int max_partition_size,
5251
const int encoder_max_partition_size,
5352
const int speculate_max_draft_token_num,
5453
const bool causal,

csrc/gpu/append_attn/template_instantiation/append_attention_c16_float16_fp8_kernel.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ template void CascadeAppendAttentionC16Kernel<paddle::float16, paddle::float8_e4
4848
const float quant_max_bound,
4949
const float quant_min_bound,
5050
const float in_scale,
51-
const int max_partition_size,
5251
const int encoder_max_partition_size,
5352
const int speculate_max_draft_token_num,
5453
const bool causal,

0 commit comments

Comments
 (0)