@@ -59,10 +59,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
5959 const float quant_max_bound,
6060 const float quant_min_bound,
6161 const float out_linear_in_scale,
62- const int encoder_block_shape_q,
63- const int decoder_block_shape_q,
64- const int max_partition_size,
65- const int encoder_max_partition_size,
6662 const int speculate_max_draft_token_num,
6763 const bool causal,
6864 const bool speculate_decoder) {
@@ -76,7 +72,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
7672 int max_enc_len_this_time_data = max_enc_len_this_time.data <int >()[0 ];
7773 int max_dec_len_this_time_data = max_dec_len_this_time.data <int >()[0 ];
7874 int max_len_kv_data = max_len_kv.data <int >()[0 ];
79-
75+ const int encoder_block_shape_q = get_encoder_block_shape_q ();
76+ const int decoder_block_shape_q = get_decoder_block_shape_q ();
8077 auto main_stream = qkv.stream ();
8178 static cudaEvent_t main_event;
8279 static cudaEvent_t decoder_event;
@@ -209,8 +206,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
209206 quant_max_bound,
210207 quant_min_bound,
211208 out_linear_in_scale,
212- max_partition_size,
213- encoder_max_partition_size,
214209 speculate_max_draft_token_num,
215210 causal,
216211 false ,
@@ -248,8 +243,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
248243 quant_max_bound,
249244 quant_min_bound,
250245 out_linear_in_scale,
251- max_partition_size,
252- encoder_max_partition_size,
253246 speculate_max_draft_token_num,
254247 causal,
255248 false ,
@@ -292,8 +285,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
292285 quant_max_bound,
293286 quant_min_bound,
294287 out_linear_in_scale,
295- max_partition_size,
296- encoder_max_partition_size,
297288 speculate_max_draft_token_num,
298289 causal,
299290 false ,
@@ -440,8 +431,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
440431 quant_max_bound,
441432 quant_min_bound,
442433 out_linear_in_scale,
443- max_partition_size,
444- encoder_max_partition_size,
445434 speculate_max_draft_token_num,
446435 causal,
447436 !speculate_decoder,
@@ -479,8 +468,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
479468 quant_max_bound,
480469 quant_min_bound,
481470 out_linear_in_scale,
482- max_partition_size,
483- encoder_max_partition_size,
484471 speculate_max_draft_token_num,
485472 causal,
486473 !speculate_decoder,
@@ -524,8 +511,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
524511 quant_max_bound,
525512 quant_min_bound,
526513 out_linear_in_scale,
527- max_partition_size,
528- encoder_max_partition_size,
529514 speculate_max_draft_token_num,
530515 causal,
531516 !speculate_decoder,
@@ -583,10 +568,6 @@ std::vector<paddle::Tensor> AppendAttention(
583568 const float quant_max_bound,
584569 const float quant_min_bound,
585570 const float out_linear_in_scale,
586- const int encoder_block_shape_q,
587- const int decoder_block_shape_q,
588- const int max_partition_size,
589- const int encoder_max_partition_size,
590571 const int speculate_max_draft_token_num,
591572 const bool causal,
592573 const bool speculate_decoder) {
@@ -648,10 +629,6 @@ std::vector<paddle::Tensor> AppendAttention(
648629 quant_max_bound,
649630 quant_min_bound,
650631 out_linear_in_scale,
651- encoder_block_shape_q,
652- decoder_block_shape_q,
653- max_partition_size,
654- encoder_max_partition_size,
655632 speculate_max_draft_token_num,
656633 causal,
657634 speculate_decoder);
@@ -698,10 +675,6 @@ std::vector<paddle::Tensor> AppendAttention(
698675 quant_max_bound,
699676 quant_min_bound,
700677 out_linear_in_scale,
701- encoder_block_shape_q,
702- decoder_block_shape_q,
703- max_partition_size,
704- encoder_max_partition_size,
705678 speculate_max_draft_token_num,
706679 causal,
707680 speculate_decoder);
@@ -749,10 +722,6 @@ std::vector<paddle::Tensor> AppendAttention(
749722 quant_max_bound,
750723 quant_min_bound,
751724 out_linear_in_scale,
752- encoder_block_shape_q,
753- decoder_block_shape_q,
754- max_partition_size,
755- encoder_max_partition_size,
756725 speculate_max_draft_token_num,
757726 causal,
758727 speculate_decoder);
@@ -798,10 +767,6 @@ std::vector<paddle::Tensor> AppendAttention(
798767 quant_max_bound,
799768 quant_min_bound,
800769 out_linear_in_scale,
801- encoder_block_shape_q,
802- decoder_block_shape_q,
803- max_partition_size,
804- encoder_max_partition_size,
805770 speculate_max_draft_token_num,
806771 causal,
807772 speculate_decoder);
@@ -903,10 +868,6 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
903868 const float quant_max_bound,
904869 const float quant_min_bound,
905870 const float out_linear_in_scale,
906- const int encoder_block_shape_q,
907- const int decoder_block_shape_q,
908- const int max_partition_size,
909- const int encoder_max_partition_size,
910871 const int speculate_max_draft_token_num,
911872 const bool causal,
912873 const bool speculate_decoder) {
@@ -983,10 +944,6 @@ PD_BUILD_OP(append_attention)
983944 " quant_max_bound: float" ,
984945 " quant_min_bound: float" ,
985946 " out_linear_in_scale: float" ,
986- " encoder_block_shape_q: int" ,
987- " decoder_block_shape_q: int" ,
988- " max_partition_size: int" ,
989- " encoder_max_partition_size: int" ,
990947 " speculate_max_draft_token_num: int" ,
991948 " causal: bool" ,
992949 " speculate_decoder: bool" })
0 commit comments