File tree Expand file tree Collapse file tree 2 files changed +20
-0
lines changed
Expand file tree Collapse file tree 2 files changed +20
-0
lines changed Original file line number Diff line number Diff line change @@ -299,4 +299,18 @@ inline uint32_t get_max_partition_size(int bsz) {
299299 static const uint32_t max_partition_size =
300300 max_partition_size_env == nullptr ? 0 : std::stoul (std::string (max_partition_size_env));
301301 return (max_partition_size != 0 ? max_partition_size : (bsz == 1 ? 128 : 512 ));
302+ }
303+
304+ inline uint32_t get_decoder_block_shape_q () {
305+ static const char * decoder_block_shape_q_env = std::getenv (" FLAGS_dec_block_shape_q" );
306+ static const uint32_t decoder_block_shape_q =
307+ decoder_block_shape_q_env == nullptr ? 16 : std::stoi (std::string (decoder_block_shape_q_env));
308+ return decoder_block_shape_q;
309+ }
310+
311+ inline uint32_t get_encoder_block_shape_q () {
312+ static const char * encoder_block_shape_q_env = std::getenv (" FLAGS_enc_block_shape_q" );
313+ static const uint32_t encoder_block_shape_q =
314+ encoder_block_shape_q_env == nullptr ? 64 : std::stoi (std::string (encoder_block_shape_q_env));
315+ return encoder_block_shape_q;
302316}
Original file line number Diff line number Diff line change @@ -22,3 +22,9 @@ PaddleNLP 提供了多种环境变量,用于优化推理性能和资源使用
2222- ` FLAGS_fraction_of_gpu_memory_to_use ` :GPU 显存使用率,默认值为0.9。设置为0.9即可。
2323
2424- ` FLAGS_gemm_use_half_precision_compute_type ` :是否使用半精度浮点数计算,默认值为0。设置为0即可。
25+
26+ ** Append Attention 优化**
27+
28+ - ` FLAGS_cascade_attention_max_partition_size ` :Append Attention decoder计算时对cache_kv进行分chunk的chunk大小,batchsize为1时默认值为128,batchsize大于时512。显示设置时不区分batchsize。
29+ - ` FLAGS_dec_block_shape_q ` :Append Attention decoder计算时对q进行分块的分块大小,默认值为16。设置为16即可。
30+ - ` FLAGS_enc_block_shape_q ` :Append Attention encoder计算时对q进行分块的分块大小,默认值为64。设置为64即可。
You can’t perform that action at this time.
0 commit comments