3030| 模型名称| 精度| MTP| 节点数| 静态图下载 model_name|
3131| :------| :-:| :-:| :-:| :-:|
3232| deepseek-ai/DeepSeek-R1 | weight_only_int4| 否| 1| deepseek-ai/DeepSeek-R1/weight_only_int4 |
33- | deepseek-ai/DeepSeek-R1 | weight_only_int4| 是| 1| deepseek-ai/DeepSeek-R1-MTP/weight-only-int4 |
33+ | deepseek-ai/DeepSeek-R1 | weight_only_int4| 是| 1| deepseek-ai/DeepSeek-R1-MTP/weight_only_int4 |
3434| deepseek-ai/DeepSeek-R1 | weight_only_int8| 否| 2| deepseek-ai/DeepSeek-R1-2nodes/weight_only_int8 |
35- | deepseek-ai/DeepSeek-R1 | weight_only_int8| 是| 2| deepseek-ai/DeepSeek-R1-MTP-2nodes/weight-only-int8 |
35+ | deepseek-ai/DeepSeek-R1 | weight_only_int8| 是| 2| deepseek-ai/DeepSeek-R1-MTP-2nodes/weight_only_int8 |
3636| deepseek-ai/DeepSeek-R1 | a8w8_fp8| 否| 2| deepseek-ai/DeepSeek-R1-2nodes/a8w8_fp8|
3737| deepseek-ai/DeepSeek-R1 | a8w8_fp8| 是| 2| deepseek-ai/DeepSeek-R1-MTP-2nodes/a8w8_fp8|
3838| deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B | weight_only_int8| -| -| deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/weight_only_int8 |
@@ -54,7 +54,7 @@ export MODEL_PATH=${MODEL_PATH:-$PWD}
5454export model_name=${model_name:- " deepseek-ai/DeepSeek-R1/weight_only_int4" }
5555docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_PTRACE \
5656-v $MODEL_PATH :/models -e " model_name=${model_name} " \
57- -dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v1.0 /bin/bash \
57+ -dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v2.1 /bin/bash \
5858-c -ex ' export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && export MP_NUM=8 && start_server $model_name && tail -f /dev/null'
5959```
6060
@@ -80,15 +80,15 @@ export MODEL_PATH=${MODEL_PATH:-$PWD}
8080export model_name=${model_name:- " deepseek-ai/DeepSeek-R1-2nodes/weight_only_int8" }
8181docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_PTRACE \
8282-v $MODEL_PATH :/models -e " model_name=${model_name} " \
83- -dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v1.0 /bin/bash \
83+ -dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v2.1 /bin/bash \
8484-c -ex ' export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && export MP_NUM=16 && export MP_NNODE=2 && export POD_0_IP=192.168.0.1 && export POD_IPS=192.168.0.1,192.168.0.2 && start_server $model_name && tail -f /dev/null'
8585
8686# node2
8787export MODEL_PATH=${MODEL_PATH:- $PWD }
8888export model_name=${model_name:- " deepseek-ai/DeepSeek-R1-2nodes/weight_only_int8" }
8989docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_PTRACE \
9090-v $MODEL_PATH :/models -e " model_name=${model_name} " \
91- -dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v1.0 /bin/bash \
91+ -dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v2.1 /bin/bash \
9292-c -ex ' export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && export MP_NUM=16 && export MP_NNODE=2 && export POD_0_IP=192.168.0.1 && export POD_IPS=192.168.0.1,192.168.0.2 && start_server $model_name && tail -f /dev/null'
9393```
9494
@@ -100,15 +100,15 @@ export MODEL_PATH=${MODEL_PATH:-$PWD}
100100export model_name=${model_name:- " deepseek-ai/DeepSeek-R1-2nodes/a8w8_fp8" }
101101docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_PTRACE \
102102-v $MODEL_PATH :/models -e " model_name=${model_name} " \
103- -dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v1.0 /bin/bash \
103+ -dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v2.1 /bin/bash \
104104-c -ex ' export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && export MP_NUM=16 && export MP_NNODE=2 && export POD_0_IP=192.168.0.1 && export POD_IPS=192.168.0.1,192.168.0.2 && start_server $model_name && tail -f /dev/null'
105105
106106# node2
107107export MODEL_PATH=${MODEL_PATH:- $PWD }
108108export model_name=${model_name:- " deepseek-ai/DeepSeek-R1-2nodes/a8w8_fp8" }
109109docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_PTRACE \
110110-v $MODEL_PATH :/models -e " model_name=${model_name} " \
111- -dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v1.0 /bin/bash \
111+ -dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v2.1 /bin/bash \
112112-c -ex ' export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && export MP_NUM=16 && export MP_NNODE=2 && export POD_0_IP=192.168.0.1 && export POD_IPS=192.168.0.1,192.168.0.2 && start_server $model_name && tail -f /dev/null'
113113```
114114
@@ -117,11 +117,12 @@ docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_P
117117### deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
118118
119119``` shell
120+ export MODEL_PATH=${MODEL_PATH:- $PWD }
121+ export model_name=${model_name:- " deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/weight_only_int8" }
120122docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_PTRACE \
121- -v /PATH_TO_MODEL/:/models \
122- -dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v1.0 /bin/bash \
123- -c -ex ' model_name=${model_name:-"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/weight_only_int8"} && cd /opt/output/Serving && bash start_server.sh $model_name && tail -f /dev/null' \
124- && docker exec -it $( docker ps -lq) sh -c " while [ ! -f /opt/output/Serving/log/workerlog.0 ]; do sleep 1; done; tail -f /opt/output/Serving/log/workerlog.0"
123+ -v /MODEL_PATH/:/models -e " model_name=${model_name} " \
124+ -dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v2.1 /bin/bash \
125+ -c -ex ' bash start_server.sh $model_name && tail -f /dev/null'
125126```
126127
127128### 请求服务化
@@ -235,17 +236,26 @@ python -m paddle.distributed.launch \
235236```
236237
237238两机 WINT8-TP16 推理
239+
238240``` shell
239- # 动态图推理
241+ 启动2机推理 需要保证2机器节点可以互相ping通
242+ # 第一个节点(master)
243+ ping 192.168.0.1
244+ # 第二个节点(slave)
245+ ping 192.168.0.2
246+ ```
247+
248+ ``` shell
249+ # 动态图推理 node1和node2命令均相同
240250export MODEL_TAG=deepseek-ai/DeepSeek-R1
241251export QUANT_MODE=weight_only_int8
242252export TOTAL_MAX_LENGTH=8192
243253export MAX_DEC_LEN=4096
244254export FLAGS_mla_use_tensorcore=1 # only support Hopper, Amper shoule be 0
245255export FLAGS_cascade_attention_max_partition_size=${TOTAL_MAX_LENGTH}
246256export CUDA_VISIBLE_DEVICES=" 0,1,2,3,4,5,6,7"
247- mpirun python -m paddle.distributed.launch \
248- --gpus ${CUDA_VISIBLE_DEVICES} \
257+ python -m paddle.distributed.launch \
258+ --gpus ${CUDA_VISIBLE_DEVICES} --ips " 192.168.0.1,192.168.0.2 " \
249259 predictor.py \
250260 --model_name_or_path ${MODEL_TAG} \
251261 --dtype bfloat16 \
@@ -258,13 +268,13 @@ mpirun python -m paddle.distributed.launch \
258268 --mla_use_matrix_absorption 1
259269
260270
261- # 动转静导出模型
271+ # 动转静导出模型 node1和node2命令均相同
262272export MODEL_TAG=deepseek-ai/DeepSeek-R1
263273export OUTPUT_PATH=/path/to/exported_model
264274export QUANT_MODE=weight_only_int8
265275export TOTAL_MAX_LENGTH=8192
266276export CUDA_VISIBLE_DEVICES=" 0,1,2,3,4,5,6,7"
267- mpirun python -m paddle.distributed.launch \
277+ python -m paddle.distributed.launch --ips " 192.168.0.1,192.168.0.2 " \
268278 --gpus ${CUDA_VISIBLE_DEVICES} \
269279 export_model.py \
270280 --model_name_or_path ${MODEL_TAG} \
@@ -277,15 +287,15 @@ mpirun python -m paddle.distributed.launch \
277287 --mla_use_matrix_absorption 1
278288
279289
280- # 静态图推理
290+ # 静态图推理 node1和node2命令均相同
281291export OUTPUT_PATH=/path/to/exported_model
282292export QUANT_MODE=weight_only_int8
283293export TOTAL_MAX_LENGTH=8192
284294export MAX_DEC_LEN=4096
285295export FLAGS_mla_use_tensorcore=1 # only support Hopper, Amper shoule be 0
286296export FLAGS_cascade_attention_max_partition_size=${TOTAL_MAX_LENGTH}
287297export CUDA_VISIBLE_DEVICES=" 0,1,2,3,4,5,6,7"
288- mpirun python -m paddle.distributed.launch \
298+ python -m paddle.distributed.launch --ips " 192.168.0.1,192.168.0.2 " \
289299 --gpus ${CUDA_VISIBLE_DEVICES} \
290300 predictor.py \
291301 --model_name_or_path ${OUTPUT_PATH} \
@@ -300,16 +310,25 @@ mpirun python -m paddle.distributed.launch \
300310```
301311
302312两机 FP8-TP16 推理
313+
303314``` shell
304- # 动态图推理
315+ 启动2机推理 需要保证2机器节点可以互相ping通
316+ # 第一个节点(master)
317+ ping 192.168.0.1
318+ # 第二个节点(slave)
319+ ping 192.168.0.2
320+ ```
321+
322+ ``` shell
323+ # 动态图推理 node1和node2命令均相同
305324export MODEL_TAG=deepseek-ai/DeepSeek-R1-FP8
306325export QUANT_MODE=a8w8_fp8
307326export TOTAL_MAX_LENGTH=8192
308327export MAX_DEC_LEN=4096
309328export FLAGS_mla_use_tensorcore=1 # only support Hopper, Amper shoule be 0
310329export FLAGS_cascade_attention_max_partition_size=${TOTAL_MAX_LENGTH}
311330export CUDA_VISIBLE_DEVICES=" 0,1,2,3,4,5,6,7"
312- mpirun python -m paddle.distributed.launch \
331+ python -m paddle.distributed.launch --ips " 192.168.0.1,192.168.0.2 " \
313332 --gpus ${CUDA_VISIBLE_DEVICES} \
314333 predictor.py \
315334 --model_name_or_path ${MODEL_TAG} \
@@ -324,13 +343,13 @@ mpirun python -m paddle.distributed.launch \
324343 --weight_block_size 128 128
325344
326345
327- # 动转静导出模型
346+ # 动转静导出模型 node1和node2命令均相同
328347export MODEL_TAG=deepseek-ai/DeepSeek-R1-FP8
329348export OUTPUT_PATH=/path/to/exported_model
330349export QUANT_MODE=a8w8_fp8
331350export TOTAL_MAX_LENGTH=8192
332351export CUDA_VISIBLE_DEVICES=" 0,1,2,3,4,5,6,7"
333- mpirun python -m paddle.distributed.launch \
352+ python -m paddle.distributed.launch --ips " 192.168.0.1,192.168.0.2 " \
334353 --gpus ${CUDA_VISIBLE_DEVICES} \
335354 export_model.py \
336355 --model_name_or_path ${MODEL_TAG} \
@@ -344,15 +363,15 @@ mpirun python -m paddle.distributed.launch \
344363 --weight_block_size 128 128
345364
346365
347- # 静态图推理
366+ # 静态图推理 node1和node2命令均相同
348367export OUTPUT_PATH=/path/to/exported_model
349368export QUANT_MODE=a8w8_fp8
350369export TOTAL_MAX_LENGTH=8192
351370export MAX_DEC_LEN=4096
352371export FLAGS_mla_use_tensorcore=1 # only support Hopper, Amper shoule be 0
353372export FLAGS_cascade_attention_max_partition_size=${TOTAL_MAX_LENGTH}
354373export CUDA_VISIBLE_DEVICES=" 0,1,2,3,4,5,6,7"
355- mpirun python -m paddle.distributed.launch \
374+ python -m paddle.distributed.launch --ips " 192.168.0.1,192.168.0.2 " \
356375 --gpus ${CUDA_VISIBLE_DEVICES} \
357376 predictor.py \
358377 --model_name_or_path ${OUTPUT_PATH} \
0 commit comments