Skip to content

Commit 6d45260

Browse files
authored
update doc (#10043)
1 parent 15a0675 commit 6d45260

File tree

5 files changed

+96
-126
lines changed

5 files changed

+96
-126
lines changed

csrc/setup.py

Lines changed: 0 additions & 73 deletions
This file was deleted.

llm/docs/predict/deepseek.md

Lines changed: 43 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@
3030
|模型名称|精度|MTP|节点数|静态图下载 model_name|
3131
|:------|:-:|:-:|:-:|:-:|
3232
| deepseek-ai/DeepSeek-R1 |weight_only_int4||1| deepseek-ai/DeepSeek-R1/weight_only_int4 |
33-
| deepseek-ai/DeepSeek-R1 |weight_only_int4||1| deepseek-ai/DeepSeek-R1-MTP/weight-only-int4 |
33+
| deepseek-ai/DeepSeek-R1 |weight_only_int4||1| deepseek-ai/DeepSeek-R1-MTP/weight_only_int4 |
3434
| deepseek-ai/DeepSeek-R1 |weight_only_int8||2| deepseek-ai/DeepSeek-R1-2nodes/weight_only_int8 |
35-
| deepseek-ai/DeepSeek-R1 |weight_only_int8||2| deepseek-ai/DeepSeek-R1-MTP-2nodes/weight-only-int8 |
35+
| deepseek-ai/DeepSeek-R1 |weight_only_int8||2| deepseek-ai/DeepSeek-R1-MTP-2nodes/weight_only_int8 |
3636
| deepseek-ai/DeepSeek-R1 |a8w8_fp8||2| deepseek-ai/DeepSeek-R1-2nodes/a8w8_fp8|
3737
| deepseek-ai/DeepSeek-R1 |a8w8_fp8||2| deepseek-ai/DeepSeek-R1-MTP-2nodes/a8w8_fp8|
3838
| deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B |weight_only_int8|-|-| deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/weight_only_int8 |
@@ -54,7 +54,7 @@ export MODEL_PATH=${MODEL_PATH:-$PWD}
5454
export model_name=${model_name:-"deepseek-ai/DeepSeek-R1/weight_only_int4"}
5555
docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_PTRACE \
5656
-v $MODEL_PATH:/models -e "model_name=${model_name}" \
57-
-dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v1.0 /bin/bash \
57+
-dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v2.1 /bin/bash \
5858
-c -ex 'export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && export MP_NUM=8 && start_server $model_name && tail -f /dev/null'
5959
```
6060

@@ -80,15 +80,15 @@ export MODEL_PATH=${MODEL_PATH:-$PWD}
8080
export model_name=${model_name:-"deepseek-ai/DeepSeek-R1-2nodes/weight_only_int8"}
8181
docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_PTRACE \
8282
-v $MODEL_PATH:/models -e "model_name=${model_name}" \
83-
-dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v1.0 /bin/bash \
83+
-dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v2.1 /bin/bash \
8484
-c -ex 'export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && export MP_NUM=16 && export MP_NNODE=2 && export POD_0_IP=192.168.0.1 && export POD_IPS=192.168.0.1,192.168.0.2 && start_server $model_name && tail -f /dev/null'
8585

8686
# node2
8787
export MODEL_PATH=${MODEL_PATH:-$PWD}
8888
export model_name=${model_name:-"deepseek-ai/DeepSeek-R1-2nodes/weight_only_int8"}
8989
docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_PTRACE \
9090
-v $MODEL_PATH:/models -e "model_name=${model_name}"\
91-
-dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v1.0 /bin/bash \
91+
-dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v2.1 /bin/bash \
9292
-c -ex 'export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && export MP_NUM=16 && export MP_NNODE=2 && export POD_0_IP=192.168.0.1 && export POD_IPS=192.168.0.1,192.168.0.2 && start_server $model_name && tail -f /dev/null'
9393
```
9494

@@ -100,15 +100,15 @@ export MODEL_PATH=${MODEL_PATH:-$PWD}
100100
export model_name=${model_name:-"deepseek-ai/DeepSeek-R1-2nodes/a8w8_fp8"}
101101
docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_PTRACE \
102102
-v $MODEL_PATH:/models -e "model_name=${model_name}" \
103-
-dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v1.0 /bin/bash \
103+
-dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v2.1 /bin/bash \
104104
-c -ex 'export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && export MP_NUM=16 && export MP_NNODE=2 && export POD_0_IP=192.168.0.1 && export POD_IPS=192.168.0.1,192.168.0.2 && start_server $model_name && tail -f /dev/null'
105105

106106
# node2
107107
export MODEL_PATH=${MODEL_PATH:-$PWD}
108108
export model_name=${model_name:-"deepseek-ai/DeepSeek-R1-2nodes/a8w8_fp8"}
109109
docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_PTRACE \
110110
-v $MODEL_PATH:/models -e "model_name=${model_name}" \
111-
-dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v1.0 /bin/bash \
111+
-dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v2.1 /bin/bash \
112112
-c -ex 'export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 && export MP_NUM=16 && export MP_NNODE=2 && export POD_0_IP=192.168.0.1 && export POD_IPS=192.168.0.1,192.168.0.2 && start_server $model_name && tail -f /dev/null'
113113
```
114114

@@ -117,11 +117,12 @@ docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_P
117117
### deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
118118

119119
```shell
120+
export MODEL_PATH=${MODEL_PATH:-$PWD}
121+
export model_name=${model_name:-"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/weight_only_int8"}
120122
docker run --gpus all --shm-size 32G --network=host --privileged --cap-add=SYS_PTRACE \
121-
-v /PATH_TO_MODEL/:/models \
122-
-dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v1.0 /bin/bash \
123-
-c -ex 'model_name=${model_name:-"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/weight_only_int8"} && cd /opt/output/Serving && bash start_server.sh $model_name && tail -f /dev/null'\
124-
&& docker exec -it $(docker ps -lq) sh -c "while [ ! -f /opt/output/Serving/log/workerlog.0 ]; do sleep 1; done; tail -f /opt/output/Serving/log/workerlog.0"
123+
-v /MODEL_PATH/:/models -e "model_name=${model_name}"\
124+
-dit ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlenlp:llm-serving-cuda124-cudnn9-v2.1 /bin/bash \
125+
-c -ex 'bash start_server.sh $model_name && tail -f /dev/null'
125126
```
126127

127128
### 请求服务化
@@ -235,17 +236,26 @@ python -m paddle.distributed.launch \
235236
```
236237

237238
两机 WINT8-TP16 推理
239+
238240
```shell
239-
# 动态图推理
241+
启动2机推理 需要保证2机器节点可以互相ping通
242+
# 第一个节点(master)
243+
ping 192.168.0.1
244+
# 第二个节点(slave)
245+
ping 192.168.0.2
246+
```
247+
248+
```shell
249+
# 动态图推理 node1和node2命令均相同
240250
export MODEL_TAG=deepseek-ai/DeepSeek-R1
241251
export QUANT_MODE=weight_only_int8
242252
export TOTAL_MAX_LENGTH=8192
243253
export MAX_DEC_LEN=4096
244254
export FLAGS_mla_use_tensorcore=1 # only support Hopper, Amper shoule be 0
245255
export FLAGS_cascade_attention_max_partition_size=${TOTAL_MAX_LENGTH}
246256
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
247-
mpirun python -m paddle.distributed.launch \
248-
--gpus ${CUDA_VISIBLE_DEVICES} \
257+
python -m paddle.distributed.launch \
258+
--gpus ${CUDA_VISIBLE_DEVICES} --ips "192.168.0.1,192.168.0.2"\
249259
predictor.py \
250260
--model_name_or_path ${MODEL_TAG} \
251261
--dtype bfloat16 \
@@ -258,13 +268,13 @@ mpirun python -m paddle.distributed.launch \
258268
--mla_use_matrix_absorption 1
259269

260270

261-
# 动转静导出模型
271+
# 动转静导出模型 node1和node2命令均相同
262272
export MODEL_TAG=deepseek-ai/DeepSeek-R1
263273
export OUTPUT_PATH=/path/to/exported_model
264274
export QUANT_MODE=weight_only_int8
265275
export TOTAL_MAX_LENGTH=8192
266276
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
267-
mpirun python -m paddle.distributed.launch \
277+
python -m paddle.distributed.launch --ips "192.168.0.1,192.168.0.2"\
268278
--gpus ${CUDA_VISIBLE_DEVICES} \
269279
export_model.py \
270280
--model_name_or_path ${MODEL_TAG} \
@@ -277,15 +287,15 @@ mpirun python -m paddle.distributed.launch \
277287
--mla_use_matrix_absorption 1
278288

279289

280-
# 静态图推理
290+
# 静态图推理 node1和node2命令均相同
281291
export OUTPUT_PATH=/path/to/exported_model
282292
export QUANT_MODE=weight_only_int8
283293
export TOTAL_MAX_LENGTH=8192
284294
export MAX_DEC_LEN=4096
285295
export FLAGS_mla_use_tensorcore=1 # only support Hopper, Amper shoule be 0
286296
export FLAGS_cascade_attention_max_partition_size=${TOTAL_MAX_LENGTH}
287297
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
288-
mpirun python -m paddle.distributed.launch \
298+
python -m paddle.distributed.launch --ips "192.168.0.1,192.168.0.2"\
289299
--gpus ${CUDA_VISIBLE_DEVICES} \
290300
predictor.py \
291301
--model_name_or_path ${OUTPUT_PATH} \
@@ -300,16 +310,25 @@ mpirun python -m paddle.distributed.launch \
300310
```
301311

302312
两机 FP8-TP16 推理
313+
303314
```shell
304-
# 动态图推理
315+
启动2机推理 需要保证2机器节点可以互相ping通
316+
# 第一个节点(master)
317+
ping 192.168.0.1
318+
# 第二个节点(slave)
319+
ping 192.168.0.2
320+
```
321+
322+
```shell
323+
# 动态图推理 node1和node2命令均相同
305324
export MODEL_TAG=deepseek-ai/DeepSeek-R1-FP8
306325
export QUANT_MODE=a8w8_fp8
307326
export TOTAL_MAX_LENGTH=8192
308327
export MAX_DEC_LEN=4096
309328
export FLAGS_mla_use_tensorcore=1 # only support Hopper, Amper shoule be 0
310329
export FLAGS_cascade_attention_max_partition_size=${TOTAL_MAX_LENGTH}
311330
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
312-
mpirun python -m paddle.distributed.launch \
331+
python -m paddle.distributed.launch --ips "192.168.0.1,192.168.0.2"\
313332
--gpus ${CUDA_VISIBLE_DEVICES} \
314333
predictor.py \
315334
--model_name_or_path ${MODEL_TAG} \
@@ -324,13 +343,13 @@ mpirun python -m paddle.distributed.launch \
324343
--weight_block_size 128 128
325344

326345

327-
# 动转静导出模型
346+
# 动转静导出模型 node1和node2命令均相同
328347
export MODEL_TAG=deepseek-ai/DeepSeek-R1-FP8
329348
export OUTPUT_PATH=/path/to/exported_model
330349
export QUANT_MODE=a8w8_fp8
331350
export TOTAL_MAX_LENGTH=8192
332351
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
333-
mpirun python -m paddle.distributed.launch \
352+
python -m paddle.distributed.launch --ips "192.168.0.1,192.168.0.2"\
334353
--gpus ${CUDA_VISIBLE_DEVICES} \
335354
export_model.py \
336355
--model_name_or_path ${MODEL_TAG} \
@@ -344,15 +363,15 @@ mpirun python -m paddle.distributed.launch \
344363
--weight_block_size 128 128
345364

346365

347-
# 静态图推理
366+
# 静态图推理 node1和node2命令均相同
348367
export OUTPUT_PATH=/path/to/exported_model
349368
export QUANT_MODE=a8w8_fp8
350369
export TOTAL_MAX_LENGTH=8192
351370
export MAX_DEC_LEN=4096
352371
export FLAGS_mla_use_tensorcore=1 # only support Hopper, Amper shoule be 0
353372
export FLAGS_cascade_attention_max_partition_size=${TOTAL_MAX_LENGTH}
354373
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
355-
mpirun python -m paddle.distributed.launch \
374+
python -m paddle.distributed.launch --ips "192.168.0.1,192.168.0.2"\
356375
--gpus ${CUDA_VISIBLE_DEVICES} \
357376
predictor.py \
358377
--model_name_or_path ${OUTPUT_PATH} \

0 commit comments

Comments
 (0)