PaddlePaddle · Yugsolanki · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/deploy/genai_vllm_server_docker/Dockerfile b/deploy/genai_vllm_server_docker/Dockerfile
@@ -13,9 +13,10 @@ RUN python -m pip install "paddlex${PADDLEX_VERSION}"
 
 ARG BUILD_FOR_SM120=false
 RUN if [ "${BUILD_FOR_SM120}" = 'true' ]; then \
-        python -m pip install https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.3.14/flash_attn-2.8.3+cu128torch2.8-cp310-cp310-linux_x86_64.whl \
+        python -m pip install torch==2.8.0 https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.11/flash_attn-2.8.3%2Bcu128torch2.8-cp310-cp310-linux_x86_64.whl; \
     else \
-        python -m pip install https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.3.14/flash_attn-2.8.2+cu128torch2.8-cp310-cp310-linux_x86_64.whl \
+        python -m pip install torch==2.8.0 https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.3.14/flash_attn-2.8.2+cu128torch2.8-cp310-cp310-linux_x86_64.whl; \
+    fi \
     && paddlex --install genai-vllm-server
 
 EXPOSE 8080

diff --git a/deploy/genai_vllm_server_docker/build.sh b/deploy/genai_vllm_server_docker/build.sh
@@ -21,8 +21,8 @@ while [[ $# -gt 0 ]]; do
             shift
             ;;
         *)
-            echo "Unknown option: $1"
-            exit 1
+            echo "Unknown option: $1" >&2
+            exit 2
             ;;
     esac
 done

diff --git a/deploy/hps/server_env/Dockerfile b/deploy/hps/server_env/Dockerfile
@@ -46,6 +46,7 @@ ENV PYTHONUNBUFFERED=1
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PIP_INDEX_URL=${PIP_INDEX_URL}
 
+RUN python -m pip install pip==25.2
 
 # Requirement collection
 FROM base AS rc

diff --git a/deploy/hps/server_env/cpu_version.txt b/deploy/hps/server_env/cpu_version.txt
@@ -1 +1 @@
-0.3.9
+0.3.10
diff --git a/deploy/hps/server_env/gpu_version.txt b/deploy/hps/server_env/gpu_version.txt
@@ -1 +1 @@
-0.3.10
+0.3.11
diff --git a/deploy/hps/server_env/requirements/app.in b/deploy/hps/server_env/requirements/app.in
@@ -4,4 +4,5 @@ numpy >= 1.24
 opencv-contrib-python == 4.10.0.84
 pycocotools >= 2
 pydantic >= 2
+safetensors @ https://paddle-whl.bj.bcebos.com/nightly/cu126/safetensors/safetensors-0.6.2.dev0-cp38-abi3-linux_x86_64.whl
 typing-extensions >= 4.11
diff --git a/deploy/hps/server_env/requirements/cpu.in b/deploy/hps/server_env/requirements/cpu.in
@@ -1 +1 @@
-paddlepaddle @ https://paddle-whl.bj.bcebos.com/stable/cpu/paddlepaddle/paddlepaddle-3.1.1-cp310-cp310-linux_x86_64.whl
+paddlepaddle @ https://paddle-whl.bj.bcebos.com/stable/cpu/paddlepaddle/paddlepaddle-3.2.1-cp310-cp310-linux_x86_64.whl
diff --git a/deploy/hps/server_env/requirements/cpu.txt b/deploy/hps/server_env/requirements/cpu.txt
@@ -171,6 +171,7 @@ lxml==5.3.1
     # via
     #   paddlex (../../../setup.py)
     #   premailer
+    #   python-docx
 markupsafe==3.0.2
     # via jinja2
 marshmallow==3.26.1
@@ -238,7 +239,7 @@ packaging==24.2
     #   matplotlib
     #   paddlex (../../../setup.py)
     #   scikit-image
-paddlepaddle @ https://paddle-whl.bj.bcebos.com/stable/cpu/paddlepaddle/paddlepaddle-3.1.1-cp310-cp310-linux_x86_64.whl
+paddlepaddle @ https://paddle-whl.bj.bcebos.com/stable/cpu/paddlepaddle/paddlepaddle-3.2.1-cp310-cp310-linux_x86_64.whl
     # via -r requirements/cpu.in
 pandas==1.3.5
     # via paddlex (../../../setup.py)
@@ -295,6 +296,8 @@ python-dateutil==2.9.0.post0
     # via
     #   matplotlib
     #   pandas
+python-docx==1.2.0
+    # via paddlex (../../../setup.py)
 pytz==2025.1
     # via pandas
 pyyaml==6.0.2
@@ -326,8 +329,11 @@ ruamel-yaml==0.18.10
     # via paddlex (../../../setup.py)
 ruamel-yaml-clib==0.2.12
     # via ruamel-yaml
-safetensors==0.6.2
-    # via paddlex (../../../setup.py)
+safetensors @ https://paddle-whl.bj.bcebos.com/nightly/cu126/safetensors/safetensors-0.6.2.dev0-cp38-abi3-linux_x86_64.whl
+    # via
+    #   -r requirements/app.in
+    #   paddlepaddle
+    #   paddlex (../../../setup.py)
 scikit-image==0.24.0
     # via paddlex (../../../setup.py)
 scikit-learn==1.6.1
@@ -396,6 +402,7 @@ typing-extensions==4.12.2
     #   paddlex (../../../setup.py)
     #   pydantic
     #   pydantic-core
+    #   python-docx
     #   sqlalchemy
     #   typing-inspect
     #   uvicorn

diff --git a/deploy/hps/server_env/requirements/gpu.in b/deploy/hps/server_env/requirements/gpu.in
@@ -1 +1 @@
-paddlepaddle-gpu @ https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/deps/paddlepaddle/paddlepaddle_gpu-3.1.1%2Bfc-cp310-cp310-linux_x86_64.whl
+paddlepaddle-gpu @ https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/deps/paddlepaddle/paddlepaddle_gpu-3.2.1%2Bfc-cp310-cp310-linux_x86_64.whl
diff --git a/deploy/hps/server_env/requirements/gpu.txt b/deploy/hps/server_env/requirements/gpu.txt
@@ -171,6 +171,7 @@ lxml==5.3.1
     # via
     #   paddlex (../../../setup.py)
     #   premailer
+    #   python-docx
 markupsafe==3.0.2
     # via jinja2
 marshmallow==3.26.1
@@ -238,7 +239,7 @@ packaging==24.2
     #   matplotlib
     #   paddlex (../../../setup.py)
     #   scikit-image
-paddlepaddle-gpu @ https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/deps/paddlepaddle/paddlepaddle_gpu-3.1.1%2Bfc-cp310-cp310-linux_x86_64.whl
+paddlepaddle-gpu @ https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/deps/paddlepaddle/paddlepaddle_gpu-3.2.1%2Bfc-cp310-cp310-linux_x86_64.whl
     # via -r requirements/gpu.in
 pandas==1.3.5
     # via paddlex (../../../setup.py)
@@ -295,6 +296,8 @@ python-dateutil==2.9.0.post0
     # via
     #   matplotlib
     #   pandas
+python-docx==1.2.0
+    # via paddlex (../../../setup.py)
 pytz==2025.1
     # via pandas
 pyyaml==6.0.2
@@ -326,8 +329,11 @@ ruamel-yaml==0.18.10
     # via paddlex (../../../setup.py)
 ruamel-yaml-clib==0.2.12
     # via ruamel-yaml
-safetensors==0.6.2
-    # via paddlex (../../../setup.py)
+safetensors @ https://paddle-whl.bj.bcebos.com/nightly/cu126/safetensors/safetensors-0.6.2.dev0-cp38-abi3-linux_x86_64.whl
+    # via
+    #   -r requirements/app.in
+    #   paddlepaddle-gpu
+    #   paddlex (../../../setup.py)
 scikit-image==0.24.0
     # via paddlex (../../../setup.py)
 scikit-learn==1.6.1
@@ -396,6 +402,7 @@ typing-extensions==4.12.2
     #   paddlex (../../../setup.py)
     #   pydantic
     #   pydantic-core
+    #   python-docx
     #   sqlalchemy
     #   starlette
     #   typing-inspect

diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.md b/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.md
@@ -1038,8 +1038,8 @@ paddlex --get_pipeline_config PaddleOCR-VL
 VLRecognition:
   ...
   genai_config:
-    backend: vllm-server
-    server_url: http://127.0.0.1:8118/v1
+    backend: vllm
+    server_url: http://127.0.0.1:8118
 ```
 
 之后，可以使用修改好的配置文件进行产线调用。例如通过 CLI 调用：

diff --git a/paddlex/.version b/paddlex/.version
@@ -1 +1 @@
-3.3.0
+3.3.10
diff --git a/paddlex/inference/genai/backends/vllm.py b/paddlex/inference/genai/backends/vllm.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from ....utils import logging
 from ....utils.deps import is_genai_engine_plugin_available, require_genai_engine_plugin
 from ..configs.utils import (
     backend_config_to_args,
@@ -61,6 +62,16 @@ def run_vllm_server(host, port, model_name, model_dir, config, chat_template_pat
         },
     )
 
+    import torch
+
+    if torch.version.hip is not None and torch.version.cuda is None:
+        # For DCU
+        if "api-server-count" in config:
+            logging.warning(
+                "Key 'api-server-count' will be popped as it is not supported"
+            )
+            config.pop("api-server-count")
+
     args = backend_config_to_args(config)
     args = parser.parse_args(args)
     validate_parsed_serve_args(args)

diff --git a/paddlex/inference/genai/configs/paddleocr_vl_09b.py b/paddlex/inference/genai/configs/paddleocr_vl_09b.py
@@ -20,7 +20,7 @@ def get_config(backend):
             "max-model-len": 16384,
             "max-num-batched-tokens": 16384,
             "max-num-seqs": 256,
-            "workers": 2,
+            "workers": 4,
             "graph-optimization-config": '{"graph_opt_level":0, "use_cudagraph":true}',
         }
     elif backend == "vllm":

diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_ernie.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_ernie.py
@@ -296,7 +296,7 @@ def forward(self, hidden_states):
                 3. Scale by learned weight parameter
             - Maintains original dtype for numerical stability during computation
         """
-        if self.config.fuse_rms_norm:
+        if hidden_states.dtype != paddle.float16 and self.config.fuse_rms_norm:
             return fused_rms_norm_ext(
                 hidden_states, self.weight, self.variance_epsilon
             )[0].astype(self.weight.dtype)
@@ -854,8 +854,15 @@ def core_attn(
         v = tensor.transpose(x=v, perm=perm)
 
         replicate = self.config.num_attention_heads // self.config.num_key_value_heads
+        is_float16 = k.dtype == paddle.float16
+        if is_float16:
+            k = k.cast(paddle.float32)
+            v = v.cast(paddle.float32)
         k = paddle.repeat_interleave(k, replicate, axis=1)
         v = paddle.repeat_interleave(v, replicate, axis=1)
+        if is_float16:
+            k = k.cast(paddle.float16)
+            v = v.cast(paddle.float16)
 
         scale_qk_coeff = self.config.scale_qk_coeff * self.head_dim**0.5
         product = paddle.matmul(x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True)

diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_siglip.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_siglip.py
@@ -42,6 +42,7 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 
+from ......utils.env import get_gpu_compute_capability
 from ....common.vlm.activations import ACT2FN
 from ....common.vlm.transformers import PretrainedModel
 from ....common.vlm.transformers.model_outputs import (
@@ -100,15 +101,22 @@ def eager_attention_forward(
     dropout: float = 0.0,
     **kwargs,
 ):
-    attn_weights = paddle.matmul(query, key.transpose((0, 1, 3, 2))) * scaling
+    origin_dtype = query.dtype
+
+    attn_weights = paddle.matmul(x=query.scale(scaling), y=key, transpose_y=True)
+    attn_weights = attn_weights.cast(paddle.float32)
+
     if attention_mask is not None:
+        attnetion_mask = attention_mask.cast(paddle.float32)
         attn_weights = attn_weights + attention_mask
 
-    attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query.dtype)
+    attn_weights = F.softmax(attn_weights, axis=-1)
+    attn_weights = attn_weights.cast(origin_dtype)
+
     attn_weights = F.dropout(attn_weights, p=dropout, training=module.training)
 
     attn_output = paddle.matmul(attn_weights, value)
-    attn_output = attn_output.transpose((0, 2, 1, 3)).contiguous()
+    attn_output = attn_output.transpose((0, 2, 1, 3))
 
     return attn_output, attn_weights
 
@@ -130,6 +138,9 @@ def __init__(self, config):
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
 
+        cap = get_gpu_compute_capability()
+        self._supports_sdpa = cap >= (8, 0) if cap is not None else False
+
     def forward(
         self,
         hidden_states: paddle.Tensor,  # [B, L, D]
@@ -138,44 +149,55 @@ def forward(
         cu_seqlens: Optional[List[paddle.Tensor]] = None,
         rope_emb: Optional[Tuple[paddle.Tensor, paddle.Tensor]] = None,  # (cos, sin)
     ):
+        if output_attentions:
+            raise NotImplementedError
+
         B, L, D = hidden_states.shape
 
         q = self.q_proj(hidden_states)
         k = self.k_proj(hidden_states)
         v = self.v_proj(hidden_states)
 
         # [B, L, H, Dh]
-
         q = q.reshape([B, L, self.num_heads, self.head_dim])
         k = k.reshape([B, L, self.num_heads, self.head_dim])
         v = v.reshape([B, L, self.num_heads, self.head_dim])
         if rope_emb is not None:
             cos, sin = rope_emb
             q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
 
-        # → [B, H, L, Dh]
-        q = q.transpose([0, 2, 1, 3])
-        k = k.transpose([0, 2, 1, 3])
-        v = v.transpose([0, 2, 1, 3])
-
-        attn_output, attn_weights = eager_attention_forward(
-            self,
-            q,
-            k,
-            v,
-            attention_mask,
-            is_causal=self.is_causal,
-            scaling=self.scale,
-            dropout=0.0 if not self.training else self.dropout,
-        )
-        attn_output = attn_output.reshape([B, L, D]).contiguous()
+        if not self._supports_sdpa or q.dtype == paddle.float32:
+            # → [B, H, L, Dh]
+            q = q.transpose([0, 2, 1, 3])
+            k = k.transpose([0, 2, 1, 3])
+            v = v.transpose([0, 2, 1, 3])
+
+            attn_output, _ = eager_attention_forward(
+                self,
+                q,
+                k,
+                v,
+                attention_mask,
+                is_causal=self.is_causal,
+                scaling=self.scale,
+                dropout=0.0 if not self.training else self.dropout,
+            )
+            attn_output = attn_output.reshape([B, L, D])
+        else:
+            attn_output = paddle.nn.functional.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attention_mask,
+                dropout_p=self.dropout,
+                is_causal=self.is_causal,
+                training=self.training,
+            )
+        attn_output = attn_output.reshape([B, L, D])
 
         attn_output = self.out_proj(attn_output)
 
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights
+        return attn_output, None
 
 
 class SiglipVisionEmbeddings(nn.Layer):

diff --git a/paddlex/inference/models/doc_vlm/predictor.py b/paddlex/inference/models/doc_vlm/predictor.py
@@ -29,7 +29,7 @@
 from ....utils.deps import require_genai_client_plugin
 from ....utils.device import TemporaryDeviceChanger
 from ...common.batch_sampler import DocVLMBatchSampler
-from ...utils.misc import is_bfloat16_available
+from ...utils.misc import is_bfloat16_available, is_float16_available
 from ..base import BasePredictor
 from .result import DocVLMResult
 
@@ -54,7 +54,12 @@ def __init__(self, *args, **kwargs):
 
         if self._use_local_model:
             self.device = kwargs.get("device", None)
-            self.dtype = "bfloat16" if is_bfloat16_available(self.device) else "float32"
+            if is_bfloat16_available(self.device):
+                self.dtype = "bfloat16"
+            elif is_float16_available(self.device):
+                self.dtype = "float16"
+            else:
+                self.dtype = "float32"
 
             self.infer, self.processor = self._build(**kwargs)
 

diff --git a/paddlex/inference/pipelines/components/retriever/base.py b/paddlex/inference/pipelines/components/retriever/base.py
@@ -22,8 +22,8 @@
 from .....utils.subclass_register import AutoRegisterABCMetaClass
 
 if is_dep_available("langchain"):
-    from langchain.docstore.document import Document
-    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    from langchain_core.documents.base import Document
+    from langchain_text_splitters.character import RecursiveCharacterTextSplitter
 if is_dep_available("langchain-community"):
     from langchain_community import vectorstores
     from langchain_community.vectorstores import FAISS

diff --git a/paddlex/inference/utils/hpi.py b/paddlex/inference/utils/hpi.py
@@ -24,6 +24,7 @@
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated, TypeAlias
 
+from ...utils import logging
 from ...utils.deps import function_requires_deps, is_paddle2onnx_plugin_available
 from ...utils.env import get_paddle_cuda_version, get_paddle_version
 from ...utils.flags import USE_PIR_TRT
@@ -156,6 +157,14 @@ def suggest_inference_backend_and_config(
         return None, f"Inference backend {repr(hpi_config.backend)} is unavailable."
 
     paddle_version = get_paddle_version()
+
+    if paddle_version[:3] >= (3, 1, 0):
+        logging.debug(
+            "Paddle version %s is not supported yet. The prior knowledge of Paddle 3.1.1 will be used.",
+            paddle_version,
+        )
+        paddle_version = (3, 1, 1, None)
+
     if (3, 0) <= paddle_version[:2] <= (3, 1) and paddle_version[3] is None:
         if paddle_version[2] == 0:
             paddle_version = f"paddle{paddle_version[0]}{paddle_version[1]}"
-Original file line number
+Diff line change
@@ Expand Up / @@ -21,8 +21,8 @@ while [[ $# -gt 0 ]]; do @@
                 shift
                 ;;
             *)
-                echo "Unknown option: $1"
-                exit 1
+                echo "Unknown option: $1" >&2
+                exit 2
                 ;;
         esac
     done
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		paddlepaddle @ https://paddle-whl.bj.bcebos.com/stable/cpu/paddlepaddle/paddlepaddle-3.1.1-cp310-cp310-linux_x86_64.whl
		paddlepaddle @ https://paddle-whl.bj.bcebos.com/stable/cpu/paddlepaddle/paddlepaddle-3.2.1-cp310-cp310-linux_x86_64.whl
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		paddlepaddle-gpu @ https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/deps/paddlepaddle/paddlepaddle_gpu-3.1.1%2Bfc-cp310-cp310-linux_x86_64.whl
		paddlepaddle-gpu @ https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/deps/paddlepaddle/paddlepaddle_gpu-3.2.1%2Bfc-cp310-cp310-linux_x86_64.whl