From b87ac5010bba6945a02d7684e83a54c7da552c9c Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 29 Jun 2026 18:42:24 +0800
Subject: [PATCH 01/33] feat(pt_expt): forward_common_lower_graph_exportable
 trace target for graph .pt2 export
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add make_model.forward_common_lower_graph_exportable: make_fx trace of
  forward_common_lower_graph with edge_vec as the autograd leaf; symbolic
  tracing enabled via two data-dependent shape fixes.
- Add EnergyModel.forward_lower_graph_exportable: wraps the above with a
  second make_fx pass that translates internal keys to public names
  (atom_energy, energy, force, virial, atom_virial).
- Fix edge_transform_output.py: replace int(n_node.sum()) with
  next(iter(fit_ret.values())).shape[0] (static shape attr, trace-safe);
  thread node_capacity through edge_energy_deriv -> edge_force_virial so
  that segment_sum sizes are static under make_fx symbolic mode.
- Add edge_energy_deriv node_capacity param (None = eager fallback).
- Test: source/tests/pt_expt/model/test_graph_export.py — TDD RED->GREEN,
  verifies traced module reproduces eager energy_redu (rtol=1e-10).
---
 deepmd/pt_expt/model/edge_transform_output.py |  17 ++-
 deepmd/pt_expt/model/ener_model.py            | 103 ++++++++++++++++++
 deepmd/pt_expt/model/make_model.py            |  78 +++++++++++++
 .../tests/pt_expt/model/test_graph_export.py  |  74 +++++++++++++
 4 files changed, 269 insertions(+), 3 deletions(-)
 create mode 100644 source/tests/pt_expt/model/test_graph_export.py

diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py
index 565e155157..1eb3e4363b 100644
--- a/deepmd/pt_expt/model/edge_transform_output.py
+++ b/deepmd/pt_expt/model/edge_transform_output.py
@@ -31,6 +31,7 @@ def edge_energy_deriv(
     n_node: torch.Tensor,
     do_atomic_virial: bool = False,
     create_graph: bool = False,
+    node_capacity: int | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor]:
     """Return (force, atom_virial_or_None, virial) from a graph energy.
 
@@ -53,6 +54,10 @@ def edge_energy_deriv(
         whether to materialize the per-atom virial (else ``None`` is returned).
     create_graph
         whether the backward retains a graph (training, for second-order grad).
+    node_capacity
+        Static node-axis size ``N``.  ``None`` (eager default) falls back to
+        ``int(n_node.sum())``.  Pass a static value (e.g. ``atype.shape[0]``)
+        to keep this function trace-safe under ``make_fx``/``torch.export``.
 
     Returns
     -------
@@ -70,7 +75,7 @@ def edge_energy_deriv(
         retain_graph=True,
     )
     force, atom_virial, virial = edge_force_virial(
-        g_e, edge_vec, edge_index, edge_mask, n_node
+        g_e, edge_vec, edge_index, edge_mask, n_node, node_capacity=node_capacity
     )
     return force, (atom_virial if do_atomic_virial else None), virial
 
@@ -132,8 +137,13 @@ def fit_output_to_model_output_graph(
     n_node = graph.n_node
     redu_prec = env.GLOBAL_PT_ENER_FLOAT_PRECISION
     nf = int(n_node.shape[0])
-    N = int(n_node.sum())
-    frame_id = frame_id_from_n_node(n_node)  # (N,) int64 frame index per atom
+    # Derive N from the fitting output's leading shape rather than int(n_node.sum()).
+    # shape attributes are always static Python ints (or SymInts in symbolic-mode
+    # tracing) and are trace-safe; reading a tensor VALUE via int() is not.
+    N = next(iter(fit_ret.values())).shape[0]
+    frame_id = frame_id_from_n_node(
+        n_node, n_total=N
+    )  # (N,) int64 frame index per atom
     model_ret: dict[str, torch.Tensor] = dict(fit_ret.items())
     for kk, vv in fit_ret.items():
         vdef = fit_output_def[kk]
@@ -174,6 +184,7 @@ def fit_output_to_model_output_graph(
                 n_node,
                 do_atomic_virial=(vdef.c_differentiable and do_atomic_virial),
                 create_graph=create_graph,
+                node_capacity=N,
             )
             # force (N, 3) -> (N, 1, 3)  [flat; caller unravels at I/O boundary]
             ff_list.append(force.reshape(N, 1, 3))
diff --git a/deepmd/pt_expt/model/ener_model.py b/deepmd/pt_expt/model/ener_model.py
index 6347382135..af9806f083 100644
--- a/deepmd/pt_expt/model/ener_model.py
+++ b/deepmd/pt_expt/model/ener_model.py
@@ -247,3 +247,106 @@ def fn(
         return make_fx(fn, **make_fx_kwargs)(
             extended_coord, extended_atype, nlist, mapping, fparam, aparam, charge_spin
         )
+
+    def forward_lower_graph_exportable(
+        self,
+        atype: torch.Tensor,
+        n_node: torch.Tensor,
+        edge_index: torch.Tensor,
+        edge_vec: torch.Tensor,
+        edge_mask: torch.Tensor,
+        fparam: torch.Tensor | None = None,
+        aparam: torch.Tensor | None = None,
+        do_atomic_virial: bool = False,
+        charge_spin: torch.Tensor | None = None,
+        **make_fx_kwargs: Any,
+    ) -> torch.nn.Module:
+        """Trace ``forward_common_lower_graph`` into an exportable module with
+        public output keys.
+
+        Delegates to ``forward_common_lower_graph_exportable`` for tracing,
+        then translates the internal keys to the ``forward_lower`` convention.
+
+        Parameters
+        ----------
+        atype
+            (N,) flat local atom types, ``N == sum(n_node)``.
+        n_node
+            (nf,) per-frame local atom counts.
+        edge_index
+            (2, E) ``[src, dst]`` edge endpoints (flat local indices).
+        edge_vec
+            (E, 3) neighbor-minus-center edge vectors (sample for tracing).
+        edge_mask
+            (E,) valid-edge mask (sample for tracing).
+        fparam, aparam, do_atomic_virial, charge_spin
+            As in ``forward_lower``.
+        **make_fx_kwargs
+            Extra keyword arguments forwarded to ``make_fx``
+            (e.g. ``tracing_mode="symbolic"``).
+
+        Returns
+        -------
+        torch.nn.Module
+            A traced module whose ``forward`` accepts
+            ``(atype, n_node, edge_index, edge_vec, edge_mask,
+            fparam, aparam, charge_spin)`` and returns a dict with the
+            public keys: ``atom_energy``, ``energy``, ``force``,
+            ``virial``, ``atom_virial``.
+        """
+        traced = self.forward_common_lower_graph_exportable(
+            atype,
+            n_node,
+            edge_index,
+            edge_vec,
+            edge_mask,
+            fparam=fparam,
+            aparam=aparam,
+            charge_spin=charge_spin,
+            do_atomic_virial=do_atomic_virial,
+            **make_fx_kwargs,
+        )
+
+        # Translate internal keys to public convention.
+        # Capture model config at trace time via closures.
+        do_grad_r = self.do_grad_r("energy")
+        do_grad_c = self.do_grad_c("energy")
+
+        def fn(
+            atype: torch.Tensor,
+            n_node: torch.Tensor,
+            edge_index: torch.Tensor,
+            edge_vec: torch.Tensor,
+            edge_mask: torch.Tensor,
+            fparam: torch.Tensor | None,
+            aparam: torch.Tensor | None,
+            charge_spin: torch.Tensor | None,
+        ) -> dict[str, torch.Tensor]:
+            model_ret = traced(
+                atype,
+                n_node,
+                edge_index,
+                edge_vec,
+                edge_mask,
+                fparam,
+                aparam,
+                charge_spin,
+            )
+            model_predict: dict[str, torch.Tensor] = {}
+            model_predict["atom_energy"] = model_ret["energy"]
+            model_predict["energy"] = model_ret["energy_redu"]
+            if do_grad_r:
+                model_predict["force"] = model_ret["energy_derv_r"].squeeze(-2)
+            if do_grad_c:
+                model_predict["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2)
+                if do_atomic_virial:
+                    model_predict["atom_virial"] = model_ret["energy_derv_c"].squeeze(
+                        -2
+                    )
+            if "mask" in model_ret:
+                model_predict["mask"] = model_ret["mask"]
+            return model_predict
+
+        return make_fx(fn, **make_fx_kwargs)(
+            atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, charge_spin
+        )
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 50ede240e4..cb2689c449 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -630,6 +630,84 @@ def fn(
                 model.need_sorted_nlist_for_lower = _orig_need_sort
             return traced
 
+        def forward_common_lower_graph_exportable(
+            self,
+            atype: torch.Tensor,
+            n_node: torch.Tensor,
+            edge_index: torch.Tensor,
+            edge_vec: torch.Tensor,
+            edge_mask: torch.Tensor,
+            fparam: torch.Tensor | None = None,
+            aparam: torch.Tensor | None = None,
+            do_atomic_virial: bool = False,
+            charge_spin: torch.Tensor | None = None,
+            **make_fx_kwargs: Any,
+        ) -> torch.nn.Module:
+            """make_fx trace of ``forward_common_lower_graph`` with ``edge_vec``
+            as the autograd leaf — the export target for graph-form .pt2 archives.
+
+            Parameters
+            ----------
+            atype
+                (N,) flat local atom types, ``N == sum(n_node)``.
+            n_node
+                (nf,) per-frame local atom counts.
+            edge_index
+                (2, E) ``[src, dst]`` edge endpoints (flat local indices).
+            edge_vec
+                (E, 3) neighbor-minus-center edge vectors (sample for tracing).
+            edge_mask
+                (E,) valid-edge mask (sample for tracing).
+            fparam, aparam, do_atomic_virial, charge_spin
+                As in ``forward_common_lower_graph``.
+            **make_fx_kwargs
+                Extra keyword arguments forwarded to ``make_fx``
+                (e.g. ``tracing_mode="symbolic"``).
+
+            Returns
+            -------
+            torch.nn.Module
+                A traced module whose ``forward`` accepts
+                ``(atype, n_node, edge_index, edge_vec, edge_mask,
+                fparam, aparam, charge_spin)`` and returns a dict with the
+                same internal keys as ``forward_common_lower_graph``.
+            """
+            model = self
+
+            def fn(
+                atype: torch.Tensor,
+                n_node: torch.Tensor,
+                edge_index: torch.Tensor,
+                edge_vec: torch.Tensor,
+                edge_mask: torch.Tensor,
+                fparam: torch.Tensor | None,
+                aparam: torch.Tensor | None,
+                charge_spin: torch.Tensor | None,
+            ) -> dict[str, torch.Tensor]:
+                ev = edge_vec.detach().requires_grad_(True)
+                return model.forward_common_lower_graph(
+                    atype,
+                    n_node,
+                    edge_index,
+                    ev,
+                    edge_mask,
+                    do_atomic_virial=do_atomic_virial,
+                    fparam=fparam,
+                    aparam=aparam,
+                    charge_spin=charge_spin,
+                )
+
+            return make_fx(fn, **make_fx_kwargs)(
+                atype,
+                n_node,
+                edge_index,
+                edge_vec,
+                edge_mask,
+                fparam,
+                aparam,
+                charge_spin,
+            )
+
         def forward_common_lower_exportable_with_comm(
             self,
             extended_coord: torch.Tensor,
diff --git a/source/tests/pt_expt/model/test_graph_export.py b/source/tests/pt_expt/model/test_graph_export.py
new file mode 100644
index 0000000000..7738595dae
--- /dev/null
+++ b/source/tests/pt_expt/model/test_graph_export.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Graph-lower export: forward_common_lower_graph_exportable traces + torch.export."""
+
+import torch
+from deepmd.pt.utils import env
+from deepmd.pt_expt.descriptor.dpa1 import DescrptDPA1
+from deepmd.pt_expt.fitting import InvarFitting
+from deepmd.pt_expt.model import EnergyModel
+from deepmd.dpmodel.utils.neighbor_graph import build_neighbor_graph
+from ...seed import GLOBAL_SEED
+
+_RCUT, _NT = 4.0, 2
+
+
+def _model():
+    ds = DescrptDPA1(
+        _RCUT,
+        0.5,
+        20,
+        _NT,
+        neuron=[3, 6],
+        axis_neuron=2,
+        attn_layer=0,
+        precision="float64",
+        seed=GLOBAL_SEED,
+    ).to(env.DEVICE)
+    ft = InvarFitting(
+        "energy",
+        _NT,
+        ds.get_dim_out(),
+        1,
+        mixed_types=ds.mixed_types(),
+        precision="float64",
+        seed=GLOBAL_SEED,
+    ).to(env.DEVICE)
+    return EnergyModel(ds, ft, type_map=["A", "B"]).to(env.DEVICE)
+
+
+def _graph_inputs(model):
+    rng = torch.Generator(device=env.DEVICE).manual_seed(GLOBAL_SEED)
+    nloc = 6
+    coord = (
+        torch.rand(1, nloc, 3, dtype=torch.float64, device=env.DEVICE, generator=rng)
+        * 3.0
+    )
+    atype = torch.tensor([[0, 1, 0, 1, 0, 1]], dtype=torch.int64, device=env.DEVICE)
+    box = torch.eye(3, dtype=torch.float64, device=env.DEVICE).reshape(1, 9) * 20.0
+    g = build_neighbor_graph(coord, atype, box, model.get_rcut())
+    return (atype.reshape(-1), g.n_node, g.edge_index, g.edge_vec, g.edge_mask)
+
+
+def test_graph_exportable_traces():
+    model = _model().eval()
+    atype, n_node, ei, ev, em = _graph_inputs(model)
+    gm = model.forward_common_lower_graph_exportable(
+        atype,
+        n_node,
+        ei,
+        ev,
+        em,
+        do_atomic_virial=False,
+        tracing_mode="symbolic",
+        _allow_non_fake_inputs=True,
+    )
+    assert isinstance(gm, torch.nn.Module)
+    # the traced module reproduces eager outputs
+    eager = model.forward_common_lower_graph(
+        atype, n_node, ei, ev, em, do_atomic_virial=False
+    )
+    # traced module has placeholders for all 8 fn args (fparam/aparam/charge_spin=None)
+    traced = gm(atype, n_node, ei, ev, em, None, None, None)
+    # traced returns a tuple/dict; compare energy_redu
+    te = traced["energy_redu"] if isinstance(traced, dict) else traced[1]
+    torch.testing.assert_close(te, eager["energy_redu"], rtol=1e-10, atol=1e-10)

From ee8db1b0fb9facc39edaa1c1785ca3c74b1a3bb4 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 29 Jun 2026 19:39:09 +0800
Subject: [PATCH 02/33] =?UTF-8?q?fix(pt=5Fexpt):=20B1.1=20review=20?=
 =?UTF-8?q?=E2=80=94=20test=20forward=5Flower=5Fgraph=5Fexportable=20(both?=
 =?UTF-8?q?=20do=5Fatomic=5Fvirial=20branches),=20dedup=20key-translation,?=
 =?UTF-8?q?=20drop=20redundant=20detach?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 deepmd/pt_expt/model/ener_model.py            | 84 ++++++++++++-------
 deepmd/pt_expt/model/make_model.py            |  6 +-
 .../tests/pt_expt/model/test_graph_export.py  | 47 +++++++++++
 3 files changed, 106 insertions(+), 31 deletions(-)

diff --git a/deepmd/pt_expt/model/ener_model.py b/deepmd/pt_expt/model/ener_model.py
index af9806f083..140141dd5e 100644
--- a/deepmd/pt_expt/model/ener_model.py
+++ b/deepmd/pt_expt/model/ener_model.py
@@ -32,6 +32,41 @@
 DPEnergyModel_ = make_model(DPEnergyAtomicModel, T_Bases=(BaseModel,))
 
 
+def _translate_energy_keys(
+    model_ret: dict[str, torch.Tensor],
+    *,
+    do_grad_r: bool,
+    do_grad_c: bool,
+    do_atomic_virial: bool,
+    local: bool,
+) -> dict[str, torch.Tensor]:
+    """Map internal fitting keys -> public energy-model keys (shared by the
+    dense and graph ``forward_lower`` export traces).
+
+    Operates on plain dicts (make_fx-safe). ``local=True`` is the GRAPH path
+    (per-node ``N == sum(n_node)`` local atoms, no ghost/extended region) and
+    emits ``force``/``atom_virial``; ``local=False`` is the DENSE extended-region
+    path and emits ``extended_force``/``extended_virial`` (folded to local by
+    ``communicate_extended_output`` at inference).
+    """
+    out: dict[str, torch.Tensor] = {}
+    out["atom_energy"] = model_ret["energy"]
+    out["energy"] = model_ret["energy_redu"]
+    if do_grad_r:
+        out["force" if local else "extended_force"] = model_ret[
+            "energy_derv_r"
+        ].squeeze(-2)
+    if do_grad_c:
+        out["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2)
+        if do_atomic_virial:
+            out["atom_virial" if local else "extended_virial"] = model_ret[
+                "energy_derv_c"
+            ].squeeze(-2)
+    if "mask" in model_ret:
+        out["mask"] = model_ret["mask"]
+    return out
+
+
 @BaseModel.register("ener")
 @BaseModel.register("sezm_ener")
 @BaseModel.register("dpa4_ener")
@@ -229,20 +264,13 @@ def fn(
                 aparam,
                 charge_spin,
             )
-            model_predict: dict[str, torch.Tensor] = {}
-            model_predict["atom_energy"] = model_ret["energy"]
-            model_predict["energy"] = model_ret["energy_redu"]
-            if do_grad_r:
-                model_predict["extended_force"] = model_ret["energy_derv_r"].squeeze(-2)
-            if do_grad_c:
-                model_predict["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2)
-                if do_atomic_virial:
-                    model_predict["extended_virial"] = model_ret[
-                        "energy_derv_c"
-                    ].squeeze(-2)
-            if "mask" in model_ret:
-                model_predict["mask"] = model_ret["mask"]
-            return model_predict
+            return _translate_energy_keys(
+                model_ret,
+                do_grad_r=do_grad_r,
+                do_grad_c=do_grad_c,
+                do_atomic_virial=do_atomic_virial,
+                local=False,
+            )
 
         return make_fx(fn, **make_fx_kwargs)(
             extended_coord, extended_atype, nlist, mapping, fparam, aparam, charge_spin
@@ -292,7 +320,12 @@ def forward_lower_graph_exportable(
             ``(atype, n_node, edge_index, edge_vec, edge_mask,
             fparam, aparam, charge_spin)`` and returns a dict with the
             public keys: ``atom_energy``, ``energy``, ``force``,
-            ``virial``, ``atom_virial``.
+            ``virial``, ``atom_virial`` (the last only when
+            ``do_atomic_virial``).  Unlike the dense
+            :meth:`forward_lower_exportable` (which emits ``extended_force`` /
+            ``extended_virial`` over the ghost-padded extended region), the
+            graph path is LOCAL-only (``N == sum(n_node)`` nodes, no ghosts),
+            so it emits ``force`` / ``atom_virial`` directly.
         """
         traced = self.forward_common_lower_graph_exportable(
             atype,
@@ -332,20 +365,13 @@ def fn(
                 aparam,
                 charge_spin,
             )
-            model_predict: dict[str, torch.Tensor] = {}
-            model_predict["atom_energy"] = model_ret["energy"]
-            model_predict["energy"] = model_ret["energy_redu"]
-            if do_grad_r:
-                model_predict["force"] = model_ret["energy_derv_r"].squeeze(-2)
-            if do_grad_c:
-                model_predict["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2)
-                if do_atomic_virial:
-                    model_predict["atom_virial"] = model_ret["energy_derv_c"].squeeze(
-                        -2
-                    )
-            if "mask" in model_ret:
-                model_predict["mask"] = model_ret["mask"]
-            return model_predict
+            return _translate_energy_keys(
+                model_ret,
+                do_grad_r=do_grad_r,
+                do_grad_c=do_grad_c,
+                do_atomic_virial=do_atomic_virial,
+                local=True,
+            )
 
         return make_fx(fn, **make_fx_kwargs)(
             atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, charge_spin
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index cb2689c449..3e14ed2d56 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -684,12 +684,14 @@ def fn(
                 aparam: torch.Tensor | None,
                 charge_spin: torch.Tensor | None,
             ) -> dict[str, torch.Tensor]:
-                ev = edge_vec.detach().requires_grad_(True)
+                # forward_common_lower_graph creates the autograd leaf from
+                # edge_vec internally, so no outer detach/requires_grad_ here
+                # (it would only add spurious ops to the traced graph).
                 return model.forward_common_lower_graph(
                     atype,
                     n_node,
                     edge_index,
-                    ev,
+                    edge_vec,
                     edge_mask,
                     do_atomic_virial=do_atomic_virial,
                     fparam=fparam,
diff --git a/source/tests/pt_expt/model/test_graph_export.py b/source/tests/pt_expt/model/test_graph_export.py
index 7738595dae..56e2d6eb7b 100644
--- a/source/tests/pt_expt/model/test_graph_export.py
+++ b/source/tests/pt_expt/model/test_graph_export.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """Graph-lower export: forward_common_lower_graph_exportable traces + torch.export."""
 
+import pytest
 import torch
 from deepmd.pt.utils import env
 from deepmd.pt_expt.descriptor.dpa1 import DescrptDPA1
@@ -72,3 +73,49 @@ def test_graph_exportable_traces():
     # traced returns a tuple/dict; compare energy_redu
     te = traced["energy_redu"] if isinstance(traced, dict) else traced[1]
     torch.testing.assert_close(te, eager["energy_redu"], rtol=1e-10, atol=1e-10)
+
+
+@pytest.mark.parametrize("do_atomic_virial", [False, True])  # both branches of the bool
+def test_forward_lower_graph_exportable_public_keys(do_atomic_virial):
+    """EnergyModel.forward_lower_graph_exportable: traces the public-key path and
+    reproduces eager energy/force; atom_virial present iff do_atomic_virial.
+    """
+    model = _model().eval()
+    atype, n_node, ei, ev, em = _graph_inputs(model)
+    gm = model.forward_lower_graph_exportable(
+        atype,
+        n_node,
+        ei,
+        ev,
+        em,
+        do_atomic_virial=do_atomic_virial,
+        tracing_mode="symbolic",
+        _allow_non_fake_inputs=True,
+    )
+    assert isinstance(gm, torch.nn.Module)
+    out = gm(atype, n_node, ei, ev, em, None, None, None)
+
+    # public key set (graph path is local-only: force/atom_virial, NOT extended_*)
+    assert "atom_energy" in out and "energy" in out and "force" in out
+    assert "virial" in out
+    assert "extended_force" not in out and "extended_virial" not in out
+    # atom_virial appears ONLY when do_atomic_virial=True
+    assert ("atom_virial" in out) == do_atomic_virial
+
+    # values match the eager graph lower
+    eager = model.forward_common_lower_graph(
+        atype, n_node, ei, ev, em, do_atomic_virial=do_atomic_virial
+    )
+    torch.testing.assert_close(
+        out["energy"], eager["energy_redu"], rtol=1e-10, atol=1e-10
+    )
+    torch.testing.assert_close(
+        out["force"], eager["energy_derv_r"].squeeze(-2), rtol=1e-10, atol=1e-10
+    )
+    if do_atomic_virial:
+        torch.testing.assert_close(
+            out["atom_virial"],
+            eager["energy_derv_c"].squeeze(-2),
+            rtol=1e-10,
+            atol=1e-10,
+        )

From 7437d549057d8e5fb5114c51d70e48f8cdc3b17d Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 29 Jun 2026 19:44:53 +0800
Subject: [PATCH 03/33] test(dpmodel): codify static edge_capacity contract for
 build_neighbor_graph

Adds test_graph_static_capacity.py (4 tests): shape (2,64), real-edge
count matches dynamic, real prefix identity + masked tail, overflow raises.
No source change needed -- builder.py already threads layout.edge_capacity
into pad_and_guard_edges.
---
 .../dpmodel/test_graph_static_capacity.py     | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 source/tests/common/dpmodel/test_graph_static_capacity.py

diff --git a/source/tests/common/dpmodel/test_graph_static_capacity.py b/source/tests/common/dpmodel/test_graph_static_capacity.py
new file mode 100644
index 0000000000..c9ff982d01
--- /dev/null
+++ b/source/tests/common/dpmodel/test_graph_static_capacity.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Tests for static edge_capacity masked padding in build_neighbor_graph.
+
+Codifies the contract: build_neighbor_graph(..., layout=GraphLayout(edge_capacity=E_max))
+returns a NeighborGraph whose edge_index/edge_vec/edge_mask have a STATIC leading
+edge dim E_max (real edges in the compact prefix, edge_mask=False tail), so export
+sees a fixed E.  Edge overflow must raise ValueError.
+"""
+
+import numpy as np
+import pytest
+
+from deepmd.dpmodel.utils.neighbor_graph import (
+    GraphLayout,
+    build_neighbor_graph,
+)
+
+
+class TestStaticEdgeCapacity:
+    """Tests for static edge_capacity masked padding via build_neighbor_graph."""
+
+    @pytest.fixture()
+    def small_system(self):
+        """6-atom periodic system with a 20 Å box (atoms well within rcut=4 range)."""
+        rng = np.random.default_rng(0)
+        coord = rng.normal(size=(1, 6, 3)) * 1.5
+        atype = np.array([[0, 1, 0, 1, 0, 1]], dtype=np.int64)
+        box = np.eye(3).reshape(1, 9) * 20.0
+        return coord, atype, box
+
+    def test_static_edge_capacity_shape(self, small_system):
+        """Static edge_capacity=64 yields edge_index.shape == (2, 64)."""
+        coord, atype, box = small_system
+        cap = build_neighbor_graph(
+            coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=64)
+        )
+        assert cap.edge_index.shape == (2, 64)
+        assert cap.edge_vec.shape == (64, 3)
+        assert cap.edge_mask.shape == (64,)
+
+    def test_static_edge_capacity_matches_dynamic(self, small_system):
+        """Static graph has same real-edge count as dynamic graph."""
+        coord, atype, box = small_system
+        dyn = build_neighbor_graph(coord, atype, box, 4.0)
+        cap = build_neighbor_graph(
+            coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=64)
+        )
+        assert cap.edge_index.shape == (2, 64)
+        assert int(cap.edge_mask.sum()) == int(dyn.edge_mask.sum())
+
+    def test_static_edge_capacity_real_prefix_matches_dynamic(self, small_system):
+        """The real-edge prefix of the static graph matches the dynamic graph."""
+        coord, atype, box = small_system
+        dyn = build_neighbor_graph(coord, atype, box, 4.0)
+        cap = build_neighbor_graph(
+            coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=64)
+        )
+        n_real = int(dyn.edge_mask.sum())
+        # real prefix must match exactly
+        np.testing.assert_array_equal(
+            cap.edge_index[:, :n_real], dyn.edge_index[:, :n_real]
+        )
+        np.testing.assert_allclose(cap.edge_vec[:n_real], dyn.edge_vec[:n_real])
+        # padding suffix must have edge_mask=False
+        assert not np.any(cap.edge_mask[n_real:])
+
+    def test_overflow_raises(self, small_system):
+        """edge_capacity smaller than real edge count must raise ValueError."""
+        coord, atype, box = small_system
+        # capacity=1 is guaranteed to be smaller than the real edge count
+        with pytest.raises(ValueError, match="edge overflow"):
+            build_neighbor_graph(
+                coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=1)
+            )

From 1051a0d9d517b1f2557cfb5b9daf64dcb684db45 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 29 Jun 2026 19:55:36 +0800
Subject: [PATCH 04/33] feat(pt_expt): graph .pt2 export branch +
 lower_input_kind metadata

---
 deepmd/pt_expt/utils/serialization.py         | 282 +++++++++++++++++-
 .../pt_expt/utils/test_graph_pt2_metadata.py  | 104 +++++++
 2 files changed, 380 insertions(+), 6 deletions(-)
 create mode 100644 source/tests/pt_expt/utils/test_graph_pt2_metadata.py

diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index 9c03783574..8c1f9e0d1e 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -310,6 +310,147 @@ def _make_sample_inputs(
     return ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam, charge_spin
 
 
+def _make_graph_sample_inputs(
+    model: torch.nn.Module,
+    e_max: int,
+    nframes: int = 2,
+    nloc: int = 7,
+) -> tuple[torch.Tensor | None, ...]:
+    """Create sample inputs for tracing ``forward_lower_graph``.
+
+    Builds a small random system, runs the carry-all
+    :func:`~deepmd.dpmodel.utils.neighbor_graph.build_neighbor_graph` with a
+    STATIC ``GraphLayout(edge_capacity=e_max)`` (decision #16: the masked
+    static edge axis), and returns tensors in the positional order expected by
+    :meth:`forward_lower_graph_exportable`:
+    ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam,
+    charge_spin)``.
+
+    Parameters
+    ----------
+    model : torch.nn.Module
+        The pt_expt energy model (must expose ``get_rcut``/``get_type_map``/...).
+    e_max : int
+        Static edge capacity ``E`` to pad the edge axis to.
+    nframes : int
+        Number of frames in the sample system.
+    nloc : int
+        Number of local atoms per frame (``N == nframes * nloc``).
+    """
+    from deepmd.dpmodel.utils.neighbor_graph import (
+        GraphLayout,
+        build_neighbor_graph,
+    )
+
+    import deepmd.pt_expt.utils.env as _env
+
+    rcut = model.get_rcut()
+    ntypes = len(model.get_type_map())
+    dim_fparam = model.get_dim_fparam()
+    dim_aparam = model.get_dim_aparam()
+
+    # Box large enough to avoid PBC degeneracy; mirrors _make_sample_inputs.
+    box_size = rcut * 3.0
+    box = np.eye(3, dtype=np.float64) * box_size
+    box_np = box.reshape(1, 9)
+
+    rng = np.random.default_rng(42)
+    coord_np = rng.random((nframes, nloc, 3), dtype=np.float64) * box_size * 0.5
+    coord_np += box_size * 0.25  # center in box
+
+    atype_np = np.zeros((nframes, nloc), dtype=np.int64)
+    for i in range(nloc):
+        atype_np[:, i] = i % ntypes
+
+    graph = build_neighbor_graph(
+        coord_np,
+        atype_np,
+        np.tile(box_np, (nframes, 1)),
+        rcut,
+        layout=GraphLayout(edge_capacity=e_max),
+    )
+
+    atype_t = torch.tensor(atype_np.reshape(-1), dtype=torch.int64, device=_env.DEVICE)
+    n_node_t = torch.tensor(
+        np.asarray(graph.n_node), dtype=torch.int64, device=_env.DEVICE
+    )
+    edge_index_t = torch.tensor(
+        np.asarray(graph.edge_index), dtype=torch.int64, device=_env.DEVICE
+    )
+    edge_vec_t = torch.tensor(
+        np.asarray(graph.edge_vec), dtype=torch.float64, device=_env.DEVICE
+    )
+    edge_mask_t = torch.tensor(
+        np.asarray(graph.edge_mask), dtype=torch.bool, device=_env.DEVICE
+    )
+
+    if dim_fparam > 0:
+        fparam = torch.zeros(
+            nframes, dim_fparam, dtype=torch.float64, device=_env.DEVICE
+        )
+    else:
+        fparam = None
+
+    if dim_aparam > 0:
+        aparam = torch.zeros(
+            nframes, nloc, dim_aparam, dtype=torch.float64, device=_env.DEVICE
+        )
+    else:
+        aparam = None
+
+    dim_chg_spin = model.get_dim_chg_spin() if hasattr(model, "get_dim_chg_spin") else 0
+    if dim_chg_spin > 0:
+        charge_spin = torch.zeros(
+            nframes, dim_chg_spin, dtype=torch.float64, device=_env.DEVICE
+        )
+    else:
+        charge_spin = None
+
+    return (
+        atype_t,
+        n_node_t,
+        edge_index_t,
+        edge_vec_t,
+        edge_mask_t,
+        fparam,
+        aparam,
+        charge_spin,
+    )
+
+
+def _build_graph_dynamic_shapes(
+    *sample_inputs: torch.Tensor | None,
+) -> tuple:
+    """Build dynamic-shape specifications for the graph-form forward_lower export.
+
+    ``nframes`` (the ``n_node`` axis) and ``N`` (the flat node axis) are
+    dynamic dims; the edge axis ``E`` is STATIC (decision #16: the masked
+    ``edge_capacity`` path), expressed by leaving the edge dims unmarked
+    (``None``) so torch.export specialises them to the sample value.
+
+    Parameters
+    ----------
+    *sample_inputs : torch.Tensor | None
+        ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam,
+        charge_spin)`` — 8 entries matching ``forward_lower_graph_exportable``.
+    """
+    fparam = sample_inputs[5]
+    aparam = sample_inputs[6]
+    charge_spin = sample_inputs[7]
+    nframes_dim = torch.export.Dim("nframes", min=1)
+    n_node_total_dim = torch.export.Dim("n_node_total", min=1)
+    return (
+        {0: n_node_total_dim},  # atype: (N,)
+        {0: nframes_dim},  # n_node: (nf,)
+        None,  # edge_index: (2, E) — E static
+        None,  # edge_vec: (E, 3) — E static
+        None,  # edge_mask: (E,) — E static
+        {0: nframes_dim} if fparam is not None else None,  # fparam: (nf, ndf)
+        {0: nframes_dim} if aparam is not None else None,  # aparam: (nf, nloc, nda)
+        {0: nframes_dim} if charge_spin is not None else None,  # charge_spin
+    )
+
+
 def _build_dynamic_shapes(
     *sample_inputs: torch.Tensor | None,
     has_spin: bool = False,
@@ -416,7 +557,9 @@ def _build_dynamic_shapes(
     return (*base, None, None, None, None, None, None, None, None)
 
 
-def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict:
+def _collect_metadata(
+    model: torch.nn.Module, is_spin: bool = False, lower_kind: str = "nlist"
+) -> dict:
     """Collect metadata from the model for C++ inference.
 
     This metadata is stored as ``metadata.json`` in both .pt2 and .pte archives.
@@ -528,6 +671,12 @@ def _probe_has_message_passing(obj: object) -> bool | None:
         if result is not None:
             break
     meta["has_message_passing"] = result if result is not None else False
+
+    # Which input schema the compiled AOTI forward consumes:
+    #   "nlist" → dense quartet (extended_coord, extended_atype, nlist, mapping)
+    #   "graph" → NeighborGraph (atype, n_node, edge_index, edge_vec, edge_mask)
+    # The C++ loader branches on this to build the matching inputs.
+    meta["lower_input_kind"] = "graph" if lower_kind == "graph" else "nlist"
     return meta
 
 
@@ -599,6 +748,7 @@ def deserialize_to_file(
     data: dict,
     model_json_override: dict | None = None,
     do_atomic_virial: bool = False,
+    lower_kind: str = "nlist",
 ) -> None:
     """Deserialize a dictionary to a .pte or .pt2 model file.
 
@@ -622,14 +772,22 @@ def deserialize_to_file(
     do_atomic_virial : bool
         If True, export with per-atom virial correction (3 extra backward
         passes, ~2.5x slower).  Default False for best performance.
+    lower_kind : str
+        Which lower-forward schema the compiled AOTI graph consumes:
+        ``"nlist"`` (default) traces the dense quartet
+        (``extended_coord``/``extended_atype``/``nlist``/``mapping``);
+        ``"graph"`` traces the NeighborGraph schema
+        (``atype``/``n_node``/``edge_index``/``edge_vec``/``edge_mask``) with a
+        static edge axis ``E = ceil(1.25 * nloc * nnei)``.  The selected schema
+        is recorded as ``lower_input_kind`` in ``metadata.json``.
     """
     if model_file.endswith(".pt2"):
         _deserialize_to_file_pt2(
-            model_file, data, model_json_override, do_atomic_virial
+            model_file, data, model_json_override, do_atomic_virial, lower_kind
         )
     else:
         _deserialize_to_file_pte(
-            model_file, data, model_json_override, do_atomic_virial
+            model_file, data, model_json_override, do_atomic_virial, lower_kind
         )
 
 
@@ -638,6 +796,7 @@ def _trace_and_export(
     model_json_override: dict | None = None,
     with_comm_dict: bool = False,
     do_atomic_virial: bool = False,
+    lower_kind: str = "nlist",
 ) -> tuple:
     """Common logic: build model, trace, export.
 
@@ -663,6 +822,10 @@ def _trace_and_export(
         If True, the traced graph computes per-atom virial (extra
         autograd.grad backward passes); off by default to keep .pt2
         inference fast. Mirrors PR #5407 in upstream master.
+    lower_kind
+        ``"nlist"`` (default) traces the dense quartet forward; ``"graph"``
+        traces ``forward_lower_graph_exportable`` over the NeighborGraph schema
+        with a static edge axis. Recorded as ``lower_input_kind`` in metadata.
 
     Returns
     -------
@@ -700,7 +863,106 @@ def _trace_and_export(
     model.eval()
 
     # 2. Collect metadata
-    metadata = _collect_metadata(model, is_spin=is_spin)
+    metadata = _collect_metadata(model, is_spin=is_spin, lower_kind=lower_kind)
+
+    # 2b. Graph-form export branch (NeighborGraph schema). The graph path is
+    # LOCAL-only (no ghosts), single-rank, energy-model only in PR-A/PR-B; it
+    # traces ``forward_lower_graph_exportable`` with a STATIC edge axis. The
+    # dense (nlist) path below is left byte-unchanged.
+    if lower_kind == "graph":
+        import math
+
+        if is_spin:
+            raise NotImplementedError(
+                "graph-form .pt2 export is not supported for spin models"
+            )
+        if with_comm_dict:
+            raise NotImplementedError(
+                "graph-form .pt2 export does not support the with-comm artifact "
+                "(multi-rank graph message passing is a later PR)"
+            )
+        if not hasattr(model, "forward_lower_graph_exportable"):
+            raise NotImplementedError(
+                f"model {type(model).__name__} has no "
+                "forward_lower_graph_exportable; graph-form .pt2 export "
+                "requires an energy model"
+            )
+
+        # Static export edge capacity E_max = ceil(1.25 * nloc * nnei)
+        # (decision #12 headroom). nloc is the sample-system local-atom count.
+        nloc_sample = 7
+        nnei = sum(model.get_sel())
+        e_max = math.ceil(1.25 * nloc_sample * nnei)
+
+        _orig_device = _env.DEVICE
+        _env.DEVICE = torch.device("cpu")
+        try:
+            sample_inputs = _make_graph_sample_inputs(
+                model, e_max=e_max, nframes=2, nloc=nloc_sample
+            )
+        finally:
+            _env.DEVICE = _orig_device
+
+        (
+            atype_g,
+            n_node_g,
+            edge_index_g,
+            edge_vec_g,
+            edge_mask_g,
+            fparam_g,
+            aparam_g,
+            charge_spin_g,
+        ) = sample_inputs
+
+        # Trace via make_fx on CPU (decomposes autograd.grad into aten ops).
+        traced = model.forward_lower_graph_exportable(
+            atype_g,
+            n_node_g,
+            edge_index_g,
+            edge_vec_g,
+            edge_mask_g,
+            fparam=fparam_g,
+            aparam=aparam_g,
+            do_atomic_virial=do_atomic_virial,
+            charge_spin=charge_spin_g,
+            tracing_mode="symbolic",
+            _allow_non_fake_inputs=True,
+        )
+        sample_out = traced(
+            atype_g,
+            n_node_g,
+            edge_index_g,
+            edge_vec_g,
+            edge_mask_g,
+            fparam_g,
+            aparam_g,
+            charge_spin_g,
+        )
+        output_keys = list(sample_out.keys())
+
+        dynamic_shapes = _build_graph_dynamic_shapes(*sample_inputs)
+        exported = torch.export.export(
+            traced,
+            sample_inputs,
+            dynamic_shapes=dynamic_shapes,
+            strict=False,
+            prefer_deferred_runtime_asserts_over_guards=True,
+        )
+
+        if target_device.type != "cpu":
+            from torch.export.passes import (
+                move_to_device_pass,
+            )
+
+            exported = move_to_device_pass(exported, target_device)
+
+        metadata["do_atomic_virial"] = do_atomic_virial
+
+        json_source = model_json_override if model_json_override is not None else data
+        data_for_json = deepcopy(json_source)
+        data_for_json = _numpy_to_json_serializable(data_for_json)
+
+        return exported, metadata, data_for_json, output_keys
 
     # 3. Create sample inputs on CPU for tracing
     # torch.export's duck-sizing unifies dimensions with the same sample value,
@@ -917,10 +1179,14 @@ def _deserialize_to_file_pte(
     data: dict,
     model_json_override: dict | None = None,
     do_atomic_virial: bool = False,
+    lower_kind: str = "nlist",
 ) -> None:
     """Deserialize a dictionary to a .pte model file."""
     exported, metadata, data_for_json, output_keys = _trace_and_export(
-        data, model_json_override, do_atomic_virial=do_atomic_virial
+        data,
+        model_json_override,
+        do_atomic_virial=do_atomic_virial,
+        lower_kind=lower_kind,
     )
 
     model_def_script = data.get("model_def_script") or {}
@@ -939,6 +1205,7 @@ def _deserialize_to_file_pt2(
     data: dict,
     model_json_override: dict | None = None,
     do_atomic_virial: bool = False,
+    lower_kind: str = "nlist",
 ) -> None:
     """Deserialize a dictionary to a .pt2 model file (AOTInductor).
 
@@ -976,7 +1243,10 @@ def _deserialize_to_file_pt2(
 
     # First artifact: regular (no comm). Always produced.
     exported, metadata, data_for_json, output_keys = _trace_and_export(
-        data, model_json_override, do_atomic_virial=do_atomic_virial
+        data,
+        model_json_override,
+        do_atomic_virial=do_atomic_virial,
+        lower_kind=lower_kind,
     )
     metadata["output_keys"] = output_keys
 
diff --git a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
new file mode 100644
index 0000000000..ffbf2ae9e7
--- /dev/null
+++ b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Graph-form ``.pt2`` export: ``lower_input_kind`` metadata branch.
+
+Covers both branches of the ``lower_kind`` selector on
+``deserialize_to_file``: ``"graph"`` traces ``forward_lower_graph_exportable``
+over the NeighborGraph schema and records ``lower_input_kind == "graph"`` in
+``metadata.json``; the default (``"nlist"``) traces the dense quartet and
+records ``lower_input_kind == "nlist"``.
+"""
+
+import copy
+import json
+import os
+import tempfile
+import zipfile
+
+import pytest
+
+from deepmd.pt_expt.utils.serialization import (
+    deserialize_to_file,
+)
+
+# dpa1 with attn_layer == 0 — the energy model exercised by the graph path.
+DPA1_CONFIG = {
+    "type_map": ["O", "H"],
+    "descriptor": {
+        "type": "se_atten",
+        "sel": 30,
+        "rcut_smth": 2.0,
+        "rcut": 6.0,
+        "neuron": [2, 4, 8],
+        "axis_neuron": 4,
+        "attn": 5,
+        "attn_layer": 0,
+        "attn_dotr": True,
+        "attn_mask": False,
+        "activation_function": "tanh",
+        "scaling_factor": 1.0,
+        "normalize": True,
+        "temperature": 1.0,
+        "type_one_side": True,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [5, 5, 5],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+}
+
+
+def _build_dpa1_data() -> dict:
+    """Build a serialized dpmodel data dict for a dpa1(attn_layer=0) energy model."""
+    from deepmd.dpmodel.model.model import (
+        get_model,
+    )
+
+    model = get_model(copy.deepcopy(DPA1_CONFIG))
+    return {
+        "model": model.serialize(),
+        "model_def_script": copy.deepcopy(DPA1_CONFIG),
+        "backend": "dpmodel",
+        "software": "deepmd-kit",
+        "version": "3.0.0",
+    }
+
+
+def _read_metadata(pt2_path: str) -> dict:
+    """Read ``model/extra/metadata.json`` from a ``.pt2`` ZIP archive."""
+    with zipfile.ZipFile(pt2_path, "r") as zf:
+        raw = zf.read("model/extra/metadata.json").decode("utf-8")
+    return json.loads(raw)
+
+
+@pytest.fixture(scope="module")
+def dpa1_dpmodel_data() -> dict:
+    return _build_dpa1_data()
+
+
+def test_graph_pt2_has_lower_input_kind_graph(dpa1_dpmodel_data) -> None:
+    """``lower_kind="graph"`` -> metadata ``lower_input_kind == "graph"``."""
+    with tempfile.TemporaryDirectory() as d:
+        p = os.path.join(d, "m_graph.pt2")
+        deserialize_to_file(
+            p,
+            copy.deepcopy(dpa1_dpmodel_data),
+            do_atomic_virial=True,
+            lower_kind="graph",
+        )
+        meta = _read_metadata(p)
+    assert meta["lower_input_kind"] == "graph"
+
+
+def test_dense_pt2_has_lower_input_kind_nlist(dpa1_dpmodel_data) -> None:
+    """Default (``lower_kind="nlist"``) -> metadata ``lower_input_kind == "nlist"``."""
+    with tempfile.TemporaryDirectory() as d:
+        p = os.path.join(d, "m_dense.pt2")
+        deserialize_to_file(
+            p,
+            copy.deepcopy(dpa1_dpmodel_data),
+            do_atomic_virial=True,
+        )
+        meta = _read_metadata(p)
+    assert meta["lower_input_kind"] == "nlist"

From 148fa0e7bdff0aa73a0b3d6a4a3f7c26330e2ce4 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 29 Jun 2026 20:03:18 +0800
Subject: [PATCH 05/33] =?UTF-8?q?fix(pt=5Fexpt):=20B1.3=20review=20?=
 =?UTF-8?q?=E2=80=94=20persist=20static=20edge=5Fcapacity=20(E=5Fmax)=20in?=
 =?UTF-8?q?=20graph=20.pt2=20metadata=20for=20the=20C++=20hub=20(B2)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 deepmd/pt_expt/utils/serialization.py                 | 5 +++++
 source/tests/pt_expt/utils/test_graph_pt2_metadata.py | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index 8c1f9e0d1e..dcb8aa58c1 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -957,6 +957,11 @@ def _trace_and_export(
             exported = move_to_device_pass(exported, target_device)
 
         metadata["do_atomic_virial"] = do_atomic_virial
+        # The edge axis is specialized STATIC: torch.export bakes E to exactly
+        # e_max, so the AOTI forward only accepts edge tensors of this length.
+        # Persist it so the C++ conversion hub (PR-B Phase B2) pads/masks runtime
+        # edges to precisely this value instead of re-deriving the constant.
+        metadata["edge_capacity"] = e_max
 
         json_source = model_json_override if model_json_override is not None else data
         data_for_json = deepcopy(json_source)
diff --git a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
index ffbf2ae9e7..5c38b5046d 100644
--- a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
+++ b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
@@ -89,6 +89,10 @@ def test_graph_pt2_has_lower_input_kind_graph(dpa1_dpmodel_data) -> None:
         )
         meta = _read_metadata(p)
     assert meta["lower_input_kind"] == "graph"
+    # the static edge axis is baked into the AOTI artifact; E_max must be
+    # persisted so the C++ conversion hub (PR-B B2) pads runtime edges to it.
+    # E_max = ceil(1.25 * nloc_sample(7) * nnei(sum(sel)=30)) = 263.
+    assert meta["edge_capacity"] == 263
 
 
 def test_dense_pt2_has_lower_input_kind_nlist(dpa1_dpmodel_data) -> None:
@@ -102,3 +106,5 @@ def test_dense_pt2_has_lower_input_kind_nlist(dpa1_dpmodel_data) -> None:
         )
         meta = _read_metadata(p)
     assert meta["lower_input_kind"] == "nlist"
+    # edge_capacity is a graph-only artifact constant; the dense path omits it.
+    assert "edge_capacity" not in meta

From ce2fd12a5728e78f77ae0bdcf2a070e613db8207 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 29 Jun 2026 20:28:35 +0800
Subject: [PATCH 06/33] test(pt_expt): graph .pt2 DeepEval parity vs eager
 dense dpa1 (pbc+nopbc, 1e-10)

Add a graph-form .pt2 DeepEval dispatch and parity test (NeighborGraph PR-B
Phase B1.4).

DeepEval: route lower_input_kind=="graph" archives through a new
_eval_model_graph that builds a carry-all NeighborGraph (padded to the static
edge_capacity baked in metadata), feeds the positional schema
(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, charge_spin)
to the AOTI forward, and reshapes the LOCAL public outputs by category (no
communicate_extended_output, since the graph path is ghost-free). The dense
(nlist) path is untouched.

Fix dynamic-shape generalization of the graph .pt2 export: under symbolic
make_fx/torch.export, int() on a SymInt SPECIALIZES the axis, baking the
trace-time sample size (N=14, nf=2) into the autograd-derived force/energy/
virial outputs. Drop int() on node_capacity (edge_force_virial) and on
n_node.shape[0] (fit_output_to_model_output_graph), and derive nf-1 in
frame_id_from_n_node as a runtime 0-d tensor instead of xp.asarray(shape-1).
The exported graph now generalizes across arbitrary N and nf.

Test: dpa1(attn_layer=0) energy model exported with lower_kind="graph",
evaluated through the pt_expt DeepPot and compared to the eager dense
(sel-capped, neighbor_graph_method="legacy") reference at rtol=atol=1e-10 for
pbc and nopbc. The fixture is a sparse 8-atom cluster and asserts non-binding
sel (max neighbors < 30) so the carry-all/sel-capped neighbor sets coincide
(anti-vacuity).
---
 .../utils/neighbor_graph/derivatives.py       |  10 +-
 deepmd/dpmodel/utils/neighbor_graph/graph.py  |   9 +-
 deepmd/pt_expt/infer/deep_eval.py             | 113 +++++++++
 deepmd/pt_expt/model/edge_transform_output.py |   6 +-
 .../pt_expt/infer/test_graph_deepeval.py      | 219 ++++++++++++++++++
 5 files changed, 351 insertions(+), 6 deletions(-)
 create mode 100644 source/tests/pt_expt/infer/test_graph_deepeval.py

diff --git a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
index 494e97a0c9..2c8c50eaca 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
@@ -80,9 +80,13 @@ def edge_force_virial(
         frame via the frame of their ``dst`` node.
     """
     xp = array_api_compat.array_namespace(g_e)
-    # node-axis size; when a static ``node_capacity`` is supplied (the jax/export
-    # path) short-circuit so we never call int() on the traced ``sum(n_node)``.
-    n_out = int(node_capacity) if node_capacity is not None else int(xp.sum(n_node))
+    # node-axis size; when a ``node_capacity`` is supplied (the jax/export path)
+    # use it AS-IS so we never call int() on the traced ``sum(n_node)`` -- and,
+    # crucially, never on ``node_capacity`` itself: under symbolic make_fx /
+    # torch.export it is a SymInt (``atype.shape[0]``); ``int(SymInt)`` would
+    # SPECIALIZE the node axis to the trace-time sample size, baking a constant
+    # ``N`` into the scatter and breaking dynamic-``N`` inference.
+    n_out = node_capacity if node_capacity is not None else int(xp.sum(n_node))
     nf = n_node.shape[0]
     # zero padding/guard contributions; cast mask to g's dtype (array-API pure,
     # CLAUDE.md mask-multiply guideline — avoids bool*float under array_api_strict)
diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py
index e527a84bf0..0ce10efdf6 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/graph.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py
@@ -153,13 +153,18 @@ def frame_id_from_n_node(n_node: Array, n_total: int | None = None) -> Array:
     dev = array_api_compat.device(n_node)
     if n_total is None:
         n_total = int(xp.sum(n_node))
-    nf = n_node.shape[0]
     idx = xp.arange(n_total, dtype=n_node.dtype, device=dev)
     boundaries = xp.cumulative_sum(n_node)  # (nf,) upper bounds, exclusive
     frame_id = xp.astype(xp.searchsorted(boundaries, idx, side="right"), xp.int64)
     # padding nodes (idx >= sum(n_node)) land at frame ``nf`` (OOB); clamp them to
     # the last real frame so the per-frame scatter never indexes out of range.
-    return xp.minimum(frame_id, xp.asarray(nf - 1, dtype=xp.int64, device=dev))
+    # Derive ``nf - 1`` as a RUNTIME 0-d tensor (sum of ones over the frame axis)
+    # rather than ``xp.asarray(n_node.shape[0] - 1)``: under symbolic make_fx /
+    # torch.export, ``shape[0]`` is a SymInt and materializing it into a constant
+    # tensor SPECIALIZES the frame axis -- baking the trace-time frame count into
+    # every downstream per-frame reduction and breaking dynamic-``nf`` inference.
+    last_frame = xp.sum(xp.ones_like(n_node)) - 1  # 0-d int == nf - 1
+    return xp.minimum(frame_id, xp.astype(last_frame, xp.int64))
 
 
 def node_validity_mask(n_node: Array, n_total: int) -> Array:
diff --git a/deepmd/pt_expt/infer/deep_eval.py b/deepmd/pt_expt/infer/deep_eval.py
index 97bba3d4a5..47bcbff731 100644
--- a/deepmd/pt_expt/infer/deep_eval.py
+++ b/deepmd/pt_expt/infer/deep_eval.py
@@ -66,6 +66,20 @@
     import ase.neighborlist
 
 
+# Public output keys emitted by the graph-form AOTI forward
+# (``forward_lower_graph_exportable``) keyed by the output-variable category that
+# ``request_defs`` carries.  The graph path is LOCAL-only (``N == sum(n_node)``
+# nodes, no ghosts), so its outputs are already at local-atom resolution -- no
+# ``communicate_extended_output`` fold-back is needed.
+_GRAPH_CATEGORY_TO_KEY = {
+    OutputVariableCategory.OUT: "atom_energy",
+    OutputVariableCategory.REDU: "energy",
+    OutputVariableCategory.DERV_R: "force",
+    OutputVariableCategory.DERV_C_REDU: "virial",
+    OutputVariableCategory.DERV_C: "atom_virial",
+}
+
+
 def _reshape_charge_spin(
     charge_spin: np.ndarray, nframes: int, dim_chg_spin: int
 ) -> np.ndarray:
@@ -1423,6 +1437,10 @@ def _eval_model(
         request_defs: list[OutputVariableDef],
         charge_spin: np.ndarray | None = None,
     ) -> tuple[np.ndarray, ...]:
+        if self.metadata.get("lower_input_kind") == "graph":
+            return self._eval_model_graph(
+                coords, cells, atom_types, fparam, aparam, request_defs, charge_spin
+            )
         model_inputs, mapping_t, nframes, natoms = self._prepare_inputs(
             coords, cells, atom_types, fparam, aparam, charge_spin
         )
@@ -1621,6 +1639,101 @@ def _eval_model_spin(
                 )
         return tuple(results)
 
+    def _eval_model_graph(
+        self,
+        coords: np.ndarray,
+        cells: np.ndarray | None,
+        atom_types: np.ndarray,
+        fparam: np.ndarray | None,
+        aparam: np.ndarray | None,
+        request_defs: list[OutputVariableDef],
+        charge_spin: np.ndarray | None = None,
+    ) -> tuple[np.ndarray, ...]:
+        """Evaluate a graph-form ``.pt2`` (``lower_input_kind == "graph"``).
+
+        Builds a carry-all :class:`~deepmd.dpmodel.utils.neighbor_graph.NeighborGraph`
+        from the eval system, padded to the static ``edge_capacity`` baked into
+        the AOTI artifact, and feeds the positional schema
+        ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam,
+        charge_spin)`` to the exported forward.  The forward returns the LOCAL
+        public keys directly, so results are reshaped without
+        ``communicate_extended_output``.
+        """
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            GraphLayout,
+            build_neighbor_graph,
+        )
+        from deepmd.pt_expt.utils.env import (
+            DEVICE,
+        )
+
+        nframes = coords.shape[0]
+        if len(atom_types.shape) == 1:
+            natoms = len(atom_types)
+            atom_types = np.tile(atom_types, nframes).reshape(nframes, -1)
+        else:
+            natoms = len(atom_types[0])
+
+        coord_input = coords.reshape(nframes, natoms, 3)
+        box_input = cells.reshape(nframes, 9) if cells is not None else None
+        edge_capacity = int(self.metadata["edge_capacity"])
+        graph = build_neighbor_graph(
+            coord_input,
+            atom_types,
+            box_input,
+            self._rcut,
+            layout=GraphLayout(edge_capacity=edge_capacity),
+        )
+
+        atype_t = torch.tensor(
+            np.asarray(atom_types).reshape(-1), dtype=torch.int64, device=DEVICE
+        )
+        n_node_t = torch.tensor(
+            np.asarray(graph.n_node), dtype=torch.int64, device=DEVICE
+        )
+        edge_index_t = torch.tensor(
+            np.asarray(graph.edge_index), dtype=torch.int64, device=DEVICE
+        )
+        edge_vec_t = torch.tensor(
+            np.asarray(graph.edge_vec), dtype=torch.float64, device=DEVICE
+        )
+        edge_mask_t = torch.tensor(
+            np.asarray(graph.edge_mask), dtype=torch.bool, device=DEVICE
+        )
+
+        fparam_t, aparam_t = self._prepare_optional_lower_inputs(
+            fparam, aparam, nframes, natoms, DEVICE
+        )
+        charge_spin_t = self._make_charge_spin_input(nframes, charge_spin)
+
+        model_inputs = (
+            atype_t,
+            n_node_t,
+            edge_index_t,
+            edge_vec_t,
+            edge_mask_t,
+            fparam_t,
+            aparam_t,
+            charge_spin_t,
+        )
+        if self._is_pt2:
+            model_ret = self._pt2_runner(*model_inputs)
+        else:
+            model_ret = self.exported_module(*model_inputs)
+
+        results = []
+        for odef in request_defs:
+            shape = self._get_output_shape(odef, nframes, natoms)
+            gkey = _GRAPH_CATEGORY_TO_KEY.get(odef.category)
+            val = model_ret.get(gkey) if gkey is not None else None
+            if val is not None:
+                results.append(val.detach().cpu().numpy().reshape(shape))
+            else:
+                results.append(
+                    np.full(np.abs(shape), np.nan, dtype=GLOBAL_NP_FLOAT_PRECISION)
+                )
+        return tuple(results)
+
     def _get_output_shape(
         self, odef: OutputVariableDef, nframes: int, natoms: int
     ) -> list[int]:
diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py
index 1eb3e4363b..9202a64d8f 100644
--- a/deepmd/pt_expt/model/edge_transform_output.py
+++ b/deepmd/pt_expt/model/edge_transform_output.py
@@ -136,7 +136,11 @@ def fit_output_to_model_output_graph(
     edge_mask = graph.edge_mask
     n_node = graph.n_node
     redu_prec = env.GLOBAL_PT_ENER_FLOAT_PRECISION
-    nf = int(n_node.shape[0])
+    # Keep ``nf`` as a (possibly symbolic) shape value: under symbolic make_fx /
+    # torch.export ``n_node`` dim-0 is the dynamic frame axis, and ``int()`` on a
+    # SymInt SPECIALIZES it -- baking the trace-time frame count into every
+    # per-frame reduction (energy_redu / virial) and breaking multi-frame infer.
+    nf = n_node.shape[0]
     # Derive N from the fitting output's leading shape rather than int(n_node.sum()).
     # shape attributes are always static Python ints (or SymInts in symbolic-mode
     # tracing) and are trace-safe; reading a tensor VALUE via int() is not.
diff --git a/source/tests/pt_expt/infer/test_graph_deepeval.py b/source/tests/pt_expt/infer/test_graph_deepeval.py
new file mode 100644
index 0000000000..e2bdabf04f
--- /dev/null
+++ b/source/tests/pt_expt/infer/test_graph_deepeval.py
@@ -0,0 +1,219 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Graph-form ``.pt2`` DeepEval parity vs the eager dense reference.
+
+A graph-form ``.pt2`` (exported with ``lower_kind="graph"``) carries the
+NeighborGraph schema ``(atype, n_node, edge_index, edge_vec, edge_mask, ...)``
+in its AOTI forward.  This test verifies that evaluating such an archive
+through the pt_expt :class:`DeepPot` reproduces the eager dpa1 energy / force /
+virial to ``rtol=atol=1e-10`` (fp64), for both PBC and non-PBC.
+
+The graph path is CARRY-ALL (every neighbor within ``rcut``); the eager dense
+reference is sel-capped (``sel=30``, forced via
+``neighbor_graph_method="legacy"``).  They coincide only at NON-BINDING ``sel``
+(max neighbor count ``< sel``), so the test fixture is a small, sparse cluster
+and the non-binding condition is asserted explicitly -- otherwise the parity
+would vacuously compare two different neighbor sets.
+"""
+
+import copy
+import os
+import tempfile
+
+import numpy as np
+import pytest
+import torch
+
+from deepmd.infer import (
+    DeepPot,
+)
+from deepmd.pt_expt.utils.env import (
+    DEVICE,
+)
+from deepmd.pt_expt.utils.serialization import (
+    deserialize_to_file,
+)
+
+# dpa1 with attn_layer == 0 -- the energy model exercised by the graph path.
+DPA1_CONFIG = {
+    "type_map": ["O", "H"],
+    "descriptor": {
+        "type": "se_atten",
+        "sel": 30,
+        "rcut_smth": 2.0,
+        "rcut": 6.0,
+        "neuron": [2, 4, 8],
+        "axis_neuron": 4,
+        "attn": 5,
+        "attn_layer": 0,
+        "attn_dotr": True,
+        "attn_mask": False,
+        "activation_function": "tanh",
+        "scaling_factor": 1.0,
+        "normalize": True,
+        "temperature": 1.0,
+        "type_one_side": True,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [5, 5, 5],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+}
+
+RCUT = 6.0
+SEL = 30
+
+
+def _build_system() -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """A small, sparse cluster: 8 atoms inside a 5 A blob, centered in an 18 A box.
+
+    The blob keeps every atom within ``rcut`` of at most 7 others (<< ``sel``),
+    so the carry-all graph neighbor set equals the sel-capped dense one.
+    """
+    rng = np.random.default_rng(20240626)
+    natoms = 8
+    box_size = 18.0
+    blob = rng.random((natoms, 3)) * 5.0 + box_size * 0.5 - 2.5
+    coords = blob.reshape(1, natoms, 3)
+    cells = (np.eye(3) * box_size).reshape(1, 9)
+    atype = np.array([0, 1, 1, 0, 1, 1, 0, 1], dtype=np.int32)
+    return coords, cells, atype
+
+
+def _max_neighbors(
+    coords: np.ndarray, cells: np.ndarray | None, atype: np.ndarray
+) -> int:
+    """Max carry-all neighbor count per center within ``rcut`` (for the non-binding check)."""
+    from deepmd.dpmodel.utils.neighbor_graph import (
+        build_neighbor_graph,
+    )
+
+    natoms = atype.shape[0]
+    graph = build_neighbor_graph(
+        coords.reshape(1, natoms, 3),
+        atype.reshape(1, natoms),
+        cells.reshape(1, 9) if cells is not None else None,
+        RCUT,
+    )
+    real = np.asarray(graph.edge_mask)
+    dst = np.asarray(graph.edge_index)[1][real]
+    counts = np.bincount(dst, minlength=natoms)
+    return int(counts.max())
+
+
+def _eager_dense_reference(
+    model: torch.nn.Module,
+    coords: np.ndarray,
+    cells: np.ndarray | None,
+    atype: np.ndarray,
+) -> dict[str, np.ndarray]:
+    """Reference energy/force/virial from the eager dense (sel-capped) path."""
+    natoms = atype.shape[0]
+    coord_t = torch.tensor(
+        coords.reshape(1, natoms, 3), dtype=torch.float64, device=DEVICE
+    ).requires_grad_(True)
+    atype_t = torch.tensor(atype.reshape(1, natoms), dtype=torch.int64, device=DEVICE)
+    box_t = (
+        torch.tensor(cells.reshape(1, 9), dtype=torch.float64, device=DEVICE)
+        if cells is not None
+        else None
+    )
+    ret = model.call_common(
+        coord_t,
+        atype_t,
+        box_t,
+        do_atomic_virial=True,
+        neighbor_graph_method="legacy",
+    )
+    out = {
+        "atom_energy": ret["energy"],
+        "energy": ret["energy_redu"],
+        "force": ret["energy_derv_r"].squeeze(-2),
+        "virial": ret["energy_derv_c_redu"].squeeze(-2),
+        "atom_virial": ret["energy_derv_c"].squeeze(-2),
+    }
+    return {k: v.detach().cpu().numpy() for k, v in out.items()}
+
+
+@pytest.fixture(scope="module")
+def graph_pt2():
+    """Build a dpa1(attn_layer=0) model and export it to a graph-form ``.pt2``.
+
+    The AOTI compile is slow (~90 s), so it is done once per module.  The eager
+    pt_expt model is returned alongside the archive path to serve as the dense
+    parity reference.
+    """
+    from deepmd.pt_expt.model import (
+        get_model,
+    )
+
+    model = get_model(copy.deepcopy(DPA1_CONFIG)).to(torch.float64)
+    model.eval()
+    data = {"model": model.serialize()}
+
+    tmpdir = tempfile.mkdtemp()
+    pt2_path = os.path.join(tmpdir, "deeppot_dpa1_graph.pt2")
+    deserialize_to_file(
+        pt2_path,
+        copy.deepcopy(data),
+        do_atomic_virial=True,
+        lower_kind="graph",
+    )
+    yield pt2_path, model
+    os.unlink(pt2_path)
+    os.rmdir(tmpdir)
+
+
+@pytest.mark.parametrize("pbc", [True, False])  # periodic vs non-periodic
+def test_graph_pt2_deepeval_parity(graph_pt2, pbc) -> None:
+    """Graph ``.pt2`` DeepEval == eager dense dpa1 (energy/force/virial), 1e-10."""
+    pt2_path, model = graph_pt2
+    coords, cells, atype = _build_system()
+    box = cells if pbc else None
+
+    # Anti-vacuity: the carry-all graph and the sel-capped dense reference only
+    # agree when no center is sel-bound.  Assert the system is non-binding.
+    max_nn = _max_neighbors(coords, box, atype)
+    assert max_nn < SEL, (
+        f"test system is sel-binding (max neighbors {max_nn} >= sel {SEL}); "
+        "carry-all graph and sel-capped dense reference would diverge"
+    )
+
+    dp = DeepPot(pt2_path)
+    assert dp.deep_eval.metadata["lower_input_kind"] == "graph"
+
+    e, f, v, ae, av = dp.eval(coords, box, atype, atomic=True)
+    ref = _eager_dense_reference(model, coords, box, atype)
+
+    np.testing.assert_allclose(
+        e.reshape(-1),
+        ref["energy"].reshape(-1),
+        rtol=1e-10,
+        atol=1e-10,
+        err_msg="energy",
+    )
+    np.testing.assert_allclose(
+        f.reshape(-1), ref["force"].reshape(-1), rtol=1e-10, atol=1e-10, err_msg="force"
+    )
+    np.testing.assert_allclose(
+        v.reshape(-1),
+        ref["virial"].reshape(-1),
+        rtol=1e-10,
+        atol=1e-10,
+        err_msg="virial",
+    )
+    np.testing.assert_allclose(
+        ae.reshape(-1),
+        ref["atom_energy"].reshape(-1),
+        rtol=1e-10,
+        atol=1e-10,
+        err_msg="atom_energy",
+    )
+    np.testing.assert_allclose(
+        av.reshape(-1),
+        ref["atom_virial"].reshape(-1),
+        rtol=1e-10,
+        atol=1e-10,
+        err_msg="atom_virial",
+    )

From e35fc389770f39f5c9df6d2aa48f50b8b63359b2 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 29 Jun 2026 22:04:49 +0800
Subject: [PATCH 07/33] fix(pt_expt): compiled training runs the graph lower
 (eager==compiled); drop force_legacy_descriptor

Retarget _CompiledModel to compile forward_common_lower_graph for graph-eligible
descriptors (dpa1 attn_layer==0), gated by the same mixed_types()+uses_graph_lower()
predicate the eager default-flip uses; se_e2_a/dpa2/dpa3 keep compiling the dense
forward_lower. _trace_and_compile_graph builds a synthetic NeighborGraph with
prime-distinct nf/N/E axes (no make_fx duck-shape merge) and edge_vec as the
autograd leaf; _forward_graph builds the carry-all graph eagerly and unravels flat
(N,*) node outputs to (nf,nloc,*). cpp.simdlen=0 for the graph compile avoids an
inductor CPU scatter-vectorizer crash on the per-frame virial atomic_add.

Also fixes an eager autograd bug in dpa1 call_graph: xp.asarray(type_embedding,
device=dev) DETACHES under torch, so the type-embedding weights never trained in
the graph path (grad None despite a real finite-diff dependency). make_fx traced
through it, so compiled != eager and the optimizer diverged after step 0. Use
type_embedding directly (mirrors the dense path); the tebd net now trains and
eager==compiled to 1e-10 across the varying-natoms trajectory.

Drops the force_legacy_descriptor workaround + uses_graph_lower monkeypatch.
---
 deepmd/dpmodel/descriptor/dpa1.py     |  13 +-
 deepmd/pt_expt/train/training.py      | 533 +++++++++++++++++++++++++-
 source/tests/pt_expt/test_training.py |  48 +--
 3 files changed, 554 insertions(+), 40 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 27e2d68bfc..3f49b6c04f 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -757,9 +757,13 @@ def call_graph(
         )
         # FLAT node axis (N, ...): no (nf, nloc) reshape -- ragged-native, spec.
         if self.concat_output_tebd:
-            tebd = xp.asarray(type_embedding, device=dev)
+            # Use type_embedding directly (mirrors the dense path's
+            # ``xp.take(type_embedding, ...)``): ``xp.asarray(..., device=dev)``
+            # DETACHES under torch, silently severing the type-embedding weight
+            # gradient so the tebd net never trains; type_embedding already lives
+            # on the model device, so the device cast was redundant anyway.
             atype_local = xp.asarray(atype, device=dev)
-            atype_embd = xp.take(tebd, atype_local, axis=0)  # (N, tebd_dim)
+            atype_embd = xp.take(type_embedding, atype_local, axis=0)  # (N, tebd_dim)
             grrg = xp.concat([grrg, atype_embd], axis=-1)
         return grrg, rot_mat
 
@@ -1523,7 +1527,10 @@ def call_graph(
         ss = rr[:, 0:1]  # (E, 1)
         # neighbor / center type embeddings (concat mode); ghost type == owner type
         # so gathering by the LOCAL owner (src) reproduces the dense neighbor tebd.
-        tebd = xp.asarray(type_embedding, device=dev)
+        # NB: do NOT wrap in ``xp.asarray(..., device=dev)`` -- that DETACHES under
+        # torch and severs the type-embedding weight gradient (the tebd net would
+        # never train); type_embedding already lives on the model device.
+        tebd = type_embedding
         atype_embd_nlist = xp.take(tebd, nei_type, axis=0)  # (E, tebd_dim)
         if not self.type_one_side:
             atype_embd_nnei = xp.take(tebd, center_type, axis=0)  # (E, tebd_dim)
diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index bd6fdb02a3..ff685cb0c7 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -477,6 +477,25 @@ def fn(
         *task_buf_vals_trace,
     )
 
+    return (
+        _finalize_compiled_lower(traced_lower, model, was_training, compile_opts),
+        task_buf_order,
+    )
+
+
+def _finalize_compiled_lower(
+    traced_lower: "torch.fx.GraphModule",
+    model: torch.nn.Module,
+    was_training: bool,
+    compile_opts: dict[str, Any] | None,
+    extra_options: dict[str, Any] | None = None,
+) -> torch.nn.Module:
+    """Shared post-``make_fx`` tail: strip detach, rebuild, inductor-compile.
+
+    Used by both the dense :func:`_trace_and_compile` and the graph
+    :func:`_trace_and_compile_graph` so the second-order-gradient handling
+    (detach removal) and inductor options stay identical on both paths.
+    """
     # make_fx inserts aten.detach.default for saved tensors used in the
     # decomposed autograd.grad backward ops.  These detach nodes break
     # second-order gradient flow (d(force)/d(params) for force training).
@@ -503,6 +522,8 @@ def fn(
         # pytorch/pytorch#174379, #178080, #179494 under
         # data-dependent symbolic shapes.
     }
+    if extra_options:
+        inductor_options.update(extra_options)
     if compile_opts:
         inductor_options.update(compile_opts)
 
@@ -511,7 +532,356 @@ def fn(
         backend="inductor",
         dynamic=True,
         options=inductor_options,
-    ), task_buf_order
+    )
+
+
+def _model_uses_graph_lower(model: torch.nn.Module) -> bool:
+    """Whether ``model``'s eager default-flip routes through the GRAPH lower.
+
+    Mirrors the predicate in
+    :meth:`~deepmd.pt_expt.model.make_model.make_model.<locals>.CM._resolve_graph_method`
+    for ``neighbor_graph_method is None`` (the training default): a model is
+    graph-eligible iff it is ``mixed_types`` AND its single descriptor reports
+    ``uses_graph_lower() == True`` (currently only dpa1 ``attn_layer == 0``).
+
+    When True the compiled lower must be the GRAPH ``forward_common_lower_graph``
+    so the compiled path matches eager training (which already default-flips to
+    the carry-all graph forward); when False the dense ``forward_lower`` is
+    compiled (se_e2_a / dpa2 / dpa3 / linear / zbl).
+    """
+    if not hasattr(model, "mixed_types"):
+        return False
+    try:
+        if not model.mixed_types():
+            return False
+    except (AttributeError, NotImplementedError):
+        return False
+    # Linear / ZBL atomic models have no single ``descriptor`` -> dense.
+    descriptor = getattr(getattr(model, "atomic_model", None), "descriptor", None)
+    uses_graph = getattr(descriptor, "uses_graph_lower", None)
+    if uses_graph is None:
+        return False
+    try:
+        return bool(uses_graph())
+    except (AttributeError, NotImplementedError):
+        return False
+
+
+def _trace_and_compile_graph(
+    model: torch.nn.Module,
+    fparam: torch.Tensor | None,
+    aparam: torch.Tensor | None,
+    charge_spin: torch.Tensor | None,
+    compile_opts: dict[str, Any] | None = None,
+    task_buffers: dict[str, torch.Tensor] | None = None,
+) -> tuple[torch.nn.Module, tuple[str, ...]]:
+    """Symbolic-trace ``forward_common_lower_graph`` and inductor-compile it.
+
+    The GRAPH analogue of :func:`_trace_and_compile`.  Builds a small synthetic
+    NeighborGraph with prime-controlled ``nf`` / ``N`` / ``E`` axes (so make_fx's
+    duck-shape unification keeps the three dynamic dims as distinct symbols),
+    traces ``model.forward_common_lower_graph`` with ``edge_vec`` as the autograd
+    leaf, and translates the internal fitting keys to the public energy-model
+    keys (``atom_energy`` / ``energy`` / ``force`` / ``virial``).  The compiled
+    callable accepts the positional graph tensors plus the promoted task buffers
+    and returns those public keys on the FLAT node axis (``N == sum(n_node)``);
+    the caller (:meth:`_CompiledModel.forward`) unravels them to ``(nf, nloc, *)``.
+
+    Parameters
+    ----------
+    model
+        The (uncompiled) graph-eligible energy model.
+    fparam, aparam, charge_spin
+        Representative optional inputs (or ``None``) so the traced branch
+        matches what :meth:`_CompiledModel.forward` passes at run time.
+    compile_opts
+        User-supplied inductor options (merged over the built-in defaults).
+    task_buffers
+        Per-task buffers promoted to FX placeholders (see
+        :func:`_detect_task_buffers`).
+    """
+    import math
+
+    from torch._decomp import (
+        get_decompositions,
+    )
+    from torch.fx.experimental.proxy_tensor import (
+        make_fx,
+    )
+
+    from deepmd.pt_expt.model.ener_model import (
+        _translate_energy_keys,
+    )
+
+    was_training = model.training
+    # Trace in train mode so create_graph=True is captured inside the graph
+    # force backward (forward_common_lower_graph passes create_graph=self.training).
+    model.train()
+
+    task_buf_order: tuple[str, ...] = tuple(task_buffers.keys()) if task_buffers else ()
+    task_buf_vals_trace: tuple[torch.Tensor, ...] = (
+        tuple(task_buffers[k] for k in task_buf_order) if task_buffers else ()
+    )
+
+    _fitting: torch.nn.Module | None = None
+    _atomic_model: torch.nn.Module | None = None
+    if task_buf_order:
+        try:
+            _fitting = model.get_fitting_net()
+        except AttributeError:
+            pass
+        try:
+            _atomic_model = model.atomic_model
+        except AttributeError:
+            pass
+
+    do_grad_r = model.do_grad_r("energy")
+    do_grad_c = model.do_grad_c("energy")
+
+    # ------------------------------------------------------------------
+    # Build the trace-time NeighborGraph with prime-distinct nf / N / E.
+    #
+    # make_fx (tracing_mode="symbolic") unifies dimension symbols that share a
+    # concrete value (duck-shape merging).  The three dynamic axes of the graph
+    # lower must stay distinct symbols, otherwise the per-frame segment_sum
+    # (N -> nf) and the per-edge scatter (E -> N) bake in a false equality:
+    #   * nf  = n_node.shape[0]      (per-frame reductions)
+    #   * N   = atype.shape[0]       (flat node axis = sum(n_node))
+    #   * E   = edge_vec.shape[0]    (edge axis)
+    # They are chosen as collision-free primes vs every parameter/buffer dim.
+    # ------------------------------------------------------------------
+    _forbidden: set[int] = {
+        int(_d)
+        for _src in (model.parameters(), model.buffers())
+        for _p in _src
+        for _d in _p.shape
+        if _d > 1
+    }
+    try:
+        _dim_fp = model.get_dim_fparam()
+        if _dim_fp > 1:
+            _forbidden.add(_dim_fp)
+    except Exception:
+        pass
+    try:
+        _dim_ap = model.get_dim_aparam()
+        if _dim_ap > 1:
+            _forbidden.add(_dim_ap)
+    except Exception:
+        pass
+    if charge_spin is not None and charge_spin.shape[-1] > 1:
+        _forbidden.add(int(charge_spin.shape[-1]))
+    for _tbv in task_buf_vals_trace:
+        for _d in _tbv.shape:
+            if _d > 1:
+                _forbidden.add(int(_d))
+
+    trace_nf = _next_safe_prime(5, _forbidden)
+    # nloc such that N = trace_nf * nloc is collision-free (and != trace_nf).
+    nloc_trace = 7
+    while (trace_nf * nloc_trace) in (_forbidden | {trace_nf}):
+        nloc_trace += 1
+    trace_N = trace_nf * nloc_trace
+    # Static edge capacity, prime-padded to stay distinct from nf and N.
+    nnei = sum(model.get_sel())
+    e_max_base = max(math.ceil(1.25 * nloc_trace * nnei), 7)
+    e_max = _next_safe_prime(e_max_base, _forbidden | {trace_nf, trace_N})
+
+    sample = _make_graph_trace_inputs(
+        model,
+        e_max=e_max,
+        nframes=trace_nf,
+        nloc=nloc_trace,
+        want_fparam=fparam is not None,
+        want_aparam=aparam is not None,
+        want_charge_spin=charge_spin is not None,
+    )
+    (
+        s_atype,
+        s_n_node,
+        s_edge_index,
+        s_edge_vec,
+        s_edge_mask,
+        s_fparam,
+        s_aparam,
+        s_charge_spin,
+    ) = sample
+
+    def fn(
+        atype: torch.Tensor,
+        n_node: torch.Tensor,
+        edge_index: torch.Tensor,
+        edge_vec: torch.Tensor,
+        edge_mask: torch.Tensor,
+        fparam: torch.Tensor | None,
+        aparam: torch.Tensor | None,
+        charge_spin: torch.Tensor | None,
+        *task_buf_vals: torch.Tensor,
+    ) -> dict[str, torch.Tensor]:
+        # Patch task-specific buffers with the proxy tensors so make_fx records
+        # them as FX placeholders (mirrors the dense ``_trace_and_compile``).
+        originals: dict[str, torch.Tensor | None] = {}
+        if task_buf_order:
+            for name, val in zip(task_buf_order, task_buf_vals, strict=True):
+                if name.startswith(_AM_PREFIX):
+                    actual = name[len(_AM_PREFIX) :]
+                    if _atomic_model is not None:
+                        originals[name] = _atomic_model._buffers.get(actual)
+                        _atomic_model._buffers[actual] = val
+                else:
+                    if _fitting is not None:
+                        originals[name] = _fitting._buffers.get(name)
+                        _fitting._buffers[name] = val
+        try:
+            # forward_common_lower_graph makes edge_vec the autograd leaf
+            # internally, so no outer detach/requires_grad_ here.
+            model_ret = model.forward_common_lower_graph(
+                atype,
+                n_node,
+                edge_index,
+                edge_vec,
+                edge_mask,
+                do_atomic_virial=False,
+                fparam=fparam,
+                aparam=aparam,
+                charge_spin=charge_spin,
+            )
+            return _translate_energy_keys(
+                model_ret,
+                do_grad_r=do_grad_r,
+                do_grad_c=do_grad_c,
+                do_atomic_virial=False,
+                local=True,
+            )
+        finally:
+            for name, orig in originals.items():
+                if name.startswith(_AM_PREFIX):
+                    actual = name[len(_AM_PREFIX) :]
+                    if _atomic_model is not None:
+                        _atomic_model._buffers[actual] = orig
+                else:
+                    if _fitting is not None:
+                        _fitting._buffers[name] = orig
+
+    decomp_table = get_decompositions([torch.ops.aten.silu_backward.default])
+
+    traced_lower = make_fx(
+        fn,
+        tracing_mode="symbolic",
+        _allow_non_fake_inputs=True,
+        decomposition_table=decomp_table,
+    )(
+        s_atype,
+        s_n_node,
+        s_edge_index,
+        s_edge_vec,
+        s_edge_mask,
+        s_fparam,
+        s_aparam,
+        s_charge_spin,
+        *task_buf_vals_trace,
+    )
+
+    # The per-frame virial reduction scatters E edges into the (nf, 3, 3) virial
+    # via an atomic_add; inductor's CPU vectorizer asserts on that scatter's
+    # scalar index (``index.is_vec``).  Disable CPU SIMD for the graph lower so
+    # the scatter is emitted scalar — numerically this only removes a
+    # reduction-order source, keeping eager==compiled within fp64 tolerance.
+    return (
+        _finalize_compiled_lower(
+            traced_lower,
+            model,
+            was_training,
+            compile_opts,
+            extra_options={"cpp.simdlen": 0},
+        ),
+        task_buf_order,
+    )
+
+
+def _make_graph_trace_inputs(
+    model: torch.nn.Module,
+    e_max: int,
+    nframes: int,
+    nloc: int,
+    *,
+    want_fparam: bool,
+    want_aparam: bool,
+    want_charge_spin: bool,
+) -> tuple[torch.Tensor | None, ...]:
+    """Build a synthetic carry-all NeighborGraph for the graph-compile trace.
+
+    Returns positional tensors in the order
+    ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam,
+    charge_spin)`` matching ``forward_common_lower_graph``.  The edge axis is
+    padded to the STATIC ``e_max`` (masked) so its concrete value is a chosen
+    prime; ``fparam`` / ``aparam`` / ``charge_spin`` are emitted only when the
+    model+data path actually carries them (``want_*``), so the traced branch
+    matches the run-time call.
+    """
+    from deepmd.dpmodel.utils.neighbor_graph import (
+        GraphLayout,
+        build_neighbor_graph,
+    )
+
+    rcut = model.get_rcut()
+    ntypes = len(model.get_type_map())
+    dim_fparam = model.get_dim_fparam()
+    dim_aparam = model.get_dim_aparam()
+
+    box_size = rcut * 3.0
+    box_np = (np.eye(3, dtype=np.float64) * box_size).reshape(1, 9)
+    rng = np.random.default_rng(42)
+    coord_np = rng.random((nframes, nloc, 3)) * box_size * 0.5 + box_size * 0.25
+    atype_np = np.zeros((nframes, nloc), dtype=np.int64)
+    for i in range(nloc):
+        atype_np[:, i] = i % ntypes
+
+    coord_t = torch.tensor(coord_np, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE)
+    atype_t = torch.tensor(atype_np, dtype=torch.int64, device=DEVICE)
+    box_t = torch.tensor(
+        np.tile(box_np, (nframes, 1)), dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
+    )
+
+    graph = build_neighbor_graph(
+        coord_t, atype_t, box_t, rcut, layout=GraphLayout(edge_capacity=e_max)
+    )
+
+    s_atype = atype_t.reshape(-1)
+    s_n_node = graph.n_node
+    s_edge_index = graph.edge_index
+    s_edge_vec = graph.edge_vec
+    s_edge_mask = graph.edge_mask
+
+    s_fparam = (
+        torch.zeros(nframes, dim_fparam, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE)
+        if (want_fparam and dim_fparam > 0)
+        else None
+    )
+    s_aparam = (
+        torch.zeros(
+            nframes, nloc, dim_aparam, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
+        )
+        if (want_aparam and dim_aparam > 0)
+        else None
+    )
+    dim_cs = model.get_dim_chg_spin() if hasattr(model, "get_dim_chg_spin") else 0
+    s_charge_spin = (
+        torch.zeros(nframes, dim_cs, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE)
+        if (want_charge_spin and dim_cs > 0)
+        else None
+    )
+
+    return (
+        s_atype,
+        s_n_node,
+        s_edge_index,
+        s_edge_vec,
+        s_edge_mask,
+        s_fparam,
+        s_aparam,
+        s_charge_spin,
+    )
 
 
 class _CompiledModel(torch.nn.Module):
@@ -546,6 +916,9 @@ def __init__(
         self._compiled_by_structure: dict = (
             compiled_by_structure if compiled_by_structure is not None else {}
         )
+        # Resolved on the first forward: whether to compile the GRAPH lower
+        # (graph-eligible mixed_types descriptors) or the dense forward_lower.
+        self._graph_eligible: bool | None = None
 
     def __getattr__(self, name: str) -> Any:
         # Delegate unknown lookups to original_model so that callers such as
@@ -579,6 +952,18 @@ def forward(
 
         nframes, nloc = atype.shape[:2]
         rcut = self.original_model.get_rcut()
+
+        # Graph-eligible models (dpa1 attn_layer==0) default-flip to the carry-all
+        # GRAPH forward in eager training; the compiled lower must be the GRAPH
+        # lower too, otherwise the eager (graph) and compiled (dense) backward
+        # gradients diverge at fp64 accumulation and the optimizer amplifies it.
+        if self._graph_eligible is None:
+            self._graph_eligible = _model_uses_graph_lower(self.original_model)
+        if self._graph_eligible:
+            return self._forward_graph(
+                coord, atype, box, fparam, aparam, charge_spin, nframes, nloc, rcut
+            )
+
         sel = self.original_model.get_sel()
 
         # coord extension + nlist (data-dependent, run in eager)
@@ -751,6 +1136,152 @@ def forward(
             out["mask"] = result["mask"]
         return out
 
+    def _forward_graph(
+        self,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
+        box: torch.Tensor | None,
+        fparam: torch.Tensor | None,
+        aparam: torch.Tensor | None,
+        charge_spin: torch.Tensor | None,
+        nframes: int,
+        nloc: int,
+        rcut: float,
+    ) -> dict[str, torch.Tensor]:
+        """Carry-all GRAPH forward -> compiled ``forward_common_lower_graph``.
+
+        Builds the carry-all NeighborGraph eagerly (the SAME builder the eager
+        uncompiled default-flip uses, so the graph tensors are bit-identical),
+        then calls the compiled graph lower.  The graph force is per-LOCAL-node
+        ``(N, 3)`` with ``N == nframes * nloc`` for a single-rank carry-all graph,
+        so no extended->local scatter is needed; only the flat ``(N, *)`` node
+        keys are unravelled to ``(nf, nloc, *)`` at the I/O boundary.
+        """
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            build_neighbor_graph,
+        )
+
+        _model = self.original_model
+
+        coord_3d = coord.detach().reshape(nframes, nloc, 3)
+        box_flat = box.detach().reshape(nframes, 9) if box is not None else None
+
+        # Mirror the optional-input defaulting of the dense path / eager
+        # call_common: a model configured with fparam / charge_spin substitutes
+        # its default when the data omits it, so the compiled (frozen) branch
+        # always sees a tensor.
+        _dim_fparam = (
+            _model.get_dim_fparam() if hasattr(_model, "get_dim_fparam") else 0
+        )
+        if fparam is None and _dim_fparam > 0:
+            _default_fparam = _model.get_default_fparam()
+            if _default_fparam is not None:
+                fparam = (
+                    torch.as_tensor(
+                        _default_fparam, dtype=coord_3d.dtype, device=coord_3d.device
+                    )
+                    .reshape(1, _dim_fparam)
+                    .expand(nframes, -1)
+                )
+        _dim_cs = (
+            _model.get_dim_chg_spin() if hasattr(_model, "get_dim_chg_spin") else 0
+        )
+        if charge_spin is None and _dim_cs > 0:
+            _default_cs = _model.get_default_chg_spin()
+            if _default_cs is not None:
+                charge_spin = (
+                    torch.as_tensor(
+                        _default_cs, dtype=coord_3d.dtype, device=coord_3d.device
+                    )
+                    .reshape(1, _dim_cs)
+                    .expand(nframes, -1)
+                )
+
+        # Carry-all graph (dynamic E, no edge_capacity) — identical to the eager
+        # uncompiled ``_call_common_graph`` builder so the two paths match.
+        ng = build_neighbor_graph(coord_3d, atype, box_flat, rcut)
+        atype_flat = atype.reshape(nframes * nloc)
+
+        # Lazy compile of the GRAPH lower (cached per structure key).
+        if self.compiled_forward_lower is None:
+            if self._structure_key in self._compiled_by_structure:
+                compiled_lower, buf_order = self._compiled_by_structure[
+                    self._structure_key
+                ]
+                log.info("Reusing compiled graph lower (shared structure, lazy).")
+            else:
+                log.info(
+                    "Lazy compile (graph lower): tracing on first forward call "
+                    "(structure_key=%s).",
+                    self._structure_key,
+                )
+                compiled_lower, buf_order = _trace_and_compile_graph(
+                    _model,
+                    fparam,
+                    aparam,
+                    charge_spin,
+                    task_buffers=self._task_buffers,
+                    compile_opts=self._compile_opts,
+                )
+                self._compiled_by_structure[self._structure_key] = (
+                    compiled_lower,
+                    buf_order,
+                )
+            self.compiled_forward_lower = compiled_lower
+            self._task_buf_order = buf_order
+            self._task_buffers = None
+
+        # Feed a detached, grad-enabled edge_vec leaf: the traced graph's internal
+        # ``edge_vec.detach()`` is stripped by ``_strip_saved_tensor_detach`` (as
+        # for the dense ext_coord leaf), so the force backward roots at this input.
+        edge_vec = ng.edge_vec.detach().requires_grad_(True)
+
+        if self._task_buf_order:
+            try:
+                _fitting = _model.get_fitting_net()
+                _am = getattr(_model, "atomic_model", None)
+                _vals: list[torch.Tensor] = []
+                for _name in self._task_buf_order:
+                    if _name.startswith(_AM_PREFIX):
+                        _actual = _name[len(_AM_PREFIX) :]
+                        _vals.append(_am._buffers[_actual])
+                    else:
+                        _vals.append(getattr(_fitting, _name))
+                task_buf_vals: tuple = tuple(_vals)
+            except AttributeError as exc:
+                raise RuntimeError(
+                    f"Compiled graph expects task buffers {self._task_buf_order!r} "
+                    "but they could not be retrieved from the model. "
+                    "This is a bug in the compile path."
+                ) from exc
+        else:
+            task_buf_vals = ()
+
+        result = self.compiled_forward_lower(
+            atype_flat,
+            ng.n_node,
+            ng.edge_index,
+            edge_vec,
+            ng.edge_mask,
+            fparam,
+            aparam,
+            charge_spin,
+            *task_buf_vals,
+        )
+
+        # The compiled graph lower emits PUBLIC keys on the FLAT node axis
+        # (``atom_energy`` / ``force`` are (N, *); ``energy`` / ``virial`` are
+        # (nf, *)).  Unravel the node-level keys to rectangular (nf, nloc, *) so
+        # callers receive the same shapes as the dense path.
+        N = nframes * nloc
+        out: dict[str, torch.Tensor] = {}
+        for key, val in result.items():
+            if val is not None and val.shape[:1] == torch.Size([N]) and N != nframes:
+                out[key] = val.reshape(nframes, nloc, *val.shape[1:])
+            else:
+                out[key] = val
+        return out
+
 
 # ---------------------------------------------------------------------------
 # Trainer
diff --git a/source/tests/pt_expt/test_training.py b/source/tests/pt_expt/test_training.py
index 45061c084a..a9764947c0 100644
--- a/source/tests/pt_expt/test_training.py
+++ b/source/tests/pt_expt/test_training.py
@@ -1352,9 +1352,7 @@ def _make_varying_config(
         config = normalize(config)
         return config
 
-    def _check_varying_natoms(
-        self, descriptor: dict | None = None, force_legacy_descriptor: bool = False
-    ) -> None:
+    def _check_varying_natoms(self, descriptor: dict | None = None) -> None:
         """Per-step compiled-vs-uncompiled comparison for the given descriptor.
 
         The loss config has ``start_pref_f=1000`` and ``start_pref_v=1.0``,
@@ -1370,17 +1368,10 @@ def _check_varying_natoms(
         cannot meet that on float64 the descriptor has a real numerical
         problem (see the DPA1 limitation note where this happened).
 
-        ``force_legacy_descriptor`` makes a graph-eligible descriptor (dpa1
-        ``attn_layer==0``) take the legacy *dense* (env-mat) path on BOTH the
-        compiled and uncompiled sides, so this stays a true compile-correctness
-        check (same computation, compiled vs eager).  The pt_expt eager default
-        for such a descriptor is the carry-all GRAPH forward while the compiled
-        ``forward_lower`` is the sel-capped DENSE forward; those are two
-        *different* force computations whose parameter gradients agree only to
-        fp64 accumulation (~1e-12), which the optimizer then amplifies into a
-        diverging training trajectory.  Making the compiled GRAPH lower (so
-        eager==compiled) is tracked for PR-B; until then this test exercises the
-        dense path it actually compiles.
+        Graph-eligible descriptors (dpa1 ``attn_layer==0``) compile the GRAPH
+        lower (``forward_common_lower_graph``) so the compiled path matches the
+        eager carry-all graph default-flip; non-eligible descriptors
+        (se_e2_a / dpa2 / dpa3) compile the dense ``forward_lower``.
         """
         from deepmd.pt_expt.train.training import (
             _CompiledModel,
@@ -1400,16 +1391,6 @@ def _check_varying_natoms(
                 compiled_model = trainer_c.wrapper.model["Default"]
                 self.assertIsInstance(compiled_model, _CompiledModel)
 
-                if force_legacy_descriptor:
-                    # Pin BOTH sides to the legacy dense (env-mat) path so the
-                    # uncompiled reference matches the dense ``forward_lower``
-                    # that gets compiled (must happen before the first forward,
-                    # i.e. before the lazy compile trace).  See the docstring /
-                    # PR-B note: the graph forward vs dense forward differ in the
-                    # backward at fp64 precision, which the optimizer amplifies.
-                    for _m in (trainer_uc.model, compiled_model.original_model):
-                        _m.get_descriptor().uses_graph_lower = lambda: False
-
                 # Sync weights so predictions can be compared exactly
                 compiled_model.original_model.load_state_dict(
                     trainer_uc.model.state_dict()
@@ -1482,25 +1463,20 @@ def test_compiled_matches_uncompiled_varying_natoms_dpa3(self) -> None:
         self._check_varying_natoms(_DESCRIPTOR_DPA3)
 
     def test_compiled_matches_uncompiled_varying_natoms_dpa1_no_attn(self) -> None:
-        """DPA1 (attn_layer=0): compiled vs uncompiled match (dense path).
+        """DPA1 (attn_layer=0): compiled vs uncompiled match (GRAPH lower).
 
-        ``force_legacy_descriptor=True`` pins both sides to the legacy dense
-        (env-mat) forward -- the path the compiled ``forward_lower`` actually
-        uses.  The pt_expt eager default for dpa1(attn_layer=0) is the carry-all
-        GRAPH forward, a *different* force computation from the compiled dense
-        forward; their backward gradients agree only to fp64 accumulation, which
-        the optimizer amplifies, so comparing graph-vs-dense through training is
-        ill-posed.  Making the compiled path the GRAPH lower (eager==compiled)
-        is tracked for PR-B (graph .pt2/export).
+        The pt_expt eager default for dpa1(attn_layer=0) is the carry-all GRAPH
+        forward, and the compiled path now compiles the matching GRAPH lower
+        (``forward_common_lower_graph``) -- so eager==compiled and the
+        multi-step varying-natoms trajectory (predictions + per-parameter grads
+        + loss) agrees to the strict ``atol=rtol=1e-10`` tolerance.
 
         DPA1 with attention layers is intentionally not covered: the
         compiled se_atten path is hardware-sensitive on multi-threaded
         CPUs (parallel reduction order diverges from eager above the
         1e-10 tolerance).  ``_compile_model`` warns the user instead.
         """
-        self._check_varying_natoms(
-            _DESCRIPTOR_DPA1_NO_ATTN, force_legacy_descriptor=True
-        )
+        self._check_varying_natoms(_DESCRIPTOR_DPA1_NO_ATTN)
 
     def test_compile_warns_dpa1_with_attention(self) -> None:
         """DPA1 (attn_layer>0) under compile must emit a warning.

From 47fb700efb0d0225762beacde4d315144d13e28f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 29 Jun 2026 22:46:07 +0800
Subject: [PATCH 08/33] =?UTF-8?q?docs(pt=5Fexpt):=20B1=20final-review=20mi?=
 =?UTF-8?q?nors=20=E2=80=94=20document=20nloc=3D=3D1=20unravel-skip=20+=20?=
 =?UTF-8?q?legacy-gate=20assumption=20in=20the=20graph=20compile=20path?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 deepmd/pt_expt/train/training.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index ff685cb0c7..6393d78e39 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -548,6 +548,11 @@ def _model_uses_graph_lower(model: torch.nn.Module) -> bool:
     so the compiled path matches eager training (which already default-flips to
     the carry-all graph forward); when False the dense ``forward_lower`` is
     compiled (se_e2_a / dpa2 / dpa3 / linear / zbl).
+
+    ASSUMPTION: training uses the default ``neighbor_graph_method`` (None). If a
+    user-facing ``"legacy"`` opt-out is ever plumbed into the trainer, this gate
+    must also honor it (else eager would run dense while the compiled path runs
+    the graph lower, re-introducing the eager!=compiled divergence this fixes).
     """
     if not hasattr(model, "mixed_types"):
         return False
@@ -1276,6 +1281,12 @@ def _forward_graph(
         N = nframes * nloc
         out: dict[str, torch.Tensor] = {}
         for key, val in result.items():
+            # ``N != nframes`` distinguishes node-level keys (lead dim N) from
+            # frame-level keys (lead dim nf) by shape. DEGENERATE: when nloc==1,
+            # N == nframes, so node-level keys are NOT unravelled and stay
+            # (nf, *) instead of (nf, 1, *). Harmless for the varying-natoms
+            # trainer (nloc >> 1); a single-atom-per-frame system would need an
+            # explicit per-key category check instead of the shape heuristic.
             if val is not None and val.shape[:1] == torch.Size([N]) and N != nframes:
                 out[key] = val.reshape(nframes, nloc, *val.shape[1:])
             else:

From b046874f03c237c0863d28a44bd83487fefe1537 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 29 Jun 2026 14:47:28 +0000
Subject: [PATCH 09/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/pt_expt/utils/serialization.py         |  3 +--
 .../tests/pt_expt/model/test_graph_export.py  | 26 ++++++++++++++-----
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index dcb8aa58c1..532fbbb4b2 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -337,13 +337,12 @@ def _make_graph_sample_inputs(
     nloc : int
         Number of local atoms per frame (``N == nframes * nloc``).
     """
+    import deepmd.pt_expt.utils.env as _env
     from deepmd.dpmodel.utils.neighbor_graph import (
         GraphLayout,
         build_neighbor_graph,
     )
 
-    import deepmd.pt_expt.utils.env as _env
-
     rcut = model.get_rcut()
     ntypes = len(model.get_type_map())
     dim_fparam = model.get_dim_fparam()
diff --git a/source/tests/pt_expt/model/test_graph_export.py b/source/tests/pt_expt/model/test_graph_export.py
index 56e2d6eb7b..6b735aa3d5 100644
--- a/source/tests/pt_expt/model/test_graph_export.py
+++ b/source/tests/pt_expt/model/test_graph_export.py
@@ -3,12 +3,26 @@
 
 import pytest
 import torch
-from deepmd.pt.utils import env
-from deepmd.pt_expt.descriptor.dpa1 import DescrptDPA1
-from deepmd.pt_expt.fitting import InvarFitting
-from deepmd.pt_expt.model import EnergyModel
-from deepmd.dpmodel.utils.neighbor_graph import build_neighbor_graph
-from ...seed import GLOBAL_SEED
+
+from deepmd.dpmodel.utils.neighbor_graph import (
+    build_neighbor_graph,
+)
+from deepmd.pt.utils import (
+    env,
+)
+from deepmd.pt_expt.descriptor.dpa1 import (
+    DescrptDPA1,
+)
+from deepmd.pt_expt.fitting import (
+    InvarFitting,
+)
+from deepmd.pt_expt.model import (
+    EnergyModel,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
 
 _RCUT, _NT = 4.0, 2
 

From 0d3860e88e91d9e2806adf312e47bb2ef12bf4af Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 29 Jun 2026 23:45:21 +0800
Subject: [PATCH 10/33] feat(pt_expt): graph .pt2 uses a dynamic edge axis
 (Dim(nedge)); drop static edge_capacity

The graph-form .pt2 export now marks the edge axis dynamic
(Dim("nedge", min=2)) instead of baking a static E_max=ceil(1.25*nloc*nnei)
capacity. The AOTI artifact accepts any system size with no capacity ceiling.

- _build_graph_dynamic_shapes: edge_index dim1 / edge_vec dim0 / edge_mask dim0
  are now dynamic; mirrors the dense Dim("nnei", min=...) precedent.
- _trace_and_export graph branch: drop the metadata["edge_capacity"] write;
  apply _strip_shape_assertions to neutralise the SIGFPE-prone deferred
  shape guards on the dynamic E axis (same handling the spin dense path uses).
- deep_eval._eval_model_graph: build the carry-all graph at its tight edge
  count (no edge_capacity padding).
- test_graph_deepeval: eval TWO different-size systems (8- and 20-atom,
  56 vs 380 real edges) through the SAME exported artifact; both match eager
  dense dpa1 at 1e-10 pbc+nopbc, both non-binding. The 20-atom system (380 > 263)
  would have overflowed the B1 static artifact -> the RED.
- test_graph_pt2_metadata: graph metadata no longer carries edge_capacity.
---
 deepmd/pt_expt/infer/deep_eval.py             | 16 ++---
 deepmd/pt_expt/utils/serialization.py         | 59 ++++++++++++-------
 .../pt_expt/infer/test_graph_deepeval.py      | 42 +++++++++----
 .../pt_expt/utils/test_graph_pt2_metadata.py  |  7 +--
 4 files changed, 80 insertions(+), 44 deletions(-)

diff --git a/deepmd/pt_expt/infer/deep_eval.py b/deepmd/pt_expt/infer/deep_eval.py
index 47bcbff731..e03f893040 100644
--- a/deepmd/pt_expt/infer/deep_eval.py
+++ b/deepmd/pt_expt/infer/deep_eval.py
@@ -1652,15 +1652,15 @@ def _eval_model_graph(
         """Evaluate a graph-form ``.pt2`` (``lower_input_kind == "graph"``).
 
         Builds a carry-all :class:`~deepmd.dpmodel.utils.neighbor_graph.NeighborGraph`
-        from the eval system, padded to the static ``edge_capacity`` baked into
-        the AOTI artifact, and feeds the positional schema
+        from the eval system at its exact (tight) edge count and feeds the
+        positional schema
         ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam,
-        charge_spin)`` to the exported forward.  The forward returns the LOCAL
-        public keys directly, so results are reshaped without
-        ``communicate_extended_output``.
+        charge_spin)`` to the exported forward.  The AOTI artifact's edge axis
+        is DYNAMIC (B2.0), so no ``edge_capacity`` padding is needed.  The
+        forward returns the LOCAL public keys directly, so results are reshaped
+        without ``communicate_extended_output``.
         """
         from deepmd.dpmodel.utils.neighbor_graph import (
-            GraphLayout,
             build_neighbor_graph,
         )
         from deepmd.pt_expt.utils.env import (
@@ -1676,13 +1676,13 @@ def _eval_model_graph(
 
         coord_input = coords.reshape(nframes, natoms, 3)
         box_input = cells.reshape(nframes, 9) if cells is not None else None
-        edge_capacity = int(self.metadata["edge_capacity"])
+        # Dynamic edge axis (B2.0): build the carry-all graph at its exact edge
+        # count (no static padding); the AOTI artifact accepts any E.
         graph = build_neighbor_graph(
             coord_input,
             atom_types,
             box_input,
             self._rcut,
-            layout=GraphLayout(edge_capacity=edge_capacity),
         )
 
         atype_t = torch.tensor(
diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index 532fbbb4b2..03c678c0fa 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -422,10 +422,13 @@ def _build_graph_dynamic_shapes(
 ) -> tuple:
     """Build dynamic-shape specifications for the graph-form forward_lower export.
 
-    ``nframes`` (the ``n_node`` axis) and ``N`` (the flat node axis) are
-    dynamic dims; the edge axis ``E`` is STATIC (decision #16: the masked
-    ``edge_capacity`` path), expressed by leaving the edge dims unmarked
-    (``None``) so torch.export specialises them to the sample value.
+    ``nframes`` (the ``n_node`` axis), ``N`` (the flat node axis) AND the edge
+    axis ``E`` are all dynamic dims (B2.0: the dynamic edge axis replaces the
+    static ``edge_capacity`` of B1).  ``E`` is marked ``Dim("nedge", min=2)`` so
+    the AOTI artifact accepts any system size with no capacity ceiling — the
+    ``min=2`` lower bound mirrors the dense path's ``Dim("nnei", min=...)`` (a
+    dynamic, SIGFPE-tamed axis) and matches the carry-all builder's
+    ``min_edges=2`` guard (every dynamic graph carries >=2 edges).
 
     Parameters
     ----------
@@ -438,12 +441,13 @@ def _build_graph_dynamic_shapes(
     charge_spin = sample_inputs[7]
     nframes_dim = torch.export.Dim("nframes", min=1)
     n_node_total_dim = torch.export.Dim("n_node_total", min=1)
+    nedge_dim = torch.export.Dim("nedge", min=2)
     return (
         {0: n_node_total_dim},  # atype: (N,)
         {0: nframes_dim},  # n_node: (nf,)
-        None,  # edge_index: (2, E) — E static
-        None,  # edge_vec: (E, 3) — E static
-        None,  # edge_mask: (E,) — E static
+        {1: nedge_dim},  # edge_index: (2, E) — E dynamic
+        {0: nedge_dim},  # edge_vec: (E, 3) — E dynamic
+        {0: nedge_dim},  # edge_mask: (E,) — E dynamic
         {0: nframes_dim} if fparam is not None else None,  # fparam: (nf, ndf)
         {0: nframes_dim} if aparam is not None else None,  # aparam: (nf, nloc, nda)
         {0: nframes_dim} if charge_spin is not None else None,  # charge_spin
@@ -777,8 +781,9 @@ def deserialize_to_file(
         (``extended_coord``/``extended_atype``/``nlist``/``mapping``);
         ``"graph"`` traces the NeighborGraph schema
         (``atype``/``n_node``/``edge_index``/``edge_vec``/``edge_mask``) with a
-        static edge axis ``E = ceil(1.25 * nloc * nnei)``.  The selected schema
-        is recorded as ``lower_input_kind`` in ``metadata.json``.
+        DYNAMIC edge axis ``E`` (``Dim("nedge", min=2)``), so the artifact
+        accepts any system size.  The selected schema is recorded as
+        ``lower_input_kind`` in ``metadata.json``.
     """
     if model_file.endswith(".pt2"):
         _deserialize_to_file_pt2(
@@ -824,7 +829,7 @@ def _trace_and_export(
     lower_kind
         ``"nlist"`` (default) traces the dense quartet forward; ``"graph"``
         traces ``forward_lower_graph_exportable`` over the NeighborGraph schema
-        with a static edge axis. Recorded as ``lower_input_kind`` in metadata.
+        with a dynamic edge axis. Recorded as ``lower_input_kind`` in metadata.
 
     Returns
     -------
@@ -866,8 +871,8 @@ def _trace_and_export(
 
     # 2b. Graph-form export branch (NeighborGraph schema). The graph path is
     # LOCAL-only (no ghosts), single-rank, energy-model only in PR-A/PR-B; it
-    # traces ``forward_lower_graph_exportable`` with a STATIC edge axis. The
-    # dense (nlist) path below is left byte-unchanged.
+    # traces ``forward_lower_graph_exportable`` with a DYNAMIC edge axis (B2.0).
+    # The dense (nlist) path below is left byte-unchanged.
     if lower_kind == "graph":
         import math
 
@@ -887,17 +892,20 @@ def _trace_and_export(
                 "requires an energy model"
             )
 
-        # Static export edge capacity E_max = ceil(1.25 * nloc * nnei)
-        # (decision #12 headroom). nloc is the sample-system local-atom count.
+        # The edge axis is DYNAMIC (B2.0): the AOTI artifact accepts any edge
+        # count, so there is no capacity to bake. The trace sample is built at a
+        # concrete, padded edge size only to keep the trace tensors distinct
+        # from the other dynamic dims (nframes=2, N=14) under torch.export's
+        # duck-sizing; the value itself does NOT constrain runtime.
         nloc_sample = 7
         nnei = sum(model.get_sel())
-        e_max = math.ceil(1.25 * nloc_sample * nnei)
+        e_sample = math.ceil(1.25 * nloc_sample * nnei)
 
         _orig_device = _env.DEVICE
         _env.DEVICE = torch.device("cpu")
         try:
             sample_inputs = _make_graph_sample_inputs(
-                model, e_max=e_max, nframes=2, nloc=nloc_sample
+                model, e_max=e_sample, nframes=2, nloc=nloc_sample
             )
         finally:
             _env.DEVICE = _orig_device
@@ -948,6 +956,16 @@ def _trace_and_export(
             prefer_deferred_runtime_asserts_over_guards=True,
         )
 
+        # Neutralise shape-guard assertion nodes on the dynamic edge axis.
+        # ``prefer_deferred_runtime_asserts_over_guards=True`` converts the
+        # symbolic-shape guards discovered while tracing into deferred
+        # ``aten._assert_scalar`` nodes; on the dynamic ``E`` axis these are the
+        # SIGFPE-prone ``nloc_min``-family checks (CLAUDE.md AOTI pitfalls) that
+        # the dense spin path already strips. Replacing each condition with
+        # ``True`` (not erasing the node) keeps the graph well-formed while
+        # letting the AOTI artifact generalise across edge counts.
+        _strip_shape_assertions(exported.graph_module)
+
         if target_device.type != "cpu":
             from torch.export.passes import (
                 move_to_device_pass,
@@ -956,11 +974,10 @@ def _trace_and_export(
             exported = move_to_device_pass(exported, target_device)
 
         metadata["do_atomic_virial"] = do_atomic_virial
-        # The edge axis is specialized STATIC: torch.export bakes E to exactly
-        # e_max, so the AOTI forward only accepts edge tensors of this length.
-        # Persist it so the C++ conversion hub (PR-B Phase B2) pads/masks runtime
-        # edges to precisely this value instead of re-deriving the constant.
-        metadata["edge_capacity"] = e_max
+        # The edge axis is DYNAMIC (B2.0): the AOTI forward accepts any edge
+        # count, so there is no ``edge_capacity`` to persist. The C++ / Python
+        # conversion hub builds the carry-all graph at its exact (tight) edge
+        # count and feeds it straight through.
 
         json_source = model_json_override if model_json_override is not None else data
         data_for_json = deepcopy(json_source)
diff --git a/source/tests/pt_expt/infer/test_graph_deepeval.py b/source/tests/pt_expt/infer/test_graph_deepeval.py
index e2bdabf04f..7fc83b677f 100644
--- a/source/tests/pt_expt/infer/test_graph_deepeval.py
+++ b/source/tests/pt_expt/infer/test_graph_deepeval.py
@@ -65,22 +65,36 @@
 SEL = 30
 
 
-def _build_system() -> tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """A small, sparse cluster: 8 atoms inside a 5 A blob, centered in an 18 A box.
-
-    The blob keeps every atom within ``rcut`` of at most 7 others (<< ``sel``),
-    so the carry-all graph neighbor set equals the sel-capped dense one.
+def _build_system(
+    natoms: int = 8, seed: int = 20240626
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """A small, sparse cluster: ``natoms`` inside a 5 A blob, centered in an 18 A box.
+
+    The blob keeps every atom within ``rcut`` of at most ``natoms - 1`` others
+    (<< ``sel``), so the carry-all graph neighbor set equals the sel-capped
+    dense one.  Varying ``natoms`` yields a different edge count, exercising the
+    DYNAMIC edge axis of the exported ``.pt2`` (B2.0).
     """
-    rng = np.random.default_rng(20240626)
-    natoms = 8
+    rng = np.random.default_rng(seed)
     box_size = 18.0
     blob = rng.random((natoms, 3)) * 5.0 + box_size * 0.5 - 2.5
     coords = blob.reshape(1, natoms, 3)
     cells = (np.eye(3) * box_size).reshape(1, 9)
-    atype = np.array([0, 1, 1, 0, 1, 1, 0, 1], dtype=np.int32)
+    # Alternate O/H types; both species present regardless of natoms.
+    atype = np.array([i % 2 for i in range(natoms)], dtype=np.int32)
     return coords, cells, atype
 
 
+# Two DIFFERENT-size systems evaluated through the SAME exported ``.pt2``.
+# Both are sparse, non-binding clusters but with different edge counts, so the
+# second size FAILS against a static-``E`` artifact (B1) and PASSES only once
+# the edge axis is dynamic (B2.0).
+_SYSTEMS = {
+    "small_8": {"natoms": 8, "seed": 20240626},
+    "large_20": {"natoms": 20, "seed": 20240701},
+}
+
+
 def _max_neighbors(
     coords: np.ndarray, cells: np.ndarray | None, atype: np.ndarray
 ) -> int:
@@ -165,11 +179,17 @@ def graph_pt2():
     os.rmdir(tmpdir)
 
 
+@pytest.mark.parametrize("system", list(_SYSTEMS))  # two different edge counts
 @pytest.mark.parametrize("pbc", [True, False])  # periodic vs non-periodic
-def test_graph_pt2_deepeval_parity(graph_pt2, pbc) -> None:
-    """Graph ``.pt2`` DeepEval == eager dense dpa1 (energy/force/virial), 1e-10."""
+def test_graph_pt2_deepeval_parity(graph_pt2, pbc, system) -> None:
+    """Graph ``.pt2`` DeepEval == eager dense dpa1 (energy/force/virial), 1e-10.
+
+    Both ``_SYSTEMS`` are fed through the SAME module-scoped ``.pt2``; the
+    differing edge counts prove the exported artifact's edge axis is dynamic
+    (a static-``E`` B1 artifact would reject / mis-shape the larger system).
+    """
     pt2_path, model = graph_pt2
-    coords, cells, atype = _build_system()
+    coords, cells, atype = _build_system(**_SYSTEMS[system])
     box = cells if pbc else None
 
     # Anti-vacuity: the carry-all graph and the sel-capped dense reference only
diff --git a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
index 5c38b5046d..17aef4d671 100644
--- a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
+++ b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
@@ -89,10 +89,9 @@ def test_graph_pt2_has_lower_input_kind_graph(dpa1_dpmodel_data) -> None:
         )
         meta = _read_metadata(p)
     assert meta["lower_input_kind"] == "graph"
-    # the static edge axis is baked into the AOTI artifact; E_max must be
-    # persisted so the C++ conversion hub (PR-B B2) pads runtime edges to it.
-    # E_max = ceil(1.25 * nloc_sample(7) * nnei(sum(sel)=30)) = 263.
-    assert meta["edge_capacity"] == 263
+    # B2.0: the edge axis is DYNAMIC (Dim("nedge", min=2)); there is no static
+    # capacity baked into the AOTI artifact, so no ``edge_capacity`` is persisted.
+    assert "edge_capacity" not in meta
 
 
 def test_dense_pt2_has_lower_input_kind_nlist(dpa1_dpmodel_data) -> None:

From 40487c413d9331f84bb3c0d74f88c9bb16066f26 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Mon, 29 Jun 2026 23:49:51 +0800
Subject: [PATCH 11/33] =?UTF-8?q?docs(pt=5Fexpt):=20B2.0=20review=20?=
 =?UTF-8?q?=E2=80=94=20=5Fstrip=5Fshape=5Fassertions=20now=20documents=20t?=
 =?UTF-8?q?he=20graph=20(dynamic-E)=20caller=20+=20edge-axis=20safety?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 deepmd/pt_expt/utils/serialization.py | 47 +++++++++++++++------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index 03c678c0fa..1bb49a8b5a 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -40,27 +40,32 @@
 
 
 def _strip_shape_assertions(graph_module: torch.nn.Module) -> None:
-    """Neutralise shape-guard assertion nodes in a spin model's exported graph.
-
-    ``torch.export`` inserts ``aten._assert_scalar`` nodes for symbolic shape
-    relationships discovered during tracing.  For the spin model, the atom-
-    doubling logic creates slice patterns that depend on ``(nall - nloc)``,
-    producing guards like ``Ne(nall, nloc)``.  These guards are spurious: the
-    model computes correct results even when ``nall == nloc`` (NoPBC, no ghost
-    atoms).
-
-    This function is **only called for spin models** (guarded by ``if is_spin``
-    in ``_trace_and_export``).  The assertion messages use opaque symbolic
-    variable names (e.g. ``Ne(s22, s96)``) rather than human-readable names,
-    so filtering by message content is not reliable.  Since
-    ``prefer_deferred_runtime_asserts_over_guards=True`` converts all shape
-    guards into these deferred assertions, and the only shape relationships in
-    the spin model involve nall/nloc, neutralising all of them is safe in this
-    context.
-
-    We replace each assertion's condition with ``True`` rather than erasing the
-    node; erasing nodes can disturb the FX graph structure and produce NaN
-    gradients on some Python/torch versions.
+    """Neutralise deferred shape-guard assertion nodes in an exported graph.
+
+    ``torch.export`` (with ``prefer_deferred_runtime_asserts_over_guards=True``)
+    inserts ``aten._assert_scalar`` nodes for symbolic-shape relationships
+    discovered during tracing.  The assertion messages use opaque symbolic names
+    (e.g. ``Ne(s22, s96)``), so filtering by message content is not reliable; we
+    replace each assertion's condition with ``True`` rather than erasing the node
+    (erasing can disturb the FX graph and yield NaN gradients on some torch
+    versions).
+
+    Called from TWO export paths in ``_trace_and_export``:
+
+    * **spin (dense) models** — atom-doubling slice patterns depend on
+      ``(nall - nloc)``, producing spurious guards like ``Ne(nall, nloc)``; the
+      model is correct even when ``nall == nloc`` (NoPBC, no ghosts).
+    * **graph models** — the DYNAMIC edge axis (``Dim("nedge")``) produces guards
+      of the ``nloc_min``/SIGFPE family on the edge count ``E``.  These are the
+      shape-specialization guards the static-``edge_capacity`` path was designed
+      to avoid; neutralising them is what makes one artifact eval any edge count.
+
+    **Safety:** in both contexts every input is constructed well-formed by the
+    builder (spin: valid atom doubling; graph: ``build_neighbor_graph`` /
+    ``buildGraphTensors`` always emit ``E >= min_edges == 2`` with in-range,
+    masked edges), so the neutralised guards would never legitimately fire.  The
+    only cost is that a MALFORMED runtime tensor no longer throws cleanly — the
+    documented AOTI trade-off (CLAUDE.md), accepted identically on both paths.
     """
     graph = graph_module.graph
     for node in list(graph.nodes):

From 9a8727b872e14ca7c246fb06c02c2db740a5fd71 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 00:18:15 +0800
Subject: [PATCH 12/33] test(infer): extend gen_dpa1.py with graph-eligible
 dpa1(attn_layer=0) generator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Section B of gen_dpa1.py produces deeppot_dpa1_graph.pt2 (lower_kind="graph",
do_atomic_virial=True) and the accompanying deeppot_dpa1_graph.expected sidecar.

Reference values come from an independent nlist .pt2 eval (NOT the graph .pt2)
so the C++ gtest (B2.5) validates the graph AOTI path against a known-good
reference. Sanity check: graph .pt2 vs nlist ref force diff 1.3e-18 (machine
precision; sel=30 >> actual neighbors, so both paths see identical neighbor
sets). Forces non-degenerate: max |F| ~5.3e-4 (PBC).

Config: type_map=[O,H], sel=30, rcut=6.0, attn_layer=0, neuron=[2,4,8],
axis_neuron=4, fitting neuron=[5,5,5], resnet_dt=True, seed=1 — mirrors
DPA1_CONFIG in source/tests/pt_expt/utils/test_graph_pt2_metadata.py.
---
 source/tests/infer/gen_dpa1.py | 166 ++++++++++++++++++++++++++++++++-
 1 file changed, 162 insertions(+), 4 deletions(-)

diff --git a/source/tests/infer/gen_dpa1.py b/source/tests/infer/gen_dpa1.py
index 7eaaae4ae2..5ec523f5f5 100644
--- a/source/tests/infer/gen_dpa1.py
+++ b/source/tests/infer/gen_dpa1.py
@@ -1,15 +1,25 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Generate deeppot_dpa1.pth and deeppot_dpa1.pt2 test models.
+"""Generate deeppot_dpa1.pth, deeppot_dpa1.pt2, and deeppot_dpa1_graph.pt2 test models.
 
-Creates a DPA1 model from dpmodel config, serializes, and exports to both
-.pth (torch.jit) and .pt2 (torch.export) from the same weights.
-Also prints reference values for C++ tests (PBC and NoPbc).
+Creates two DPA1 models from dpmodel configs:
+  - deeppot_dpa1.pt2 / deeppot_dpa1.pth  (attn_layer=2, dense nlist-form export)
+  - deeppot_dpa1_graph.pt2                (attn_layer=0, graph-form export via
+                                            lower_kind="graph"; the graph forward
+                                            is eligible only when attn_layer==0)
+
+Both are serialized and exported to their respective formats from the same weights.
+Reference sidecar files (.expected) consumed by C++ gtests are also written:
+  - deeppot_dpa1.expected   — from the nlist .pt2 eval (existing)
+  - deeppot_dpa1_graph.expected — from a direct dpmodel eval (independent ground
+      truth, NOT from the graph .pt2); the graph .pt2 is sanity-checked against
+      this reference at ≤1e-6.
 """
 
 import copy
 import os
 import sys
+import tempfile
 
 import numpy as np
 
@@ -171,6 +181,154 @@ def main():
     print(f"// .pth NoPbc total energy: {e_pth_np[0, 0]:.18e}")  # noqa: T201
     print(f"// .pth vs .pt2 NoPbc energy diff: {abs(e_np[0, 0] - e_pth_np[0, 0]):.2e}")  # noqa: T201
 
+    # ============================================================
+    # Section B: graph-eligible DPA1 (attn_layer=0) model
+    # ============================================================
+    # attn_layer=0 disables the attention layers, making the descriptor
+    # a plain two-body embedding (se_e2_a-like) that is eligible for the
+    # NeighborGraph forward path (forward_lower_graph_exportable).
+    # Config mirrors DPA1_CONFIG in
+    # source/tests/pt_expt/utils/test_graph_pt2_metadata.py
+    graph_config = {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "se_atten",
+            "sel": 30,
+            "rcut_smth": 2.0,
+            "rcut": 6.0,
+            "neuron": [2, 4, 8],
+            "axis_neuron": 4,
+            "attn": 5,
+            "attn_layer": 0,
+            "attn_dotr": True,
+            "attn_mask": False,
+            "activation_function": "tanh",
+            "scaling_factor": 1.0,
+            "normalize": True,
+            "temperature": 1.0,
+            "type_one_side": True,
+            "seed": 1,
+        },
+        "fitting_net": {
+            "neuron": [5, 5, 5],
+            "resnet_dt": True,
+            "seed": 1,
+        },
+    }
+
+    print("\n---- Building graph-eligible DPA1 (attn_layer=0) ----")  # noqa: T201
+
+    # ---- B.1  Build dpmodel, serialize ----
+    model_g = get_model(copy.deepcopy(graph_config))
+    model_dict_g = model_g.serialize()
+
+    data_g = {
+        "model": copy.deepcopy(model_dict_g),
+        "model_def_script": graph_config,
+        "backend": "dpmodel",
+        "software": "deepmd-kit",
+        "version": "3.0.0",
+    }
+
+    # ---- B.2  Compute reference via nlist .pt2 (independent of graph path) ----
+    # The reference for deeppot_dpa1_graph.expected comes from the NLIST .pt2
+    # (dense-quartet forward), NOT the graph .pt2.  This ensures the C++ gtest
+    # (B2.5) independently validates the graph AOTI path against a known-good
+    # nlist evaluation.
+    print("Exporting reference nlist .pt2 (independent ground truth) ...")  # noqa: T201
+    with tempfile.TemporaryDirectory() as _tmp:
+        nlist_ref_pt2 = os.path.join(_tmp, "dpa1_graph_nlist_ref.pt2")
+        pt_expt_deserialize_to_file(
+            nlist_ref_pt2,
+            copy.deepcopy(data_g),
+            do_atomic_virial=True,
+            lower_kind="nlist",  # independent: dense nlist, NOT graph
+        )
+        dp_nlist_ref = DeepPot(nlist_ref_pt2)
+
+        # PBC reference from nlist path
+        e_r1, f_r1, v_r1, ae_r1, av_r1 = dp_nlist_ref.eval(
+            coord, box, atype, atomic=True
+        )
+        # NoPBC reference from nlist path
+        e_rnp, f_rnp, v_rnp, ae_rnp, av_rnp = dp_nlist_ref.eval(
+            coord, None, atype, atomic=True
+        )
+
+    print(f"Nlist ref PBC energy: {e_r1[0, 0]:.18e}")  # noqa: T201
+    print(f"Nlist ref NoPBC energy: {e_rnp[0, 0]:.18e}")  # noqa: T201
+    max_ref_force_pbc = float(np.max(np.abs(f_r1)))
+    max_ref_force_nopbc = float(np.max(np.abs(f_rnp)))
+    print(f"Nlist ref PBC max |force|: {max_ref_force_pbc:.6e}")  # noqa: T201
+    print(f"Nlist ref NoPBC max |force|: {max_ref_force_nopbc:.6e}")  # noqa: T201
+    if max_ref_force_pbc < 1e-10:
+        raise RuntimeError(
+            f"Graph model nlist-ref forces are degenerate "
+            f"(max={max_ref_force_pbc:.2e}); weights may need perturbation."
+        )
+
+    # ---- B.3  Write sidecar reference file ----
+    graph_ref_path = os.path.join(base_dir, "deeppot_dpa1_graph.expected")
+    write_expected_ref(
+        graph_ref_path,
+        sections={
+            "pbc": {
+                "expected_e": ae_r1[0, :, 0],
+                "expected_f": f_r1[0],
+                "expected_v": av_r1[0],
+            },
+            "nopbc": {
+                "expected_e": ae_rnp[0, :, 0],
+                "expected_f": f_rnp[0],
+                "expected_v": av_rnp[0],
+            },
+        },
+        source_script="source/tests/infer/gen_dpa1.py",
+    )
+    print(f"Wrote {graph_ref_path}")  # noqa: T201
+
+    # ---- B.4  Export graph-form .pt2 ----
+    graph_pt2_path = os.path.join(base_dir, "deeppot_dpa1_graph.pt2")
+    print(f"Exporting to {graph_pt2_path} (lower_kind='graph') ...")  # noqa: T201
+    pt_expt_deserialize_to_file(
+        graph_pt2_path,
+        copy.deepcopy(data_g),
+        do_atomic_virial=True,
+        lower_kind="graph",
+    )
+    print("Graph .pt2 export done.")  # noqa: T201
+
+    # ---- B.5  Sanity-check: graph .pt2 vs nlist reference ----
+    # Both use the SAME weights; at non-binding sel the math is equivalent.
+    # Verifies that forward_lower_graph_exportable + edge_energy_deriv match
+    # the nlist forward for this concrete system.
+    dp_graph = DeepPot(graph_pt2_path)
+
+    # PBC sanity check
+    e_g1, f_g1, v_g1, ae_g1, av_g1 = dp_graph.eval(coord, box, atype, atomic=True)
+    force_diff_pbc = float(np.max(np.abs(f_g1[0] - f_r1[0])))
+    print(  # noqa: T201
+        f"Graph .pt2 vs nlist ref PBC force max diff: {force_diff_pbc:.2e}"
+    )
+    if force_diff_pbc > 1e-5:
+        raise RuntimeError(
+            f"BLOCKED: graph .pt2 PBC force differs from nlist reference by "
+            f"{force_diff_pbc:.2e} (threshold 1e-5)."
+        )
+
+    # NoPBC sanity check
+    e_gnp, f_gnp, v_gnp, ae_gnp, av_gnp = dp_graph.eval(coord, None, atype, atomic=True)
+    force_diff_nopbc = float(np.max(np.abs(f_gnp[0] - f_rnp[0])))
+    print(  # noqa: T201
+        f"Graph .pt2 vs nlist ref NoPBC force max diff: {force_diff_nopbc:.2e}"
+    )
+    if force_diff_nopbc > 1e-5:
+        raise RuntimeError(
+            f"BLOCKED: graph .pt2 NoPBC force differs from nlist reference by "
+            f"{force_diff_nopbc:.2e} (threshold 1e-5)."
+        )
+
+    print("\nAll graph sanity checks passed.")  # noqa: T201
     print("\nDone!")  # noqa: T201
 
 

From f97129c88c147e7992ac7495c4ccd615826fe605 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 00:28:55 +0800
Subject: [PATCH 13/33] feat(api_cc): graph-schema .pt2 ingestion in
 DeepPotPTExpt (single-rank)

B2.2: read lower_input_kind="graph" -> lower_input_is_graph_.
B2.3: run_model_graph with NeighborGraph AOTI input order
  (atype, n_node, edge_index, edge_vec, edge_mask, [fparam], [aparam],
  [charge_spin]); no coord / edge_scatter_index.
B2.4: GraphTensorPack + buildGraphTensors (delegates to
  createEdgeTensors+compactEdgeTensors for the rcut filter, dynamic edge
  count and 2 masked dummy edges; drops edge_index_ext, adds n_node=[nloc],
  node types from atype_ext[0:nloc]) + compute_inner & standalone dispatch
  branches. Multi-rank graph fails fast (PR-B3).
---
 source/api_cc/include/DeepPotPTExpt.h | 25 ++++++++
 source/api_cc/include/commonPT.h      | 84 +++++++++++++++++++++++++++
 source/api_cc/src/DeepPotPTExpt.cc    | 64 ++++++++++++++++++++
 3 files changed, 173 insertions(+)

diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h
index 68a553e29c..b569c7eb16 100644
--- a/source/api_cc/include/DeepPotPTExpt.h
+++ b/source/api_cc/include/DeepPotPTExpt.h
@@ -308,6 +308,7 @@ class DeepPotPTExpt : public DeepPotBackend {
   bool do_atomic_virial;  // whether model was exported with atomic virial corr
   int nnei;               // expected nlist nnei dimension (= sum(sel))
   bool lower_input_is_edge_ = false;
+  bool lower_input_is_graph_ = false;
   NeighborListData nlist_data;
   at::Tensor mapping_tensor;         // cached mapping tensor (LAMMPS path)
   at::Tensor firstneigh_tensor;      // cached nlist tensor (LAMMPS path)
@@ -398,6 +399,30 @@ class DeepPotPTExpt : public DeepPotBackend {
       const torch::Tensor& aparam,
       const torch::Tensor& charge_spin);
 
+  /**
+   * @brief Run a NeighborGraph-schema ``.pt2`` (lower_input_kind="graph").
+   *
+   * Positional AOTI input order matches the Python export ABI:
+   * ``(atype, n_node, edge_index, edge_vec, edge_mask, [fparam], [aparam],
+   * [charge_spin])``.  Unlike the edge schema there is no ``coord`` and no
+   * ``edge_scatter_index`` input; node count is carried by ``n_node`` and the
+   * geometry is fully described by ``edge_vec``.
+   *
+   * @param[in] atype Per-node local types, shape ``(N,)`` int64.
+   * @param[in] n_node Per-frame node count, shape ``(nf,)`` int64.
+   * @param[in] edge_index Folded edge graph ``(2, E)`` int64 [src, dst].
+   * @param[in] edge_vec Edge vectors ``(E, 3)`` (neighbour - center).
+   * @param[in] edge_mask Physical-edge mask ``(E,)`` bool.
+   */
+  std::vector<torch::Tensor> run_model_graph(const torch::Tensor& atype,
+                                             const torch::Tensor& n_node,
+                                             const torch::Tensor& edge_index,
+                                             const torch::Tensor& edge_vec,
+                                             const torch::Tensor& edge_mask,
+                                             const torch::Tensor& fparam,
+                                             const torch::Tensor& aparam,
+                                             const torch::Tensor& charge_spin);
+
   /**
    * @brief Run the with-comm .pt2 artifact with comm tensors appended.
    *
diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h
index 643e53974a..30cd8e5e9f 100644
--- a/source/api_cc/include/commonPT.h
+++ b/source/api_cc/include/commonPT.h
@@ -364,6 +364,90 @@ inline EdgeTensorPack compactEdgeTensors(const torch::Tensor& edge_index,
   return pack;
 }
 
+struct GraphTensorPack {
+  torch::Tensor atype;
+  torch::Tensor n_node;
+  torch::Tensor edge_index;
+  torch::Tensor edge_vec;
+  torch::Tensor edge_mask;
+};
+
+/**
+ * @brief Build NeighborGraph input tensors from a host neighbor list
+ *        (single-rank, dynamic edge axis).
+ *
+ * Mirrors the edge schema but drops ``coord``/``edge_scatter_index`` and adds
+ * ``n_node``.  Edge construction is delegated to the existing
+ * ``createEdgeTensors``/``compactEdgeTensors`` helpers (same rcut filter,
+ * variable edge count and two masked dummy edges that keep the dynamic edge
+ * dimension non-empty); the wrapper then (a) drops the extended scatter index,
+ * (b) emits ``n_node = [nloc]`` for the single frame, and (c) sets the node
+ * types from the local slice of ``atype_ext``.
+ *
+ * @param nlist Neighbor-list rows (local idx into the extended set).
+ * @param coord Extended coordinates shaped as nall x 3.
+ * @param atype_ext Extended atom types, length nall.  Node types are taken from
+ *   the extended types (NOT ``atype[mapping]``); for single-rank ghost-free
+ *   this is just ``atype_ext[0:nloc]``, while multi-rank (B3) passes the halo
+ *   types directly.
+ * @param mapping Extended-to-local atom map, length nall.
+ * @param nloc Number of local atoms.
+ * @param nall Number of extended atoms.
+ * @param rcut Model cutoff (edges with ``rr > rcut**2`` are dropped).
+ * @param device Target device for the returned tensors.
+ * @param row_centers Optional center atom index for each neighbor-list row
+ *   (LAMMPS compacts away empty rows); ``nullptr`` means row i is center i.
+ */
+template <typename VALUETYPE>
+inline GraphTensorPack buildGraphTensors(
+    const std::vector<std::vector<int>>& nlist,
+    const std::vector<VALUETYPE>& coord,
+    const std::vector<int>& atype_ext,
+    const std::vector<std::int64_t>& mapping,
+    const int nloc,
+    const int nall,
+    const double rcut,
+    const torch::Device& device,
+    const std::vector<int>* row_centers = nullptr) {
+  auto int_options = torch::TensorOptions().dtype(torch::kInt64);
+
+  // 1. Cached-style topology only (no geometry): edge_index folds ghost
+  //    neighbours onto their local owners (fold_to_local=true), edge_index_ext
+  //    keeps extended indices for the on-device geometry recompute.
+  const EdgeTensorPack topo =
+      createEdgeTensors(nlist, coord, mapping, nloc, nall, device,
+                        /*with_geometry=*/false, row_centers,
+                        /*fold_to_local=*/true);
+
+  // 2. Recompute geometry from the current coords on-device, filter by rcut and
+  //    append the two masked dummy edges.  The model is compiled for float64
+  //    inputs, so build the coord tensor as float64 to match the edge path.
+  std::vector<double> coord_d(coord.begin(), coord.end());
+  at::Tensor coord_tensor =
+      torch::from_blob(coord_d.data(),
+                       {static_cast<std::int64_t>(nall), 3},
+                       torch::TensorOptions().dtype(torch::kFloat64))
+          .clone()
+          .to(device);
+  const EdgeTensorPack edges = compactEdgeTensors(
+      topo.edge_index, topo.edge_index_ext, coord_tensor, rcut);
+
+  GraphTensorPack pack;
+  pack.edge_index = edges.edge_index;  // local-folded (2, E)
+  pack.edge_vec = edges.edge_vec;      // (E, 3) neighbour - center
+  pack.edge_mask = edges.edge_mask;    // (E,) bool
+  pack.n_node =
+      torch::full({1}, static_cast<std::int64_t>(nloc), int_options).to(device);
+  // Node types from the local slice of the extended types.
+  std::vector<std::int64_t> atype_loc(atype_ext.begin(),
+                                      atype_ext.begin() + nloc);
+  pack.atype = torch::from_blob(atype_loc.data(),
+                                {static_cast<std::int64_t>(nloc)}, int_options)
+                   .clone()
+                   .to(device);
+  return pack;
+}
+
 }  // namespace deepmd
 
 #endif  // BUILD_PYTORCH
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index 96033fcab4..25d82f623a 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -155,8 +155,10 @@ void DeepPotPTExpt::init(const std::string& model,
     const std::string lower_input_kind =
         metadata["lower_input_kind"].as_string();
     lower_input_is_edge_ = lower_input_kind == "edge_vec";
+    lower_input_is_graph_ = lower_input_kind == "graph";
   } else {
     lower_input_is_edge_ = false;
+    lower_input_is_graph_ = false;
   }
 
   type_map.clear();
@@ -289,6 +291,31 @@ std::vector<torch::Tensor> DeepPotPTExpt::run_model_edges(
   return loader->run(inputs);
 }
 
+std::vector<torch::Tensor> DeepPotPTExpt::run_model_graph(
+    const torch::Tensor& atype,
+    const torch::Tensor& n_node,
+    const torch::Tensor& edge_index,
+    const torch::Tensor& edge_vec,
+    const torch::Tensor& edge_mask,
+    const torch::Tensor& fparam,
+    const torch::Tensor& aparam,
+    const torch::Tensor& charge_spin) {
+  // NeighborGraph ABI: (atype, n_node, edge_index, edge_vec, edge_mask,
+  // [fparam], [aparam], [charge_spin]).  No coord, no edge_scatter_index.
+  std::vector<torch::Tensor> inputs = {atype, n_node, edge_index, edge_vec,
+                                       edge_mask};
+  if (dfparam > 0) {
+    inputs.push_back(fparam);
+  }
+  if (daparam > 0) {
+    inputs.push_back(aparam);
+  }
+  if (dchgspin > 0) {
+    inputs.push_back(charge_spin);
+  }
+  return loader->run(inputs);
+}
+
 std::vector<torch::Tensor> DeepPotPTExpt::run_model_with_comm(
     const torch::Tensor& coord,
     const torch::Tensor& atype,
@@ -475,6 +502,15 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   bool multi_rank = (lmp_list.nprocs > 1);
   bool atom_map_present = (lmp_list.mapping != nullptr);
   bool use_with_comm = has_comm_artifact_ && multi_rank;
+  // The NeighborGraph schema only has a single-rank artifact so far; the
+  // multi-rank (with-comm) graph path is PR-B3.  Fail fast before building
+  // any tensors so callers get a clear message instead of a wrong answer.
+  if (lower_input_is_graph_ && multi_rank) {
+    throw deepmd::deepmd_exception(
+        "Multi-rank graph (NeighborGraph) .pt2 inference is not yet "
+        "supported (PR-B3). Run single-rank, or use a dense/edge .pt2 for "
+        "multi-rank LAMMPS.");
+  }
   // Decision matrix (see PR #5450 description):
   //   non-GNN model (has_message_passing_ == false): regular path is
   //                                                  always safe.
@@ -556,6 +592,11 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
           /*fold_to_local=*/!use_with_comm);
       edge_index_tensor = edge_tensors.edge_index;
       edge_index_ext_tensor = edge_tensors.edge_index_ext;
+    } else if (lower_input_is_graph_) {
+      // Graph schema rebuilds the edge topology on-device every step inside
+      // buildGraphTensors (from the raw, unpadded nlist_data.jlist +
+      // nlist_data.ilist centers), so nothing is cached here and the nlist is
+      // left unpadded (createEdgeTensors handles ragged rows and skips -1).
     } else {
       nlist_data.padding();
       firstneigh_tensor = createNlistTensor(nlist_data.jlist, nnei)
@@ -771,6 +812,17 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
                           edge_tensors.edge_index, edge_tensors.edge_vec,
                           edge_tensors.edge_index_ext, edge_tensors.edge_mask,
                           fparam_tensor, aparam_tensor, charge_spin_tensor);
+    } else if (lower_input_is_graph_) {
+      // Single-rank NeighborGraph schema: build (atype, n_node, edge_index,
+      // edge_vec, edge_mask) from the host nlist (node types from the extended
+      // types, folded local edge graph) and run the graph artifact.
+      const auto graph_tensors = buildGraphTensors(
+          nlist_data.jlist, dcoord, datype, mapping, nloc, nall_real,
+          static_cast<double>(rcut), device, &nlist_data.ilist);
+      flat_outputs = run_model_graph(
+          graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index,
+          graph_tensors.edge_vec, graph_tensors.edge_mask, fparam_tensor,
+          aparam_tensor, charge_spin_tensor);
     } else {
       flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor,
                                mapping_tensor, fparam_tensor, aparam_tensor,
@@ -1015,9 +1067,16 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
           .to(device);
   at::Tensor nlist_tensor;
   EdgeTensorPack edge_tensors;
+  GraphTensorPack graph_tensors;
   if (lower_input_is_edge_) {
     edge_tensors = createEdgeTensors(nlist_raw, coord_cpy_d, mapping_64, nloc,
                                      nall, device);
+  } else if (lower_input_is_graph_) {
+    // Standalone (no nlist) graph schema: build_nlist already cut at rcut and
+    // keys row i to center i, so no row_centers remapping is needed.
+    graph_tensors =
+        buildGraphTensors(nlist_raw, coord_cpy_d, atype_cpy, mapping_64, nloc,
+                          nall, static_cast<double>(rcut), device);
   } else {
     nlist_tensor =
         createNlistTensor(nlist_raw, nnei).to(torch::kInt64).to(device);
@@ -1104,6 +1163,11 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
                         edge_tensors.edge_index, edge_tensors.edge_vec,
                         edge_tensors.edge_index_ext, edge_tensors.edge_mask,
                         fparam_tensor, aparam_tensor, charge_spin_tensor);
+  } else if (lower_input_is_graph_) {
+    flat_outputs = run_model_graph(
+        graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index,
+        graph_tensors.edge_vec, graph_tensors.edge_mask, fparam_tensor,
+        aparam_tensor, charge_spin_tensor);
   } else {
     flat_outputs =
         run_model(coord_Tensor, atype_Tensor, nlist_tensor, mapping_tensor,

From 074b3ff63bfadd2da298d12255df1ecb723579e8 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 00:37:43 +0800
Subject: [PATCH 14/33] fix(api_cc): cache mapping vector as member to fix OOB
 on ago>0 graph path

Local std::vector<int64_t> mapping was declared in compute_inner and populated
only inside if(ago==0). The graph branch called buildGraphTensors with this
local vector on every step, causing an OOB heap read on ago>0 (mapping.size()==0).

Fix: promote mapping to a member mapping_ (parallel to mapping_tensor) so it
persists across steps. Edge-path (createEdgeTensors) and dense-path (mapping_tensor)
are unaffected in behavior; only the vector source changes from local to member.
---
 source/api_cc/include/DeepPotPTExpt.h |  5 +++--
 source/api_cc/src/DeepPotPTExpt.cc    | 20 ++++++++++----------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h
index b569c7eb16..d8bd22cbad 100644
--- a/source/api_cc/include/DeepPotPTExpt.h
+++ b/source/api_cc/include/DeepPotPTExpt.h
@@ -310,8 +310,9 @@ class DeepPotPTExpt : public DeepPotBackend {
   bool lower_input_is_edge_ = false;
   bool lower_input_is_graph_ = false;
   NeighborListData nlist_data;
-  at::Tensor mapping_tensor;         // cached mapping tensor (LAMMPS path)
-  at::Tensor firstneigh_tensor;      // cached nlist tensor (LAMMPS path)
+  at::Tensor mapping_tensor;              // cached mapping tensor (LAMMPS path)
+  std::vector<std::int64_t> mapping_;    // cached mapping vector (LAMMPS path)
+  at::Tensor firstneigh_tensor;          // cached nlist tensor (LAMMPS path)
   at::Tensor edge_index_tensor;      // cached local edge graph (LAMMPS path)
   at::Tensor edge_index_ext_tensor;  // cached extended edge graph (LAMMPS path)
   std::unique_ptr<torch::inductor::AOTIModelPackageLoader> loader;
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index 25d82f623a..3b0e976da3 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -541,19 +541,19 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   // LAMMPS sets ago=0 on every nlist rebuild (neighbor rebuild, re-partition,
   // atom exchange between subdomains), so `ago > 0` implies the cached
   // mapping and nlist tensors are still valid.  Rebuild only on ago==0.
-  std::vector<std::int64_t> mapping;
   if (ago == 0) {
     nlist_data.copy_from_nlist(lmp_list, nall - nghost);
     nlist_data.shuffle_exclude_empty(fwd_map);
 
-    // Rebuild mapping tensor
+    // Rebuild mapping vector and tensor (cached as members; graph branch reads
+    // mapping_ on every step, not just ago==0, so the vector must persist).
     if (lmp_list.mapping) {
-      mapping.resize(nall_real);
+      mapping_.resize(nall_real);
       for (int ii = 0; ii < nall_real; ii++) {
-        mapping[ii] = fwd_map[lmp_list.mapping[bkw_map[ii]]];
+        mapping_[ii] = fwd_map[lmp_list.mapping[bkw_map[ii]]];
       }
       mapping_tensor =
-          torch::from_blob(mapping.data(), {1, nall_real}, int_option)
+          torch::from_blob(mapping_.data(), {1, nall_real}, int_option)
               .clone()
               .to(device);
     } else {
@@ -566,12 +566,12 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
       //     features via border_op and ignores this tensor for ghost
       //     gather — see deepmd/pt_expt/descriptor/
       //     repflows.py::_exchange_ghosts).
-      mapping.resize(nall_real);
+      mapping_.resize(nall_real);
       for (int ii = 0; ii < nall_real; ii++) {
-        mapping[ii] = ii;
+        mapping_[ii] = ii;
       }
       mapping_tensor =
-          torch::from_blob(mapping.data(), {1, nall_real}, int_option)
+          torch::from_blob(mapping_.data(), {1, nall_real}, int_option)
               .clone()
               .to(device);
     }
@@ -587,7 +587,7 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
       // their features can be exchanged across ranks via border_op, instead of
       // being folded onto a local owner that this rank does not own.
       const auto edge_tensors = createEdgeTensors(
-          nlist_data.jlist, dcoord, mapping, nloc, nall_real, device,
+          nlist_data.jlist, dcoord, mapping_, nloc, nall_real, device,
           /*with_geometry=*/false, /*row_centers=*/&nlist_data.ilist,
           /*fold_to_local=*/!use_with_comm);
       edge_index_tensor = edge_tensors.edge_index;
@@ -817,7 +817,7 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
       // edge_vec, edge_mask) from the host nlist (node types from the extended
       // types, folded local edge graph) and run the graph artifact.
       const auto graph_tensors = buildGraphTensors(
-          nlist_data.jlist, dcoord, datype, mapping, nloc, nall_real,
+          nlist_data.jlist, dcoord, datype, mapping_, nloc, nall_real,
           static_cast<double>(rcut), device, &nlist_data.ilist);
       flat_outputs = run_model_graph(
           graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index,

From 26b2c9d7cbe200bfb42e726242ad2f3bb94db968 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 01:05:37 +0800
Subject: [PATCH 15/33] test(api_cc): dpa1 graph .pt2 single-rank parity + fix
 graph output extraction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add test_deeppot_dpa1_graph_ptexpt.cc — the first runtime exercise of the
NeighborGraph .pt2 C++ ingestion (B2.2-4). Four cases x {double,float}:
build-nlist parity vs the committed .expected, a second 12-atom system through
the same model (dynamic edge axis), the LAMMPS InputNlist+ago=0/ago=1 path
(compute_inner + cached mapping_), and a tiny no-edge system (nedge_min=2 guard).

The first run surfaced a B2.2-4 bug: the graph forward emits LOCAL flat-N PUBLIC
keys (atom_energy/energy/force/virial/atom_virial) but compute()'s output
extraction read the dense INTERNAL keys (energy_redu/energy_derv_r/...), so
output_map["energy_redu"].view() threw on an undefined tensor. The graph branch
had only ever been compiled, never run.

Fix: remap_graph_outputs_to_dense_keys() in commonPT.h, called after
extract_outputs in both compute overloads (gated on lower_input_is_graph_).
Rewrites the public keys into the dense internal-key layout; per-atom
force/atom_virial are local (nloc) and zero-padded up to nall so the existing
fold-back is a no-op on ghost rows (local rows already carry the folded ghost
contributions). Dense/edge paths untouched.

gen_dpa1.py now persists the dense nlist-ref .pt2 (deeppot_dpa1_graph_nlist_ref.pt2)
as a live graph-vs-dense oracle for the dynamic-edge cases.
---
 source/api_cc/include/commonPT.h              |  64 ++++
 source/api_cc/src/DeepPotPTExpt.cc            |  17 ++
 .../tests/test_deeppot_dpa1_graph_ptexpt.cc   | 282 ++++++++++++++++++
 source/tests/infer/gen_dpa1.py                |  40 +--
 4 files changed, 384 insertions(+), 19 deletions(-)
 create mode 100644 source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc

diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h
index 30cd8e5e9f..f6f82d89a0 100644
--- a/source/api_cc/include/commonPT.h
+++ b/source/api_cc/include/commonPT.h
@@ -6,6 +6,8 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <map>
+#include <string>
 #include <type_traits>
 #include <vector>
 
@@ -448,6 +450,68 @@ inline GraphTensorPack buildGraphTensors(
   return pack;
 }
 
+/**
+ * @brief Remap NeighborGraph (graph-schema) public outputs onto the dense
+ *        internal-key layout the rest of ``compute`` consumes.
+ *
+ * The graph forward (``forward_lower_graph_exportable``) is LOCAL-only and emits
+ * flat-N PUBLIC keys:
+ *   - ``atom_energy`` (N, 1)      per-atom energy        (N == nloc)
+ *   - ``energy``      (nf, 1)     reduced total energy
+ *   - ``force``       (N, 3)      per-atom force (ghosts already folded onto
+ *                                 their local owners via ``edge_index``)
+ *   - ``virial``      (nf, 9)     reduced total virial
+ *   - ``atom_virial`` (N, 9)      per-atom (full-to-src) virial
+ *
+ * The downstream extraction in ``DeepPotPTExpt::compute`` was written for the
+ * dense forward's internal keys with their extra dims:
+ *   ``energy_redu`` (nf,1), ``energy_derv_c_redu`` (nf,1,9),
+ *   ``energy_derv_r`` (nf,nall,1,3), ``energy`` (nf,nloc,1),
+ *   ``energy_derv_c`` (nf,nall,1,9).
+ *
+ * This helper rewrites the public keys into those internal keys (single frame,
+ * nf == 1).  The per-atom force / atom-virial are LOCAL (nloc rows); they are
+ * zero-padded up to the extended length ``nall`` so the existing fold-back
+ * (``fold_back`` / ``select_map``) is a no-op on the ghost rows — the local
+ * rows already carry the folded ghost contributions, so zero ghosts avoid
+ * double counting (and keep LAMMPS reverse-comm correct).
+ *
+ * @param[in,out] output_map Output tensor map (public keys in, internal keys
+ *   added).
+ * @param[in] nloc Number of local atoms (== N, the graph node count).
+ * @param[in] nall Extended atom count to pad the per-atom outputs up to.
+ * @param[in] atomic Whether atomic energy / virial were requested.
+ */
+inline void remap_graph_outputs_to_dense_keys(
+    std::map<std::string, torch::Tensor>& output_map,
+    const std::int64_t nloc,
+    const std::int64_t nall,
+    const bool atomic) {
+  using torch::indexing::Slice;
+  const std::int64_t nf = 1;
+  const auto& energy_pub = output_map.at("energy");  // (nf, 1)
+  const auto& force_pub = output_map.at("force");    // (N, 3), N == nloc
+  const auto& virial_pub = output_map.at("virial");  // (nf, 9)
+
+  output_map["energy_redu"] = energy_pub.reshape({nf, 1});
+  output_map["energy_derv_c_redu"] = virial_pub.reshape({nf, 1, 9});
+
+  // Local force -> (nf, nall, 1, 3) with zero ghost rows.
+  auto force_full = torch::zeros({nf, nall, 1, 3}, force_pub.options());
+  force_full.index_put_({0, Slice(0, nloc), 0}, force_pub);
+  output_map["energy_derv_r"] = force_full;
+
+  if (atomic) {
+    const auto& atom_energy_pub = output_map.at("atom_energy");  // (N, 1)
+    const auto& atom_virial_pub = output_map.at("atom_virial");  // (N, 9)
+    output_map["energy"] = atom_energy_pub.reshape({nf, nloc, 1});
+    auto atom_virial_full =
+        torch::zeros({nf, nall, 1, 9}, atom_virial_pub.options());
+    atom_virial_full.index_put_({0, Slice(0, nloc), 0}, atom_virial_pub);
+    output_map["energy_derv_c"] = atom_virial_full;
+  }
+}
+
 }  // namespace deepmd
 
 #endif  // BUILD_PYTORCH
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index 3b0e976da3..c591ae23e6 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -834,6 +834,15 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   std::map<std::string, torch::Tensor> output_map;
   extract_outputs(output_map, flat_outputs);
 
+  if (lower_input_is_graph_) {
+    // The graph forward emits LOCAL public keys (atom_energy/energy/force/
+    // virial/atom_virial); rewrite them into the dense internal-key layout the
+    // downstream extraction/fold-back expects.  nloc == N (graph node count);
+    // pad the per-atom force/virial up to nall_real with zero ghost rows.
+    deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall_real,
+                                              atomic);
+  }
+
   if (phantom_n > 0) {
     // Strip the phantom local prefix and zero the empty rank's energy.  The
     // phantom atoms carry no edges, so their force / per-atom virial are
@@ -1178,6 +1187,14 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   std::map<std::string, torch::Tensor> output_map;
   extract_outputs(output_map, flat_outputs);
 
+  if (lower_input_is_graph_) {
+    // The graph forward emits LOCAL public keys; rewrite them into the dense
+    // internal-key layout used below.  nloc == N (graph node count); pad the
+    // per-atom force/virial up to the extended nall with zero ghost rows so the
+    // fold-back is a no-op on ghosts.
+    deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall, atomic);
+  }
+
   // 7. Extract energy
   torch::Tensor flat_energy_ =
       output_map["energy_redu"].view({-1}).to(torch::kCPU);
diff --git a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
new file mode 100644
index 0000000000..bbe751026c
--- /dev/null
+++ b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+// Test C++ inference for the NeighborGraph (graph-schema) .pt2 path of the
+// pt_expt backend.  The graph model is a dpa1(attn_layer=0) descriptor exported
+// with lower_kind="graph" (gen_dpa1.py section B); this is the FIRST runtime
+// exercise of the C++ graph ingestion added in PR-B Phase B2
+// (lower_input_is_graph_ / run_model_graph / buildGraphTensors / the
+// compute_inner graph branch).
+//
+// Reference values (deeppot_dpa1_graph.expected) come from an INDEPENDENT
+// nlist (dense-quartet) evaluation of the same weights, so a match validates
+// the graph AOTI ABI/geometry, not just self-consistency.  A second, persisted
+// nlist .pt2 of the same weights (deeppot_dpa1_graph_nlist_ref.pt2) is loaded
+// alongside the graph model so arbitrary system sizes (dynamic edge axis) can
+// be cross-checked graph≈dense live without baking more reference blocks.
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "DeepPot.h"
+#include "DeepPotPTExpt.h"
+#include "expected_ref.h"
+#include "neighbor_list.h"
+#include "test_utils.h"
+
+namespace {
+constexpr const char* kGraphModel = "../../tests/infer/deeppot_dpa1_graph.pt2";
+constexpr const char* kNlistRefModel =
+    "../../tests/infer/deeppot_dpa1_graph_nlist_ref.pt2";
+constexpr const char* kRefPath = "../../tests/infer/deeppot_dpa1_graph.expected";
+}  // namespace
+
+template <class VALUETYPE>
+class TestInferDpa1GraphPtExpt : public ::testing::Test {
+ protected:
+  std::vector<VALUETYPE> coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
+                                  00.25, 3.32, 1.68, 3.36,  3.00, 1.81,
+                                  3.51,  2.51, 2.60, 4.27,  3.22, 1.56};
+  std::vector<int> atype = {0, 1, 1, 0, 1, 1};
+  std::vector<VALUETYPE> box = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+  // Per-atom reference (energy/force/virial) loaded from the .expected sidecar.
+  std::vector<VALUETYPE> expected_e;
+  std::vector<VALUETYPE> expected_f;
+  std::vector<VALUETYPE> expected_v;
+  int natoms;
+  double expected_tot_e;
+  std::vector<VALUETYPE> expected_tot_v;
+
+  // Graph-schema model under test.
+  static deepmd::DeepPot dp;
+  // Independent nlist (dense) model with identical weights — used as a live
+  // graph≈dense oracle for arbitrary system sizes.
+  static deepmd::DeepPot dp_ref;
+
+  static void SetUpTestSuite() {
+#if defined(BUILD_PYTORCH) && BUILD_PT_EXPT
+    dp.init(kGraphModel);
+    dp_ref.init(kNlistRefModel);
+#endif
+  }
+
+  void SetUp() override {
+#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT
+    GTEST_SKIP() << "Skip because PyTorch support is not enabled.";
+#endif
+    deepmd_test::ExpectedRef ref;
+    ref.load(kRefPath);
+    expected_e = ref.get<VALUETYPE>("pbc", "expected_e");
+    expected_f = ref.get<VALUETYPE>("pbc", "expected_f");
+    expected_v = ref.get<VALUETYPE>("pbc", "expected_v");
+
+    natoms = expected_e.size();
+    EXPECT_EQ(natoms * 3, static_cast<int>(expected_f.size()));
+    EXPECT_EQ(natoms * 9, static_cast<int>(expected_v.size()));
+    expected_tot_e = 0.;
+    expected_tot_v.assign(9, 0.);
+    for (int ii = 0; ii < natoms; ++ii) {
+      expected_tot_e += expected_e[ii];
+    }
+    for (int ii = 0; ii < natoms; ++ii) {
+      for (int dd = 0; dd < 9; ++dd) {
+        expected_tot_v[dd] += expected_v[ii * 9 + dd];
+      }
+    }
+  };
+
+  void TearDown() override {};
+
+  static void TearDownTestSuite() {
+    dp = deepmd::DeepPot();
+    dp_ref = deepmd::DeepPot();
+  }
+};
+
+template <class VALUETYPE>
+deepmd::DeepPot TestInferDpa1GraphPtExpt<VALUETYPE>::dp;
+template <class VALUETYPE>
+deepmd::DeepPot TestInferDpa1GraphPtExpt<VALUETYPE>::dp_ref;
+
+TYPED_TEST_SUITE(TestInferDpa1GraphPtExpt, ValueTypes);
+
+// Case 1: DeepPot builds its own neighbor list and runs the standalone graph
+// branch (lower_input_is_graph_, build_nlist -> buildGraphTensors).  Validates
+// the graph AOTI ABI/geometry against the independent nlist reference.
+TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  double ener;
+  std::vector<VALUETYPE> force, virial;
+  dp.compute(ener, force, virial, coord, atype, box);
+
+  EXPECT_EQ(force.size(), static_cast<size_t>(natoms * 3));
+  EXPECT_EQ(virial.size(), 9u);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}
+
+// Case 2: a SECOND, larger system (12 atoms, different edge count) through the
+// SAME loaded graph model — proves the dynamic edge axis works in C++.  The
+// graph result is cross-checked against the dense nlist .pt2 (same weights);
+// at non-binding sel they must agree bit-for-bit (fp64 ~1e-10).
+TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_sys2_dynamic_edges) {
+  using VALUETYPE = TypeParam;
+  deepmd::DeepPot& dp = this->dp;
+  deepmd::DeepPot& dp_ref = this->dp_ref;
+
+  // 12 atoms: original 6 stacked with a +13 z-shifted copy, box doubled in z.
+  // Same local density as the 6-atom fixture, so per-atom neighbor counts stay
+  // far below sel=30 and graph(carry-all) == dense(sel-truncated).
+  std::vector<VALUETYPE> coord2 = {
+      12.83, 2.56, 2.18,  12.09, 2.87, 2.74,  00.25, 3.32, 1.68,
+      3.36,  3.00, 1.81,  3.51,  2.51, 2.60,  4.27,  3.22, 1.56,
+      12.83, 2.56, 15.18, 12.09, 2.87, 15.74, 00.25, 3.32, 14.68,
+      3.36,  3.00, 14.81, 3.51,  2.51, 15.60, 4.27,  3.22, 14.56};
+  std::vector<int> atype2 = {0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1};
+  std::vector<VALUETYPE> box2 = {13., 0., 0., 0., 13., 0., 0., 0., 26.};
+  int natoms2 = atype2.size();
+
+  double ener_g, ener_r;
+  std::vector<VALUETYPE> force_g, virial_g, force_r, virial_r;
+  dp.compute(ener_g, force_g, virial_g, coord2, atype2, box2);
+  dp_ref.compute(ener_r, force_r, virial_r, coord2, atype2, box2);
+
+  EXPECT_EQ(force_g.size(), static_cast<size_t>(natoms2 * 3));
+  EXPECT_EQ(virial_g.size(), 9u);
+
+  EXPECT_LT(fabs(ener_g - ener_r), EPSILON);
+  for (int ii = 0; ii < natoms2 * 3; ++ii) {
+    EXPECT_LT(fabs(force_g[ii] - force_r[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_LT(fabs(virial_g[ii] - virial_r[ii]), EPSILON);
+  }
+}
+
+// Case 3 (CRITICAL): exercise the LAMMPS compute_inner graph branch with an
+// explicit InputNlist and the `ago` cache.  Calling compute twice WITHOUT
+// rebuilding the nlist — first ago=0 (rebuild), then ago=1 (reuse) — must give
+// identical results.  This is the only case that hits compute_inner + the
+// member-cached mapping_ vector; the build-nlist cases above never touch it.
+// Regression guard for the OOB-on-ago>0 bug fixed by caching mapping_ as a
+// member (commit 7c70db47b).
+TYPED_TEST(TestInferDpa1GraphPtExpt, lammps_nlist_ago) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+
+  float rc = dp.cutoff();
+  int nloc = coord.size() / 3;
+  std::vector<VALUETYPE> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int> > nlist_data;
+  _build_nlist<VALUETYPE>(nlist_data, coord_cpy, atype_cpy, mapping, coord,
+                          atype, box, rc);
+  int nall = coord_cpy.size() / 3;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_data);
+  // The graph branch folds ghost neighbours onto their local owners via the
+  // LAMMPS atom-map; without it periodic (ghost) edges would be dropped.
+  inlist.mapping = mapping.data();
+
+  // ago=0: rebuild the cached nlist/mapping, then run the graph branch.
+  double ener;
+  std::vector<VALUETYPE> force_, virial;
+  dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc,
+             inlist, 0);
+  std::vector<VALUETYPE> force;
+  _fold_back<VALUETYPE>(force, force_, mapping, nloc, nall, 3);
+
+  EXPECT_EQ(force.size(), static_cast<size_t>(natoms * 3));
+  EXPECT_EQ(virial.size(), 9u);
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+
+  // ago=1: reuse the cached nlist/mapping (NO rebuild).  Must match again.
+  // This is the path that previously read the local mapping vector OOB.
+  ener = 0.;
+  std::fill(force_.begin(), force_.end(), 0.0);
+  std::fill(virial.begin(), virial.end(), 0.0);
+  dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc,
+             inlist, 1);
+  _fold_back<VALUETYPE>(force, force_, mapping, nloc, nall, 3);
+
+  EXPECT_EQ(force.size(), static_cast<size_t>(natoms * 3));
+  EXPECT_EQ(virial.size(), 9u);
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}
+
+// Case 4: a tiny system with no in-cutoff neighbors — only the two masked
+// dummy edges survive (nedge_min=2 guard / SIGFPE-edge family).  The graph
+// must run cleanly, produce finite, interaction-free output (zero force/virial)
+// and agree with the dense reference.
+TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_tiny_no_edges) {
+  using VALUETYPE = TypeParam;
+  deepmd::DeepPot& dp = this->dp;
+  deepmd::DeepPot& dp_ref = this->dp_ref;
+
+  // Two atoms ~33 apart in a 40-box: no neighbor within rcut=6 and no periodic
+  // image either, so the graph sees zero real edges (only the 2 dummy edges).
+  std::vector<VALUETYPE> coord_t = {1.0, 1.0, 1.0, 20.0, 20.0, 20.0};
+  std::vector<int> atype_t = {0, 1};
+  std::vector<VALUETYPE> box_t = {40., 0., 0., 0., 40., 0., 0., 0., 40.};
+  int natoms_t = atype_t.size();
+
+  double ener_g, ener_r;
+  std::vector<VALUETYPE> force_g, virial_g, force_r, virial_r;
+  ASSERT_NO_THROW(
+      dp.compute(ener_g, force_g, virial_g, coord_t, atype_t, box_t));
+  dp_ref.compute(ener_r, force_r, virial_r, coord_t, atype_t, box_t);
+
+  EXPECT_EQ(force_g.size(), static_cast<size_t>(natoms_t * 3));
+  EXPECT_EQ(virial_g.size(), 9u);
+
+  EXPECT_TRUE(std::isfinite(ener_g));
+  // No interactions: force and virial must vanish.
+  for (int ii = 0; ii < natoms_t * 3; ++ii) {
+    EXPECT_TRUE(std::isfinite(force_g[ii]));
+    EXPECT_LT(fabs(force_g[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_TRUE(std::isfinite(virial_g[ii]));
+    EXPECT_LT(fabs(virial_g[ii]), EPSILON);
+  }
+  // graph == dense for the isolated-atom limit.
+  EXPECT_LT(fabs(ener_g - ener_r), EPSILON);
+  for (int ii = 0; ii < natoms_t * 3; ++ii) {
+    EXPECT_LT(fabs(force_g[ii] - force_r[ii]), EPSILON);
+  }
+}
diff --git a/source/tests/infer/gen_dpa1.py b/source/tests/infer/gen_dpa1.py
index 5ec523f5f5..b45b476520 100644
--- a/source/tests/infer/gen_dpa1.py
+++ b/source/tests/infer/gen_dpa1.py
@@ -19,7 +19,6 @@
 import copy
 import os
 import sys
-import tempfile
 
 import numpy as np
 
@@ -235,25 +234,28 @@ def main():
     # (dense-quartet forward), NOT the graph .pt2.  This ensures the C++ gtest
     # (B2.5) independently validates the graph AOTI path against a known-good
     # nlist evaluation.
-    print("Exporting reference nlist .pt2 (independent ground truth) ...")  # noqa: T201
-    with tempfile.TemporaryDirectory() as _tmp:
-        nlist_ref_pt2 = os.path.join(_tmp, "dpa1_graph_nlist_ref.pt2")
-        pt_expt_deserialize_to_file(
-            nlist_ref_pt2,
-            copy.deepcopy(data_g),
-            do_atomic_virial=True,
-            lower_kind="nlist",  # independent: dense nlist, NOT graph
-        )
-        dp_nlist_ref = DeepPot(nlist_ref_pt2)
+    #
+    # The nlist .pt2 is also PERSISTED (deeppot_dpa1_graph_nlist_ref.pt2): the
+    # C++ gtest loads it alongside the graph .pt2 to cross-check graph≈dense at
+    # 1e-9 on arbitrary system sizes (dynamic-edge-axis cases) without baking a
+    # second reference block into the .expected sidecar.  Same weights as the
+    # graph model, so at non-binding sel the two paths must agree.
+    nlist_ref_pt2 = os.path.join(base_dir, "deeppot_dpa1_graph_nlist_ref.pt2")
+    print(f"Exporting reference nlist .pt2 to {nlist_ref_pt2} ...")  # noqa: T201
+    pt_expt_deserialize_to_file(
+        nlist_ref_pt2,
+        copy.deepcopy(data_g),
+        do_atomic_virial=True,
+        lower_kind="nlist",  # independent: dense nlist, NOT graph
+    )
+    dp_nlist_ref = DeepPot(nlist_ref_pt2)
 
-        # PBC reference from nlist path
-        e_r1, f_r1, v_r1, ae_r1, av_r1 = dp_nlist_ref.eval(
-            coord, box, atype, atomic=True
-        )
-        # NoPBC reference from nlist path
-        e_rnp, f_rnp, v_rnp, ae_rnp, av_rnp = dp_nlist_ref.eval(
-            coord, None, atype, atomic=True
-        )
+    # PBC reference from nlist path
+    e_r1, f_r1, v_r1, ae_r1, av_r1 = dp_nlist_ref.eval(coord, box, atype, atomic=True)
+    # NoPBC reference from nlist path
+    e_rnp, f_rnp, v_rnp, ae_rnp, av_rnp = dp_nlist_ref.eval(
+        coord, None, atype, atomic=True
+    )
 
     print(f"Nlist ref PBC energy: {e_r1[0, 0]:.18e}")  # noqa: T201
     print(f"Nlist ref NoPBC energy: {e_rnp[0, 0]:.18e}")  # noqa: T201

From af92be11b4bbf42da7c668f0c3600968d4c0887c Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 01:20:49 +0800
Subject: [PATCH 16/33] =?UTF-8?q?docs(infer):=20B2=20final-review=20?=
 =?UTF-8?q?=E2=80=94=20correct=20gen=5Fdpa1=20graph-reference=20docstring?=
 =?UTF-8?q?=20(nlist=20.pt2,=201e-5)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 source/tests/infer/gen_dpa1.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/source/tests/infer/gen_dpa1.py b/source/tests/infer/gen_dpa1.py
index b45b476520..9c743c9f88 100644
--- a/source/tests/infer/gen_dpa1.py
+++ b/source/tests/infer/gen_dpa1.py
@@ -11,9 +11,11 @@
 Both are serialized and exported to their respective formats from the same weights.
 Reference sidecar files (.expected) consumed by C++ gtests are also written:
   - deeppot_dpa1.expected   — from the nlist .pt2 eval (existing)
-  - deeppot_dpa1_graph.expected — from a direct dpmodel eval (independent ground
-      truth, NOT from the graph .pt2); the graph .pt2 is sanity-checked against
-      this reference at ≤1e-6.
+  - deeppot_dpa1_graph.expected — from an independent NLIST .pt2 eval (NOT the
+      graph .pt2; dpmodel se_atten has no analytical force, so the dense nlist
+      path is the independent ground truth). At non-binding sel the graph and
+      nlist paths see the same neighbor set, so the graph .pt2 is sanity-checked
+      against this reference at ≤1e-5.
 """
 
 import copy

From b25fdfc2f258e5e1d88c8f07af88d3ddfe5c9ce6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 29 Jun 2026 17:22:43 +0000
Subject: [PATCH 17/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 source/api_cc/include/DeepPotPTExpt.h                 | 8 ++++----
 source/api_cc/include/commonPT.h                      | 7 +++----
 source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc | 3 ++-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h
index d8bd22cbad..ddaea35646 100644
--- a/source/api_cc/include/DeepPotPTExpt.h
+++ b/source/api_cc/include/DeepPotPTExpt.h
@@ -310,10 +310,10 @@ class DeepPotPTExpt : public DeepPotBackend {
   bool lower_input_is_edge_ = false;
   bool lower_input_is_graph_ = false;
   NeighborListData nlist_data;
-  at::Tensor mapping_tensor;              // cached mapping tensor (LAMMPS path)
-  std::vector<std::int64_t> mapping_;    // cached mapping vector (LAMMPS path)
-  at::Tensor firstneigh_tensor;          // cached nlist tensor (LAMMPS path)
-  at::Tensor edge_index_tensor;      // cached local edge graph (LAMMPS path)
+  at::Tensor mapping_tensor;           // cached mapping tensor (LAMMPS path)
+  std::vector<std::int64_t> mapping_;  // cached mapping vector (LAMMPS path)
+  at::Tensor firstneigh_tensor;        // cached nlist tensor (LAMMPS path)
+  at::Tensor edge_index_tensor;        // cached local edge graph (LAMMPS path)
   at::Tensor edge_index_ext_tensor;  // cached extended edge graph (LAMMPS path)
   std::unique_ptr<torch::inductor::AOTIModelPackageLoader> loader;
   // Optional second AOTInductor artifact for the multi-rank GNN code
diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h
index f6f82d89a0..c56f366bbd 100644
--- a/source/api_cc/include/commonPT.h
+++ b/source/api_cc/include/commonPT.h
@@ -426,8 +426,7 @@ inline GraphTensorPack buildGraphTensors(
   //    inputs, so build the coord tensor as float64 to match the edge path.
   std::vector<double> coord_d(coord.begin(), coord.end());
   at::Tensor coord_tensor =
-      torch::from_blob(coord_d.data(),
-                       {static_cast<std::int64_t>(nall), 3},
+      torch::from_blob(coord_d.data(), {static_cast<std::int64_t>(nall), 3},
                        torch::TensorOptions().dtype(torch::kFloat64))
           .clone()
           .to(device);
@@ -454,8 +453,8 @@ inline GraphTensorPack buildGraphTensors(
  * @brief Remap NeighborGraph (graph-schema) public outputs onto the dense
  *        internal-key layout the rest of ``compute`` consumes.
  *
- * The graph forward (``forward_lower_graph_exportable``) is LOCAL-only and emits
- * flat-N PUBLIC keys:
+ * The graph forward (``forward_lower_graph_exportable``) is LOCAL-only and
+ * emits flat-N PUBLIC keys:
  *   - ``atom_energy`` (N, 1)      per-atom energy        (N == nloc)
  *   - ``energy``      (nf, 1)     reduced total energy
  *   - ``force``       (N, 3)      per-atom force (ghosts already folded onto
diff --git a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
index bbe751026c..95dcf72e74 100644
--- a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
+++ b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
@@ -28,7 +28,8 @@ namespace {
 constexpr const char* kGraphModel = "../../tests/infer/deeppot_dpa1_graph.pt2";
 constexpr const char* kNlistRefModel =
     "../../tests/infer/deeppot_dpa1_graph_nlist_ref.pt2";
-constexpr const char* kRefPath = "../../tests/infer/deeppot_dpa1_graph.expected";
+constexpr const char* kRefPath =
+    "../../tests/infer/deeppot_dpa1_graph.expected";
 }  // namespace
 
 template <class VALUETYPE>

From 10d82a68baece6168dac87f0e25861762b389144 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 08:11:07 +0800
Subject: [PATCH 18/33] fix(api_cc): guard graph remap single-rank-only +
 atomic-overload gtest (B2.5 follow-ups)

---
 source/api_cc/include/commonPT.h              | 16 ++++++-
 source/api_cc/src/DeepPotPTExpt.cc            |  9 +++-
 .../tests/test_deeppot_dpa1_graph_ptexpt.cc   | 44 +++++++++++++++++++
 3 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h
index c56f366bbd..865f9d3837 100644
--- a/source/api_cc/include/commonPT.h
+++ b/source/api_cc/include/commonPT.h
@@ -475,17 +475,31 @@ inline GraphTensorPack buildGraphTensors(
  * rows already carry the folded ghost contributions, so zero ghosts avoid
  * double counting (and keep LAMMPS reverse-comm correct).
  *
+ * **Single-rank only.**  Multi-rank inference (B3.2) must NOT call this
+ * function: ghost/halo forces are real cross-rank contributions that must be
+ * returned as-is and folded back via reverse-comm rather than being zeroed.
+ * Calling this function on a multi-rank result would silently zero those forces
+ * and produce wrong energetics. Pass ``single_rank = false`` to get an
+ * explicit exception instead of silent corruption.
+ *
  * @param[in,out] output_map Output tensor map (public keys in, internal keys
  *   added).
  * @param[in] nloc Number of local atoms (== N, the graph node count).
  * @param[in] nall Extended atom count to pad the per-atom outputs up to.
  * @param[in] atomic Whether atomic energy / virial were requested.
+ * @param[in] single_rank Must be true; throws deepmd_exception if false.
  */
 inline void remap_graph_outputs_to_dense_keys(
     std::map<std::string, torch::Tensor>& output_map,
     const std::int64_t nloc,
     const std::int64_t nall,
-    const bool atomic) {
+    const bool atomic,
+    const bool single_rank = true) {
+  if (!single_rank) {
+    throw deepmd::deepmd_exception(
+        "remap_graph_outputs_to_dense_keys is single-rank-only; multi-rank "
+        "uses the extended-region reverse-comm fold (PR-B3.2)");
+  }
   using torch::indexing::Slice;
   const std::int64_t nf = 1;
   const auto& energy_pub = output_map.at("energy");  // (nf, 1)
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index c591ae23e6..30cffe8c99 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -839,8 +839,10 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
     // virial/atom_virial); rewrite them into the dense internal-key layout the
     // downstream extraction/fold-back expects.  nloc == N (graph node count);
     // pad the per-atom force/virial up to nall_real with zero ghost rows.
+    // single_rank=true: the multi-rank fail-fast at line ~508 guarantees we
+    // never reach here on a multi-rank graph call.
     deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall_real,
-                                              atomic);
+                                              atomic, /*single_rank=*/true);
   }
 
   if (phantom_n > 0) {
@@ -1192,7 +1194,10 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
     // internal-key layout used below.  nloc == N (graph node count); pad the
     // per-atom force/virial up to the extended nall with zero ghost rows so the
     // fold-back is a no-op on ghosts.
-    deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall, atomic);
+    // single_rank=true: the standalone (build_nlist) path is always
+    // single-rank; there is no comm_dict / cross-rank ghost exchange here.
+    deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall, atomic,
+                                              /*single_rank=*/true);
   }
 
   // 7. Extract energy
diff --git a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
index 95dcf72e74..8b347c5e57 100644
--- a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
+++ b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
@@ -240,6 +240,50 @@ TYPED_TEST(TestInferDpa1GraphPtExpt, lammps_nlist_ago) {
   }
 }
 
+// Case 5: exercise the DeepPot::compute ATOMIC overload on the graph .pt2.
+// This is the first test to reach the ``if (atomic)`` branch inside
+// remap_graph_outputs_to_dense_keys (the atom_energy/atom_virial remapping).
+// The per-atom reference values are already loaded from deeppot_dpa1_graph.expected
+// into this->expected_e and this->expected_v by SetUp().
+TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_atomic) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_e = this->expected_e;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  std::vector<VALUETYPE>& expected_v = this->expected_v;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+
+  double ener;
+  std::vector<VALUETYPE> force, virial, atom_energy, atom_virial;
+  // Standalone atomic overload: DeepPot builds its own nlist (graph branch),
+  // then returns per-atom energy + atom-virial alongside total energy/force/virial.
+  dp.compute(ener, force, virial, atom_energy, atom_virial, coord, atype, box);
+
+  EXPECT_EQ(force.size(), static_cast<size_t>(natoms * 3));
+  EXPECT_EQ(virial.size(), 9u);
+  EXPECT_EQ(atom_energy.size(), static_cast<size_t>(natoms));
+  EXPECT_EQ(atom_virial.size(), static_cast<size_t>(natoms * 9));
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < natoms; ++ii) {
+    EXPECT_LT(fabs(atom_energy[ii] - expected_e[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < natoms * 9; ++ii) {
+    EXPECT_LT(fabs(atom_virial[ii] - expected_v[ii]), EPSILON);
+  }
+}
+
 // Case 4: a tiny system with no in-cutoff neighbors — only the two masked
 // dummy edges survive (nedge_min=2 guard / SIGFPE-edge family).  The graph
 // must run cleanly, produce finite, interaction-free output (zero force/virial)

From 7d37319c7095064dfb69557cef89b34d2f519c49 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 08:24:24 +0800
Subject: [PATCH 19/33] feat(api_cc): non-MP multi-rank graph path (extended
 region + reverse-comm; no with-comm)

---
 source/api_cc/include/commonPT.h   | 95 +++++++++++++++++++++++++-----
 source/api_cc/src/DeepPotPTExpt.cc | 56 ++++++++++++------
 2 files changed, 119 insertions(+), 32 deletions(-)

diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h
index 865f9d3837..919df504f0 100644
--- a/source/api_cc/include/commonPT.h
+++ b/source/api_cc/include/commonPT.h
@@ -399,6 +399,14 @@ struct GraphTensorPack {
  * @param device Target device for the returned tensors.
  * @param row_centers Optional center atom index for each neighbor-list row
  *   (LAMMPS compacts away empty rows); ``nullptr`` means row i is center i.
+ * @param fold_to_local Whether ghost neighbours are folded onto their local
+ *   owners (single-rank, ``N == nloc``, ``n_node = [nloc]``, node types from
+ *   ``atype_ext[0:nloc]``) or kept as distinct extended nodes (multi-rank,
+ *   ``N == nall``, ``n_node = [nall]``, node types from the full ``atype_ext``
+ *   including the real halo types — the #5583 invariant).  In the multi-rank
+ *   case ``edge_index`` indexes the extended atoms directly, so ghost reaction
+ *   forces land on the ghost rows and are folded to their owners by LAMMPS
+ *   reverse-comm (no with-comm artifact / no border_op — dpa1 is non-MP).
  */
 template <typename VALUETYPE>
 inline GraphTensorPack buildGraphTensors(
@@ -410,16 +418,18 @@ inline GraphTensorPack buildGraphTensors(
     const int nall,
     const double rcut,
     const torch::Device& device,
-    const std::vector<int>* row_centers = nullptr) {
+    const std::vector<int>* row_centers = nullptr,
+    const bool fold_to_local = true) {
   auto int_options = torch::TensorOptions().dtype(torch::kInt64);
 
-  // 1. Cached-style topology only (no geometry): edge_index folds ghost
-  //    neighbours onto their local owners (fold_to_local=true), edge_index_ext
-  //    keeps extended indices for the on-device geometry recompute.
+  // 1. Cached-style topology only (no geometry): when fold_to_local=true,
+  //    edge_index folds ghost neighbours onto their local owners (single-rank);
+  //    when false, edge_index indexes the extended atoms directly (multi-rank).
+  //    edge_index_ext always keeps extended indices for the on-device geometry
+  //    recompute.
   const EdgeTensorPack topo =
       createEdgeTensors(nlist, coord, mapping, nloc, nall, device,
-                        /*with_geometry=*/false, row_centers,
-                        /*fold_to_local=*/true);
+                        /*with_geometry=*/false, row_centers, fold_to_local);
 
   // 2. Recompute geometry from the current coords on-device, filter by rcut and
   //    append the two masked dummy edges.  The model is compiled for float64
@@ -434,16 +444,19 @@ inline GraphTensorPack buildGraphTensors(
       topo.edge_index, topo.edge_index_ext, coord_tensor, rcut);
 
   GraphTensorPack pack;
-  pack.edge_index = edges.edge_index;  // local-folded (2, E)
+  pack.edge_index = edges.edge_index;  // (2, E): local-folded or extended
   pack.edge_vec = edges.edge_vec;      // (E, 3) neighbour - center
   pack.edge_mask = edges.edge_mask;    // (E,) bool
-  pack.n_node =
-      torch::full({1}, static_cast<std::int64_t>(nloc), int_options).to(device);
-  // Node types from the local slice of the extended types.
-  std::vector<std::int64_t> atype_loc(atype_ext.begin(),
-                                      atype_ext.begin() + nloc);
-  pack.atype = torch::from_blob(atype_loc.data(),
-                                {static_cast<std::int64_t>(nloc)}, int_options)
+  // Single-rank: N == nloc (ghosts folded onto owners).  Multi-rank: N == nall
+  // (ghosts are distinct nodes whose features come from their real halo types).
+  const std::int64_t n_node_count = fold_to_local ? nloc : nall;
+  pack.n_node = torch::full({1}, n_node_count, int_options).to(device);
+  // Node types from the extended types (NOT atype[mapping]): the local slice
+  // for single-rank, the full extended set (incl. real halo types) for
+  // multi-rank.
+  std::vector<std::int64_t> atype_nodes(atype_ext.begin(),
+                                        atype_ext.begin() + n_node_count);
+  pack.atype = torch::from_blob(atype_nodes.data(), {n_node_count}, int_options)
                    .clone()
                    .to(device);
   return pack;
@@ -525,6 +538,60 @@ inline void remap_graph_outputs_to_dense_keys(
   }
 }
 
+/**
+ * @brief Remap NeighborGraph public outputs onto the dense internal-key layout
+ *        for the MULTI-RANK (extended-region) non-message-passing path.
+ *
+ * Built with ``fold_to_local=false``, the graph has ``N == nall`` nodes: ghost
+ * (halo) atoms are distinct nodes, so the per-node ``force`` is already the
+ * EXTENDED force (one row per extended atom).  Ghost reaction forces stay on
+ * their ghost rows and are folded back to their owning rank by LAMMPS
+ * reverse-comm — exactly as the dense path returns its extended force.  No
+ * zero-padding (unlike the single-rank helper) and no with-comm artifact (dpa1
+ * is non-MP).
+ *
+ * Key differences from the single-rank helper:
+ *   - ``energy_redu`` = sum of the LOCAL atom energies (``atom_energy[0:nloc]``)
+ *     ONLY.  The public ``energy`` key reduces over all ``N == nall`` nodes,
+ *     which would double-count the bias energy of ghost nodes that belong to
+ *     other ranks (ghost nodes have no center edges, so they carry a bias-only
+ *     energy and zero force/virial gradient — harmless for force/virial but
+ *     wrong for the owned energy).
+ *   - ``energy_derv_r`` / ``energy_derv_c`` keep all ``nall`` rows (no padding).
+ *
+ * @param[in,out] output_map Output tensor map (public keys in, internal keys
+ *   added).
+ * @param[in] nloc Number of local atoms (owned by this rank).
+ * @param[in] nall Extended atom count (== N, the graph node count).
+ * @param[in] atomic Whether atomic energy / virial were requested.
+ */
+inline void remap_graph_outputs_to_dense_keys_extended(
+    std::map<std::string, torch::Tensor>& output_map,
+    const std::int64_t nloc,
+    const std::int64_t nall,
+    const bool atomic) {
+  using torch::indexing::Slice;
+  const std::int64_t nf = 1;
+  const auto& atom_energy_pub = output_map.at("atom_energy");  // (N==nall, 1)
+  const auto& force_pub = output_map.at("force");    // (N==nall, 3) extended
+  const auto& virial_pub = output_map.at("virial");  // (nf, 9)
+
+  // Owned energy = sum over LOCAL atoms only; ghost nodes carry bias-only
+  // energy belonging to other ranks.
+  output_map["energy_redu"] =
+      atom_energy_pub.index({Slice(0, nloc)}).sum().reshape({nf, 1});
+  output_map["energy_derv_c_redu"] = virial_pub.reshape({nf, 1, 9});
+  // Extended force: ghost rows stay distinct for LAMMPS reverse-comm fold-back.
+  output_map["energy_derv_r"] = force_pub.reshape({nf, nall, 1, 3});
+
+  if (atomic) {
+    const auto& atom_virial_pub = output_map.at("atom_virial");  // (N==nall, 9)
+    output_map["energy"] =
+        atom_energy_pub.index({Slice(0, nloc)}).reshape({nf, nloc, 1});
+    output_map["energy_derv_c"] = atom_virial_pub.reshape({nf, nall, 1, 9});
+  }
+}
+
 }  // namespace deepmd
 
 #endif  // BUILD_PYTORCH
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index 30cffe8c99..49e7ed0662 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -502,14 +502,23 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   bool multi_rank = (lmp_list.nprocs > 1);
   bool atom_map_present = (lmp_list.mapping != nullptr);
   bool use_with_comm = has_comm_artifact_ && multi_rank;
-  // The NeighborGraph schema only has a single-rank artifact so far; the
-  // multi-rank (with-comm) graph path is PR-B3.  Fail fast before building
-  // any tensors so callers get a clear message instead of a wrong answer.
-  if (lower_input_is_graph_ && multi_rank) {
+  // NeighborGraph multi-rank dispatch:
+  //   - NON-message-passing (dpa1, se_e2_a, ...): the SAME single-rank graph
+  //     .pt2 runs on the EXTENDED region (fold_to_local=false; ghosts are
+  //     distinct nodes whose features come from their real halo types).  No
+  //     with-comm artifact / no border_op is needed; ghost reaction forces are
+  //     folded to their owners by LAMMPS reverse-comm.  Handled below.
+  //   - message-passing graph (DPA2/DPA3, PR-G): would need a with-comm graph
+  //     artifact for cross-rank ghost-feature exchange — not yet supported.
+  //     Fail fast before building any tensors so callers get a clear message
+  //     instead of a wrong answer.
+  if (lower_input_is_graph_ && multi_rank && has_message_passing_) {
     throw deepmd::deepmd_exception(
-        "Multi-rank graph (NeighborGraph) .pt2 inference is not yet "
-        "supported (PR-B3). Run single-rank, or use a dense/edge .pt2 for "
-        "multi-rank LAMMPS.");
+        "Multi-rank message-passing graph (NeighborGraph) .pt2 inference is "
+        "not yet supported (PR-G). Non-message-passing graph models (e.g. "
+        "dpa1) run multi-rank on the extended-region single-rank artifact; "
+        "for message-passing models run single-rank, or use a dense/edge "
+        ".pt2 for multi-rank LAMMPS.");
   }
   // Decision matrix (see PR #5450 description):
   //   non-GNN model (has_message_passing_ == false): regular path is
@@ -813,12 +822,16 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
                           edge_tensors.edge_index_ext, edge_tensors.edge_mask,
                           fparam_tensor, aparam_tensor, charge_spin_tensor);
     } else if (lower_input_is_graph_) {
-      // Single-rank NeighborGraph schema: build (atype, n_node, edge_index,
-      // edge_vec, edge_mask) from the host nlist (node types from the extended
-      // types, folded local edge graph) and run the graph artifact.
+      // NeighborGraph schema: build (atype, n_node, edge_index, edge_vec,
+      // edge_mask) from the host nlist and run the (single-rank) graph
+      // artifact.  Single-rank folds ghosts onto local owners (N == nloc);
+      // multi-rank (non-MP only — the fail-fast above blocks MP graph
+      // multi-rank) keeps the extended region (N == nall_real, node types from
+      // the real halo types) so LAMMPS reverse-comm folds ghost forces back.
       const auto graph_tensors = buildGraphTensors(
           nlist_data.jlist, dcoord, datype, mapping_, nloc, nall_real,
-          static_cast<double>(rcut), device, &nlist_data.ilist);
+          static_cast<double>(rcut), device, &nlist_data.ilist,
+          /*fold_to_local=*/!multi_rank);
       flat_outputs = run_model_graph(
           graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index,
           graph_tensors.edge_vec, graph_tensors.edge_mask, fparam_tensor,
@@ -835,14 +848,21 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   extract_outputs(output_map, flat_outputs);
 
   if (lower_input_is_graph_) {
-    // The graph forward emits LOCAL public keys (atom_energy/energy/force/
+    // The graph forward emits flat-N PUBLIC keys (atom_energy/energy/force/
     // virial/atom_virial); rewrite them into the dense internal-key layout the
-    // downstream extraction/fold-back expects.  nloc == N (graph node count);
-    // pad the per-atom force/virial up to nall_real with zero ghost rows.
-    // single_rank=true: the multi-rank fail-fast at line ~508 guarantees we
-    // never reach here on a multi-rank graph call.
-    deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall_real,
-                                              atomic, /*single_rank=*/true);
+    // downstream extraction/fold-back expects.
+    if (multi_rank) {
+      // Extended region (N == nall_real): force is already per-extended-atom,
+      // owned energy = sum over local atom energies, no zero-padding.  Ghost
+      // forces fold back via LAMMPS reverse-comm (no with-comm artifact).
+      deepmd::remap_graph_outputs_to_dense_keys_extended(output_map, nloc,
+                                                         nall_real, atomic);
+    } else {
+      // Single-rank (N == nloc): ghosts folded onto owners; pad the per-atom
+      // force/virial up to nall_real with zero ghost rows.
+      deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall_real,
+                                                atomic, /*single_rank=*/true);
+    }
   }
 
   if (phantom_n > 0) {

From 92c35a6b14f46a10fae2deaec84b89bd8355cfd2 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 08:29:49 +0800
Subject: [PATCH 20/33] test(lammps): dpa1 graph .pt2 single + multi-rank
 (mpirun -n 2, local)

---
 .../lmp/tests/test_lammps_dpa1_graph_pt2.py   | 306 ++++++++++++++++++
 1 file changed, 306 insertions(+)
 create mode 100644 source/lmp/tests/test_lammps_dpa1_graph_pt2.py

diff --git a/source/lmp/tests/test_lammps_dpa1_graph_pt2.py b/source/lmp/tests/test_lammps_dpa1_graph_pt2.py
new file mode 100644
index 0000000000..3e6a11428d
--- /dev/null
+++ b/source/lmp/tests/test_lammps_dpa1_graph_pt2.py
@@ -0,0 +1,306 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Test LAMMPS with the NeighborGraph (graph-schema) .pt2 DPA1 model.
+
+The model ``deeppot_dpa1_graph.pt2`` is a dpa1(attn_layer=0) descriptor
+exported with ``lower_kind="graph"`` (gen_dpa1.py section B).  dpa1 is
+NON-message-passing, so the SAME single-rank graph .pt2 also drives the
+multi-rank path: the C++ ``DeepPotPTExpt`` builds an EXTENDED-region graph
+(``fold_to_local=False``; ghosts are distinct nodes whose features come from
+their real halo types) and returns per-extended-atom forces, which LAMMPS
+reverse-comm folds back to their owners.  There is NO with-comm artifact and
+NO ``border_op`` (that is the message-passing PR-G path) — hence no
+``use_loc_mapping=False`` variant.
+
+Reference values come from ``source/tests/infer/gen_dpa1.py`` (the same
+``deeppot_dpa1_graph.expected`` the C++ gtest uses); the multi-rank run must
+match the single-rank reference for energy, per-atom force, and per-atom
+virial.  This is the core multi-rank correctness gate for the non-MP graph
+path implemented in B3.1.
+"""
+
+import importlib.util
+import os
+import shutil
+import subprocess as sp
+import sys
+import tempfile
+from pathlib import (
+    Path,
+)
+
+import constants
+import numpy as np
+import pytest
+from expected_ref import (
+    read_expected_ref,
+)
+from lammps import (
+    PyLammps,
+)
+from write_lmp_data import (
+    write_lmp_data,
+)
+
+pb_file = (
+    Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa1_graph.pt2"
+)
+ref_file = (
+    Path(__file__).parent.parent.parent
+    / "tests"
+    / "infer"
+    / "deeppot_dpa1_graph.expected"
+)
+# The MPI runner is backend-agnostic (DATAFILE PB_FILE OUTPUT + flags); reuse
+# the DPA3 driver verbatim rather than duplicate it.
+mpi_runner = Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"
+
+data_file = Path(__file__).parent / "data_dpa1_graph_pt2.lmp"
+# Elongated-box variant for the empty-subdomain MPI corner: x extended to
+# 30 A while atoms stay in x in [0.25, 12.83]; with ``processors 2 1 1`` the
+# split at x = 15 leaves rank 1 with zero local atoms.
+data_file_empty_subdomain = (
+    Path(__file__).parent / "data_dpa1_graph_pt2_empty_subdomain.lmp"
+)
+
+# Reference values written by source/tests/infer/gen_dpa1.py (PBC case).
+# Guarded with try/except because gen_dpa1.py only runs when PyTorch is built.
+try:
+    _ref = read_expected_ref(ref_file)["pbc"]
+    expected_e = float(np.sum(_ref["expected_e"]))
+    expected_f = _ref["expected_f"].reshape(6, 3)
+    # LAMMPS uses the opposite sign convention for virial vs DeepPot.
+    expected_v = -_ref["expected_v"].reshape(6, 9)
+except FileNotFoundError:
+    expected_e = expected_f = expected_v = None
+
+box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0])
+coord = np.array(
+    [
+        [12.83, 2.56, 2.18],
+        [12.09, 2.87, 2.74],
+        [0.25, 3.32, 1.68],
+        [3.36, 3.00, 1.81],
+        [3.51, 2.51, 2.60],
+        [4.27, 3.22, 1.56],
+    ]
+)
+# Model type_map is ["O", "H"]; gtest atype = [0, 1, 1, 0, 1, 1] -> LAMMPS
+# types [1, 2, 2, 1, 2, 2] under identity ``pair_coeff * *``.
+type_OH = np.array([1, 2, 2, 1, 2, 2])
+
+
+def setup_module() -> None:
+    if os.environ.get("ENABLE_PYTORCH", "1") != "1":
+        pytest.skip(
+            "Skip test because PyTorch support is not enabled.",
+        )
+    write_lmp_data(box, coord, type_OH, data_file)
+    box_empty_subdomain = np.array([0, 30, 0, 13, 0, 13, 0, 0, 0])
+    write_lmp_data(box_empty_subdomain, coord, type_OH, data_file_empty_subdomain)
+
+
+def teardown_module() -> None:
+    for f in [data_file, data_file_empty_subdomain]:
+        if f.exists():
+            os.remove(f)
+
+
+def _lammps(data_file, units="metal", atom_map: str = "yes") -> PyLammps:
+    lammps = PyLammps()
+    lammps.units(units)
+    lammps.boundary("p p p")
+    lammps.atom_style("atomic")
+    if atom_map != "no":
+        lammps.atom_modify(f"map {atom_map}")
+    lammps.neighbor("2.0 bin")
+    lammps.neigh_modify("every 10 delay 0 check no")
+    lammps.read_data(data_file.resolve())
+    lammps.mass("1 16")
+    lammps.mass("2 2")
+    lammps.timestep(0.0005)
+    lammps.fix("1 all nve")
+    return lammps
+
+
+@pytest.fixture
+def lammps():
+    lmp = _lammps(data_file=data_file)
+    yield lmp
+    lmp.close()
+
+
+def test_pair_deepmd(lammps) -> None:
+    """Single-rank serial run (``atom_modify map yes``): the graph .pt2
+    folds ghosts onto local owners (``fold_to_local=True``) and must match
+    the gen_dpa1.py reference for energy and per-atom force.
+    """
+    lammps.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps.pair_coeff("* *")
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    lammps.run(1)
+
+
+def test_pair_deepmd_virial(lammps) -> None:
+    """Single-rank per-atom virial via ``centroid/stress/atom``."""
+    lammps.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps.pair_coeff("* *")
+    lammps.compute("virial all centroid/stress/atom NULL pair")
+    for ii in range(9):
+        jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii]
+        lammps.variable(f"virial{jj} atom c_virial[{ii + 1}]")
+    lammps.dump(
+        "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
+    )
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
+    for ii in range(9):
+        assert np.array(
+            lammps.variables[f"virial{ii}"].value
+        ) / constants.nktv2p == pytest.approx(expected_v[idx_map, ii])
+
+
+# ---------------------------------------------------------------------------
+# Multi-rank test (non-MP extended-region graph path; B3.1).
+#
+# dpa1 is non-message-passing, so multi-rank uses the SAME single-rank graph
+# .pt2 on the extended region.  The expected energy/force/virial are the
+# single-rank reference: each rank evaluates its local atoms over the extended
+# graph; ghost reaction forces fold back via LAMMPS reverse-comm.
+# ---------------------------------------------------------------------------
+
+
+def _run_mpi_subprocess(
+    extra_args: list[str] | None = None,
+    nprocs: int = 2,
+    data_path: Path | None = None,
+    processors: str | None = None,
+    runner_args: list[str] | None = None,
+) -> dict:
+    """Invoke the (backend-agnostic) DPA3 MPI runner under
+    ``mpirun -n <nprocs>`` against the dpa1 graph .pt2 and return
+    ``{"pe": float, "forces": (n, 3), "virials": (n, 9)}``.
+
+    ``nprocs == 1`` forces ``--processors 1 1 1`` so the C++ side sees
+    ``nprocs == 1`` and routes to the single-rank graph path — a
+    same-archive reference for the multi-rank comparison.
+    """
+    if data_path is None:
+        data_path = data_file
+    with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f:
+        out_path = f.name
+    try:
+        argv = [
+            "mpirun",
+            "-n",
+            str(nprocs),
+            sys.executable,
+            str(mpi_runner),
+            str(data_path.resolve()),
+            str(pb_file.resolve()),
+            out_path,
+        ]
+        if processors is not None:
+            argv.extend(["--processors", processors])
+        elif nprocs == 1:
+            argv.extend(["--processors", "1 1 1"])
+        if extra_args:
+            argv.extend(extra_args)
+        if runner_args:
+            argv.extend(runner_args)
+        sp.check_call(argv)
+        with open(out_path) as fh:
+            lines = fh.read().strip().splitlines()
+        pe = float(lines[0])
+        rows = np.array(
+            [list(map(float, line.split())) for line in lines[1:]],
+            dtype=np.float64,
+        )
+        forces = rows[:, :3]
+        virials = rows[:, 3:]
+        return {"pe": pe, "forces": forces, "virials": virials}
+    finally:
+        if os.path.exists(out_path):
+            os.remove(out_path)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa1_graph() -> None:
+    """Multi-rank LAMMPS run for the dpa1 graph .pt2 must match the
+    single-rank reference within numerical tolerance for energy, forces,
+    and per-atom virial.
+
+    This is the core correctness gate for the non-MP extended-region
+    multi-rank C++ path (B3.1): the extended graph + reverse-comm
+    fold-back must reproduce the folded single-rank result.
+    """
+    out = _run_mpi_subprocess()
+    assert out["pe"] == pytest.approx(expected_e, rel=0, abs=1e-8)
+    for ii in range(6):
+        np.testing.assert_allclose(out["forces"][ii], expected_f[ii], atol=1e-8, rtol=0)
+    # ``centroid/stress/atom`` column order [xx, yy, zz, xy, xz, yz, yx, zx,
+    # zy]; the inverse permutation maps it back to the expected_v columns.
+    expected_v_to_lammps = [0, 6, 7, 3, 1, 8, 4, 5, 2]
+    np.testing.assert_allclose(
+        out["virials"][:, expected_v_to_lammps] / constants.nktv2p,
+        expected_v,
+        atol=1e-8,
+        rtol=0,
+    )
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa1_graph_matches_single_rank() -> None:
+    """Multi-rank (``-n 2``) ≡ single-rank (``-n 1``) on the SAME archive
+    and trajectory — isolates the extended-region multi-rank C++ path from
+    the .pt2 reference values (a wrong-but-finite divergence would show up
+    here even if the hardcoded reference drifted).
+    """
+    out_mpi = _run_mpi_subprocess(nprocs=2)
+    out_ref = _run_mpi_subprocess(nprocs=1)
+    np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
+    )
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa1_graph_empty_subdomain() -> None:
+    """Multi-rank with one rank owning zero local atoms (elongated box,
+    ``processors 2 1 1``, split at x = 15).  The extended-region graph path
+    must still produce correct forces/virial on the populated rank and a
+    zero contribution from the empty rank — compared against a same-archive
+    single-rank reference of the same fixture.
+    """
+    out_mpi = _run_mpi_subprocess(nprocs=2, data_path=data_file_empty_subdomain)
+    out_ref = _run_mpi_subprocess(nprocs=1, data_path=data_file_empty_subdomain)
+    np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
+    )
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10)

From e2e07f16046f6c44f592e342cba71ed83a53aa4c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 30 Jun 2026 00:38:02 +0000
Subject: [PATCH 21/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 source/api_cc/include/commonPT.h                  | 15 ++++++++-------
 .../tests/test_deeppot_dpa1_graph_ptexpt.cc       |  8 +++++---
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h
index 919df504f0..02c25aa047 100644
--- a/source/api_cc/include/commonPT.h
+++ b/source/api_cc/include/commonPT.h
@@ -551,13 +551,14 @@ inline void remap_graph_outputs_to_dense_keys(
  * is non-MP).
  *
  * Key differences from the single-rank helper:
- *   - ``energy_redu`` = sum of the LOCAL atom energies (``atom_energy[0:nloc]``)
- *     ONLY.  The public ``energy`` key reduces over all ``N == nall`` nodes,
- *     which would double-count the bias energy of ghost nodes that belong to
- *     other ranks (ghost nodes have no center edges, so they carry a bias-only
- *     energy and zero force/virial gradient — harmless for force/virial but
- *     wrong for the owned energy).
- *   - ``energy_derv_r`` / ``energy_derv_c`` keep all ``nall`` rows (no padding).
+ *   - ``energy_redu`` = sum of the LOCAL atom energies
+ * (``atom_energy[0:nloc]``) ONLY.  The public ``energy`` key reduces over all
+ * ``N == nall`` nodes, which would double-count the bias energy of ghost nodes
+ * that belong to other ranks (ghost nodes have no center edges, so they carry a
+ * bias-only energy and zero force/virial gradient — harmless for force/virial
+ * but wrong for the owned energy).
+ *   - ``energy_derv_r`` / ``energy_derv_c`` keep all ``nall`` rows (no
+ * padding).
  *
  * @param[in,out] output_map Output tensor map (public keys in, internal keys
  *   added).
diff --git a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
index 8b347c5e57..c57abaf0a0 100644
--- a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
+++ b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
@@ -243,8 +243,9 @@ TYPED_TEST(TestInferDpa1GraphPtExpt, lammps_nlist_ago) {
 // Case 5: exercise the DeepPot::compute ATOMIC overload on the graph .pt2.
 // This is the first test to reach the ``if (atomic)`` branch inside
 // remap_graph_outputs_to_dense_keys (the atom_energy/atom_virial remapping).
-// The per-atom reference values are already loaded from deeppot_dpa1_graph.expected
-// into this->expected_e and this->expected_v by SetUp().
+// The per-atom reference values are already loaded from
+// deeppot_dpa1_graph.expected into this->expected_e and this->expected_v by
+// SetUp().
 TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_atomic) {
   using VALUETYPE = TypeParam;
   std::vector<VALUETYPE>& coord = this->coord;
@@ -261,7 +262,8 @@ TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_atomic) {
   double ener;
   std::vector<VALUETYPE> force, virial, atom_energy, atom_virial;
   // Standalone atomic overload: DeepPot builds its own nlist (graph branch),
-  // then returns per-atom energy + atom-virial alongside total energy/force/virial.
+  // then returns per-atom energy + atom-virial alongside total
+  // energy/force/virial.
   dp.compute(ener, force, virial, atom_energy, atom_virial, coord, atype, box);
 
   EXPECT_EQ(force.size(), static_cast<size_t>(natoms * 3));

From 942de1f4f4dcd58574709cc4d25955069f4b6cce Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 11:50:14 +0800
Subject: [PATCH 22/33] refactor(pt_expt): group node_capacity with graph-shape
 args, make edge_energy_deriv flags keyword-only

Move node_capacity up next to n_node (it is a shape parameter, not a
behavior flag) and put a keyword-only barrier before do_atomic_virial/
create_graph. Reads target -> graph-shape -> behavior, mirroring the
edge_force_virial it wraps. The sole caller already passed the flags by
keyword, so this is a no-behavior-change reorder.
---
 deepmd/pt_expt/model/edge_transform_output.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py
index 9202a64d8f..98a620b524 100644
--- a/deepmd/pt_expt/model/edge_transform_output.py
+++ b/deepmd/pt_expt/model/edge_transform_output.py
@@ -29,9 +29,10 @@ def edge_energy_deriv(
     edge_index: torch.Tensor,
     edge_mask: torch.Tensor,
     n_node: torch.Tensor,
+    node_capacity: int | None = None,
+    *,
     do_atomic_virial: bool = False,
     create_graph: bool = False,
-    node_capacity: int | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor]:
     """Return (force, atom_virial_or_None, virial) from a graph energy.
 
@@ -50,14 +51,14 @@ def edge_energy_deriv(
         (E,) valid-edge mask.
     n_node
         (nf,) per-frame node counts.
-    do_atomic_virial
-        whether to materialize the per-atom virial (else ``None`` is returned).
-    create_graph
-        whether the backward retains a graph (training, for second-order grad).
     node_capacity
         Static node-axis size ``N``.  ``None`` (eager default) falls back to
         ``int(n_node.sum())``.  Pass a static value (e.g. ``atype.shape[0]``)
         to keep this function trace-safe under ``make_fx``/``torch.export``.
+    do_atomic_virial
+        whether to materialize the per-atom virial (else ``None`` is returned).
+    create_graph
+        whether the backward retains a graph (training, for second-order grad).
 
     Returns
     -------
@@ -186,9 +187,9 @@ def fit_output_to_model_output_graph(
                 edge_index,
                 edge_mask,
                 n_node,
+                node_capacity=N,
                 do_atomic_virial=(vdef.c_differentiable and do_atomic_virial),
                 create_graph=create_graph,
-                node_capacity=N,
             )
             # force (N, 3) -> (N, 1, 3)  [flat; caller unravels at I/O boundary]
             ff_list.append(force.reshape(N, 1, 3))

From 7658091dd008394abbf1936d38e34abb443062c0 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 13:17:46 +0800
Subject: [PATCH 23/33] fix(dpmodel): clamp graph edge-scatter indices
 in-bounds (CUDA device-assert on graph .pt2 export)

The graph forward_common_lower_graph .pt2 AOTI export device-asserted on
CUDA ('index out of bounds: 0 <= tmp8 < ks0') in the edge_force_virial
scatter. A padding/guard edge's node index reaches the index_add address
computation BEFORE edge_mask zeroes its (zero) contribution; CPU does not
bounds-check the address (so all dev + CI, which are CPU-only, were green),
CUDA device-asserts. Clamp src/dst into [0, n_out) and edge_frame into
[0, nf) before the scatters -- padding edges carry w_edge==0, so a clamped
out-of-range index scatters zero (numerically harmless). Verified on Tesla
T4: graph .pt2 export + DeepEval parity (small_8/large_20) + full graph
suite + consistency all pass; parity confirms the clamp does not corrupt
(N bound was correct, the stray index was genuine padding).

Also bind the per-node scatter to the input atype.shape[0] (node_capacity)
rather than the re-derived fit_ret.shape[0] -- hardening.
---
 .../dpmodel/utils/neighbor_graph/derivatives.py  | 14 ++++++++++++--
 deepmd/pt_expt/model/edge_transform_output.py    | 16 +++++++++++++++-
 deepmd/pt_expt/model/make_model.py               |  5 +++++
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
index 2c8c50eaca..8d6125abfc 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
@@ -91,8 +91,15 @@ def edge_force_virial(
     # zero padding/guard contributions; cast mask to g's dtype (array-API pure,
     # CLAUDE.md mask-multiply guideline — avoids bool*float under array_api_strict)
     g = g_e * xp.astype(edge_mask[:, None], g_e.dtype)
-    src = edge_index[0]
-    dst = edge_index[1]
+    # Clamp scatter indices into the valid node range ``[0, n_out)``. Padding/guard
+    # edges (``edge_mask == 0``) carry ``g == 0`` above, so ``w_edge == 0`` and a
+    # clamped out-of-range index scatters ZERO -- numerically harmless. This keeps
+    # the scatter address in-bounds for the CUDA-compiled kernel: under dynamic-edge
+    # ``torch.export`` a padding index can reach the ``index_add`` BEFORE the mask
+    # zeroes its value, tripping ``tl.device_assert(idx < ks0)`` (a hard device-side
+    # assert on CUDA; benign on CPU, which does not bounds-check the address).
+    src = xp.clip(edge_index[0], 0, n_out - 1)
+    dst = xp.clip(edge_index[1], 0, n_out - 1)
     # force (output sized to the node axis, incl. any padding tail)
     force = segment_sum(g, dst, n_out) - segment_sum(g, src, n_out)
     # per-edge virial w_e[k, j] = -g_e[k] * edge_vec[j]  (broadcast, no einsum)
@@ -106,5 +113,8 @@ def edge_force_virial(
     edge_frame = xp.astype(
         xp.searchsorted(boundaries, dst, side="right"), xp.int64
     )  # (E,) in [0, nf)
+    # searchsorted(side="right") can return ``nf`` for an out-of-range ``dst``
+    # (padding/garbage); clamp into ``[0, nf)`` for the same CUDA-bounds reason.
+    edge_frame = xp.clip(edge_frame, 0, nf - 1)
     virial = segment_sum(w_edge, edge_frame, nf)  # (nf, 3, 3)
     return force, atom_virial, virial
diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py
index 98a620b524..653f323404 100644
--- a/deepmd/pt_expt/model/edge_transform_output.py
+++ b/deepmd/pt_expt/model/edge_transform_output.py
@@ -88,6 +88,7 @@ def fit_output_to_model_output_graph(
     do_atomic_virial: bool = False,
     create_graph: bool = True,
     mask: torch.Tensor | None = None,
+    node_capacity: int | None = None,
 ) -> dict[str, torch.Tensor]:
     """Graph analogue of the dense pt_expt ``fit_output_to_model_output``.
 
@@ -121,6 +122,15 @@ def fit_output_to_model_output_graph(
         Whether the backward retains a graph (training).
     mask
         (N,) flat realness mask; used only for intensive-output reduction.
+    node_capacity
+        Authoritative node-axis size ``N`` = the scatter bound for the
+        per-node force/atom-virial assembly. Pass the INPUT ``atype.shape[0]``
+        (the pristine node-axis symbol that ``edge_index`` indexes into by
+        construction); ``None`` falls back to the descriptor/fitting output's
+        ``fit_ret.shape[0]`` (value-equal). This makes the scatter bound the
+        input node axis rather than a re-derived shape -- hardening; the actual
+        CUDA out-of-bounds device-assert is prevented by the index clamp in
+        :func:`~deepmd.dpmodel.utils.neighbor_graph.derivatives.edge_force_virial`.
 
     Returns
     -------
@@ -145,7 +155,11 @@ def fit_output_to_model_output_graph(
     # Derive N from the fitting output's leading shape rather than int(n_node.sum()).
     # shape attributes are always static Python ints (or SymInts in symbolic-mode
     # tracing) and are trace-safe; reading a tensor VALUE via int() is not.
-    N = next(iter(fit_ret.values())).shape[0]
+    N = (
+        node_capacity
+        if node_capacity is not None
+        else next(iter(fit_ret.values())).shape[0]
+    )
     frame_id = frame_id_from_n_node(
         n_node, n_total=N
     )  # (N,) int64 frame index per atom
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 3e14ed2d56..5b19cb63f1 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -370,6 +370,11 @@ def forward_common_lower_graph(
                 do_atomic_virial=do_atomic_virial,
                 create_graph=self.training,
                 mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
+                # Bound the per-node scatter by the INPUT node axis (the symbol
+                # ``edge_index`` indexes into), not the re-derived fitting-output
+                # shape -- avoids a CUDA out-of-bounds device-assert under
+                # dynamic-edge torch.export. See fit_output_to_model_output_graph.
+                node_capacity=atype.shape[0],
             )
 
         def _resolve_graph_method(

From 0f437317296b8175579c1d4952e55fb19d70cc14 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 15:24:13 +0800
Subject: [PATCH 24/33] fix(dpmodel): export-safe modulo clamp for graph
 edge-scatter indices

The xp.clip clamp added in the previous commit breaks under torch.export:
the bound n_out is a SymInt and array_api_compat's clip reads .shape on it
('SymInt' object has no attribute 'shape'), failing every graph .pt2 export
on CI/local (it only passed on the GPU box's newer array_api_compat). Replace
with modulo (src % n_out, dst % n_out, edge_frame % nf) -- pure arithmetic, so
torch.export-safe, and a no-op on in-range real indices. Also fixes the case a
mask-multiply missed: the out-of-range index occurs on an edge_mask==1 edge
(ks0=n_out binds to a smaller symbol than the live node count at AOTI runtime),
which only a clamp-every-index (clip or modulo), not a mask-zero, neutralizes.
Out-of-range edges carry ~zero w_edge, so wrapping them is numerically harmless.

Verified on Tesla T4: graph .pt2 export (gen_dpa1) + DeepEval parity
(small_8/large_20) + C++ TestInferDpa1GraphPtExpt 10/10 all pass; CPU export
suite 12/12 (the SymInt failure the prior commit would have hit in CI).
---
 .../utils/neighbor_graph/derivatives.py       | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
index 8d6125abfc..affeb57ccc 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
@@ -91,15 +91,19 @@ def edge_force_virial(
     # zero padding/guard contributions; cast mask to g's dtype (array-API pure,
     # CLAUDE.md mask-multiply guideline — avoids bool*float under array_api_strict)
     g = g_e * xp.astype(edge_mask[:, None], g_e.dtype)
-    # Clamp scatter indices into the valid node range ``[0, n_out)``. Padding/guard
-    # edges (``edge_mask == 0``) carry ``g == 0`` above, so ``w_edge == 0`` and a
-    # clamped out-of-range index scatters ZERO -- numerically harmless. This keeps
-    # the scatter address in-bounds for the CUDA-compiled kernel: under dynamic-edge
-    # ``torch.export`` a padding index can reach the ``index_add`` BEFORE the mask
-    # zeroes its value, tripping ``tl.device_assert(idx < ks0)`` (a hard device-side
-    # assert on CUDA; benign on CPU, which does not bounds-check the address).
-    src = xp.clip(edge_index[0], 0, n_out - 1)
-    dst = xp.clip(edge_index[1], 0, n_out - 1)
+    # Wrap node indices into ``[0, n_out)`` so every scatter address is in-bounds.
+    # Real edges already have index < n_out (modulo is a no-op). Out-of-range
+    # indices CAN appear in the CUDA-compiled kernel: under dynamic-edge
+    # ``torch.export`` the scatter bound ``ks0 == n_out`` binds to a SMALLER
+    # symbol than the live node count at AOTI runtime, so a valid index trips
+    # ``tl.device_assert(idx < ks0)`` (a hard device-side assert on CUDA; benign
+    # on CPU, which does not bounds-check the address). Such edges carry ~zero
+    # ``w_edge`` (masked ``g`` + tiny ``g_e``), so wrapping them to another node
+    # is numerically harmless. Modulo is pure arithmetic => torch.export-safe,
+    # unlike ``xp.clip`` (SymInt bound breaks array_api_compat) and unlike a
+    # mask-multiply (misses ``edge_mask == 1`` out-of-range indices).
+    src = edge_index[0] % n_out
+    dst = edge_index[1] % n_out
     # force (output sized to the node axis, incl. any padding tail)
     force = segment_sum(g, dst, n_out) - segment_sum(g, src, n_out)
     # per-edge virial w_e[k, j] = -g_e[k] * edge_vec[j]  (broadcast, no einsum)
@@ -112,9 +116,8 @@ def edge_force_virial(
     boundaries = xp.cumulative_sum(n_node)  # (nf,) per-frame node upper bounds
     edge_frame = xp.astype(
         xp.searchsorted(boundaries, dst, side="right"), xp.int64
-    )  # (E,) in [0, nf)
-    # searchsorted(side="right") can return ``nf`` for an out-of-range ``dst``
-    # (padding/garbage); clamp into ``[0, nf)`` for the same CUDA-bounds reason.
-    edge_frame = xp.clip(edge_frame, 0, nf - 1)
+    )  # (E,) in [0, nf]
+    # wrap into [0, nf) for the same CUDA-bounds reason (export-safe modulo)
+    edge_frame = edge_frame % nf
     virial = segment_sum(w_edge, edge_frame, nf)  # (nf, 3, 3)
     return force, atom_virial, virial

From afda4c7b36812aa0c970a9223643cd98a3b2a3ed Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 15:24:41 +0800
Subject: [PATCH 25/33] fix(pt_expt): address AI review (CodeQL + CodeRabbit)
 on #5604

- training.py: add explanatory comments to four pass-only except clauses
  (CodeQL empty-except); replace the nloc==1-fragile 'N != nframes' shape
  heuristic in the compiled-graph unravel with an explicit node-level key set
  so single-atom-per-frame outputs reshape to (nf, 1, *) (CodeRabbit).
- serialization.py: mark aparam's atom axis dynamic in the graph export
  dynamic-shape spec ({0: nframes, 1: nloc}), matching the dense path, so a
  dim_aparam>0 graph export no longer specializes nloc (CodeRabbit).
- test_lammps_dpa1_graph_pt2.py: skip the two reference-comparison tests when
  the gen_dpa1 .expected fixture is absent (clean skip vs TypeError on None);
  force 'processors 2 1 1' in the empty-subdomain MPI test so the empty-rank
  branch is genuinely exercised (CodeRabbit).

Skipped as invalid: CodeQL 'import DEVICE' (house pattern, benign in test);
CodeRabbit do_atomic_virial-in-compiled-graph (the dense compiled path drops it
identically; training never requests atom_virial).

Validated: CPU export 8/8 + compiled-varying-natoms 5/5 (incl. dpa1_no_attn).
---
 deepmd/pt_expt/train/training.py              | 31 +++++++++++--------
 deepmd/pt_expt/utils/serialization.py         |  6 +++-
 .../lmp/tests/test_lammps_dpa1_graph_pt2.py   | 15 ++++++++-
 3 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 6393d78e39..409fcef48d 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -421,13 +421,13 @@ def fn(
         if _dim_fp > 1:
             _forbidden.add(_dim_fp)
     except Exception:
-        pass
+        pass  # best-effort: dim_fparam unavailable -> nothing to forbid
     try:
         _dim_ap = model.get_dim_aparam()
         if _dim_ap > 1:
             _forbidden.add(_dim_ap)
     except Exception:
-        pass
+        pass  # best-effort: dim_aparam unavailable -> nothing to forbid
     if charge_spin is not None:
         _dim_cs = int(charge_spin.shape[1])
         if _dim_cs > 1:
@@ -634,11 +634,11 @@ def _trace_and_compile_graph(
         try:
             _fitting = model.get_fitting_net()
         except AttributeError:
-            pass
+            pass  # optional accessor; a model without a fitting net keeps None
         try:
             _atomic_model = model.atomic_model
         except AttributeError:
-            pass
+            pass  # optional attribute; a model without an atomic model keeps None
 
     do_grad_r = model.do_grad_r("energy")
     do_grad_c = model.do_grad_c("energy")
@@ -667,13 +667,13 @@ def _trace_and_compile_graph(
         if _dim_fp > 1:
             _forbidden.add(_dim_fp)
     except Exception:
-        pass
+        pass  # best-effort: dim_fparam unavailable -> nothing to forbid
     try:
         _dim_ap = model.get_dim_aparam()
         if _dim_ap > 1:
             _forbidden.add(_dim_ap)
     except Exception:
-        pass
+        pass  # best-effort: dim_aparam unavailable -> nothing to forbid
     if charge_spin is not None and charge_spin.shape[-1] > 1:
         _forbidden.add(int(charge_spin.shape[-1]))
     for _tbv in task_buf_vals_trace:
@@ -1279,15 +1279,20 @@ def _forward_graph(
         # (nf, *)).  Unravel the node-level keys to rectangular (nf, nloc, *) so
         # callers receive the same shapes as the dense path.
         N = nframes * nloc
+        # Node-level (per-atom, lead dim N) public keys emitted by the graph
+        # lower; the remaining keys are frame-level (lead dim nf) and must NOT
+        # be unravelled. Keying on the NAME rather than the ``N != nframes``
+        # shape heuristic keeps the single-atom case (nloc == 1, where
+        # N == nframes) correct -- node-level outputs still reshape to
+        # (nf, 1, *) instead of staying (nf, *).
+        node_level_keys = {"atom_energy", "force", "atom_virial", "mask"}
         out: dict[str, torch.Tensor] = {}
         for key, val in result.items():
-            # ``N != nframes`` distinguishes node-level keys (lead dim N) from
-            # frame-level keys (lead dim nf) by shape. DEGENERATE: when nloc==1,
-            # N == nframes, so node-level keys are NOT unravelled and stay
-            # (nf, *) instead of (nf, 1, *). Harmless for the varying-natoms
-            # trainer (nloc >> 1); a single-atom-per-frame system would need an
-            # explicit per-key category check instead of the shape heuristic.
-            if val is not None and val.shape[:1] == torch.Size([N]) and N != nframes:
+            if (
+                key in node_level_keys
+                and val is not None
+                and val.shape[:1] == torch.Size([N])
+            ):
                 out[key] = val.reshape(nframes, nloc, *val.shape[1:])
             else:
                 out[key] = val
diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index 1bb49a8b5a..f4b538df75 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -447,6 +447,7 @@ def _build_graph_dynamic_shapes(
     nframes_dim = torch.export.Dim("nframes", min=1)
     n_node_total_dim = torch.export.Dim("n_node_total", min=1)
     nedge_dim = torch.export.Dim("nedge", min=2)
+    nloc_dim = torch.export.Dim("nloc", min=1)
     return (
         {0: n_node_total_dim},  # atype: (N,)
         {0: nframes_dim},  # n_node: (nf,)
@@ -454,7 +455,10 @@ def _build_graph_dynamic_shapes(
         {0: nedge_dim},  # edge_vec: (E, 3) — E dynamic
         {0: nedge_dim},  # edge_mask: (E,) — E dynamic
         {0: nframes_dim} if fparam is not None else None,  # fparam: (nf, ndf)
-        {0: nframes_dim} if aparam is not None else None,  # aparam: (nf, nloc, nda)
+        # aparam: (nf, nloc, nda) — both the frame AND atom axes are dynamic,
+        # matching the dense ``_build_dynamic_shapes`` (otherwise a dim_aparam>0
+        # graph export specializes nloc to the sample size and breaks at runtime).
+        {0: nframes_dim, 1: nloc_dim} if aparam is not None else None,  # aparam
         {0: nframes_dim} if charge_spin is not None else None,  # charge_spin
     )
 
diff --git a/source/lmp/tests/test_lammps_dpa1_graph_pt2.py b/source/lmp/tests/test_lammps_dpa1_graph_pt2.py
index 3e6a11428d..ab898e15f6 100644
--- a/source/lmp/tests/test_lammps_dpa1_graph_pt2.py
+++ b/source/lmp/tests/test_lammps_dpa1_graph_pt2.py
@@ -73,6 +73,12 @@
 except FileNotFoundError:
     expected_e = expected_f = expected_v = None
 
+# Gate the reference-comparison tests on the generated ``.expected`` fixture so
+# they skip cleanly (rather than failing with a ``TypeError`` on ``None``) when
+# gen_dpa1.py has not run (e.g. PyTorch not built). The MPI multi-rank tests
+# compare against a single-rank run of the same archive and do not need it.
+_HAS_REF = expected_e is not None
+
 box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0])
 coord = np.array(
     [
@@ -129,6 +135,7 @@ def lammps():
     lmp.close()
 
 
+@pytest.mark.skipif(not _HAS_REF, reason="gen_dpa1.py .expected fixture not generated")
 def test_pair_deepmd(lammps) -> None:
     """Single-rank serial run (``atom_modify map yes``): the graph .pt2
     folds ghosts onto local owners (``fold_to_local=True``) and must match
@@ -145,6 +152,7 @@ def test_pair_deepmd(lammps) -> None:
     lammps.run(1)
 
 
+@pytest.mark.skipif(not _HAS_REF, reason="gen_dpa1.py .expected fixture not generated")
 def test_pair_deepmd_virial(lammps) -> None:
     """Single-rank per-atom virial via ``centroid/stress/atom``."""
     lammps.pair_style(f"deepmd {pb_file.resolve()}")
@@ -297,7 +305,12 @@ def test_pair_deepmd_mpi_dpa1_graph_empty_subdomain() -> None:
     zero contribution from the empty rank — compared against a same-archive
     single-rank reference of the same fixture.
     """
-    out_mpi = _run_mpi_subprocess(nprocs=2, data_path=data_file_empty_subdomain)
+    # Force ``processors 2 1 1`` so the split is along x at 15 and rank 1 is
+    # genuinely empty -- otherwise LAMMPS may auto-pick a grid where neither
+    # rank is empty and the branch under test is not exercised.
+    out_mpi = _run_mpi_subprocess(
+        nprocs=2, data_path=data_file_empty_subdomain, processors="2 1 1"
+    )
     out_ref = _run_mpi_subprocess(nprocs=1, data_path=data_file_empty_subdomain)
     np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
     np.testing.assert_allclose(

From 0ea2c3435567610ae6e00f58aa540ab598fd912f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 15:25:09 +0800
Subject: [PATCH 26/33] feat(pt_expt): dp freeze --lower-kind {nlist,graph} for
 graph .pt2 export

Add a user-facing entry point to the graph C++ inference path. Before this,
the graph lower was reachable only via the internal API / gen_dpa1.py test
fixture, so a user-frozen dpa1 .pt2 always used the dense (nlist) lower and
the tested C++ graph path was unreachable from the CLI.

- main.py: add --lower-kind {nlist,graph} to the freeze subparser (default
  nlist; PyTorch-Exportable backend only, same convention as --head/--node-names).
- entrypoints/main.py: thread lower_kind into freeze() -> deserialize_to_file.
  Fail fast (ValueError) when 'graph' is requested for a non-graph-eligible
  model (reuses _model_uses_graph_lower; currently dpa1 attn_layer==0 only).
  Enable do_atomic_virial for the graph form -- near-free there (one extra
  scatter off the shared single backward).
- test_dp_freeze.py: graph-eligibility rejection (se_e2_a) + a public-CLI
  graph freeze of dpa1(attn_layer=0) asserting metadata lower_input_kind=graph.

Opt-in by design; auto-selecting graph for eligible models (mirroring training)
is deferred until the graph path covers attention/angles/MP. Both tests pass.
---
 deepmd/main.py                         | 10 ++++
 deepmd/pt_expt/entrypoints/main.py     | 39 +++++++++++++++-
 source/tests/pt_expt/test_dp_freeze.py | 64 ++++++++++++++++++++++++++
 3 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/deepmd/main.py b/deepmd/main.py
index 43f40dc214..0a4d44137a 100644
--- a/deepmd/main.py
+++ b/deepmd/main.py
@@ -350,6 +350,16 @@ def main_parser() -> argparse.ArgumentParser:
         type=str,
         help="(Supported backend: PyTorch) Task head (alias: model branch) to freeze if in multi-task mode.",
     )
+    parser_frz.add_argument(
+        "--lower-kind",
+        default="nlist",
+        type=str,
+        choices=["nlist", "graph"],
+        help="(Supported backend: PyTorch Exportable) Lower-level export form of the "
+        "frozen .pt2: 'nlist' (default, dense neighbor-list lower) or 'graph' "
+        "(NeighborGraph edge-list lower; only for graph-eligible models, currently "
+        "dpa1 with attn_layer=0). 'graph' selects the C++ graph inference path.",
+    )
 
     # * test script ********************************************************************
     parser_tst = subparsers.add_parser(
diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py
index da28229bf4..2567756578 100644
--- a/deepmd/pt_expt/entrypoints/main.py
+++ b/deepmd/pt_expt/entrypoints/main.py
@@ -387,6 +387,7 @@ def freeze(
     model: str,
     output: str = "frozen_model.pte",
     head: str | None = None,
+    lower_kind: str = "nlist",
 ) -> None:
     """Freeze a pt_expt checkpoint into a .pte exported model.
 
@@ -398,6 +399,13 @@ def freeze(
         Path for the output .pte file.
     head : str or None
         Head to freeze in multi-task mode.
+    lower_kind : str
+        Lower-level export form: ``"nlist"`` (default, dense neighbor-list lower)
+        or ``"graph"`` (NeighborGraph edge-list lower). ``"graph"`` is only valid
+        for graph-eligible models (``mixed_types`` and ``uses_graph_lower``,
+        currently dpa1 with ``attn_layer == 0``) and selects the C++ graph
+        inference path; the per-atom virial is enabled for it (near-free in the
+        graph path: one extra scatter off the shared single backward).
     """
     import torch
 
@@ -458,12 +466,34 @@ def freeze(
         single_model_params = model_params
 
     m.eval()
+
+    # The graph lower is opt-in and only valid for graph-eligible models (dpa1
+    # attn_layer==0 today). Fail fast with a clear message rather than emitting a
+    # broken .pt2. Enable the per-atom virial for the graph form -- it is
+    # near-free there (one extra scatter off the single shared backward).
+    do_atomic_virial = False
+    if lower_kind == "graph":
+        from deepmd.pt_expt.train.training import (
+            _model_uses_graph_lower,
+        )
+
+        if not _model_uses_graph_lower(m):
+            raise ValueError(
+                "lower_kind='graph' requires a graph-eligible model "
+                "(mixed_types and a descriptor exposing uses_graph_lower()==True, "
+                "currently dpa1 with attn_layer==0). Use lower_kind='nlist' for "
+                "this model."
+            )
+        do_atomic_virial = True
+
     model_dict_serialized = m.serialize()
     deserialize_to_file(
         output,
         {"model": model_dict_serialized, "model_def_script": single_model_params},
+        do_atomic_virial=do_atomic_virial,
+        lower_kind=lower_kind,
     )
-    log.info("Saved frozen model to %s", output)
+    log.info("Saved frozen model to %s (lower_kind=%s)", output, lower_kind)
 
 
 def change_bias(
@@ -703,7 +733,12 @@ def main(args: list[str] | argparse.Namespace | None = None) -> None:
             FLAGS.model = str(model_path)
         if not FLAGS.output.endswith((".pte", ".pt2")):
             FLAGS.output = str(Path(FLAGS.output).with_suffix(".pte"))
-        freeze(model=FLAGS.model, output=FLAGS.output, head=FLAGS.head)
+        freeze(
+            model=FLAGS.model,
+            output=FLAGS.output,
+            head=FLAGS.head,
+            lower_kind=getattr(FLAGS, "lower_kind", "nlist"),
+        )
     elif FLAGS.command == "change-bias":
         change_bias(
             input_file=FLAGS.INPUT,
diff --git a/source/tests/pt_expt/test_dp_freeze.py b/source/tests/pt_expt/test_dp_freeze.py
index 7c33f0de81..ebfac485ea 100644
--- a/source/tests/pt_expt/test_dp_freeze.py
+++ b/source/tests/pt_expt/test_dp_freeze.py
@@ -41,6 +41,36 @@
     "data_stat_nbatch": 20,
 }
 
+# dpa1 with attn_layer == 0 — the only graph-eligible model family today
+# (mixed_types and uses_graph_lower()==True), used to exercise the
+# ``freeze --lower-kind graph`` public-CLI path.
+model_dpa1_graph = {
+    "type_map": ["O", "H"],
+    "descriptor": {
+        "type": "se_atten",
+        "sel": 30,
+        "rcut_smth": 2.0,
+        "rcut": 6.0,
+        "neuron": [2, 4, 8],
+        "axis_neuron": 4,
+        "attn": 5,
+        "attn_layer": 0,
+        "attn_dotr": True,
+        "attn_mask": False,
+        "activation_function": "tanh",
+        "scaling_factor": 1.0,
+        "normalize": True,
+        "temperature": 1.0,
+        "type_one_side": True,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [5, 5, 5],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+}
+
 
 class TestDPFreezePtExpt(unittest.TestCase):
     """Test dp freeze for the pt_expt backend."""
@@ -103,6 +133,40 @@ def test_freeze_default_suffix(self) -> None:
         expected = os.path.join(self.tmpdir, "frozen_default_suffix.pte")
         self.assertTrue(os.path.exists(expected))
 
+    def test_freeze_graph_rejects_ineligible(self) -> None:
+        """``--lower-kind graph`` on a non-graph-eligible model (se_e2_a,
+        mixed_types=False) fails fast rather than emitting a broken .pt2.
+        """
+        output = os.path.join(self.tmpdir, "frozen_graph_reject.pt2")
+        with self.assertRaises(ValueError):
+            freeze(model=self.ckpt_file, output=output, lower_kind="graph")
+
+    def test_freeze_graph_dpa1(self) -> None:
+        """``freeze --lower-kind graph`` on a graph-eligible dpa1(attn_layer=0)
+        model produces a .pt2 whose metadata records the graph lower (the
+        user-facing entry point to the C++ graph inference path).
+        """
+        import json
+        import zipfile
+
+        model_params = deepcopy(model_dpa1_graph)
+        model = get_model(model_params)
+        wrapper = ModelWrapper(model, model_params=model_params)
+        ckpt = os.path.join(self.tmpdir, "dpa1_graph.pt")
+        torch.save({"model": wrapper.state_dict()}, ckpt)
+
+        output = os.path.join(self.tmpdir, "frozen_dpa1_graph.pt2")
+        freeze(model=ckpt, output=output, lower_kind="graph")
+        self.assertTrue(os.path.exists(output))
+
+        # the .pt2 is a zip; metadata.json must record the graph lower
+        with zipfile.ZipFile(output) as zf:
+            meta_name = next(
+                n for n in zf.namelist() if n.endswith("extra/metadata.json")
+            )
+            metadata = json.loads(zf.read(meta_name))
+        self.assertEqual(metadata["lower_input_kind"], "graph")
+
     def test_freeze_pt2(self) -> None:
         """Freeze to .pt2 (AOTInductor) and verify the file is loadable."""
         self.assertTrue(os.path.exists(self.shared_pt2))

From 7a50e60e2754437827e6e74c4a3aaed3b39e8724 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 30 Jun 2026 15:52:51 +0800
Subject: [PATCH 27/33] test(api_cc): add deeppot_dpa1_graph.pt2 to
 universal/variant battery

Wire the dpa1 graph .pt2 (attn_layer=0, lower_input_kind=graph) into the
parametrized VariantDeepPotTest as case dpa1_graph_ptexpt (Backend::PTExpt,
ref deeppot_dpa1_graph.expected, pbc/nopbc, tol 1e-10/1e-4). Flags mirror
dpa1_pytorch_pt2 exactly; all 19 enabled subtests pass on remote GPU
(FiniteDifference, LmpNlist*, CutoffTwice, TypeSel, NoPbc*). The dedicated
test_deeppot_dpa1_graph_ptexpt.cc is retained for graph-unique coverage.
---
 source/api_cc/tests/test_deeppot_universal.cc | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/source/api_cc/tests/test_deeppot_universal.cc b/source/api_cc/tests/test_deeppot_universal.cc
index e0ee6fc8f4..1c599d7d33 100644
--- a/source/api_cc/tests/test_deeppot_universal.cc
+++ b/source/api_cc/tests/test_deeppot_universal.cc
@@ -143,6 +143,28 @@ std::vector<VariantDeepPotCase> variant_deeppot_cases() {
            /*supports_no_pbc_atomic=*/false,
            /*supports_no_pbc_lmp_nlist=*/true,
            /*supports_no_pbc_lmp_nlist_atomic=*/false},
+          {"dpa1_graph_ptexpt",
+           Backend::PTExpt,
+           "../../tests/infer/deeppot_dpa1_graph.pt2",
+           /*convert_pbtxt=*/false,
+           nullptr,
+           nullptr,
+           "../../tests/infer/deeppot_dpa1_graph.expected",
+           "pbc",
+           "nopbc",
+           1e-10,
+           1e-4,
+           /*supports_float=*/true,
+           /*supports_finite_difference=*/true,
+           /*supports_lmp_nlist=*/true,
+           /*supports_lmp_nlist_atomic=*/true,
+           /*supports_lmp_nlist_cutoff_twice=*/true,
+           /*supports_lmp_nlist_type_sel=*/true,
+           /*supports_print_summary=*/true,
+           /*supports_no_pbc_simple=*/true,
+           /*supports_no_pbc_atomic=*/false,
+           /*supports_no_pbc_lmp_nlist=*/true,
+           /*supports_no_pbc_lmp_nlist_atomic=*/false},
           {"dpa2_pytorch_pth",
            Backend::PyTorch,
            "../../tests/infer/deeppot_dpa2.pth",

From b4c0b49f522997b8fd373ea1c4cefbf752a8e4a7 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Wed, 1 Jul 2026 08:43:48 +0800
Subject: [PATCH 28/33] =?UTF-8?q?fix(pt=5Fexpt):=20address=20iProzd=20revi?=
 =?UTF-8?q?ew=20=E2=80=94=20graph=20freeze=20defaults=20to=20.pt2=20+=20mo?=
 =?UTF-8?q?dulo-clamp=20invariant=20test?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- entrypoints/main.py: 'dp --pt_expt freeze --lower-kind graph' now defaults a
  suffix-less output to .pt2 (the AOTI archive the C++ graph path consumes)
  instead of .pte; explicit .pte/.pt2 suffixes are preserved for both kinds.
  New test (freeze mocked) covers all four suffix/kind combinations.
- test_edge_force_virial.py: add an invariant test that the in-bounds index
  clamp (% n_out) NEVER alters a real (edge_mask==True) edge -- a boundary real
  edge on node n_out-1 scatters unwrapped, and a masked guard edge with
  deliberately out-of-range indices contributes nothing; the result equals the
  real-edges-only reference. Directly answers iProzd's ask to prove the modulo
  cannot silently remap real edges (only zero-weight guard edges).

Both new tests pass.
---
 deepmd/pt_expt/entrypoints/main.py            |  9 ++++-
 .../common/dpmodel/test_edge_force_virial.py  | 32 ++++++++++++++++
 source/tests/pt_expt/test_dp_freeze.py        | 37 +++++++++++++++++++
 3 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py
index 2567756578..5b3b74b6bd 100644
--- a/deepmd/pt_expt/entrypoints/main.py
+++ b/deepmd/pt_expt/entrypoints/main.py
@@ -731,13 +731,18 @@ def main(args: list[str] | argparse.Namespace | None = None) -> None:
                     f"Checkpoint path '{model_path}' does not exist."
                 )
             FLAGS.model = str(model_path)
+        _lower_kind = getattr(FLAGS, "lower_kind", "nlist")
         if not FLAGS.output.endswith((".pte", ".pt2")):
-            FLAGS.output = str(Path(FLAGS.output).with_suffix(".pte"))
+            # Default suffix: .pt2 for the graph export (an AOTI .pt2 archive is
+            # what the C++ graph path consumes), .pte otherwise. Explicit user
+            # .pte / .pt2 suffixes are preserved for both.
+            _default_suffix = ".pt2" if _lower_kind == "graph" else ".pte"
+            FLAGS.output = str(Path(FLAGS.output).with_suffix(_default_suffix))
         freeze(
             model=FLAGS.model,
             output=FLAGS.output,
             head=FLAGS.head,
-            lower_kind=getattr(FLAGS, "lower_kind", "nlist"),
+            lower_kind=_lower_kind,
         )
     elif FLAGS.command == "change-bias":
         change_bias(
diff --git a/source/tests/common/dpmodel/test_edge_force_virial.py b/source/tests/common/dpmodel/test_edge_force_virial.py
index fa84ef7ba4..722960f57a 100644
--- a/source/tests/common/dpmodel/test_edge_force_virial.py
+++ b/source/tests/common/dpmodel/test_edge_force_virial.py
@@ -97,6 +97,38 @@ def test_all_edges_masked_gives_zero(self) -> None:
             np.testing.assert_allclose(av, np.zeros((n, 3, 3)))
             np.testing.assert_allclose(vir, np.zeros((nf, 3, 3)))
 
+    def test_modulo_clamp_leaves_real_edges_unchanged(self) -> None:
+        # INVARIANT (iProzd review): the in-bounds index clamp (``% n_out``) that
+        # keeps the CUDA-exported scatter address legal must NEVER alter a real
+        # (edge_mask == True) edge -- only masked/out-of-range guard edges may be
+        # remapped, and those carry zero weight so remapping is harmless. Here a
+        # REAL edge sits on the boundary node ``n_out - 1`` (the largest valid
+        # index, where a wrong wrap would be visible) and a MASKED guard edge
+        # carries deliberately OUT-OF-RANGE indices (>= n_out) with nonzero g/vec.
+        # Correctness requires the result to equal the real-edges-only reference:
+        # the boundary real edge must land on node n_out-1 (not wrapped), and the
+        # out-of-range guard must contribute nothing. If real edges were ever
+        # remapped by the modulo (the shape-binding bug iProzd warned about), the
+        # boundary node's force/virial would be wrong and this test would fail.
+        n_node = np.array([5], dtype=np.int64)  # 1 frame, nodes 0..4 (n_out = 5)
+        # e0: real, src on the boundary node 4 -> node 0 ; e1: real, node 0 -> 4
+        # e2: MASKED guard with out-of-range indices src=99, dst=77 (>= n_out)
+        edge_index = np.array([[4, 0, 99], [0, 4, 77]], dtype=np.int64)
+        edge_vec = np.array([[1.0, 0.0, 0.0], [-1.0, 0.0, 0.0], [9.0, 9.0, 9.0]])
+        edge_mask = np.array([True, True, False])
+        g = np.array([[0.5, 0.2, 0.0], [0.3, 0.0, 0.1], [7.0, 7.0, 7.0]])
+        force, av, vir = edge_force_virial(g, edge_vec, edge_index, edge_mask, n_node)
+
+        # reference: the SAME two real edges only (no guard edge at all)
+        ref_force, ref_av, ref_vir = edge_force_virial(
+            g[:2], edge_vec[:2], edge_index[:, :2], edge_mask[:2], n_node
+        )
+        np.testing.assert_allclose(force, ref_force)
+        np.testing.assert_allclose(av, ref_av)
+        np.testing.assert_allclose(vir, ref_vir)
+        # explicit: the boundary real edge scattered its force to node 4 (unwrapped)
+        self.assertTrue(np.any(force[4] != 0.0))
+
     def test_ragged_multiframe_with_edge_and_node_padding(self) -> None:
         # MOST GENERAL case: 2 frames with DIFFERENT node counts (3 and 5) AND
         # different edge counts (2 and 3), masked guard EDGES, and a padded NODE
diff --git a/source/tests/pt_expt/test_dp_freeze.py b/source/tests/pt_expt/test_dp_freeze.py
index ebfac485ea..9090e4978d 100644
--- a/source/tests/pt_expt/test_dp_freeze.py
+++ b/source/tests/pt_expt/test_dp_freeze.py
@@ -133,6 +133,43 @@ def test_freeze_default_suffix(self) -> None:
         expected = os.path.join(self.tmpdir, "frozen_default_suffix.pte")
         self.assertTrue(os.path.exists(expected))
 
+    def test_freeze_output_suffix_by_lower_kind(self) -> None:
+        """main() defaults a suffix-less output to .pt2 for --lower-kind graph
+        and .pte for nlist, while preserving an explicit .pte/.pt2 (iProzd
+        review). freeze() is mocked so the suffix logic is checked without the
+        AOTInductor compile cost.
+        """
+        from unittest import mock
+
+        cases = [
+            ("graph", "out_g", None, ".pt2"),  # graph, no suffix -> .pt2
+            ("nlist", "out_n", None, ".pte"),  # nlist, no suffix -> .pte
+            ("graph", "out_g_explicit", ".pte", ".pte"),  # explicit .pte kept
+            ("nlist", "out_n_explicit", ".pt2", ".pt2"),  # explicit .pt2 kept
+        ]
+        for lower_kind, stem, explicit, expected_suffix in cases:
+            with self.subTest(lower_kind=lower_kind, explicit=explicit):
+                name = stem + (explicit or "")
+                captured: dict = {}
+
+                def _fake_freeze(model, output, head=None, lower_kind="nlist", **kw):
+                    captured["output"] = output
+                    captured["lower_kind"] = lower_kind
+
+                flags = argparse.Namespace(
+                    command="freeze",
+                    checkpoint_folder=self.ckpt_file,
+                    output=os.path.join(self.tmpdir, name),
+                    head=None,
+                    lower_kind=lower_kind,
+                    log_level=2,
+                    log_path=None,
+                )
+                with mock.patch("deepmd.pt_expt.entrypoints.main.freeze", _fake_freeze):
+                    main(flags)
+                self.assertTrue(captured["output"].endswith(expected_suffix))
+                self.assertEqual(captured["lower_kind"], lower_kind)
+
     def test_freeze_graph_rejects_ineligible(self) -> None:
         """``--lower-kind graph`` on a non-graph-eligible model (se_e2_a,
         mixed_types=False) fails fast rather than emitting a broken .pt2.

From 4a76f6df9b89118095f554c824c7747d99f57ad6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 1 Jul 2026 00:45:58 +0000
Subject: [PATCH 29/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 source/tests/pt_expt/test_dp_freeze.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/source/tests/pt_expt/test_dp_freeze.py b/source/tests/pt_expt/test_dp_freeze.py
index 9090e4978d..cdaa4a02c6 100644
--- a/source/tests/pt_expt/test_dp_freeze.py
+++ b/source/tests/pt_expt/test_dp_freeze.py
@@ -139,7 +139,9 @@ def test_freeze_output_suffix_by_lower_kind(self) -> None:
         review). freeze() is mocked so the suffix logic is checked without the
         AOTInductor compile cost.
         """
-        from unittest import mock
+        from unittest import (
+            mock,
+        )
 
         cases = [
             ("graph", "out_g", None, ".pt2"),  # graph, no suffix -> .pt2

From c095e12db33c833427e5c55a528557a0986d6f69 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Wed, 1 Jul 2026 11:42:16 +0800
Subject: [PATCH 30/33] docs(dpmodel): document the graph edge-scatter modulo
 as the permanent fix + root cause

Reframe the comment on the % n_out index guard: it is the PERMANENT fix, not an
interim workaround. Records the GPU-confirmed root cause -- the dynamic-edge
graph torch.export path traces the node count as several equal-but-distinct
symbols tied only by aten._assert_scalar(Eq(...)) nodes, which
_strip_shape_assertions neutralises wholesale (needed for export to trace),
dropping those equalities so inductor can no longer prove idx < n_out and emits
a device_assert. The upstream alternative (making the shared, spin-critical
_strip_shape_assertions selective) is deliberately not taken -- it risks the
torch.export bugs that helper bypasses and the spin .pt2 path. Comment-only.
---
 .../utils/neighbor_graph/derivatives.py       | 34 +++++++++++++------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
index affeb57ccc..599877d7e2 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
@@ -91,17 +91,29 @@ def edge_force_virial(
     # zero padding/guard contributions; cast mask to g's dtype (array-API pure,
     # CLAUDE.md mask-multiply guideline — avoids bool*float under array_api_strict)
     g = g_e * xp.astype(edge_mask[:, None], g_e.dtype)
-    # Wrap node indices into ``[0, n_out)`` so every scatter address is in-bounds.
-    # Real edges already have index < n_out (modulo is a no-op). Out-of-range
-    # indices CAN appear in the CUDA-compiled kernel: under dynamic-edge
-    # ``torch.export`` the scatter bound ``ks0 == n_out`` binds to a SMALLER
-    # symbol than the live node count at AOTI runtime, so a valid index trips
-    # ``tl.device_assert(idx < ks0)`` (a hard device-side assert on CUDA; benign
-    # on CPU, which does not bounds-check the address). Such edges carry ~zero
-    # ``w_edge`` (masked ``g`` + tiny ``g_e``), so wrapping them to another node
-    # is numerically harmless. Modulo is pure arithmetic => torch.export-safe,
-    # unlike ``xp.clip`` (SymInt bound breaks array_api_compat) and unlike a
-    # mask-multiply (misses ``edge_mask == 1`` out-of-range indices).
+    # Wrap node indices into ``[0, n_out)`` so every scatter address is provably
+    # in-bounds. For a well-formed graph every real edge already has
+    # ``index < n_out`` (== ``atype.shape[0]``), so this modulo is the IDENTITY on
+    # real edges (pinned by test_modulo_clamp_leaves_real_edges_unchanged) -- a
+    # correctness-preserving guard, not a value fixup.
+    #
+    # Why it is needed (root cause, GPU-confirmed): under the dynamic-edge graph
+    # ``torch.export`` path the node count is traced as several equal-but-distinct
+    # symbols (``atype.shape[0]``, ``fit_ret.shape[0]``, ...), tied only by
+    # ``aten._assert_scalar(Eq(...))`` nodes. ``_strip_shape_assertions``
+    # (pt_expt/utils/serialization.py) neutralises ALL such asserts so export can
+    # trace -- which also drops those node-count equalities, so inductor can no
+    # longer prove the scatter index and its bound ``ks0 == n_out`` share a symbol
+    # and emits ``tl.device_assert(idx < ks0)`` (fatal on CUDA; unchecked on CPU,
+    # which is why all CPU dev/CI was green). ``% n_out`` discharges that guard
+    # unconditionally. This is the PERMANENT fix: the upstream alternative --
+    # making the SHARED, spin-export-critical ``_strip_shape_assertions``
+    # selective -- risks re-triggering the torch.export bugs it exists to bypass
+    # and the spin ``.pt2`` path, so it is deliberately NOT taken.
+    #
+    # Pure arithmetic => torch.export-safe, unlike ``xp.clip`` (SymInt bound
+    # breaks array_api_compat's clip) and unlike a mask-multiply (which misses the
+    # ``edge_mask == 1`` indices the stripped guard mis-bounds).
     src = edge_index[0] % n_out
     dst = edge_index[1] % n_out
     # force (output sized to the node axis, incl. any padding tail)

From 3348c80e4de2d5c8dd4437d37bd8c1da6da4ec20 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Wed, 1 Jul 2026 14:36:45 +0800
Subject: [PATCH 31/33] refactor(pt_expt): consolidate graph trace/sample
 builders (OutisLi review)

Merge the two near-identical synthetic-graph builders
_make_graph_sample_inputs (serialization.py) and _make_graph_trace_inputs
(training.py) into one build_synthetic_graph_inputs in serialization.py,
parameterized by dtype (export=float64 ABI, training=GLOBAL_PT precision) and
device (export passes cpu explicitly instead of mutating env.DEVICE) plus the
want_fparam/aparam/charge_spin gating. Also factor the duplicated prime-collision
_forbidden scan into _forbidden_dims_from_model, shared by the dense and graph
_trace_and_compile. Removes the desync risk between the training and export
graph traces flagged in review.
---
 deepmd/pt_expt/train/training.py      | 188 +++++++-------------------
 deepmd/pt_expt/utils/serialization.py | 134 +++++++++---------
 2 files changed, 121 insertions(+), 201 deletions(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 409fcef48d..5068ac51af 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -274,6 +274,41 @@ def get_additional_data_requirement(_model: Any) -> list[DataRequirementItem]:
 # ---------------------------------------------------------------------------
 
 
+def _forbidden_dims_from_model(
+    model: torch.nn.Module,
+    task_buf_vals: tuple[torch.Tensor, ...],
+) -> set[int]:
+    """Prime-collision set for trace-dim selection.
+
+    Collects every ``> 1`` dim of the model's parameters/buffers (so
+    ``_next_safe_prime`` never aliases an internal dim like ``g2_dim`` /
+    ``axis_neuron`` / ``attn_head`` without a hardcoded list), plus
+    ``dim_fparam``/``dim_aparam`` and the task-buffer dims.  Shared by the dense
+    :func:`_trace_and_compile` and the graph :func:`_trace_and_compile_graph`;
+    each caller adds its path-specific dims (nall/nloc/nsel for dense,
+    charge_spin for both) on top of this base set.
+    """
+    forbidden: set[int] = {
+        int(_d)
+        for _src in (model.parameters(), model.buffers())
+        for _p in _src
+        for _d in _p.shape
+        if _d > 1
+    }
+    for _getter in (model.get_dim_fparam, model.get_dim_aparam):
+        try:
+            _dim = _getter()
+            if _dim > 1:
+                forbidden.add(int(_dim))
+        except Exception:
+            pass  # best-effort: dim unavailable -> nothing to forbid
+    for _tbv in task_buf_vals:
+        for _d in _tbv.shape:
+            if _d > 1:
+                forbidden.add(int(_d))
+    return forbidden
+
+
 def _trace_and_compile(
     model: torch.nn.Module,
     ext_coord: torch.Tensor,
@@ -397,17 +432,11 @@ def fn(
     # large to alias with any architecture dim and need no adjustment.
     #
     # The prime for nf is chosen by enumerating every dimension that appears
-    # in the model's parameters and buffers, then calling _next_safe_prime to
-    # find the first prime that doesn't collide with any of them.  This
-    # catches internal dims like g2_dim, axis_neuron, attn_head, etc.
-    # without requiring a hardcoded list.
-    _forbidden: set[int] = {
-        int(_d)
-        for _src in (model.parameters(), model.buffers())
-        for _p in _src
-        for _d in _p.shape
-        if _d > 1
-    }
+    # in the model's parameters and buffers (see _forbidden_dims_from_model),
+    # then calling _next_safe_prime to find the first prime that doesn't collide
+    # with any of them -- catching internal dims like g2_dim/axis_neuron/
+    # attn_head without a hardcoded list.  Add the dense-path dims on top.
+    _forbidden = _forbidden_dims_from_model(model, task_buf_vals_trace)
     # Also add the real nloc and nall so trace_nf never aliases them.
     _forbidden.add(int(ext_coord.shape[1]))  # nall
     _forbidden.add(int(ext_atype.shape[1]))  # nall (same tensor, defensive)
@@ -416,26 +445,10 @@ def fn(
     _nsel = int(nlist.shape[2])
     if _nsel > 1:
         _forbidden.add(_nsel)
-    try:
-        _dim_fp = model.get_dim_fparam()
-        if _dim_fp > 1:
-            _forbidden.add(_dim_fp)
-    except Exception:
-        pass  # best-effort: dim_fparam unavailable -> nothing to forbid
-    try:
-        _dim_ap = model.get_dim_aparam()
-        if _dim_ap > 1:
-            _forbidden.add(_dim_ap)
-    except Exception:
-        pass  # best-effort: dim_aparam unavailable -> nothing to forbid
     if charge_spin is not None:
         _dim_cs = int(charge_spin.shape[1])
         if _dim_cs > 1:
             _forbidden.add(_dim_cs)
-    for _tbv in task_buf_vals_trace:
-        for _d in _tbv.shape:
-            if _d > 1:
-                _forbidden.add(int(_d))
 
     trace_nf = _next_safe_prime(5, _forbidden)
 
@@ -653,33 +666,12 @@ def _trace_and_compile_graph(
     #   * nf  = n_node.shape[0]      (per-frame reductions)
     #   * N   = atype.shape[0]       (flat node axis = sum(n_node))
     #   * E   = edge_vec.shape[0]    (edge axis)
-    # They are chosen as collision-free primes vs every parameter/buffer dim.
+    # They are chosen as collision-free primes vs every parameter/buffer dim
+    # (see _forbidden_dims_from_model) plus charge_spin.
     # ------------------------------------------------------------------
-    _forbidden: set[int] = {
-        int(_d)
-        for _src in (model.parameters(), model.buffers())
-        for _p in _src
-        for _d in _p.shape
-        if _d > 1
-    }
-    try:
-        _dim_fp = model.get_dim_fparam()
-        if _dim_fp > 1:
-            _forbidden.add(_dim_fp)
-    except Exception:
-        pass  # best-effort: dim_fparam unavailable -> nothing to forbid
-    try:
-        _dim_ap = model.get_dim_aparam()
-        if _dim_ap > 1:
-            _forbidden.add(_dim_ap)
-    except Exception:
-        pass  # best-effort: dim_aparam unavailable -> nothing to forbid
+    _forbidden = _forbidden_dims_from_model(model, task_buf_vals_trace)
     if charge_spin is not None and charge_spin.shape[-1] > 1:
         _forbidden.add(int(charge_spin.shape[-1]))
-    for _tbv in task_buf_vals_trace:
-        for _d in _tbv.shape:
-            if _d > 1:
-                _forbidden.add(int(_d))
 
     trace_nf = _next_safe_prime(5, _forbidden)
     # nloc such that N = trace_nf * nloc is collision-free (and != trace_nf).
@@ -692,11 +684,18 @@ def _trace_and_compile_graph(
     e_max_base = max(math.ceil(1.25 * nloc_trace * nnei), 7)
     e_max = _next_safe_prime(e_max_base, _forbidden | {trace_nf, trace_N})
 
-    sample = _make_graph_trace_inputs(
+    # Shared with the .pt2 export trace (serialization.py) so the two graph
+    # traces can never desync on the input schema.  Training uses the run-time
+    # float precision and device; optional tensors match the actual call.
+    from deepmd.pt_expt.utils.serialization import build_synthetic_graph_inputs
+
+    sample = build_synthetic_graph_inputs(
         model,
         e_max=e_max,
         nframes=trace_nf,
         nloc=nloc_trace,
+        dtype=GLOBAL_PT_FLOAT_PRECISION,
+        device=DEVICE,
         want_fparam=fparam is not None,
         want_aparam=aparam is not None,
         want_charge_spin=charge_spin is not None,
@@ -804,91 +803,6 @@ def fn(
     )
 
 
-def _make_graph_trace_inputs(
-    model: torch.nn.Module,
-    e_max: int,
-    nframes: int,
-    nloc: int,
-    *,
-    want_fparam: bool,
-    want_aparam: bool,
-    want_charge_spin: bool,
-) -> tuple[torch.Tensor | None, ...]:
-    """Build a synthetic carry-all NeighborGraph for the graph-compile trace.
-
-    Returns positional tensors in the order
-    ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam,
-    charge_spin)`` matching ``forward_common_lower_graph``.  The edge axis is
-    padded to the STATIC ``e_max`` (masked) so its concrete value is a chosen
-    prime; ``fparam`` / ``aparam`` / ``charge_spin`` are emitted only when the
-    model+data path actually carries them (``want_*``), so the traced branch
-    matches the run-time call.
-    """
-    from deepmd.dpmodel.utils.neighbor_graph import (
-        GraphLayout,
-        build_neighbor_graph,
-    )
-
-    rcut = model.get_rcut()
-    ntypes = len(model.get_type_map())
-    dim_fparam = model.get_dim_fparam()
-    dim_aparam = model.get_dim_aparam()
-
-    box_size = rcut * 3.0
-    box_np = (np.eye(3, dtype=np.float64) * box_size).reshape(1, 9)
-    rng = np.random.default_rng(42)
-    coord_np = rng.random((nframes, nloc, 3)) * box_size * 0.5 + box_size * 0.25
-    atype_np = np.zeros((nframes, nloc), dtype=np.int64)
-    for i in range(nloc):
-        atype_np[:, i] = i % ntypes
-
-    coord_t = torch.tensor(coord_np, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE)
-    atype_t = torch.tensor(atype_np, dtype=torch.int64, device=DEVICE)
-    box_t = torch.tensor(
-        np.tile(box_np, (nframes, 1)), dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-    )
-
-    graph = build_neighbor_graph(
-        coord_t, atype_t, box_t, rcut, layout=GraphLayout(edge_capacity=e_max)
-    )
-
-    s_atype = atype_t.reshape(-1)
-    s_n_node = graph.n_node
-    s_edge_index = graph.edge_index
-    s_edge_vec = graph.edge_vec
-    s_edge_mask = graph.edge_mask
-
-    s_fparam = (
-        torch.zeros(nframes, dim_fparam, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE)
-        if (want_fparam and dim_fparam > 0)
-        else None
-    )
-    s_aparam = (
-        torch.zeros(
-            nframes, nloc, dim_aparam, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-        )
-        if (want_aparam and dim_aparam > 0)
-        else None
-    )
-    dim_cs = model.get_dim_chg_spin() if hasattr(model, "get_dim_chg_spin") else 0
-    s_charge_spin = (
-        torch.zeros(nframes, dim_cs, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE)
-        if (want_charge_spin and dim_cs > 0)
-        else None
-    )
-
-    return (
-        s_atype,
-        s_n_node,
-        s_edge_index,
-        s_edge_vec,
-        s_edge_mask,
-        s_fparam,
-        s_aparam,
-        s_charge_spin,
-    )
-
-
 class _CompiledModel(torch.nn.Module):
     """Coord extension (eager) -> compiled forward_lower (dynamic shapes).
 
diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index f4b538df75..e0c08a6f84 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -315,32 +315,57 @@ def _make_sample_inputs(
     return ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam, charge_spin
 
 
-def _make_graph_sample_inputs(
+def build_synthetic_graph_inputs(
     model: torch.nn.Module,
     e_max: int,
     nframes: int = 2,
     nloc: int = 7,
+    *,
+    dtype: torch.dtype,
+    device: torch.device | None = None,
+    want_fparam: bool = True,
+    want_aparam: bool = True,
+    want_charge_spin: bool = True,
 ) -> tuple[torch.Tensor | None, ...]:
-    """Create sample inputs for tracing ``forward_lower_graph``.
+    """Build a synthetic carry-all ``NeighborGraph`` for graph-lower tracing.
 
-    Builds a small random system, runs the carry-all
+    Single source of the trace-time graph inputs, shared by ``.pt2`` export
+    (:func:`_trace_and_export`) and compiled training
+    (:func:`deepmd.pt_expt.train.training._trace_and_compile_graph`), so the two
+    traces can never desync on the graph input schema.  Builds a small random
+    system, runs the carry-all
     :func:`~deepmd.dpmodel.utils.neighbor_graph.build_neighbor_graph` with a
-    STATIC ``GraphLayout(edge_capacity=e_max)`` (decision #16: the masked
-    static edge axis), and returns tensors in the positional order expected by
-    :meth:`forward_lower_graph_exportable`:
+    STATIC ``GraphLayout(edge_capacity=e_max)`` (decision #16: the masked static
+    edge axis), and returns tensors in the positional order expected by
+    ``forward_(common_)lower_graph``:
     ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam,
     charge_spin)``.
 
+    The system (``rng(42)``, ``box = rcut*3``, centered coords, ``atype[:, i] =
+    i % ntypes``) is identical for both callers; the only two former differences
+    are now parameters.
+
     Parameters
     ----------
     model : torch.nn.Module
         The pt_expt energy model (must expose ``get_rcut``/``get_type_map``/...).
     e_max : int
-        Static edge capacity ``E`` to pad the edge axis to.
+        Static edge capacity ``E`` to pad the (masked) edge axis to.
     nframes : int
         Number of frames in the sample system.
     nloc : int
         Number of local atoms per frame (``N == nframes * nloc``).
+    dtype : torch.dtype
+        Float precision of ``coord``/``edge_vec``/``fparam``/... .  The exported
+        ``.pt2`` is float64-only (C++ ABI); training passes
+        ``GLOBAL_PT_FLOAT_PRECISION``.
+    device : torch.device, optional
+        Target device.  Defaults to ``deepmd.pt_expt.utils.env.DEVICE``; the
+        export path passes ``cpu`` explicitly (make_fx traces on CPU).
+    want_fparam, want_aparam, want_charge_spin : bool
+        Whether to emit the optional conditioning tensor when its ``dim > 0``.
+        Export passes the defaults (``True`` = include if present); training
+        passes ``x is not None`` so the traced branch matches the run-time call.
     """
     import deepmd.pt_expt.utils.env as _env
     from deepmd.dpmodel.utils.neighbor_graph import (
@@ -348,74 +373,53 @@ def _make_graph_sample_inputs(
         build_neighbor_graph,
     )
 
+    if device is None:
+        device = _env.DEVICE
+
     rcut = model.get_rcut()
     ntypes = len(model.get_type_map())
     dim_fparam = model.get_dim_fparam()
     dim_aparam = model.get_dim_aparam()
+    dim_chg_spin = model.get_dim_chg_spin() if hasattr(model, "get_dim_chg_spin") else 0
 
-    # Box large enough to avoid PBC degeneracy; mirrors _make_sample_inputs.
+    # Box large enough to avoid PBC degeneracy; centered coords.
     box_size = rcut * 3.0
-    box = np.eye(3, dtype=np.float64) * box_size
-    box_np = box.reshape(1, 9)
-
+    box_np = (np.eye(3, dtype=np.float64) * box_size).reshape(1, 9)
     rng = np.random.default_rng(42)
-    coord_np = rng.random((nframes, nloc, 3), dtype=np.float64) * box_size * 0.5
-    coord_np += box_size * 0.25  # center in box
-
+    coord_np = rng.random((nframes, nloc, 3)) * box_size * 0.5 + box_size * 0.25
     atype_np = np.zeros((nframes, nloc), dtype=np.int64)
     for i in range(nloc):
         atype_np[:, i] = i % ntypes
 
+    coord_t = torch.tensor(coord_np, dtype=dtype, device=device)
+    atype_t = torch.tensor(atype_np, dtype=torch.int64, device=device)
+    box_t = torch.tensor(np.tile(box_np, (nframes, 1)), dtype=dtype, device=device)
     graph = build_neighbor_graph(
-        coord_np,
-        atype_np,
-        np.tile(box_np, (nframes, 1)),
-        rcut,
-        layout=GraphLayout(edge_capacity=e_max),
+        coord_t, atype_t, box_t, rcut, layout=GraphLayout(edge_capacity=e_max)
     )
 
-    atype_t = torch.tensor(atype_np.reshape(-1), dtype=torch.int64, device=_env.DEVICE)
-    n_node_t = torch.tensor(
-        np.asarray(graph.n_node), dtype=torch.int64, device=_env.DEVICE
-    )
-    edge_index_t = torch.tensor(
-        np.asarray(graph.edge_index), dtype=torch.int64, device=_env.DEVICE
+    fparam = (
+        torch.zeros(nframes, dim_fparam, dtype=dtype, device=device)
+        if (want_fparam and dim_fparam > 0)
+        else None
     )
-    edge_vec_t = torch.tensor(
-        np.asarray(graph.edge_vec), dtype=torch.float64, device=_env.DEVICE
+    aparam = (
+        torch.zeros(nframes, nloc, dim_aparam, dtype=dtype, device=device)
+        if (want_aparam and dim_aparam > 0)
+        else None
     )
-    edge_mask_t = torch.tensor(
-        np.asarray(graph.edge_mask), dtype=torch.bool, device=_env.DEVICE
+    charge_spin = (
+        torch.zeros(nframes, dim_chg_spin, dtype=dtype, device=device)
+        if (want_charge_spin and dim_chg_spin > 0)
+        else None
     )
 
-    if dim_fparam > 0:
-        fparam = torch.zeros(
-            nframes, dim_fparam, dtype=torch.float64, device=_env.DEVICE
-        )
-    else:
-        fparam = None
-
-    if dim_aparam > 0:
-        aparam = torch.zeros(
-            nframes, nloc, dim_aparam, dtype=torch.float64, device=_env.DEVICE
-        )
-    else:
-        aparam = None
-
-    dim_chg_spin = model.get_dim_chg_spin() if hasattr(model, "get_dim_chg_spin") else 0
-    if dim_chg_spin > 0:
-        charge_spin = torch.zeros(
-            nframes, dim_chg_spin, dtype=torch.float64, device=_env.DEVICE
-        )
-    else:
-        charge_spin = None
-
     return (
-        atype_t,
-        n_node_t,
-        edge_index_t,
-        edge_vec_t,
-        edge_mask_t,
+        atype_t.reshape(-1),
+        graph.n_node,
+        graph.edge_index,
+        graph.edge_vec,
+        graph.edge_mask,
         fparam,
         aparam,
         charge_spin,
@@ -910,14 +914,16 @@ def _trace_and_export(
         nnei = sum(model.get_sel())
         e_sample = math.ceil(1.25 * nloc_sample * nnei)
 
-        _orig_device = _env.DEVICE
-        _env.DEVICE = torch.device("cpu")
-        try:
-            sample_inputs = _make_graph_sample_inputs(
-                model, e_max=e_sample, nframes=2, nloc=nloc_sample
-            )
-        finally:
-            _env.DEVICE = _orig_device
+        # make_fx traces on CPU; the .pt2 C++ ABI is float64-only.  Pass device
+        # and dtype explicitly instead of mutating the module-level env.DEVICE.
+        sample_inputs = build_synthetic_graph_inputs(
+            model,
+            e_max=e_sample,
+            nframes=2,
+            nloc=nloc_sample,
+            dtype=torch.float64,
+            device=torch.device("cpu"),
+        )
 
         (
             atype_g,

From 282f6410ebcb2583203ffda8b3536d6c5faf9d4f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Wed, 1 Jul 2026 14:45:55 +0800
Subject: [PATCH 32/33] perf(api_cc): cache graph edge topology across steps +
 guard empty rank (OutisLi review)

Comment 1 (perf): the graph LAMMPS path called buildGraphTensors every timestep,
whose createEdgeTensors stage is an O(E) host loop + H2D copy that rebuilds the
edge topology from scratch even when LAMMPS has not refreshed the neighbor list.
Mirror the edge path: at ago==0 cache the skin topology via
createEdgeTensors(with_geometry=false, fold_to_local=!multi_rank) into
edge_index_tensor / edge_index_ext_tensor; each step run only the on-device
compactEdgeTensors (geometry recompute + rcut filter) and assemble the cheap
n_node / node-atype tensors. Topology now rebuilds only on a neighbor-list
rebuild, consistent with the edge path.

Comment 2 (empty-rank nit): a truly-empty rank (nall_real == 0, no local atoms
AND no ghosts) would feed N == 0 into the graph, and edge_force_virial's
edge_index % node_capacity would SIGFPE (div-by-zero). Early-return zero outputs
for that rank. The tested nloc==0 empty-subdomain case has nall_real>0 (ghosts
within rcut) and still runs normally.

Validated: 29/33 graph gtests (4 pre-existing NoPBC skips) + 5/5
test_lammps_dpa1_graph_pt2 (single-rank multi-step ref, multi-rank,
mpirun -n 2 == single-rank, empty-subdomain).
---
 source/api_cc/src/DeepPotPTExpt.cc | 78 +++++++++++++++++++++++-------
 1 file changed, 60 insertions(+), 18 deletions(-)

diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index 49e7ed0662..5d9b081a34 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -554,8 +554,13 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
     nlist_data.copy_from_nlist(lmp_list, nall - nghost);
     nlist_data.shuffle_exclude_empty(fwd_map);
 
-    // Rebuild mapping vector and tensor (cached as members; graph branch reads
-    // mapping_ on every step, not just ago==0, so the vector must persist).
+    // Rebuild mapping vector and tensor (cached as members).  ``mapping_tensor``
+    // is consumed every step by the dense ``run_model`` (ghost-feature gather);
+    // the ``mapping_`` vector is read only here at ago==0 -- to build that
+    // tensor and, for the edge/graph paths, to fold ghost neighbours onto their
+    // local owners inside ``createEdgeTensors``.  (The graph path used to read
+    // ``mapping_`` every step via a per-step ``buildGraphTensors``; it now caches
+    // the topology at ago==0 like the edge/dense paths, so no per-step read.)
     if (lmp_list.mapping) {
       mapping_.resize(nall_real);
       for (int ii = 0; ii < nall_real; ii++) {
@@ -602,10 +607,19 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
       edge_index_tensor = edge_tensors.edge_index;
       edge_index_ext_tensor = edge_tensors.edge_index_ext;
     } else if (lower_input_is_graph_) {
-      // Graph schema rebuilds the edge topology on-device every step inside
-      // buildGraphTensors (from the raw, unpadded nlist_data.jlist +
-      // nlist_data.ilist centers), so nothing is cached here and the nlist is
-      // left unpadded (createEdgeTensors handles ragged rows and skips -1).
+      // Cache only the real skin topology, exactly like the edge path: the
+      // geometry (edge_vec) + rcut filter are recomputed on-device every step
+      // by compactEdgeTensors, so the O(E) host loop + H2D copy in
+      // createEdgeTensors runs ONLY on a LAMMPS nlist rebuild (ago==0), not
+      // every step.  Single-rank folds ghosts onto local owners
+      // (fold_to_local=true); non-MP multi-rank keeps the extended region
+      // (fold_to_local=false) so ghost forces reverse-comm to their owners.
+      const auto edge_tensors = createEdgeTensors(
+          nlist_data.jlist, dcoord, mapping_, nloc, nall_real, device,
+          /*with_geometry=*/false, /*row_centers=*/&nlist_data.ilist,
+          /*fold_to_local=*/!multi_rank);
+      edge_index_tensor = edge_tensors.edge_index;
+      edge_index_ext_tensor = edge_tensors.edge_index_ext;
     } else {
       nlist_data.padding();
       firstneigh_tensor = createNlistTensor(nlist_data.jlist, nnei)
@@ -822,19 +836,47 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
                           edge_tensors.edge_index_ext, edge_tensors.edge_mask,
                           fparam_tensor, aparam_tensor, charge_spin_tensor);
     } else if (lower_input_is_graph_) {
-      // NeighborGraph schema: build (atype, n_node, edge_index, edge_vec,
-      // edge_mask) from the host nlist and run the (single-rank) graph
-      // artifact.  Single-rank folds ghosts onto local owners (N == nloc);
-      // multi-rank (non-MP only — the fail-fast above blocks MP graph
-      // multi-rank) keeps the extended region (N == nall_real, node types from
-      // the real halo types) so LAMMPS reverse-comm folds ghost forces back.
-      const auto graph_tensors = buildGraphTensors(
-          nlist_data.jlist, dcoord, datype, mapping_, nloc, nall_real,
-          static_cast<double>(rcut), device, &nlist_data.ilist,
-          /*fold_to_local=*/!multi_rank);
+      if (nall_real == 0) {
+        // Truly-empty rank (no local atoms AND no ghosts): the graph would emit
+        // N == 0 nodes, and edge_force_virial's ``edge_index % node_capacity``
+        // would divide by zero (SIGFPE) -- it also violates the exported
+        // ``Dim("n_node_total", min=1)``.  Such a rank contributes nothing, so
+        // fill zero outputs and return instead of running the model.  (The
+        // tested ``nloc == 0`` empty-subdomain case has ``nall_real > 0`` --
+        // ghosts within rcut -- so it still runs the model normally.)
+        ener.assign(nframes, static_cast<ENERGYTYPE>(0));
+        force.assign(static_cast<size_t>(nframes) * fwd_map.size() * 3,
+                     static_cast<VALUETYPE>(0));
+        virial.assign(static_cast<size_t>(nframes) * 9,
+                      static_cast<VALUETYPE>(0));
+        if (atomic) {
+          atom_energy.assign(static_cast<size_t>(nframes) * fwd_map.size(),
+                             static_cast<VALUETYPE>(0));
+          atom_virial.assign(static_cast<size_t>(nframes) * fwd_map.size() * 9,
+                             static_cast<VALUETYPE>(0));
+        }
+        return;
+      }
+      // NeighborGraph schema: recompute geometry + rcut filter on-device from
+      // the cached skin topology (edge_index[_ext]_tensor built at ago==0),
+      // then assemble the cheap node tensors.  Mirrors the edge path -- no
+      // per-step host rebuild / H2D copy.  Single-rank folds ghosts onto local
+      // owners (N == nloc); multi-rank (non-MP only — the fail-fast above blocks
+      // MP graph multi-rank) keeps the extended region (N == nall_real, node
+      // types from the real halo types) so LAMMPS reverse-comm folds ghost
+      // forces back.  The node types come from the on-device extended
+      // atype_Tensor slice (== atype_ext[0:N]); n_node is a 1-element tensor.
+      const auto edge_tensors =
+          compactEdgeTensors(edge_index_tensor, edge_index_ext_tensor,
+                             coord_Tensor, static_cast<double>(rcut));
+      const std::int64_t n_node_count = multi_rank ? nall_real : nloc;
+      at::Tensor n_node_tensor =
+          torch::full({1}, n_node_count, int_option).to(device);
+      at::Tensor node_atype =
+          atype_Tensor.slice(1, 0, n_node_count).reshape({n_node_count});
       flat_outputs = run_model_graph(
-          graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index,
-          graph_tensors.edge_vec, graph_tensors.edge_mask, fparam_tensor,
+          node_atype, n_node_tensor, edge_tensors.edge_index,
+          edge_tensors.edge_vec, edge_tensors.edge_mask, fparam_tensor,
           aparam_tensor, charge_spin_tensor);
     } else {
       flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor,

From 70b02fef68a0b1a362f4270f24df0b865124b64e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 1 Jul 2026 06:55:25 +0000
Subject: [PATCH 33/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/pt_expt/train/training.py   |  4 +++-
 source/api_cc/src/DeepPotPTExpt.cc | 21 +++++++++++----------
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index 5068ac51af..4ab43fe329 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -687,7 +687,9 @@ def _trace_and_compile_graph(
     # Shared with the .pt2 export trace (serialization.py) so the two graph
     # traces can never desync on the input schema.  Training uses the run-time
     # float precision and device; optional tensors match the actual call.
-    from deepmd.pt_expt.utils.serialization import build_synthetic_graph_inputs
+    from deepmd.pt_expt.utils.serialization import (
+        build_synthetic_graph_inputs,
+    )
 
     sample = build_synthetic_graph_inputs(
         model,
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index 5d9b081a34..315f4cc39b 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -554,13 +554,14 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
     nlist_data.copy_from_nlist(lmp_list, nall - nghost);
     nlist_data.shuffle_exclude_empty(fwd_map);
 
-    // Rebuild mapping vector and tensor (cached as members).  ``mapping_tensor``
+    // Rebuild mapping vector and tensor (cached as members). ``mapping_tensor``
     // is consumed every step by the dense ``run_model`` (ghost-feature gather);
     // the ``mapping_`` vector is read only here at ago==0 -- to build that
     // tensor and, for the edge/graph paths, to fold ghost neighbours onto their
     // local owners inside ``createEdgeTensors``.  (The graph path used to read
-    // ``mapping_`` every step via a per-step ``buildGraphTensors``; it now caches
-    // the topology at ago==0 like the edge/dense paths, so no per-step read.)
+    // ``mapping_`` every step via a per-step ``buildGraphTensors``; it now
+    // caches the topology at ago==0 like the edge/dense paths, so no per-step
+    // read.)
     if (lmp_list.mapping) {
       mapping_.resize(nall_real);
       for (int ii = 0; ii < nall_real; ii++) {
@@ -861,9 +862,9 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
       // the cached skin topology (edge_index[_ext]_tensor built at ago==0),
       // then assemble the cheap node tensors.  Mirrors the edge path -- no
       // per-step host rebuild / H2D copy.  Single-rank folds ghosts onto local
-      // owners (N == nloc); multi-rank (non-MP only — the fail-fast above blocks
-      // MP graph multi-rank) keeps the extended region (N == nall_real, node
-      // types from the real halo types) so LAMMPS reverse-comm folds ghost
+      // owners (N == nloc); multi-rank (non-MP only — the fail-fast above
+      // blocks MP graph multi-rank) keeps the extended region (N == nall_real,
+      // node types from the real halo types) so LAMMPS reverse-comm folds ghost
       // forces back.  The node types come from the on-device extended
       // atype_Tensor slice (== atype_ext[0:N]); n_node is a 1-element tensor.
       const auto edge_tensors =
@@ -874,10 +875,10 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
           torch::full({1}, n_node_count, int_option).to(device);
       at::Tensor node_atype =
           atype_Tensor.slice(1, 0, n_node_count).reshape({n_node_count});
-      flat_outputs = run_model_graph(
-          node_atype, n_node_tensor, edge_tensors.edge_index,
-          edge_tensors.edge_vec, edge_tensors.edge_mask, fparam_tensor,
-          aparam_tensor, charge_spin_tensor);
+      flat_outputs =
+          run_model_graph(node_atype, n_node_tensor, edge_tensors.edge_index,
+                          edge_tensors.edge_vec, edge_tensors.edge_mask,
+                          fparam_tensor, aparam_tensor, charge_spin_tensor);
     } else {
       flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor,
                                mapping_tensor, fparam_tensor, aparam_tensor,