From 1267b1f2b9d6d3ecf1fc6bb2086aaa4c0d499c2f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 10:13:56 +0800
Subject: [PATCH 01/69] feat(dpmodel): per-edge env_mat 4-vector (graph-native
 EnvMat)

---
 deepmd/dpmodel/utils/__init__.py              |   2 +
 .../dpmodel/utils/neighbor_graph/__init__.py  |   4 +
 deepmd/dpmodel/utils/neighbor_graph/env.py    | 103 ++++++++++++++++++
 .../tests/common/dpmodel/test_edge_env_mat.py | 103 ++++++++++++++++++
 .../test_neighbor_graph_lower_smoke.py        |   9 ++
 5 files changed, 221 insertions(+)
 create mode 100644 deepmd/dpmodel/utils/neighbor_graph/env.py
 create mode 100644 source/tests/common/dpmodel/test_edge_env_mat.py
 create mode 100644 source/tests/common/dpmodel/test_neighbor_graph_lower_smoke.py

diff --git a/deepmd/dpmodel/utils/__init__.py b/deepmd/dpmodel/utils/__init__.py
index 0179543dd4..eb1f8fb855 100644
--- a/deepmd/dpmodel/utils/__init__.py
+++ b/deepmd/dpmodel/utils/__init__.py
@@ -22,6 +22,7 @@
     GraphLayout,
     NeighborGraph,
     build_neighbor_graph,
+    edge_env_mat,
     edge_force_virial,
     from_dense_quartet,
     node_validity_mask,
@@ -91,6 +92,7 @@
     "build_neighbor_graph",
     "build_neighbor_list",
     "compute_total_numb_batch",
+    "edge_env_mat",
     "edge_force_virial",
     "extend_coord_with_ghosts",
     "from_dense_quartet",
diff --git a/deepmd/dpmodel/utils/neighbor_graph/__init__.py b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
index 08b165f861..58f4d61b8d 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/__init__.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
@@ -16,6 +16,9 @@
 from .derivatives import (
     edge_force_virial,
 )
+from .env import (
+    edge_env_mat,
+)
 from .graph import (
     GraphLayout,
     NeighborGraph,
@@ -31,6 +34,7 @@
     "GraphLayout",
     "NeighborGraph",
     "build_neighbor_graph",
+    "edge_env_mat",
     "edge_force_virial",
     "from_dense_quartet",
     "node_validity_mask",
diff --git a/deepmd/dpmodel/utils/neighbor_graph/env.py b/deepmd/dpmodel/utils/neighbor_graph/env.py
new file mode 100644
index 0000000000..e5bec3f04b
--- /dev/null
+++ b/deepmd/dpmodel/utils/neighbor_graph/env.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Per-edge environment-matrix 4-vector, the graph-native analogue of
+EnvMat.call (deepmd/dpmodel/utils/env_mat.py).
+
+Computes, per edge, [1/r, dx/r^2, dy/r^2, dz/r^2] * smooth_weight, then
+normalizes by (davg, dstd) indexed by the edge's CENTER (dst) atom type.
+Stats are (ntypes, 4) — slot-independent — which is valid because
+EnvMatStatSe tiles a single per-type vector across all nnei slots
+(``np.tile(davgunit, [nsel, 1])``), so the slot axis carries no information.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+from typing import (
+    TYPE_CHECKING,
+)
+
+import array_api_compat
+
+from deepmd.dpmodel.utils.env_mat import (
+    compute_smooth_weight,
+)
+from deepmd.dpmodel.utils.safe_gradient import (
+    safe_for_vector_norm,
+)
+
+if TYPE_CHECKING:
+    from deepmd.dpmodel.array_api import (
+        Array,
+    )
+
+
+def edge_env_mat(
+    edge_vec: Array,
+    center_type: Array,
+    davg: Array,
+    dstd: Array,
+    rcut: float,
+    rcut_smth: float,
+    protection: float = 0.0,
+) -> Array:
+    """Compute the per-edge environment-matrix 4-vector.
+
+    Mirrors the math in ``_make_env_mat`` / ``EnvMat.call`` (env_mat.py)
+    for a single edge batch instead of a dense (nf, nloc, nnei) tensor.
+
+    Parameters
+    ----------
+    edge_vec
+        (E, 3) displacement vectors r_src - r_dst (neighbor minus center);
+        padding edges must have ``edge_vec = 0``.
+    center_type
+        (E,) int — atom type of the center (dst) atom for each edge.
+    davg
+        (ntypes, 4) per-center-type mean (slot-independent).
+    dstd
+        (ntypes, 4) per-center-type inverse-std (slot-independent).
+    rcut
+        Outer cutoff radius.
+    rcut_smth
+        Inner radius where the smooth switch begins.
+    protection
+        Small additive offset to avoid exact division-by-zero on
+        atoms that are numerically at the same position (default 0).
+
+    Returns
+    -------
+    Array
+        (E, 4) normalized environment-matrix vectors.
+        Padding edges (``edge_vec = 0``) produce nonzero values but are
+        masked by ``NeighborGraph.edge_mask`` downstream.
+    """
+    xp = array_api_compat.array_namespace(edge_vec)
+    dev = array_api_compat.device(edge_vec)
+
+    # ── geometry ───────────────────────────────────────────────────────────
+    # (E, 1) lengths; safe_for_vector_norm returns 0 for zero vectors
+    length = safe_for_vector_norm(edge_vec, axis=-1, keepdims=True)
+
+    # Guard against exact zero to avoid 1/0 (happens on padding edges where
+    # edge_vec = 0).  Real edges always have length > 0.
+    safe_len = xp.where(length < 1e-10, xp.ones_like(length), length)
+
+    denom = safe_len + protection  # (E, 1)
+    t0 = 1.0 / denom  # (E, 1)  — radial component
+    t1 = edge_vec / (denom**2)  # (E, 3) — angular components
+
+    # ── smooth switch (same polynomial as compute_smooth_weight) ───────────
+    # length has shape (E, 1); compute_smooth_weight broadcasts over any shape
+    sw = compute_smooth_weight(length, rcut_smth, rcut)  # (E, 1)
+
+    # ── raw (unnormalized) env-mat ─────────────────────────────────────────
+    em = xp.concat([t0, t1], axis=-1) * sw  # (E, 4)
+
+    # ── per-type normalization (indexed by center-atom type) ───────────────
+    # davg/dstd must be asarray'd to ensure device placement when called with
+    # numpy stats on a torch/jax edge_vec.
+    avg = xp.take(xp.asarray(davg, device=dev), center_type, axis=0)  # (E, 4)
+    std = xp.take(xp.asarray(dstd, device=dev), center_type, axis=0)  # (E, 4)
+
+    return (em - avg) / std
diff --git a/source/tests/common/dpmodel/test_edge_env_mat.py b/source/tests/common/dpmodel/test_edge_env_mat.py
new file mode 100644
index 0000000000..486a5d226f
--- /dev/null
+++ b/source/tests/common/dpmodel/test_edge_env_mat.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+
+from deepmd.dpmodel.utils.env_mat import EnvMat
+from deepmd.dpmodel.utils.neighbor_graph import (
+    edge_env_mat,
+    from_dense_quartet,
+)
+
+
+class TestEdgeEnvMat(unittest.TestCase):
+    def setUp(self) -> None:
+        rng = np.random.default_rng(0)
+        self.rcut, self.rcut_smth = 4.0, 0.5
+        self.nf, self.nloc, self.nnei = 1, 4, 6
+        self.ext_coord = rng.normal(size=(self.nf, self.nloc, 3)) * 1.5
+        self.atype = np.array([[0, 1, 0, 1]], dtype=np.int64)
+        nlist = -np.ones((self.nf, self.nloc, self.nnei), dtype=np.int64)
+        for i in range(self.nloc):
+            ns = [j for j in range(self.nloc) if j != i][: self.nnei]
+            nlist[0, i, : len(ns)] = ns
+        self.nlist = nlist
+        self.mapping = np.arange(self.nloc, dtype=np.int64)[None]
+        self.nt = 2
+        self.davg = rng.normal(size=(self.nt, 4))
+        self.dstd = np.abs(rng.normal(size=(self.nt, 4))) + 0.5
+
+    def test_matches_envmat_slice(self) -> None:
+        davg_dense = np.broadcast_to(
+            self.davg[:, None, :], (self.nt, self.nnei, 4)
+        ).copy()
+        dstd_dense = np.broadcast_to(
+            self.dstd[:, None, :], (self.nt, self.nnei, 4)
+        ).copy()
+        dmat, _, _ = EnvMat(self.rcut, self.rcut_smth).call(
+            self.ext_coord, self.atype, self.nlist, davg_dense, dstd_dense
+        )
+
+        ng = from_dense_quartet(self.ext_coord, self.nlist, self.mapping)
+        center_type = self.atype.reshape(-1)[ng.edge_index[1]]
+        em = edge_env_mat(
+            ng.edge_vec, center_type, self.davg, self.dstd, self.rcut, self.rcut_smth
+        )
+
+        ei = ng.edge_index[:, ng.edge_mask]
+        for k in range(ei.shape[1]):
+            src, dst = int(ei[0, k]), int(ei[1, k])
+            slot = list(self.nlist[0, dst]).index(src)
+            np.testing.assert_allclose(
+                em[k], dmat[0, dst, slot], rtol=1e-12, atol=1e-12
+            )
+
+    def test_slot_broadcast_stats(self) -> None:
+        """After compute_input_stats, DescrptBlockSeAtten stats must be
+        slot-uniform: mean[:, k, :] == mean[:, 0, :] for all slots k.
+        This property is what allows edge_env_mat to use (ntypes, 4) stats
+        instead of (ntypes, nnei, 4) stats.
+        """
+        from deepmd.dpmodel.descriptor import DescrptDPA1
+
+        rng = np.random.default_rng(42)
+        nloc = 6
+        nf = 3
+        rcut = 4.0
+        rcut_smth = 0.5
+        ntypes = 2
+        sel = [6, 6]
+
+        coord = rng.normal(size=(nf, nloc, 3)).astype(np.float64)
+        # scale so atoms are within rcut of each other
+        coord = coord * 1.2
+        atype = np.array([[0, 1, 0, 1, 0, 1]] * nf, dtype=np.int64)
+        # non-periodic: box=None
+        data = [
+            {
+                "coord": coord,
+                "atype": atype,
+                "box": None,
+            }
+        ]
+
+        dpa1 = DescrptDPA1(rcut, rcut_smth, sel, ntypes=ntypes)
+        dpa1.compute_input_stats(data)
+        block = dpa1.se_atten
+
+        nnei = block.nnei
+        for k in range(1, nnei):
+            np.testing.assert_allclose(
+                block.mean[:, 0, :],
+                block.mean[:, k, :],
+                rtol=0,
+                atol=0,
+                err_msg=f"mean slot {k} != slot 0",
+            )
+            np.testing.assert_allclose(
+                block.stddev[:, 0, :],
+                block.stddev[:, k, :],
+                rtol=0,
+                atol=0,
+                err_msg=f"stddev slot {k} != slot 0",
+            )
diff --git a/source/tests/common/dpmodel/test_neighbor_graph_lower_smoke.py b/source/tests/common/dpmodel/test_neighbor_graph_lower_smoke.py
new file mode 100644
index 0000000000..e1d860192c
--- /dev/null
+++ b/source/tests/common/dpmodel/test_neighbor_graph_lower_smoke.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+
+class TestPrASymbolsExist(unittest.TestCase):
+    def test_edge_env_mat_importable(self) -> None:
+        from deepmd.dpmodel.utils.neighbor_graph import edge_env_mat
+
+        self.assertTrue(callable(edge_env_mat))

From 3c75daf044cb38626df2c6fb7d59d4c452f303bb Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 10:21:37 +0800
Subject: [PATCH 02/69] feat(dpmodel): DescrptBlockSeAtten.call_graph
 (attn_layer=0, segment_sum)

---
 deepmd/dpmodel/descriptor/dpa1.py             | 102 ++++++++++++++++++
 .../dpmodel/test_dpa1_call_graph_block.py     |  94 ++++++++++++++++
 2 files changed, 196 insertions(+)
 create mode 100644 source/tests/common/dpmodel/test_dpa1_call_graph_block.py

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 2311858180..db3acdccac 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -4,6 +4,7 @@
     Callable,
 )
 from typing import (
+    Any,
     NoReturn,
     Optional,
     Union,
@@ -1240,6 +1241,107 @@ def call(
             xp.reshape(sw, (nf, nloc, nnei, 1)),
         )
 
+    def call_graph(
+        self,
+        graph: Any,
+        atype: Array,
+        type_embedding: Array | None = None,
+    ) -> Array:
+        """Graph-native forward (``attn_layer=0`` only).
+
+        Bit-exact analogue of :meth:`call` on the SAME neighbor list, with the
+        neighbor-axis reduction replaced by a ``segment_sum`` over edge centers
+        (``dst``). Geometry enters only through ``graph.edge_vec``.
+
+        Parameters
+        ----------
+        graph
+            A :class:`~deepmd.dpmodel.utils.neighbor_graph.NeighborGraph` whose
+            ``edge_index = [src, dst]`` (src = neighbor local owner, dst = center),
+            ``edge_vec = r_src - r_dst`` and ``edge_mask`` marks real edges.
+        atype
+            (N,) flat node atom types (``N = sum(graph.n_node)``).
+        type_embedding
+            (ntypes_with_padding, tebd_dim) type-embedding table.
+
+        Returns
+        -------
+        Array
+            (N, ng * axis_neuron) per-node descriptor, matching the first output
+            of :meth:`call` flattened over the (nf, nloc) axes.
+
+        Notes
+        -----
+        Known limitations (NeighborGraph PR-A):
+        - ``attn_layer == 0`` only (attention lands in PR-D);
+        - ``tebd_input_mode == "concat"`` only (strip mode lands later);
+        - type exclusion (``exclude_types``) is not applied on the graph path.
+        """
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            edge_env_mat,
+            segment_sum,
+        )
+
+        if self.attn_layer != 0:
+            raise NotImplementedError(
+                "graph path supports attn_layer=0 only (NeighborGraph PR-A); "
+                "attn_layer>0 lands in PR-D"
+            )
+        if self.tebd_input_mode not in ["concat"]:
+            raise NotImplementedError(
+                "graph path supports tebd_input_mode='concat' only (NeighborGraph PR-A)"
+            )
+        if type_embedding is None:
+            raise ValueError("type_embedding is required for the graph path")
+        xp = array_api_compat.array_namespace(graph.edge_vec)
+        dev = array_api_compat.device(graph.edge_vec)
+        n_total = int(xp.sum(graph.n_node))
+        src = graph.edge_index[0, :]
+        dst = graph.edge_index[1, :]
+        atype = xp.asarray(atype, device=dev)
+        center_type = xp.take(atype, dst, axis=0)  # (E,)
+        nei_type = xp.take(atype, src, axis=0)  # (E,)
+        # per-edge env-mat 4-vector, normalized by the center (dst) atom type.
+        # self.mean/self.stddev are slot-independent (ntypes, nnei, 4); slot 0 is
+        # the canonical per-type vector.
+        rr = edge_env_mat(
+            graph.edge_vec,
+            center_type,
+            self.mean[:, 0, :],
+            self.stddev[:, 0, :],
+            self.rcut,
+            self.rcut_smth,
+            protection=self.env_protection,
+        )  # (E, 4)
+        # radial channel
+        ss = rr[:, 0:1]  # (E, 1)
+        # neighbor / center type embeddings (concat mode); ghost type == owner type
+        # so gathering by the LOCAL owner (src) reproduces the dense neighbor tebd.
+        tebd = xp.asarray(type_embedding, device=dev)
+        atype_embd_nlist = xp.take(tebd, nei_type, axis=0)  # (E, tebd_dim)
+        if not self.type_one_side:
+            atype_embd_nnei = xp.take(tebd, center_type, axis=0)  # (E, tebd_dim)
+            ss = xp.concat([ss, atype_embd_nlist, atype_embd_nnei], axis=-1)
+        else:
+            ss = xp.concat([ss, atype_embd_nlist], axis=-1)
+        # embedding net (same weights as the dense path); applies on the last axis
+        gg = self.embeddings[0].call(ss)  # (E, ng)
+        # zero padding/guard edges BEFORE the segment sum
+        gg = gg * xp.astype(graph.edge_mask[:, None], gg.dtype)
+        # outer product (replaces the dense gg[:,:,:,None] * rr[:,:,None,:])
+        outer = gg[:, :, None] * rr[:, None, :]  # (E, ng, 4)
+        # neighbor-axis reduction -> segment_sum over centers; divide by nnei
+        gr = segment_sum(outer, dst, n_total) / self.nnei  # (N, ng, 4)
+        gr1 = gr[:, : self.axis_neuron, :]
+        # nf x nloc x (ng x ng1)
+        grrg = xp.sum(gr[:, :, None, :] * gr1[:, None, :, :], axis=3)  # (N, ng, ng1)
+        ng = self.neuron[-1]
+        grrg = xp.astype(
+            xp.reshape(grrg, (n_total, ng * self.axis_neuron)),
+            graph.edge_vec.dtype,
+        )
+        return grrg
+
     def has_message_passing(self) -> bool:
         """Returns whether the descriptor block has message passing."""
         return False
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
new file mode 100644
index 0000000000..77211f6fa8
--- /dev/null
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Bit-exact parity between the graph-native ``DescrptBlockSeAtten.call_graph``
+(attn_layer=0) and the legacy dense ``DescrptBlockSeAtten.call`` on the SAME
+neighbor list, for binding AND non-binding ``sel``.
+"""
+
+import unittest
+
+import numpy as np
+
+from deepmd.dpmodel.descriptor.dpa1 import (
+    DescrptDPA1,
+)
+from deepmd.dpmodel.utils.neighbor_graph import (
+    from_dense_quartet,
+)
+from deepmd.dpmodel.utils.nlist import (
+    extend_input_and_build_neighbor_list,
+)
+
+
+class TestDpa1BlockCallGraph(unittest.TestCase):
+    def _make(self, sel):
+        return DescrptDPA1(
+            rcut=4.0,
+            rcut_smth=0.5,
+            sel=sel,
+            ntypes=2,
+            attn_layer=0,
+            axis_neuron=2,
+            neuron=[6, 12],
+        )
+
+    def setUp(self) -> None:
+        rng = np.random.default_rng(1)
+        self.nloc = 4
+        self.coord = rng.normal(size=(1, self.nloc, 3)) * 1.5
+        self.atype = np.array([[0, 1, 0, 1]], dtype=np.int64)
+
+    def test_block_graph_equals_dense_any_sel(self) -> None:
+        for sel in ([20], [3]):  # non-binding AND binding
+            with self.subTest(sel=sel):
+                dd = self._make(sel)
+                blk = dd.se_atten
+                # build the dense nlist exactly as the descriptor would
+                (
+                    ext_coord,
+                    ext_atype,
+                    mapping,
+                    nlist,
+                ) = extend_input_and_build_neighbor_list(
+                    self.coord,
+                    self.atype,
+                    dd.get_rcut(),
+                    dd.get_sel(),
+                    mixed_types=dd.mixed_types(),
+                    box=None,
+                )
+                # type embedding as both paths use it
+                tebd = dd.type_embedding.call()
+                nf, nall = ext_atype.shape
+                atype_embd_ext = np.reshape(
+                    np.take(tebd, np.reshape(ext_atype, (-1,)), axis=0),
+                    (nf, nall, dd.tebd_dim),
+                )
+                dense_g, *_ = blk.call(
+                    nlist,
+                    ext_coord,
+                    ext_atype,
+                    atype_embd_ext=atype_embd_ext,
+                    mapping=None,
+                    type_embedding=tebd,
+                )
+                ng = from_dense_quartet(ext_coord, nlist, mapping)
+                graph_g = blk.call_graph(
+                    ng,
+                    np.reshape(ext_atype, (-1,)),
+                    type_embedding=tebd,
+                )
+                np.testing.assert_allclose(
+                    graph_g.reshape(dense_g.shape),
+                    dense_g,
+                    rtol=1e-12,
+                    atol=1e-12,
+                )
+
+    def test_attn_layer_gt0_raises(self) -> None:
+        dd = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[20], ntypes=2, attn_layer=2)
+        with self.assertRaises(NotImplementedError):
+            dd.se_atten.call_graph(None, np.array([0], dtype=np.int64))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 836000d22069e7d107580f4ec2f81617aa98a27d Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 10:24:17 +0800
Subject: [PATCH 03/69] feat(dpmodel): fail-fast on exclude_types in
 DescrptBlockSeAtten.call_graph

The dense path masks excluded type pairs; the graph path does not yet, so
raise NotImplementedError instead of silently diverging.
---
 deepmd/dpmodel/descriptor/dpa1.py             |  7 ++++++-
 .../dpmodel/test_dpa1_call_graph_block.py     | 21 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index db3acdccac..e15f27569e 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -1275,7 +1275,7 @@ def call_graph(
         Known limitations (NeighborGraph PR-A):
         - ``attn_layer == 0`` only (attention lands in PR-D);
         - ``tebd_input_mode == "concat"`` only (strip mode lands later);
-        - type exclusion (``exclude_types``) is not applied on the graph path.
+        - ``exclude_types`` is not yet supported and raises (lands in a later PR).
         """
         from deepmd.dpmodel.utils.neighbor_graph import (
             edge_env_mat,
@@ -1291,6 +1291,11 @@ def call_graph(
             raise NotImplementedError(
                 "graph path supports tebd_input_mode='concat' only (NeighborGraph PR-A)"
             )
+        if self.exclude_types:
+            raise NotImplementedError(
+                "graph path does not yet apply exclude_types (NeighborGraph PR-A); "
+                "type exclusion lands in a later PR"
+            )
         if type_embedding is None:
             raise ValueError("type_embedding is required for the graph path")
         xp = array_api_compat.array_namespace(graph.edge_vec)
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
index 77211f6fa8..6a82f3e159 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
@@ -89,6 +89,27 @@ def test_attn_layer_gt0_raises(self) -> None:
         with self.assertRaises(NotImplementedError):
             dd.se_atten.call_graph(None, np.array([0], dtype=np.int64))
 
+    def test_exclude_types_raises(self) -> None:
+        # the graph path does not yet apply type exclusion; it must fail-fast
+        # rather than silently diverge from the dense path (which masks edges).
+        dd = DescrptDPA1(
+            rcut=4.0,
+            rcut_smth=0.5,
+            sel=[20],
+            ntypes=2,
+            attn_layer=0,
+            exclude_types=[(0, 1)],
+        )
+        ng = from_dense_quartet(
+            self.coord,
+            -np.ones((1, self.nloc, 1), dtype=np.int64),  # any graph; guard fires first
+            np.arange(self.nloc, dtype=np.int64)[None],
+        )
+        with self.assertRaises(NotImplementedError):
+            dd.se_atten.call_graph(
+                ng, self.atype.reshape(-1), type_embedding=dd.type_embedding.call()
+            )
+
 
 if __name__ == "__main__":
     unittest.main()

From 01beb47f92810618a7dae1834b2217337dec305f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 10:36:34 +0800
Subject: [PATCH 04/69] feat(dpmodel): DescrptDPA1 dense call ->
 from-quartet->call_graph adapter (attn_layer=0)

---
 deepmd/dpmodel/descriptor/dpa1.py             | 114 +++++++++++++++++-
 .../dpmodel/test_dpa1_call_graph_block.py     |   2 +-
 .../test_dpa1_call_graph_descriptor.py        | 102 ++++++++++++++++
 3 files changed, 213 insertions(+), 5 deletions(-)
 create mode 100644 source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index e15f27569e..24806b1b05 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -541,10 +541,38 @@ def call(
         sw
             The smooth switch function.
         """
-        del mapping
         xp = array_api_compat.array_namespace(coord_ext, atype_ext, nlist)
         nf, nloc, nnei = nlist.shape
         nall = xp.reshape(coord_ext, (nf, -1)).shape[1] // 3
+        # attn_layer == 0 routes through the graph-native path; the dense call is
+        # a thin adapter (decision #14: graph = single math source). The full
+        # dense 5-tuple ABI is preserved exactly (see call_graph).
+        if self.se_atten.attn_layer == 0:
+            from deepmd.dpmodel.utils.neighbor_graph import (
+                from_dense_quartet,
+            )
+
+            dev = array_api_compat.device(coord_ext)
+            coord_ext_3 = xp.reshape(coord_ext, (nf, nall, 3))
+            if mapping is None:
+                # default identity mapping (ext == loc, e.g. no-PBC nall == nloc)
+                mapping_g = xp.broadcast_to(
+                    xp.arange(nall, dtype=xp.int64, device=dev)[None, :], (nf, nall)
+                )
+            else:
+                mapping_g = xp.reshape(mapping, (nf, nall))
+            graph = from_dense_quartet(coord_ext_3, nlist, mapping_g)
+            # local atom types, flat (nf * nloc,)
+            atype_local = xp.reshape(xp_take_first_n(atype_ext, 1, nloc), (nf * nloc,))
+            return self.call_graph(
+                graph,
+                atype_local,
+                type_embedding=self.type_embedding.call(),
+                nlist=nlist,
+                coord_ext=coord_ext,
+                atype_ext=atype_ext,
+            )
+        del mapping
         type_embedding = self.type_embedding.call()
         # nf x nall x tebd_dim
         atype_embd_ext = xp.reshape(
@@ -568,6 +596,77 @@ def call(
             )
         return grrg, rot_mat, None, None, sw
 
+    def call_graph(
+        self,
+        graph: Any,
+        atype: Array,
+        type_embedding: Array | None = None,
+        nlist: Array | None = None,
+        coord_ext: Array | None = None,
+        atype_ext: Array | None = None,
+    ) -> tuple[Array, Array, None, None, Array | None]:
+        """Descriptor-level graph-native forward (``attn_layer == 0``).
+
+        Wraps the block :meth:`DescrptBlockSeAtten.call_graph`, adds the
+        descriptor-level ``concat_output_tebd`` step, and reshapes the per-node
+        outputs back to the dense ABI shapes ``(nf, nloc, ...)``.
+
+        Parameters
+        ----------
+        graph
+            A :class:`~deepmd.dpmodel.utils.neighbor_graph.NeighborGraph`.
+        atype
+            (nf * nloc,) flat LOCAL atom types.
+        type_embedding
+            (ntypes_with_padding, tebd_dim) type-embedding table.
+        nlist, coord_ext, atype_ext
+            Original dense quartet inputs, used ONLY to reconstruct the dense
+            ``sw`` (nf, nloc, nnei, 1) exactly (a dense-layout artifact tied to
+            neighbor slots, which the graph does not carry). When ``nlist`` is
+            ``None`` the returned ``sw`` is ``None``.
+
+        Returns
+        -------
+        tuple
+            ``(grrg, rot_mat, None, None, sw)`` matching :meth:`call`.
+        """
+        xp = array_api_compat.array_namespace(graph.edge_vec)
+        dev = array_api_compat.device(graph.edge_vec)
+        grrg_node, rot_mat_node = self.se_atten.call_graph(
+            graph, atype, type_embedding=type_embedding
+        )
+        nf = graph.n_node.shape[0]
+        nloc = int(graph.n_node[0])
+        ng = self.se_atten.neuron[-1]
+        axis = self.se_atten.axis_neuron
+        grrg = xp.reshape(grrg_node, (nf, nloc, ng * axis))
+        rot_mat = xp.reshape(rot_mat_node, (nf, nloc, ng, 3))
+        # descriptor-level concat_output_tebd
+        if self.concat_output_tebd:
+            tebd = xp.asarray(type_embedding, device=dev)
+            atype_local = xp.asarray(atype, device=dev)
+            atype_embd = xp.take(tebd, atype_local, axis=0)  # (nf*nloc, tebd_dim)
+            atype_embd = xp.reshape(atype_embd, (nf, nloc, self.tebd_dim))
+            grrg = xp.concat([grrg, atype_embd], axis=-1)
+        # reconstruct the dense-shaped sw exactly the dense way (env_mat switch
+        # masked where nlist == -1; the graph path forbids exclude_types, so
+        # nlist_mask == nlist != -1, matching DescrptBlockSeAtten.call).
+        sw: Array | None = None
+        if nlist is not None:
+            nf_, nloc_, nnei = nlist.shape
+            # env_mat returns sw with shape (nf, nloc, nnei, 1)
+            _, _, sw = self.se_atten.env_mat.call(
+                coord_ext,
+                atype_ext,
+                nlist,
+                self.se_atten.mean[...],
+                self.se_atten.stddev[...],
+            )
+            nlist_mask = (nlist != -1)[:, :, :, None]
+            sw = xp.where(nlist_mask, sw, xp.zeros_like(sw))
+            sw = xp.reshape(sw, (nf_, nloc_, nnei, 1))
+        return grrg, rot_mat, None, None, sw
+
     def serialize(self) -> dict:
         """Serialize the descriptor to dict."""
         obj = self.se_atten
@@ -1246,7 +1345,7 @@ def call_graph(
         graph: Any,
         atype: Array,
         type_embedding: Array | None = None,
-    ) -> Array:
+    ) -> tuple[Array, Array]:
         """Graph-native forward (``attn_layer=0`` only).
 
         Bit-exact analogue of :meth:`call` on the SAME neighbor list, with the
@@ -1266,9 +1365,12 @@ def call_graph(
 
         Returns
         -------
-        Array
+        grrg : Array
             (N, ng * axis_neuron) per-node descriptor, matching the first output
             of :meth:`call` flattened over the (nf, nloc) axes.
+        rot_mat : Array
+            (N, ng, 3) per-node equivariant single-particle representation,
+            matching ``gr[..., 1:]`` of :meth:`call` flattened over (nf, nloc).
 
         Notes
         -----
@@ -1345,7 +1447,11 @@ def call_graph(
             xp.reshape(grrg, (n_total, ng * self.axis_neuron)),
             graph.edge_vec.dtype,
         )
-        return grrg
+        # equivariant single-particle representation, dense-ABI slice gr[..., 1:]
+        # (N, ng, 3); not cast, mirroring the dense block which leaves rot_mat in
+        # the working precision before the descriptor-level @cast_precision.
+        rot_mat = gr[:, :, 1:]
+        return grrg, rot_mat
 
     def has_message_passing(self) -> bool:
         """Returns whether the descriptor block has message passing."""
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
index 6a82f3e159..58014d70f9 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
@@ -72,7 +72,7 @@ def test_block_graph_equals_dense_any_sel(self) -> None:
                     type_embedding=tebd,
                 )
                 ng = from_dense_quartet(ext_coord, nlist, mapping)
-                graph_g = blk.call_graph(
+                graph_g, _rot_mat = blk.call_graph(
                     ng,
                     np.reshape(ext_atype, (-1,)),
                     type_embedding=tebd,
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
new file mode 100644
index 0000000000..7f3561888a
--- /dev/null
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Full 5-tuple ABI parity between the graph-routed ``DescrptDPA1.call``
+(attn_layer=0, which now goes ``from_dense_quartet -> call_graph``) and the
+legacy dense descriptor output captured BEFORE the swap, for binding AND
+non-binding ``sel``.
+
+The dense reference is reconstructed by calling the BLOCK directly
+(``dd.se_atten.call``) and applying the descriptor-level ``concat_output_tebd``
+step by hand (mirroring dpa1.py), because ``dd.call`` itself now routes through
+the graph for ``attn_layer == 0``.
+"""
+
+import unittest
+
+import numpy as np
+
+from deepmd.dpmodel.descriptor.dpa1 import (
+    DescrptDPA1,
+)
+from deepmd.dpmodel.utils.nlist import (
+    extend_input_and_build_neighbor_list,
+)
+
+
+class TestDpa1DescriptorCallGraph(unittest.TestCase):
+    def _make(self, sel):
+        return DescrptDPA1(
+            rcut=4.0,
+            rcut_smth=0.5,
+            sel=sel,
+            ntypes=2,
+            attn_layer=0,
+            axis_neuron=2,
+            neuron=[6, 12],
+        )
+
+    def setUp(self) -> None:
+        rng = np.random.default_rng(2)
+        self.nloc = 4
+        self.coord = rng.normal(size=(1, self.nloc, 3)) * 1.5
+        self.atype = np.array([[0, 1, 0, 1]], dtype=np.int64)
+
+    def _dense_reference(self, dd, ext_coord, ext_atype, nlist):
+        """Reconstruct the original dense descriptor 5-tuple (pre-swap)."""
+        tebd = dd.type_embedding.call()
+        nf, nall = ext_atype.shape
+        atype_embd_ext = np.reshape(
+            np.take(tebd, np.reshape(ext_atype, (-1,)), axis=0),
+            (nf, nall, dd.tebd_dim),
+        )
+        grrg, g2, h2, rot_mat, sw = dd.se_atten.call(
+            nlist,
+            ext_coord,
+            ext_atype,
+            atype_embd_ext=atype_embd_ext,
+            mapping=None,
+            type_embedding=tebd,
+        )
+        nloc = nlist.shape[1]
+        # descriptor-level concat_output_tebd (mirror dpa1.py)
+        atype_embd = atype_embd_ext[:, :nloc, :]
+        if dd.concat_output_tebd:
+            grrg = np.concatenate(
+                [grrg, np.reshape(atype_embd, (nf, nloc, dd.tebd_dim))], axis=-1
+            )
+        return grrg, rot_mat, None, None, sw
+
+    def test_descriptor_graph_equals_dense_full_tuple(self) -> None:
+        for sel in ([30], [4]):  # non-binding AND binding
+            with self.subTest(sel=sel):
+                dd = self._make(sel)
+                (
+                    ext_coord,
+                    ext_atype,
+                    mapping,
+                    nlist,
+                ) = extend_input_and_build_neighbor_list(
+                    self.coord,
+                    self.atype,
+                    dd.get_rcut(),
+                    dd.get_sel(),
+                    mixed_types=dd.mixed_types(),
+                    box=None,
+                )
+                # dense reference captured via the block (pre-swap behaviour)
+                ref = self._dense_reference(dd, ext_coord, ext_atype, nlist)
+                # the swapped public ABI: routes through the graph
+                out = dd.call(ext_coord, ext_atype, nlist, mapping=mapping)
+                self.assertEqual(len(out), 5)
+                # grrg
+                np.testing.assert_allclose(out[0], ref[0], rtol=1e-12, atol=1e-12)
+                # rot_mat
+                np.testing.assert_allclose(out[1], ref[1], rtol=1e-12, atol=1e-12)
+                # positions [2], [3] are always None for this descriptor
+                self.assertIsNone(out[2])
+                self.assertIsNone(out[3])
+                # sw
+                np.testing.assert_allclose(out[4], ref[4], rtol=1e-12, atol=1e-12)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 2ac1306c532b34dec1cbd422124e09966121dbea Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 10:43:41 +0800
Subject: [PATCH 05/69] feat(dpmodel): model.call_lower_graph
 (energy/atom-energy via segment_sum)

---
 deepmd/dpmodel/model/make_model.py            | 97 ++++++++++++++++++
 .../common/dpmodel/test_call_lower_graph.py   | 98 +++++++++++++++++++
 2 files changed, 195 insertions(+)
 create mode 100644 source/tests/common/dpmodel/test_call_lower_graph.py

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index aba6b9fd48..74c0c91878 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -42,6 +42,10 @@
     NeighborList,
     nlist_distinguish_types,
 )
+from deepmd.dpmodel.utils.neighbor_graph import (
+    NeighborGraph,
+    segment_sum,
+)
 from deepmd.utils.path import (
     DPPath,
 )
@@ -423,6 +427,99 @@ def forward_common_atomic(
                 mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
             )
 
+        def call_lower_graph(
+            self,
+            atype: Array,
+            n_node: Array,
+            edge_index: Array,
+            edge_vec: Array,
+            edge_mask: Array,
+            n_local: Array | None = None,
+            fparam: Array | None = None,
+            aparam: Array | None = None,
+            comm_dict: dict | None = None,
+        ) -> dict[str, Array]:
+            """Graph-native ENERGY lower (PR-A: dpa1 ``attn_layer == 0``).
+
+            Energy-level only: returns the per-atom ``atom_energy`` and the
+            per-frame reduced ``energy``. Force/virial are produced by the
+            pt_expt autograd path (a later task). Must match the dense
+            :meth:`call_common_lower` energy and atom-energy on the SAME
+            neighbor list.
+
+            Parameters
+            ----------
+            atype
+                (N,) flat LOCAL atom types, ``N == sum(n_node)``.
+            n_node
+                (nf,) per-frame local atom counts.
+            edge_index
+                (2, E) ``[src, dst]`` edge endpoints (flat local indices).
+            edge_vec
+                (E, 3) neighbor-minus-center edge vectors.
+            edge_mask
+                (E,) boolean/0-1 valid-edge mask.
+            n_local
+                Per-rank local atom counts for multi-rank inference.
+                Ignored in PR-A (single-rank); accepted for ABI stability.
+            fparam
+                Frame parameter, ``(nf, ndf)``.
+            aparam
+                Atomic parameter, ``(nf, nloc, nda)``.
+            comm_dict
+                MPI communication metadata. Ignored in PR-A; accepted for
+                ABI stability.
+
+            Returns
+            -------
+            dict
+                ``{"atom_energy": (nf, nloc, 1), "energy": (nf, 1)}``.
+            """
+            xp = array_api_compat.array_namespace(edge_vec)
+            dev = array_api_compat.device(edge_vec)
+            graph = NeighborGraph(
+                n_node=n_node,
+                edge_index=edge_index,
+                edge_vec=edge_vec,
+                edge_mask=edge_mask,
+            )
+            nf = n_node.shape[0]
+            nloc = int(n_node[0])
+            descriptor = self.atomic_model.descriptor
+            fitting_net = self.atomic_model.fitting_net
+            # dpa1 call_graph requires the type-embedding table explicitly
+            type_embedding = descriptor.type_embedding.call()
+            gg, rot_mat, g2, h2, _ = descriptor.call_graph(
+                graph, atype, type_embedding=type_embedding
+            )
+            # the fitting expects atype shaped (nf, nloc)
+            atype_2d = xp.reshape(xp.asarray(atype, device=dev), (nf, nloc))
+            fit_ret = fitting_net(
+                gg,
+                atype_2d,
+                gr=rot_mat,
+                g2=g2,
+                h2=h2,
+                fparam=fparam,
+                aparam=aparam,
+            )
+            atom_energy = fit_ret["energy"]  # (nf, nloc, 1)
+            # per-frame reduction via segment_sum, mirroring the dense reduction
+            # (cast to energy precision before summing; see
+            # transform_output.fit_output_to_model_output).
+            frame_id = xp.repeat(
+                xp.arange(nf, dtype=edge_index.dtype, device=dev),
+                xp.asarray(n_node, device=dev),
+            )
+            energy = segment_sum(
+                xp.reshape(
+                    atom_energy.astype(GLOBAL_ENER_FLOAT_PRECISION), (nf * nloc, 1)
+                ),
+                frame_id,
+                nf,
+            )
+            return {"atom_energy": atom_energy, "energy": energy}
+
         call = call_common
         call_lower = call_common_lower
 
diff --git a/source/tests/common/dpmodel/test_call_lower_graph.py b/source/tests/common/dpmodel/test_call_lower_graph.py
new file mode 100644
index 0000000000..40839468c4
--- /dev/null
+++ b/source/tests/common/dpmodel/test_call_lower_graph.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Energy-level parity between the graph-native model lower
+(``CM.call_lower_graph``) and the dense ``EnergyModel.call_lower`` on the SAME
+neighbor list (regime-1: ``from_dense_quartet`` reproduces the nlist neighbors).
+
+PR-A is dpa1(attn_layer=0) energy-only; force/virial come from pt_expt autograd
+in a later task, so this only checks ``energy`` (reduced per-frame) and
+``atom_energy`` (per-atom).
+"""
+
+import unittest
+
+import numpy as np
+
+from deepmd.dpmodel.descriptor.dpa1 import (
+    DescrptDPA1,
+)
+from deepmd.dpmodel.fitting import (
+    InvarFitting,
+)
+from deepmd.dpmodel.model.ener_model import (
+    EnergyModel,
+)
+from deepmd.dpmodel.utils.neighbor_graph import (
+    from_dense_quartet,
+)
+from deepmd.dpmodel.utils.nlist import (
+    extend_input_and_build_neighbor_list,
+)
+
+
+class TestCallLowerGraph(unittest.TestCase):
+    def _make_model(self):
+        ds = DescrptDPA1(
+            rcut=4.0,
+            rcut_smth=0.5,
+            sel=[30],
+            ntypes=2,
+            attn_layer=0,
+            axis_neuron=2,
+            neuron=[6, 12],
+        )
+        ft = InvarFitting(
+            "energy",
+            2,
+            ds.get_dim_out(),
+            1,
+            mixed_types=ds.mixed_types(),
+        )
+        return EnergyModel(ds, ft, type_map=["foo", "bar"])
+
+    def setUp(self) -> None:
+        rng = np.random.default_rng(2)
+        self.nloc = 4
+        self.coord = rng.normal(size=(1, self.nloc, 3)) * 1.5
+        self.atype = np.array([[0, 1, 0, 1]], dtype=np.int64)
+
+    def test_graph_lower_matches_dense_lower(self) -> None:
+        model = self._make_model()
+        (
+            ext_coord,
+            ext_atype,
+            mapping,
+            nlist,
+        ) = extend_input_and_build_neighbor_list(
+            self.coord,
+            self.atype,
+            model.get_rcut(),
+            model.get_sel(),
+            mixed_types=model.mixed_types(),
+            box=None,
+        )
+
+        dense = model.call_lower(ext_coord, ext_atype, nlist, mapping)
+
+        ng = from_dense_quartet(ext_coord, nlist, mapping)
+        nloc = nlist.shape[1]
+        out = model.call_lower_graph(
+            atype=ext_atype.reshape(-1)[:nloc],
+            n_node=ng.n_node,
+            edge_index=ng.edge_index,
+            edge_vec=ng.edge_vec,
+            edge_mask=ng.edge_mask,
+        )
+
+        np.testing.assert_allclose(
+            out["energy"], dense["energy"], rtol=1e-12, atol=1e-12
+        )
+        np.testing.assert_allclose(
+            out["atom_energy"].reshape(dense["atom_energy"].shape),
+            dense["atom_energy"],
+            rtol=1e-12,
+            atol=1e-12,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0a00978aa42c9b3458bffa4aaae1d8f2d56003ae Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 10:59:25 +0800
Subject: [PATCH 06/69] test(dpmodel): remove PR-A import smoke test (no smoke
 tests in repo)

---
 .../common/dpmodel/test_neighbor_graph_lower_smoke.py    | 9 ---------
 1 file changed, 9 deletions(-)
 delete mode 100644 source/tests/common/dpmodel/test_neighbor_graph_lower_smoke.py

diff --git a/source/tests/common/dpmodel/test_neighbor_graph_lower_smoke.py b/source/tests/common/dpmodel/test_neighbor_graph_lower_smoke.py
deleted file mode 100644
index e1d860192c..0000000000
--- a/source/tests/common/dpmodel/test_neighbor_graph_lower_smoke.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: LGPL-3.0-or-later
-import unittest
-
-
-class TestPrASymbolsExist(unittest.TestCase):
-    def test_edge_env_mat_importable(self) -> None:
-        from deepmd.dpmodel.utils.neighbor_graph import edge_env_mat
-
-        self.assertTrue(callable(edge_env_mat))

From 61668ef8722c2934c3a8d12e2af5b847e5176777 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 11:17:03 +0800
Subject: [PATCH 07/69] refactor(dpmodel): single public
 DescrptDPA1.call_graph; private block kernel; sw to dense adapter

---
 deepmd/dpmodel/descriptor/dpa1.py             |  72 ++++++-----
 deepmd/dpmodel/model/make_model.py            |   3 +-
 .../common/dpmodel/test_call_lower_graph.py   |   1 +
 .../dpmodel/test_dpa1_call_graph_block.py     | 115 +++++++++---------
 .../test_dpa1_call_graph_descriptor.py        |  73 ++++++-----
 5 files changed, 127 insertions(+), 137 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 24806b1b05..71587d3900 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -564,14 +564,28 @@ def call(
             graph = from_dense_quartet(coord_ext_3, nlist, mapping_g)
             # local atom types, flat (nf * nloc,)
             atype_local = xp.reshape(xp_take_first_n(atype_ext, 1, nloc), (nf * nloc,))
-            return self.call_graph(
+            grrg, rot_mat = self.call_graph(
                 graph,
                 atype_local,
                 type_embedding=self.type_embedding.call(),
-                nlist=nlist,
-                coord_ext=coord_ext,
-                atype_ext=atype_ext,
             )
+            # reconstruct the dense-shaped sw exactly the dense way (env_mat
+            # switch masked where nlist == -1; the graph path forbids
+            # exclude_types, so nlist_mask == nlist != -1, matching
+            # DescrptBlockSeAtten.call). This is a dense-layout artifact tied to
+            # neighbor slots, which the graph does not carry, so it lives here in
+            # the dense adapter (which has nlist/coord_ext available).
+            _, _, sw = self.se_atten.env_mat.call(
+                coord_ext,
+                atype_ext,
+                nlist,
+                self.se_atten.mean[...],
+                self.se_atten.stddev[...],
+            )
+            nlist_mask = (nlist != -1)[:, :, :, None]
+            sw = xp.where(nlist_mask, sw, xp.zeros_like(sw))
+            sw = xp.reshape(sw, (nf, nloc, nnei, 1))
+            return grrg, rot_mat, None, None, sw
         del mapping
         type_embedding = self.type_embedding.call()
         # nf x nall x tebd_dim
@@ -601,15 +615,17 @@ def call_graph(
         graph: Any,
         atype: Array,
         type_embedding: Array | None = None,
-        nlist: Array | None = None,
-        coord_ext: Array | None = None,
-        atype_ext: Array | None = None,
-    ) -> tuple[Array, Array, None, None, Array | None]:
+    ) -> tuple[Array, Array]:
         """Descriptor-level graph-native forward (``attn_layer == 0``).
 
-        Wraps the block :meth:`DescrptBlockSeAtten.call_graph`, adds the
-        descriptor-level ``concat_output_tebd`` step, and reshapes the per-node
-        outputs back to the dense ABI shapes ``(nf, nloc, ...)``.
+        Wraps the private block kernel
+        :meth:`DescrptBlockSeAtten._call_graph`, adds the descriptor-level
+        ``concat_output_tebd`` step, and reshapes the per-node outputs back to
+        the dense ABI shapes ``(nf, nloc, ...)``.
+
+        This method is graph-native: it takes no dense quartet inputs and does
+        not produce the dense ``sw`` (that lives in the dense :meth:`call`
+        adapter, which has the ``nlist``/``coord_ext`` needed to build it).
 
         Parameters
         ----------
@@ -619,20 +635,17 @@ def call_graph(
             (nf * nloc,) flat LOCAL atom types.
         type_embedding
             (ntypes_with_padding, tebd_dim) type-embedding table.
-        nlist, coord_ext, atype_ext
-            Original dense quartet inputs, used ONLY to reconstruct the dense
-            ``sw`` (nf, nloc, nnei, 1) exactly (a dense-layout artifact tied to
-            neighbor slots, which the graph does not carry). When ``nlist`` is
-            ``None`` the returned ``sw`` is ``None``.
 
         Returns
         -------
-        tuple
-            ``(grrg, rot_mat, None, None, sw)`` matching :meth:`call`.
+        grrg : Array
+            (nf, nloc, ng * axis_neuron [+ tebd_dim]) descriptor.
+        rot_mat : Array
+            (nf, nloc, ng, 3) equivariant single-particle representation.
         """
         xp = array_api_compat.array_namespace(graph.edge_vec)
         dev = array_api_compat.device(graph.edge_vec)
-        grrg_node, rot_mat_node = self.se_atten.call_graph(
+        grrg_node, rot_mat_node = self.se_atten._call_graph(
             graph, atype, type_embedding=type_embedding
         )
         nf = graph.n_node.shape[0]
@@ -648,24 +661,7 @@ def call_graph(
             atype_embd = xp.take(tebd, atype_local, axis=0)  # (nf*nloc, tebd_dim)
             atype_embd = xp.reshape(atype_embd, (nf, nloc, self.tebd_dim))
             grrg = xp.concat([grrg, atype_embd], axis=-1)
-        # reconstruct the dense-shaped sw exactly the dense way (env_mat switch
-        # masked where nlist == -1; the graph path forbids exclude_types, so
-        # nlist_mask == nlist != -1, matching DescrptBlockSeAtten.call).
-        sw: Array | None = None
-        if nlist is not None:
-            nf_, nloc_, nnei = nlist.shape
-            # env_mat returns sw with shape (nf, nloc, nnei, 1)
-            _, _, sw = self.se_atten.env_mat.call(
-                coord_ext,
-                atype_ext,
-                nlist,
-                self.se_atten.mean[...],
-                self.se_atten.stddev[...],
-            )
-            nlist_mask = (nlist != -1)[:, :, :, None]
-            sw = xp.where(nlist_mask, sw, xp.zeros_like(sw))
-            sw = xp.reshape(sw, (nf_, nloc_, nnei, 1))
-        return grrg, rot_mat, None, None, sw
+        return grrg, rot_mat
 
     def serialize(self) -> dict:
         """Serialize the descriptor to dict."""
@@ -1340,7 +1336,7 @@ def call(
             xp.reshape(sw, (nf, nloc, nnei, 1)),
         )
 
-    def call_graph(
+    def _call_graph(
         self,
         graph: Any,
         atype: Array,
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 74c0c91878..c5b077d8d3 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -489,9 +489,10 @@ def call_lower_graph(
             fitting_net = self.atomic_model.fitting_net
             # dpa1 call_graph requires the type-embedding table explicitly
             type_embedding = descriptor.type_embedding.call()
-            gg, rot_mat, g2, h2, _ = descriptor.call_graph(
+            gg, rot_mat = descriptor.call_graph(
                 graph, atype, type_embedding=type_embedding
             )
+            g2 = h2 = None
             # the fitting expects atype shaped (nf, nloc)
             atype_2d = xp.reshape(xp.asarray(atype, device=dev), (nf, nloc))
             fit_ret = fitting_net(
diff --git a/source/tests/common/dpmodel/test_call_lower_graph.py b/source/tests/common/dpmodel/test_call_lower_graph.py
index 40839468c4..6ea4e83544 100644
--- a/source/tests/common/dpmodel/test_call_lower_graph.py
+++ b/source/tests/common/dpmodel/test_call_lower_graph.py
@@ -56,6 +56,7 @@ def setUp(self) -> None:
         self.atype = np.array([[0, 1, 0, 1]], dtype=np.int64)
 
     def test_graph_lower_matches_dense_lower(self) -> None:
+        """Graph model lower energy/atom_energy match the dense lower on the same nlist."""
         model = self._make_model()
         (
             ext_coord,
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
index 58014d70f9..1da0f5fd1a 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Bit-exact parity between the graph-native ``DescrptBlockSeAtten.call_graph``
+"""Bit-exact parity between the graph-native ``DescrptBlockSeAtten._call_graph``
 (attn_layer=0) and the legacy dense ``DescrptBlockSeAtten.call`` on the SAME
 neighbor list, for binding AND non-binding ``sel``.
 """
 
-import unittest
-
 import numpy as np
+import pytest
 
 from deepmd.dpmodel.descriptor.dpa1 import (
     DescrptDPA1,
@@ -19,7 +18,7 @@
 )
 
 
-class TestDpa1BlockCallGraph(unittest.TestCase):
+class TestDpa1BlockCallGraph:
     def _make(self, sel):
         return DescrptDPA1(
             rcut=4.0,
@@ -31,65 +30,67 @@ def _make(self, sel):
             neuron=[6, 12],
         )
 
-    def setUp(self) -> None:
+    def setup_method(self) -> None:
         rng = np.random.default_rng(1)
         self.nloc = 4
         self.coord = rng.normal(size=(1, self.nloc, 3)) * 1.5
         self.atype = np.array([[0, 1, 0, 1]], dtype=np.int64)
 
-    def test_block_graph_equals_dense_any_sel(self) -> None:
-        for sel in ([20], [3]):  # non-binding AND binding
-            with self.subTest(sel=sel):
-                dd = self._make(sel)
-                blk = dd.se_atten
-                # build the dense nlist exactly as the descriptor would
-                (
-                    ext_coord,
-                    ext_atype,
-                    mapping,
-                    nlist,
-                ) = extend_input_and_build_neighbor_list(
-                    self.coord,
-                    self.atype,
-                    dd.get_rcut(),
-                    dd.get_sel(),
-                    mixed_types=dd.mixed_types(),
-                    box=None,
-                )
-                # type embedding as both paths use it
-                tebd = dd.type_embedding.call()
-                nf, nall = ext_atype.shape
-                atype_embd_ext = np.reshape(
-                    np.take(tebd, np.reshape(ext_atype, (-1,)), axis=0),
-                    (nf, nall, dd.tebd_dim),
-                )
-                dense_g, *_ = blk.call(
-                    nlist,
-                    ext_coord,
-                    ext_atype,
-                    atype_embd_ext=atype_embd_ext,
-                    mapping=None,
-                    type_embedding=tebd,
-                )
-                ng = from_dense_quartet(ext_coord, nlist, mapping)
-                graph_g, _rot_mat = blk.call_graph(
-                    ng,
-                    np.reshape(ext_atype, (-1,)),
-                    type_embedding=tebd,
-                )
-                np.testing.assert_allclose(
-                    graph_g.reshape(dense_g.shape),
-                    dense_g,
-                    rtol=1e-12,
-                    atol=1e-12,
-                )
+    @pytest.mark.parametrize("sel", [[20], [3]])  # non-binding AND binding
+    def test_block_graph_equals_dense_any_sel(self, sel) -> None:
+        """Graph block output is bit-exact with the dense block on the same nlist."""
+        dd = self._make(sel)
+        blk = dd.se_atten
+        # build the dense nlist exactly as the descriptor would
+        (
+            ext_coord,
+            ext_atype,
+            mapping,
+            nlist,
+        ) = extend_input_and_build_neighbor_list(
+            self.coord,
+            self.atype,
+            dd.get_rcut(),
+            dd.get_sel(),
+            mixed_types=dd.mixed_types(),
+            box=None,
+        )
+        # type embedding as both paths use it
+        tebd = dd.type_embedding.call()
+        nf, nall = ext_atype.shape
+        atype_embd_ext = np.reshape(
+            np.take(tebd, np.reshape(ext_atype, (-1,)), axis=0),
+            (nf, nall, dd.tebd_dim),
+        )
+        dense_g, *_ = blk.call(
+            nlist,
+            ext_coord,
+            ext_atype,
+            atype_embd_ext=atype_embd_ext,
+            mapping=None,
+            type_embedding=tebd,
+        )
+        ng = from_dense_quartet(ext_coord, nlist, mapping)
+        graph_g, _rot_mat = blk._call_graph(
+            ng,
+            np.reshape(ext_atype, (-1,)),
+            type_embedding=tebd,
+        )
+        np.testing.assert_allclose(
+            graph_g.reshape(dense_g.shape),
+            dense_g,
+            rtol=1e-12,
+            atol=1e-12,
+        )
 
     def test_attn_layer_gt0_raises(self) -> None:
+        """The graph block kernel fail-fasts for attn_layer > 0 (unsupported)."""
         dd = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[20], ntypes=2, attn_layer=2)
-        with self.assertRaises(NotImplementedError):
-            dd.se_atten.call_graph(None, np.array([0], dtype=np.int64))
+        with pytest.raises(NotImplementedError):
+            dd.se_atten._call_graph(None, np.array([0], dtype=np.int64))
 
     def test_exclude_types_raises(self) -> None:
+        """The graph block kernel fail-fasts for exclude_types (not yet applied)."""
         # the graph path does not yet apply type exclusion; it must fail-fast
         # rather than silently diverge from the dense path (which masks edges).
         dd = DescrptDPA1(
@@ -105,11 +106,7 @@ def test_exclude_types_raises(self) -> None:
             -np.ones((1, self.nloc, 1), dtype=np.int64),  # any graph; guard fires first
             np.arange(self.nloc, dtype=np.int64)[None],
         )
-        with self.assertRaises(NotImplementedError):
-            dd.se_atten.call_graph(
+        with pytest.raises(NotImplementedError):
+            dd.se_atten._call_graph(
                 ng, self.atype.reshape(-1), type_embedding=dd.type_embedding.call()
             )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
index 7f3561888a..aaf0a3e040 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
@@ -10,9 +10,8 @@
 the graph for ``attn_layer == 0``.
 """
 
-import unittest
-
 import numpy as np
+import pytest
 
 from deepmd.dpmodel.descriptor.dpa1 import (
     DescrptDPA1,
@@ -22,7 +21,7 @@
 )
 
 
-class TestDpa1DescriptorCallGraph(unittest.TestCase):
+class TestDpa1DescriptorCallGraph:
     def _make(self, sel):
         return DescrptDPA1(
             rcut=4.0,
@@ -34,7 +33,7 @@ def _make(self, sel):
             neuron=[6, 12],
         )
 
-    def setUp(self) -> None:
+    def setup_method(self) -> None:
         rng = np.random.default_rng(2)
         self.nloc = 4
         self.coord = rng.normal(size=(1, self.nloc, 3)) * 1.5
@@ -65,38 +64,34 @@ def _dense_reference(self, dd, ext_coord, ext_atype, nlist):
             )
         return grrg, rot_mat, None, None, sw
 
-    def test_descriptor_graph_equals_dense_full_tuple(self) -> None:
-        for sel in ([30], [4]):  # non-binding AND binding
-            with self.subTest(sel=sel):
-                dd = self._make(sel)
-                (
-                    ext_coord,
-                    ext_atype,
-                    mapping,
-                    nlist,
-                ) = extend_input_and_build_neighbor_list(
-                    self.coord,
-                    self.atype,
-                    dd.get_rcut(),
-                    dd.get_sel(),
-                    mixed_types=dd.mixed_types(),
-                    box=None,
-                )
-                # dense reference captured via the block (pre-swap behaviour)
-                ref = self._dense_reference(dd, ext_coord, ext_atype, nlist)
-                # the swapped public ABI: routes through the graph
-                out = dd.call(ext_coord, ext_atype, nlist, mapping=mapping)
-                self.assertEqual(len(out), 5)
-                # grrg
-                np.testing.assert_allclose(out[0], ref[0], rtol=1e-12, atol=1e-12)
-                # rot_mat
-                np.testing.assert_allclose(out[1], ref[1], rtol=1e-12, atol=1e-12)
-                # positions [2], [3] are always None for this descriptor
-                self.assertIsNone(out[2])
-                self.assertIsNone(out[3])
-                # sw
-                np.testing.assert_allclose(out[4], ref[4], rtol=1e-12, atol=1e-12)
-
-
-if __name__ == "__main__":
-    unittest.main()
+    @pytest.mark.parametrize("sel", [[30], [4]])  # non-binding AND binding
+    def test_descriptor_graph_equals_dense_full_tuple(self, sel) -> None:
+        """Graph-routed dd.call() returns the identical dense 5-tuple ABI."""
+        dd = self._make(sel)
+        (
+            ext_coord,
+            ext_atype,
+            mapping,
+            nlist,
+        ) = extend_input_and_build_neighbor_list(
+            self.coord,
+            self.atype,
+            dd.get_rcut(),
+            dd.get_sel(),
+            mixed_types=dd.mixed_types(),
+            box=None,
+        )
+        # dense reference captured via the block (pre-swap behaviour)
+        ref = self._dense_reference(dd, ext_coord, ext_atype, nlist)
+        # the swapped public ABI: routes through the graph
+        out = dd.call(ext_coord, ext_atype, nlist, mapping=mapping)
+        assert len(out) == 5
+        # grrg
+        np.testing.assert_allclose(out[0], ref[0], rtol=1e-12, atol=1e-12)
+        # rot_mat
+        np.testing.assert_allclose(out[1], ref[1], rtol=1e-12, atol=1e-12)
+        # positions [2], [3] are always None for this descriptor
+        assert out[2] is None
+        assert out[3] is None
+        # sw
+        np.testing.assert_allclose(out[4], ref[4], rtol=1e-12, atol=1e-12)

From c22bc13c0b4feba584c5d310686d9dc0286e90b8 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 11:31:59 +0800
Subject: [PATCH 08/69] feat(dpmodel): neighbor_graph_from_ijs + ASE carry-all
 builder (optional dep)

---
 deepmd/dpmodel/utils/__init__.py              |   4 +
 .../dpmodel/utils/neighbor_graph/__init__.py  |   8 ++
 .../utils/neighbor_graph/ase_builder.py       | 116 ++++++++++++++++++
 .../dpmodel/utils/neighbor_graph/from_ijs.py  | 115 +++++++++++++++++
 source/tests/common/dpmodel/test_from_ijs.py  |  88 +++++++++++++
 5 files changed, 331 insertions(+)
 create mode 100644 deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
 create mode 100644 deepmd/dpmodel/utils/neighbor_graph/from_ijs.py
 create mode 100644 source/tests/common/dpmodel/test_from_ijs.py

diff --git a/deepmd/dpmodel/utils/__init__.py b/deepmd/dpmodel/utils/__init__.py
index eb1f8fb855..66dd8fe3c1 100644
--- a/deepmd/dpmodel/utils/__init__.py
+++ b/deepmd/dpmodel/utils/__init__.py
@@ -22,9 +22,11 @@
     GraphLayout,
     NeighborGraph,
     build_neighbor_graph,
+    build_neighbor_graph_ase,
     edge_env_mat,
     edge_force_virial,
     from_dense_quartet,
+    neighbor_graph_from_ijs,
     node_validity_mask,
     pad_and_guard_edges,
     segment_mean,
@@ -90,6 +92,7 @@
     "aggregate",
     "build_multiple_neighbor_list",
     "build_neighbor_graph",
+    "build_neighbor_graph_ase",
     "build_neighbor_list",
     "compute_total_numb_batch",
     "edge_env_mat",
@@ -105,6 +108,7 @@
     "make_fitting_network",
     "make_multilayer_network",
     "make_neighbor_stat_data",
+    "neighbor_graph_from_ijs",
     "nlist_distinguish_types",
     "node_validity_mask",
     "normalize_coord",
diff --git a/deepmd/dpmodel/utils/neighbor_graph/__init__.py b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
index 58f4d61b8d..27e49a4e18 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/__init__.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
@@ -9,6 +9,9 @@
 See memory/spec_unified_edge_nlist.md.
 """
 
+from .ase_builder import (
+    build_neighbor_graph_ase,
+)
 from .builder import (
     build_neighbor_graph,
     from_dense_quartet,
@@ -19,6 +22,9 @@
 from .env import (
     edge_env_mat,
 )
+from .from_ijs import (
+    neighbor_graph_from_ijs,
+)
 from .graph import (
     GraphLayout,
     NeighborGraph,
@@ -34,9 +40,11 @@
     "GraphLayout",
     "NeighborGraph",
     "build_neighbor_graph",
+    "build_neighbor_graph_ase",
     "edge_env_mat",
     "edge_force_virial",
     "from_dense_quartet",
+    "neighbor_graph_from_ijs",
     "node_validity_mask",
     "pad_and_guard_edges",
     "segment_mean",
diff --git a/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py b/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
new file mode 100644
index 0000000000..eaa01359ee
--- /dev/null
+++ b/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Carry-all NeighborGraph builder backed by ASE's O(N) cell list (optional dep).
+
+``build_neighbor_graph_ase`` is a carry-all search backend: it uses ASE's
+``neighbor_list("ijS", ...)`` to enumerate EVERY neighbor within ``rcut`` (no
+``sel`` cutoff), then routes the resulting sparse ``(i, j, S)`` edge list through
+:func:`neighbor_graph_from_ijs` so ``edge_vec`` is recomputed differentiably from
+``coord``/``box`` -- ASE's own distance vectors are intentionally NOT used, to
+keep the geometry convention and autograd leaf consistent with every other
+builder. ASE is an OPTIONAL dependency, imported lazily inside the function.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+from typing import (
+    TYPE_CHECKING,
+)
+
+import numpy as np
+
+from .from_ijs import (
+    neighbor_graph_from_ijs,
+)
+
+if TYPE_CHECKING:
+    from deepmd.dpmodel.array_api import (
+        Array,
+    )
+    from .graph import (
+        GraphLayout,
+        NeighborGraph,
+    )
+
+
+def build_neighbor_graph_ase(
+    coord: Array,
+    atype: Array,
+    box: Array | None,
+    rcut: float,
+    layout: GraphLayout | None = None,
+) -> NeighborGraph:
+    """Build a CARRY-ALL NeighborGraph using ASE's O(N) cell-list search.
+
+    Per frame, ASE ``neighbor_list("ijS", atoms, rcut)`` returns center ``i``,
+    neighbor ``j`` and periodic shift ``S`` such that the neighbor image sits at
+    ``positions[j] + S @ cell``. These map directly to the graph convention
+    (src=neighbor=j, dst=center=i), and the edge list is fed to
+    :func:`neighbor_graph_from_ijs` which recomputes ``edge_vec`` from
+    ``coord``/``box`` (ASE's distance vectors are discarded for convention +
+    differentiability consistency).
+
+    Parameters
+    ----------
+    coord
+        (nf, nloc, 3) local coordinates.
+    atype
+        (nf, nloc) local atom types (unused for the search; carried for API parity).
+    box
+        (nf, 3, 3) simulation cell, or ``None`` for non-periodic.
+    rcut
+        cutoff radius.
+    layout
+        edge-axis length policy; ``None`` => dynamic (torch) with ``min_edges`` guards.
+
+    Raises
+    ------
+    ImportError
+        if the optional ``ase`` package is not installed.
+    """
+    try:
+        from ase import (
+            Atoms,
+        )
+        from ase.neighborlist import (
+            neighbor_list,
+        )
+    except ImportError as e:
+        raise ImportError(
+            "build_neighbor_graph_ase requires the optional 'ase' package; "
+            "install ase or use neighbor-graph method 'dense'."
+        ) from e
+
+    coord_np = np.asarray(coord)
+    nf, nloc = coord_np.shape[:2]
+    coord_np = coord_np.reshape(nf, nloc, 3)
+    box_np = np.asarray(box).reshape(nf, 3, 3) if box is not None else None
+    periodic = box is not None
+
+    i_parts = []
+    j_parts = []
+    S_parts = []
+    nframe_parts = []
+    for f in range(nf):
+        atoms = Atoms(
+            positions=coord_np[f],
+            cell=(box_np[f] if periodic else None),
+            pbc=periodic,
+        )
+        ii, jj, SS = neighbor_list("ijS", atoms, rcut)
+        i_parts.append(np.asarray(ii, dtype=np.int64))
+        j_parts.append(np.asarray(jj, dtype=np.int64))
+        S_parts.append(np.asarray(SS, dtype=np.int64).reshape(-1, 3))
+        nframe_parts.append(np.full((len(ii),), f, dtype=np.int64))
+
+    i_all = np.concatenate(i_parts) if i_parts else np.zeros((0,), dtype=np.int64)
+    j_all = np.concatenate(j_parts) if j_parts else np.zeros((0,), dtype=np.int64)
+    S_all = np.concatenate(S_parts) if S_parts else np.zeros((0, 3), dtype=np.int64)
+    nframe_all = (
+        np.concatenate(nframe_parts) if nframe_parts else np.zeros((0,), dtype=np.int64)
+    )
+
+    return neighbor_graph_from_ijs(
+        i_all, j_all, S_all, coord, box, nframe_all, nloc, layout=layout
+    )
diff --git a/deepmd/dpmodel/utils/neighbor_graph/from_ijs.py b/deepmd/dpmodel/utils/neighbor_graph/from_ijs.py
new file mode 100644
index 0000000000..8aa533c61e
--- /dev/null
+++ b/deepmd/dpmodel/utils/neighbor_graph/from_ijs.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Sparse ``(i, j, S)`` edge-list converter to :class:`NeighborGraph`.
+
+``neighbor_graph_from_ijs`` is the canonical sparse converter: it takes an
+already-built sparse edge list -- per-edge center ``i``, neighbor ``j`` (both
+per-frame LOCAL indices in ``[0, nloc)``) and integer periodic-image shift ``S``
+-- and emits a :class:`NeighborGraph` whose ``edge_vec`` is recomputed
+DIFFERENTIABLY from ``coord``/``box`` (it never trusts the builder's distance
+vectors). It is the format-conversion step shared by every O(N) search backend
+(ASE/vesin/LAMMPS): a backend searches, then hands its ``(i, j, S)`` here.
+
+Convention (matching :mod:`...graph`): ``edge_index = [src, dst]`` with
+``src = j`` (neighbor's local owner), ``dst = i`` (center), and
+``edge_vec = r_j + S @ box - r_i`` (neighbor image minus center).
+"""
+
+from __future__ import (
+    annotations,
+)
+
+from typing import (
+    TYPE_CHECKING,
+)
+
+import array_api_compat
+
+from .graph import (
+    GraphLayout,
+    NeighborGraph,
+    pad_and_guard_edges,
+)
+
+if TYPE_CHECKING:
+    from deepmd.dpmodel.array_api import (
+        Array,
+    )
+
+
+def neighbor_graph_from_ijs(
+    i: Array,
+    j: Array,
+    S: Array,
+    coord: Array,
+    box: Array | None,
+    nframe_id: Array,
+    nloc: int,
+    layout: GraphLayout | None = None,
+) -> NeighborGraph:
+    """Convert a sparse ``(i, j, S)`` edge list into a :class:`NeighborGraph`.
+
+    ``edge_vec`` is recomputed from ``coord``/``box`` (NOT from any distance vector
+    the search backend may carry), so it is a differentiable function of the input
+    coordinates and follows the graph convention exactly.
+
+    Parameters
+    ----------
+    i
+        (E,) int per-edge center, per-frame LOCAL index in ``[0, nloc)``.
+    j
+        (E,) int per-edge neighbor, per-frame LOCAL index in ``[0, nloc)``.
+    S
+        (E, 3) int periodic-image shift: the neighbor sits at ``coord[j] + S @ box``.
+    coord
+        (nf, nloc, 3) local coordinates.
+    box
+        (nf, 3, 3) simulation cell, or ``None`` for non-periodic (``S`` ignored).
+    nframe_id
+        (E,) int frame index of each edge.
+    nloc
+        number of local atoms per frame (used for the frame-major node offset).
+    layout
+        edge-axis length policy; ``None`` => dynamic (torch) with ``min_edges`` guards.
+
+    Returns
+    -------
+    NeighborGraph
+        ``edge_index = [j + nframe_id*nloc, i + nframe_id*nloc]`` (src=neighbor,
+        dst=center); ``edge_vec = coord[j] + S@box - coord[i]``; ``n_node`` is
+        ``nloc`` per frame.
+    """
+    if layout is None:
+        layout = GraphLayout()
+    xp = array_api_compat.array_namespace(coord)
+    dev = array_api_compat.device(coord)
+    nf = coord.shape[0]
+    coord = xp.reshape(coord, (nf, nloc, 3))
+    i = xp.astype(xp.asarray(i, device=dev), xp.int64)
+    j = xp.astype(xp.asarray(j, device=dev), xp.int64)
+    nframe_id = xp.astype(xp.asarray(nframe_id, device=dev), xp.int64)
+    # flat frame-major node indices
+    i_flat = i + nframe_id * nloc
+    j_flat = j + nframe_id * nloc
+    coord_flat = xp.reshape(coord, (nf * nloc, 3))
+    r_i = xp.take(coord_flat, i_flat, axis=0)
+    r_j = xp.take(coord_flat, j_flat, axis=0)
+    edge_vec = r_j - r_i
+    if box is not None:
+        box = xp.reshape(box, (nf, 3, 3))
+        box_per_edge = xp.take(box, nframe_id, axis=0)  # (E, 3, 3)
+        S = xp.astype(xp.asarray(S, device=dev), box.dtype)
+        # S @ box per edge via broadcast sum (NEVER np.einsum, which breaks on torch):
+        # shift[e, b] = sum_a S[e, a] * box[e, a, b]
+        shift = xp.sum(S[:, :, None] * box_per_edge, axis=1)  # (E, 3)
+        edge_vec = edge_vec + shift
+    edge_index = xp.stack([j_flat, i_flat], axis=0)
+    edge_index, edge_vec, edge_mask = pad_and_guard_edges(
+        edge_index, edge_vec, layout.edge_capacity, layout.min_edges
+    )
+    n_node = xp.full((nf,), nloc, dtype=xp.int64, device=dev)
+    return NeighborGraph(
+        n_node=n_node,
+        edge_index=edge_index,
+        edge_vec=edge_vec,
+        edge_mask=edge_mask,
+    )
diff --git a/source/tests/common/dpmodel/test_from_ijs.py b/source/tests/common/dpmodel/test_from_ijs.py
new file mode 100644
index 0000000000..f4c965372b
--- /dev/null
+++ b/source/tests/common/dpmodel/test_from_ijs.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+import pytest
+
+from deepmd.dpmodel.utils.neighbor_graph import neighbor_graph_from_ijs
+
+
+class TestFromIjs(unittest.TestCase):
+    def test_edge_vec_and_index(self) -> None:
+        """src=j, dst=i, edge_vec = coord[j] + S@box - coord[i] (single frame, S=0)."""
+        coord = np.array([[[0.0, 0, 0], [1.0, 0, 0], [0, 2.0, 0]]])  # (1,3,3)
+        box = np.eye(3)[None] * 6.0
+        i = np.array([0, 1])  # center
+        j = np.array([1, 0])  # neighbor
+        S = np.array([[0, 0, 0], [0, 0, 0]], dtype=np.int64)
+        ng = neighbor_graph_from_ijs(
+            i, j, S, coord, box, nframe_id=np.zeros(2, np.int64), nloc=3
+        )
+        np.testing.assert_array_equal(ng.edge_index[0][ng.edge_mask], j)  # src
+        np.testing.assert_array_equal(ng.edge_index[1][ng.edge_mask], i)  # dst
+        np.testing.assert_allclose(
+            ng.edge_vec[ng.edge_mask][0], coord[0, 1] - coord[0, 0]
+        )
+
+    def test_periodic_shift_in_edge_vec(self) -> None:
+        """A nonzero S contributes S@box to edge_vec (image neighbor)."""
+        coord = np.array([[[0.5, 0, 0], [5.5, 0, 0]]])  # (1,2,3)
+        box = np.eye(3)[None] * 6.0
+        i = np.array([0])
+        j = np.array([1])
+        S = np.array([[-1, 0, 0]], dtype=np.int64)
+        ng = neighbor_graph_from_ijs(
+            i, j, S, coord, box, nframe_id=np.zeros(1, np.int64), nloc=2
+        )
+        # coord[1] + (-1,0,0)@box - coord[0] = 5.5 - 6 - 0.5 = -1.0
+        np.testing.assert_allclose(
+            ng.edge_vec[ng.edge_mask][0], np.array([-1.0, 0.0, 0.0])
+        )
+
+
+class TestAseCarryAll(unittest.TestCase):
+    def _sets(self, ng, nloc):
+        # per-center set of (src, rounded edge_vec); real edges only
+        ei = ng.edge_index[:, ng.edge_mask]
+        ev = ng.edge_vec[ng.edge_mask]
+        s = [set() for _ in range(nloc)]
+        for k in range(ei.shape[1]):
+            s[int(ei[1, k])].add((int(ei[0, k]), tuple(np.round(ev[k], 6))))
+        return s
+
+    def test_ase_matches_intree_carry_all(self) -> None:
+        """ASE carry-all builder yields the SAME neighbor set as the in-tree
+        carry-all build_neighbor_graph (both carry ALL neighbors in rcut).
+        """
+        pytest.importorskip("ase")
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            build_neighbor_graph,
+            build_neighbor_graph_ase,
+        )
+
+        rng = np.random.default_rng(3)
+        coord = rng.normal(size=(1, 8, 3)) * 2.0
+        atype = np.array([[0, 1] * 4], dtype=np.int64)
+        box = np.eye(3)[None] * 8.0
+        ng_ase = build_neighbor_graph_ase(coord, atype, box, rcut=4.0)
+        ng_ref = build_neighbor_graph(coord, atype, box, rcut=4.0)
+        self.assertEqual(self._sets(ng_ase, 8), self._sets(ng_ref, 8))
+
+    def test_ase_matches_intree_carry_all_nonperiodic(self) -> None:
+        """Non-periodic (box=None): ASE carry-all == in-tree carry-all."""
+        pytest.importorskip("ase")
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            build_neighbor_graph,
+            build_neighbor_graph_ase,
+        )
+
+        rng = np.random.default_rng(7)
+        coord = rng.normal(size=(1, 6, 3)) * 2.0
+        atype = np.array([[0, 1, 0, 1, 0, 1]], dtype=np.int64)
+        ng_ase = build_neighbor_graph_ase(coord, atype, None, rcut=4.0)
+        ng_ref = build_neighbor_graph(coord, atype, None, rcut=4.0)
+        self.assertEqual(self._sets(ng_ase, 6), self._sets(ng_ref, 6))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 178c17491b3e68fe834ce34abd16227849681987 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 11:41:21 +0800
Subject: [PATCH 09/69] feat(dpmodel): opt-in carry-all graph energy forward
 via neighbor_graph_method (Option B)

---
 deepmd/dpmodel/descriptor/dpa1.py             |   8 ++
 deepmd/dpmodel/model/make_model.py            |  81 ++++++++++++++
 .../dpmodel/test_dpa1_graph_model_energy.py   | 100 ++++++++++++++++++
 3 files changed, 189 insertions(+)
 create mode 100644 source/tests/common/dpmodel/test_dpa1_graph_model_energy.py

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 71587d3900..d79bf778b9 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -418,6 +418,14 @@ def get_numb_attn_layer(self) -> int:
         """Returns the number of se_atten attention layers."""
         return self.se_atten.attn_layer
 
+    def uses_graph_lower(self) -> bool:
+        """Returns whether this descriptor supports the graph-native lower.
+
+        The graph-native energy lower (``call_graph``) currently covers only
+        the non-attention (``attn_layer == 0``) factorizable path.
+        """
+        return self.se_atten.attn_layer == 0
+
     def share_params(
         self, base_class: "DescrptDPA1", shared_level: int, resume: bool = False
     ) -> NoReturn:
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index c5b077d8d3..066fce37f4 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -44,6 +44,8 @@
 )
 from deepmd.dpmodel.utils.neighbor_graph import (
     NeighborGraph,
+    build_neighbor_graph,
+    build_neighbor_graph_ase,
     segment_sum,
 )
 from deepmd.utils.path import (
@@ -263,6 +265,7 @@ def call_common(
             coord_corr_for_virial: Array | None = None,
             charge_spin: Array | None = None,
             neighbor_list: NeighborList | None = None,
+            neighbor_graph_method: str | None = None,
         ) -> dict[str, Array]:
             """Return model prediction.
 
@@ -289,6 +292,17 @@ def call_common(
                 default all-pairs builder; an alternative strategy (e.g. an O(N)
                 cell list) may be injected to speed up neighbor-list construction
                 without changing model outputs.
+            neighbor_graph_method
+                Opt-in CARRY-ALL graph energy forward (Option B). ``None``
+                (default) keeps the existing dense nlist path UNCHANGED. When
+                set to ``"dense"`` (in-tree all-pairs search) or ``"ase"``
+                (O(N) ASE cell list), the model builds a carry-all
+                :class:`NeighborGraph` and routes the ENERGY forward through
+                :meth:`call_lower_graph`. Requires a ``mixed_types`` descriptor
+                with a graph lower (dpa1 ``attn_layer == 0``). At non-binding
+                ``sel`` this matches the dense path exactly; at binding ``sel``
+                the carry-all graph keeps neighbors the dense path truncates, so
+                energy intentionally differs.
 
             Returns
             -------
@@ -301,6 +315,17 @@ def call_common(
                 coord, box=box, fparam=fparam, aparam=aparam, charge_spin=charge_spin
             )
             del coord, box, fparam, aparam, charge_spin
+            if neighbor_graph_method is not None:
+                model_predict = self._call_common_graph(
+                    cc,
+                    atype,
+                    bb,
+                    fp,
+                    ap,
+                    neighbor_graph_method,
+                )
+                model_predict = self._output_type_cast(model_predict, input_prec)
+                return model_predict
             model_predict = model_call_from_call_lower(
                 call_lower=self.call_common_lower,
                 rcut=self.get_rcut(),
@@ -320,6 +345,62 @@ def call_common(
             model_predict = self._output_type_cast(model_predict, input_prec)
             return model_predict
 
+        def _call_common_graph(
+            self,
+            cc: Array,
+            atype: Array,
+            bb: Array | None,
+            fp: Array | None,
+            ap: Array | None,
+            method: str,
+        ) -> dict[str, Array]:
+            """Carry-all graph energy forward (opt-in, Option B).
+
+            Builds a carry-all :class:`NeighborGraph` from ``cc``/``atype``/``bb``
+            and routes the ENERGY forward through :meth:`call_lower_graph`. The
+            returned dict mirrors the dense ``call_common`` energy keys
+            (``atom_energy``, ``energy``, ``mask``). Input type-casting is done
+            by the caller; output type-casting is also applied by the caller.
+            """
+            descriptor = self.atomic_model.descriptor
+            uses_graph_lower = getattr(descriptor, "uses_graph_lower", lambda: False)
+            if not (self.mixed_types() and uses_graph_lower()):
+                raise NotImplementedError(
+                    "neighbor_graph_method requires a mixed_types descriptor "
+                    "with a graph lower (e.g. dpa1 attn_layer=0)"
+                )
+            if method == "dense":
+                ng = build_neighbor_graph(cc, atype, bb, self.get_rcut())
+            elif method == "ase":
+                ng = build_neighbor_graph_ase(cc, atype, bb, self.get_rcut())
+            else:
+                raise ValueError(
+                    f"unknown neighbor_graph_method {method!r}; use 'dense' or 'ase'"
+                )
+            xp = array_api_compat.array_namespace(atype)
+            dev = array_api_compat.device(atype)
+            nf, nloc = atype.shape[:2]
+            graph_ret = self.call_lower_graph(
+                atype=xp.reshape(atype, (nf * nloc,)),
+                n_node=ng.n_node,
+                edge_index=ng.edge_index,
+                edge_vec=ng.edge_vec,
+                edge_mask=ng.edge_mask,
+                fparam=fp,
+                aparam=ap,
+            )
+            # mirror the dense ``call_common`` energy keys: ``energy`` is the
+            # per-atom energy (nf, nloc, 1); ``energy_redu`` is the per-frame
+            # reduction (nf, 1); ``mask`` is the (nf, nloc) realness mask.
+            model_predict = {
+                "energy": xp.reshape(graph_ret["atom_energy"], (nf, nloc, 1)),
+                "energy_redu": graph_ret["energy"],
+                # carry-all graph: all local atoms are real -> all-ones int mask,
+                # matching the dense path (base_atomic_model: mask = int32 atom_mask).
+                "mask": xp.ones((nf, nloc), dtype=xp.int32, device=dev),
+            }
+            return model_predict
+
         def call_common_lower(
             self,
             extended_coord: Array,
diff --git a/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py b/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
new file mode 100644
index 0000000000..e8e5285ebd
--- /dev/null
+++ b/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Opt-in carry-all graph energy forward via ``neighbor_graph_method`` (Option B).
+
+PR-A 5c: ``CM.call_common`` gains a ``neighbor_graph_method`` keyword that,
+when set, routes a dpa1(``attn_layer == 0``) ENERGY forward through the
+carry-all graph builder + ``call_lower_graph`` instead of the dense nlist path.
+
+Option-B behavior (decision #17 / spec_unified_edge_nlist):
+
+* non-binding ``sel`` -- the carry-all graph and the dense path see the SAME
+  neighbors, so ``energy``/``atom_energy`` are EXACTLY equal;
+* binding ``sel`` -- the carry-all graph keeps neighbors the dense path
+  truncates, so energy DIFFERS (intended).
+
+The DEFAULT (``neighbor_graph_method=None``) keeps the dense path unchanged.
+"""
+
+import numpy as np
+import pytest
+
+from deepmd.dpmodel.descriptor.dpa1 import (
+    DescrptDPA1,
+)
+from deepmd.dpmodel.fitting import (
+    InvarFitting,
+)
+from deepmd.dpmodel.model.ener_model import (
+    EnergyModel,
+)
+
+
+def _make_model(sel):
+    ds = DescrptDPA1(
+        rcut=4.0,
+        rcut_smth=0.5,
+        sel=sel,
+        ntypes=2,
+        attn_layer=0,
+        axis_neuron=2,
+        neuron=[6, 12],
+    )
+    ft = InvarFitting(
+        "energy",
+        2,
+        ds.get_dim_out(),
+        1,
+        mixed_types=ds.mixed_types(),
+    )
+    return EnergyModel(ds, ft, type_map=["foo", "bar"])
+
+
+@pytest.mark.parametrize("method", ["dense", "ase"])  # in-tree carry-all AND ase
+@pytest.mark.parametrize("periodic", [True, False])  # PBC and non-PBC
+def test_energy_parity_non_binding_sel(method, periodic) -> None:
+    """At non-binding sel the carry-all graph and the dense path see the SAME
+    neighbors, so model energy is exactly equal.
+    """
+    if method == "ase":
+        pytest.importorskip("ase")
+    rng = np.random.default_rng(0)
+    nloc = 6
+    coord = rng.normal(size=(1, nloc, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0, 1]], dtype=np.int64)
+    box = None
+    if periodic:
+        # large box so the cell is essentially non-periodic for rcut=4.0
+        box = np.eye(3).reshape(1, 9) * 20.0
+    # LARGE sel -> non-binding (no truncation)
+    model = _make_model([200])
+
+    dense = model.call_common(coord, atype, box)
+    graph = model.call_common(coord, atype, box, neighbor_graph_method=method)
+
+    # dense energy keys: ``energy_redu`` (reduced, nf x 1) and ``energy``
+    # (per-atom, nf x nloc x 1). Compare matching keys.
+    np.testing.assert_allclose(
+        graph["energy_redu"], dense["energy_redu"], rtol=1e-12, atol=1e-12
+    )
+    np.testing.assert_allclose(graph["energy"], dense["energy"], rtol=1e-12, atol=1e-12)
+    # mask must match the dense all-ones (nf, nloc) int mask
+    np.testing.assert_array_equal(graph["mask"], dense["mask"])
+
+
+def test_binding_sel_carries_more_than_dense() -> None:
+    """At binding sel the carry-all graph includes neighbors the dense path
+    truncates, so energy DIFFERS (intended, decision #17 / Option B).
+    """
+    rng = np.random.default_rng(1)
+    nloc = 14
+    # a dense cluster: many atoms well within rcut=4.0 of each other
+    coord = rng.normal(size=(1, nloc, 3)) * 0.8
+    atype = np.array([[0, 1] * 7], dtype=np.int64)
+    box = None
+    # binding sel -> dense path truncates to 4 neighbors per atom
+    model = _make_model([4])
+
+    dense = model.call_common(coord, atype, box)
+    graph = model.call_common(coord, atype, box, neighbor_graph_method="dense")
+
+    assert not np.allclose(graph["energy_redu"], dense["energy_redu"])

From 09c8b33c33a4739bdfbb3359276c62717dd4a613 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 12:05:08 +0800
Subject: [PATCH 10/69] refactor(dpmodel): explicit if/else for
 neighbor_graph_method routing in call_common

---
 deepmd/dpmodel/model/make_model.py | 37 +++++++++++++++---------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 066fce37f4..60ec031b0e 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -316,6 +316,7 @@ def call_common(
             )
             del coord, box, fparam, aparam, charge_spin
             if neighbor_graph_method is not None:
+                # carry-all NeighborGraph energy forward (Option B / decision #17)
                 model_predict = self._call_common_graph(
                     cc,
                     atype,
@@ -324,24 +325,24 @@ def call_common(
                     ap,
                     neighbor_graph_method,
                 )
-                model_predict = self._output_type_cast(model_predict, input_prec)
-                return model_predict
-            model_predict = model_call_from_call_lower(
-                call_lower=self.call_common_lower,
-                rcut=self.get_rcut(),
-                sel=self.get_sel(),
-                mixed_types=self.mixed_types(),
-                model_output_def=self.model_output_def(),
-                coord=cc,
-                atype=atype,
-                box=bb,
-                fparam=fp,
-                aparam=ap,
-                do_atomic_virial=do_atomic_virial,
-                coord_corr_for_virial=coord_corr_for_virial,
-                charge_spin=cs,
-                neighbor_list=neighbor_list,
-            )
+            else:
+                # legacy dense-nlist path (builds the extended quartet)
+                model_predict = model_call_from_call_lower(
+                    call_lower=self.call_common_lower,
+                    rcut=self.get_rcut(),
+                    sel=self.get_sel(),
+                    mixed_types=self.mixed_types(),
+                    model_output_def=self.model_output_def(),
+                    coord=cc,
+                    atype=atype,
+                    box=bb,
+                    fparam=fp,
+                    aparam=ap,
+                    do_atomic_virial=do_atomic_virial,
+                    coord_corr_for_virial=coord_corr_for_virial,
+                    charge_spin=cs,
+                    neighbor_list=neighbor_list,
+                )
             model_predict = self._output_type_cast(model_predict, input_prec)
             return model_predict
 

From cfebef9bc9659132a71cc05c8e13c840d9313c0c Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 12:48:22 +0800
Subject: [PATCH 11/69] feat(pt_expt): edge_energy_deriv (autograd
 grad(E,edge_vec) -> edge_force_virial)

---
 deepmd/pt_expt/model/edge_transform_output.py | 39 ++++++++
 .../tests/pt_expt/test_edge_energy_deriv.py   | 89 +++++++++++++++++++
 2 files changed, 128 insertions(+)
 create mode 100644 deepmd/pt_expt/model/edge_transform_output.py
 create mode 100644 source/tests/pt_expt/test_edge_energy_deriv.py

diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py
new file mode 100644
index 0000000000..6d3bf2df4f
--- /dev/null
+++ b/deepmd/pt_expt/model/edge_transform_output.py
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Autograd assembly: graph energy -> force/virial/atom_virial via grad(E, edge_vec).
+
+torch-only. The pure-array scatter (edge_force_virial) is shared with dpmodel;
+this module supplies the single backward pass that produces g_e = dE/d(edge_vec).
+"""
+
+import torch
+
+from deepmd.dpmodel.utils.neighbor_graph import (
+    edge_force_virial,
+)
+
+
+def edge_energy_deriv(
+    energy: torch.Tensor,
+    edge_vec: torch.Tensor,
+    edge_index: torch.Tensor,
+    edge_mask: torch.Tensor,
+    n_node: torch.Tensor,
+    do_atomic_virial: bool = False,
+    create_graph: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor]:
+    """Return (force, atom_virial_or_None, virial) from a graph energy.
+
+    g_e = dE/d(edge_vec) via one torch.autograd.grad, then the shared
+    edge_force_virial scatter. ``virial`` (per-frame) is always computed;
+    ``atom_virial`` is materialized only when do_atomic_virial=True.
+    """
+    (g_e,) = torch.autograd.grad(
+        energy.sum() if energy.dim() else energy,
+        edge_vec,
+        create_graph=create_graph,
+        retain_graph=True,
+    )
+    force, atom_virial, virial = edge_force_virial(
+        g_e, edge_vec, edge_index, edge_mask, n_node
+    )
+    return force, (atom_virial if do_atomic_virial else None), virial
diff --git a/source/tests/pt_expt/test_edge_energy_deriv.py b/source/tests/pt_expt/test_edge_energy_deriv.py
new file mode 100644
index 0000000000..9fc8dadc4e
--- /dev/null
+++ b/source/tests/pt_expt/test_edge_energy_deriv.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import torch
+
+from deepmd.pt.utils import env
+from deepmd.pt_expt.model.edge_transform_output import edge_energy_deriv
+
+
+class TestEdgeEnergyDeriv(unittest.TestCase):
+    def test_force_matches_autograd_wrt_node_coords(self) -> None:
+        """The graph force equals -dE/d(node coord): build edge_vec from node
+        coords, so force from edge_energy_deriv == -autograd.grad(E, coords).
+        """
+        torch.manual_seed(0)
+        N, nf = 5, 1
+        n_node = torch.tensor([N], dtype=torch.int64, device=env.DEVICE)
+        coord = torch.randn(
+            N, 3, dtype=torch.float64, device=env.DEVICE, requires_grad=True
+        )
+        # a connected edge set (both directions), all real
+        src = torch.tensor([0, 1, 1, 2, 3, 4], device=env.DEVICE)
+        dst = torch.tensor([1, 0, 2, 1, 4, 3], device=env.DEVICE)
+        edge_index = torch.stack([src, dst], 0)
+        edge_mask = torch.ones(src.shape[0], dtype=torch.bool, device=env.DEVICE)
+        edge_vec = coord[src] - coord[dst]  # differentiable wrt coord
+        energy = (torch.sin(edge_vec).sum(-1) ** 2).sum()  # toy scalar energy
+        force, av, gv = edge_energy_deriv(
+            energy, edge_vec, edge_index, edge_mask, n_node, do_atomic_virial=True
+        )
+        # reference physical force = -dE/d(coord)
+        f_ref = -torch.autograd.grad(energy, coord, retain_graph=True)[0]
+        torch.testing.assert_close(force, f_ref, rtol=1e-10, atol=1e-10)
+        # atom-virial sums (per frame) to the global virial
+        torch.testing.assert_close(av.sum(0), gv[0], rtol=1e-10, atol=1e-10)
+        self.assertEqual(gv.shape, (nf, 3, 3))
+
+    def test_padding_edges_contribute_nothing(self) -> None:
+        """A masked guard edge with a huge edge_vec must not change force/virial."""
+        torch.manual_seed(1)
+        N = 4
+        n_node = torch.tensor([N], dtype=torch.int64, device=env.DEVICE)
+        coord = torch.randn(
+            N, 3, dtype=torch.float64, device=env.DEVICE, requires_grad=True
+        )
+        src = torch.tensor([0, 1, 2], device=env.DEVICE)
+        dst = torch.tensor([1, 2, 3], device=env.DEVICE)
+        ev = coord[src] - coord[dst]
+        # append a masked guard edge with a huge vec
+        guard = torch.tensor(
+            [[99.0, 99.0, 99.0]], dtype=torch.float64, device=env.DEVICE
+        )
+        edge_vec = torch.cat([ev, guard], 0).detach().requires_grad_(True)
+        edge_index = torch.tensor([[0, 1, 2, 0], [1, 2, 3, 0]], device=env.DEVICE)
+        edge_mask = torch.tensor([True, True, True, False], device=env.DEVICE)
+        energy = (edge_vec**2).sum()
+        force, av, gv = edge_energy_deriv(
+            energy, edge_vec, edge_index, edge_mask, n_node, do_atomic_virial=True
+        )
+        # run again with ONLY the real edges; results must match
+        ev2 = edge_vec[:3].detach().requires_grad_(True)
+        e2 = (ev2**2).sum()
+        f2, av2, gv2 = edge_energy_deriv(
+            e2, ev2, edge_index[:, :3], edge_mask[:3], n_node, do_atomic_virial=True
+        )
+        torch.testing.assert_close(force, f2, rtol=1e-12, atol=1e-12)
+        torch.testing.assert_close(gv, gv2, rtol=1e-12, atol=1e-12)
+
+    def test_atom_virial_optional(self) -> None:
+        """do_atomic_virial=False returns None for atom_virial; force+virial still computed."""
+        N = 3
+        n_node = torch.tensor([N], dtype=torch.int64, device=env.DEVICE)
+        coord = torch.randn(
+            N, 3, dtype=torch.float64, device=env.DEVICE, requires_grad=True
+        )
+        edge_index = torch.tensor([[0, 1], [1, 0]], device=env.DEVICE)
+        edge_mask = torch.ones(2, dtype=torch.bool, device=env.DEVICE)
+        edge_vec = coord[edge_index[0]] - coord[edge_index[1]]
+        energy = (edge_vec**2).sum()
+        force, av, gv = edge_energy_deriv(
+            energy, edge_vec, edge_index, edge_mask, n_node, do_atomic_virial=False
+        )
+        self.assertIsNone(av)
+        self.assertEqual(force.shape, (N, 3))
+        self.assertEqual(gv.shape, (1, 3, 3))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 57202ae149a0913e3d343f8244c6c7cbd82795f8 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 13:00:54 +0800
Subject: [PATCH 12/69] feat(pt_expt): forward_common_lower_graph (force/virial
 via edge_energy_deriv) + parity

---
 deepmd/dpmodel/model/make_model.py            |   6 +-
 deepmd/pt_expt/model/make_model.py            |  96 +++++++
 .../pt_expt/model/test_dpa1_graph_lower.py    | 240 ++++++++++++++++++
 3 files changed, 341 insertions(+), 1 deletion(-)
 create mode 100644 source/tests/pt_expt/model/test_dpa1_graph_lower.py

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 60ec031b0e..e3b4140c23 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -594,9 +594,13 @@ def call_lower_graph(
                 xp.arange(nf, dtype=edge_index.dtype, device=dev),
                 xp.asarray(n_node, device=dev),
             )
+            ener_dtype = get_xp_precision(
+                xp, RESERVED_PRECISION_DICT[self.global_ener_float_precision]
+            )
             energy = segment_sum(
                 xp.reshape(
-                    atom_energy.astype(GLOBAL_ENER_FLOAT_PRECISION), (nf * nloc, 1)
+                    xp.astype(atom_energy, ener_dtype),
+                    (nf * nloc, 1),
                 ),
                 frame_id,
                 nf,
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 878ed21a38..9e9947511b 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -24,6 +24,9 @@
     torch_module,
 )
 
+from .edge_transform_output import (
+    edge_energy_deriv,
+)
 from .transform_output import (
     fit_output_to_model_output,
 )
@@ -277,6 +280,99 @@ def forward_common_lower(
             """Forward common lower delegates to call_common_lower()."""
             return self.call_common_lower(*args, **kwargs)
 
+        def forward_common_lower_graph(
+            self,
+            atype: torch.Tensor,
+            n_node: torch.Tensor,
+            edge_index: torch.Tensor,
+            edge_vec: torch.Tensor,
+            edge_mask: torch.Tensor,
+            do_atomic_virial: bool = False,
+            fparam: torch.Tensor | None = None,
+            aparam: torch.Tensor | None = None,
+        ) -> dict[str, torch.Tensor]:
+            """Graph-native lower with autograd force/virial (PR-A: dpa1 ``attn_layer==0``).
+
+            Runs the dpmodel ENERGY-only :meth:`call_lower_graph` with ``edge_vec``
+            as the autograd leaf, then assembles force / per-frame virial /
+            (optional) atom virial from a SINGLE backward pass via
+            :func:`edge_energy_deriv` (``g_e = dE/d(edge_vec)``, then the shared
+            full-to-``src`` scatter).
+
+            The returned dict uses the SAME internal key names as the legacy dense
+            :meth:`forward_common_lower` (``energy``, ``energy_redu``,
+            ``energy_derv_r``, ``energy_derv_c_redu``, and ``energy_derv_c`` when
+            ``do_atomic_virial``).  Unlike the dense lower (which returns EXTENDED
+            ``nall`` force/atom-virial), the graph is ghost-free, so force and
+            atom-virial here live on the ``nloc`` LOCAL atoms (ghost contributions
+            are already folded onto their local owner via ``src = mapping[neighbor]``).
+            They equal the dense lower's extended quantities once the latter are
+            folded onto local atoms via ``mapping`` (i.e. ``communicate_extended_output``).
+
+            Parameters
+            ----------
+            atype
+                (N,) flat LOCAL atom types, ``N == sum(n_node)``.
+            n_node
+                (nf,) per-frame local atom counts.
+            edge_index
+                (2, E) ``[src, dst]`` edge endpoints (flat local indices).
+            edge_vec
+                (E, 3) neighbor-minus-center edge vectors.
+            edge_mask
+                (E,) valid-edge mask.
+            do_atomic_virial
+                Whether to also return the per-atom virial ``energy_derv_c``.
+            fparam
+                Frame parameter, ``(nf, ndf)``.
+            aparam
+                Atomic parameter, ``(nf, nloc, nda)``.
+
+            Returns
+            -------
+            dict
+                ``energy`` (nf, nloc, 1), ``energy_redu`` (nf, 1),
+                ``energy_derv_r`` (nf, nloc, 1, 3),
+                ``energy_derv_c_redu`` (nf, 1, 9), and -- when
+                ``do_atomic_virial`` -- ``energy_derv_c`` (nf, nloc, 1, 9).
+            """
+            nf = int(n_node.shape[0])
+            nloc = int(n_node[0])
+            # make edge_vec the autograd leaf for the energy backward
+            edge_vec = edge_vec.detach().requires_grad_(True)
+            ret = self.call_lower_graph(
+                atype=atype,
+                n_node=n_node,
+                edge_index=edge_index,
+                edge_vec=edge_vec,
+                edge_mask=edge_mask,
+                fparam=fparam,
+                aparam=aparam,
+            )
+            atom_energy = ret["atom_energy"]  # (nf, nloc, 1)
+            energy = ret["energy"]  # (nf, 1)
+            force, atom_virial, virial = edge_energy_deriv(
+                energy,
+                edge_vec,
+                edge_index,
+                edge_mask,
+                n_node,
+                do_atomic_virial=do_atomic_virial,
+                create_graph=self.training,
+            )
+            out = {
+                "energy": atom_energy,
+                "energy_redu": energy,
+                # force (N, 3) -> (nf, nloc, 1, 3); virial (nf, 3, 3) -> (nf, 1, 9)
+                "energy_derv_r": force.reshape(nf, nloc, 1, 3),
+                "energy_derv_c_redu": virial.reshape(nf, 1, 9),
+            }
+            if do_atomic_virial:
+                assert atom_virial is not None
+                # atom_virial (N, 3, 3) -> (nf, nloc, 1, 9)
+                out["energy_derv_c"] = atom_virial.reshape(nf, nloc, 1, 9)
+            return out
+
         def forward_common_atomic(
             self,
             extended_coord: torch.Tensor,
diff --git a/source/tests/pt_expt/model/test_dpa1_graph_lower.py b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
new file mode 100644
index 0000000000..b192a66b86
--- /dev/null
+++ b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Parity: graph lower (forward_common_lower_graph) vs legacy dense lower.
+
+Builds a same-weights pt_expt dpa1(attn_layer=0) EnergyModel and a small
+extended system, then compares the graph-native lower (energy/force/virial/
+atom_virial assembled from ``edge_energy_deriv``) against the legacy dense
+``forward_common_lower`` on the SAME neighbor set (the graph is built REGIME-1
+from the same extended quartet via ``from_dense_quartet``).
+
+The graph lower is inherently LOCAL (ghost-free): its force/atom_virial live on
+``nloc`` nodes, while the legacy lower returns EXTENDED (``nall``) force/
+atom_virial.  The two are reconciled by folding the legacy extended force/
+atom_virial onto local atoms via ``mapping`` (a scatter-add on the atom axis,
+identical to ``communicate_extended_output``).  Energy, reduced energy and the
+reduced (per-frame) virial are frame/local quantities and compare directly.
+"""
+
+import unittest
+
+import numpy as np
+import torch
+
+from deepmd.dpmodel.utils.neighbor_graph import (
+    from_dense_quartet,
+)
+from deepmd.dpmodel.utils.nlist import (
+    build_neighbor_list,
+    extend_coord_with_ghosts,
+)
+from deepmd.dpmodel.utils.region import (
+    normalize_coord,
+)
+from deepmd.pt.utils import (
+    env,
+)
+from deepmd.pt_expt.descriptor.dpa1 import (
+    DescrptDPA1,
+)
+from deepmd.pt_expt.fitting import (
+    InvarFitting,
+)
+from deepmd.pt_expt.model import (
+    EnergyModel,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+
+
+def _fold_extended_to_local(
+    ext: torch.Tensor, mapping: torch.Tensor, nloc: int
+) -> torch.Tensor:
+    """Scatter-add an extended (nf, nall, 1, K) tensor onto local atoms.
+
+    Mirrors ``communicate_extended_output``: ``local[mapping[j]] += ext[j]``
+    along the atom axis (dim 1).
+    """
+    nf, nall = mapping.shape
+    K = ext.shape[-1]
+    out = torch.zeros(nf, nloc, 1, K, dtype=ext.dtype, device=ext.device)
+    idx = mapping.view(nf, nall, 1, 1).expand(nf, nall, 1, K)
+    out.scatter_add_(1, idx, ext)
+    return out
+
+
+class TestDpa1GraphLower(unittest.TestCase):
+    def setUp(self) -> None:
+        self.device = env.DEVICE
+        self.natoms = 5
+        self.rcut = 4.0
+        self.rcut_smth = 0.5
+        self.sel = 20  # mixed-type single int sel
+        self.nt = 2
+        self.type_map = ["foo", "bar"]
+
+        generator = torch.Generator(device=self.device).manual_seed(GLOBAL_SEED)
+        cell = torch.rand(
+            [3, 3], dtype=torch.float64, device=self.device, generator=generator
+        )
+        cell = (cell + cell.T) + 5.0 * torch.eye(3, device=self.device)
+        self.cell = cell.unsqueeze(0)  # [1, 3, 3]
+        coord = torch.rand(
+            [self.natoms, 3],
+            dtype=torch.float64,
+            device=self.device,
+            generator=generator,
+        )
+        coord = torch.matmul(coord, cell)
+        self.coord = coord.unsqueeze(0).to(self.device)  # [1, natoms, 3]
+        self.atype = torch.tensor(
+            [[0, 0, 0, 1, 1]], dtype=torch.int64, device=self.device
+        )
+
+    def _make_model(self) -> EnergyModel:
+        ds = DescrptDPA1(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+            self.nt,
+            neuron=[3, 6],
+            axis_neuron=2,
+            attn=4,
+            attn_layer=0,  # graph lower only supports attn_layer == 0
+            attn_dotr=True,
+            attn_mask=False,
+            activation_function="tanh",
+            set_davg_zero=False,
+            type_one_side=True,
+            precision="float64",
+            seed=GLOBAL_SEED,
+        ).to(self.device)
+        ft = InvarFitting(
+            "energy",
+            self.nt,
+            ds.get_dim_out(),
+            1,
+            mixed_types=ds.mixed_types(),
+            precision="float64",
+            seed=GLOBAL_SEED,
+        ).to(self.device)
+        return EnergyModel(ds, ft, type_map=self.type_map).to(self.device)
+
+    def _prepare_lower_inputs(self, periodic: bool):
+        """Build extended coords, atype, nlist, mapping as torch tensors."""
+        coord_np = self.coord.detach().cpu().numpy()
+        atype_np = self.atype.detach().cpu().numpy()
+        if periodic:
+            cell_np = self.cell.reshape(1, 9).detach().cpu().numpy()
+            coord_normalized = normalize_coord(
+                coord_np.reshape(1, self.natoms, 3),
+                cell_np.reshape(1, 3, 3),
+            )
+            extended_coord, extended_atype, mapping = extend_coord_with_ghosts(
+                coord_normalized,
+                atype_np,
+                cell_np,
+                self.rcut,
+            )
+            nlist = build_neighbor_list(
+                extended_coord,
+                extended_atype,
+                self.natoms,
+                self.rcut,
+                [self.sel],
+                distinguish_types=False,
+            )
+            extended_coord = extended_coord.reshape(1, -1, 3)
+        else:
+            extended_coord = coord_np.reshape(1, self.natoms, 3)
+            extended_atype = atype_np.reshape(1, self.natoms)
+            mapping = np.arange(self.natoms, dtype=np.int64).reshape(1, self.natoms)
+            nlist = build_neighbor_list(
+                extended_coord,
+                extended_atype,
+                self.natoms,
+                self.rcut,
+                [self.sel],
+                distinguish_types=False,
+            )
+        ext_coord = torch.tensor(
+            extended_coord, dtype=torch.float64, device=self.device
+        )
+        ext_atype = torch.tensor(extended_atype, dtype=torch.int64, device=self.device)
+        nlist_t = torch.tensor(nlist, dtype=torch.int64, device=self.device)
+        mapping_t = torch.tensor(mapping, dtype=torch.int64, device=self.device)
+        return ext_coord, ext_atype, nlist_t, mapping_t
+
+    def test_force_virial_parity_vs_legacy(self) -> None:
+        """Graph lower energy/force/virial/atom_virial == legacy dense lower on
+        the SAME neighbor set (regime-1 graph from from_dense_quartet).
+        """
+        model = self._make_model()
+        model.eval()
+        tol = (
+            {"rtol": 1e-12, "atol": 1e-12}
+            if self.device.type == "cpu"
+            else {"rtol": 1e-10, "atol": 1e-10}
+        )
+        for periodic in (True, False):
+            for do_av in (False, True):
+                with self.subTest(periodic=periodic, do_av=do_av):
+                    ext_coord, ext_atype, nlist, mapping = self._prepare_lower_inputs(
+                        periodic
+                    )
+                    nf = ext_coord.shape[0]
+                    nloc = self.natoms
+
+                    legacy = model.forward_common_lower(
+                        ext_coord.clone().requires_grad_(True),
+                        ext_atype,
+                        nlist,
+                        mapping,
+                        do_atomic_virial=do_av,
+                    )
+
+                    # build the regime-1 graph from the SAME extended quartet.
+                    # from_dense_quartet is array-API; feed torch tensors so the
+                    # returned edge_vec is already a torch tensor on env.DEVICE.
+                    ng = from_dense_quartet(ext_coord, nlist, mapping)
+                    atype_local = ext_atype[:, :nloc].reshape(nf * nloc)
+                    graph = model.forward_common_lower_graph(
+                        atype_local,
+                        ng.n_node,
+                        ng.edge_index,
+                        ng.edge_vec,
+                        ng.edge_mask,
+                        do_atomic_virial=do_av,
+                    )
+
+                    # energy / reduced energy / reduced virial: direct compare
+                    torch.testing.assert_close(graph["energy"], legacy["energy"], **tol)
+                    torch.testing.assert_close(
+                        graph["energy_redu"], legacy["energy_redu"], **tol
+                    )
+                    torch.testing.assert_close(
+                        graph["energy_derv_c_redu"],
+                        legacy["energy_derv_c_redu"],
+                        **tol,
+                    )
+
+                    # force: fold legacy extended (nall) -> local (nloc)
+                    legacy_force_local = _fold_extended_to_local(
+                        legacy["energy_derv_r"], mapping, nloc
+                    )
+                    torch.testing.assert_close(
+                        graph["energy_derv_r"], legacy_force_local, **tol
+                    )
+
+                    if do_av:
+                        legacy_av_local = _fold_extended_to_local(
+                            legacy["energy_derv_c"], mapping, nloc
+                        )
+                        torch.testing.assert_close(
+                            graph["energy_derv_c"], legacy_av_local, **tol
+                        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 6e9742328e4b3a63f4f7ecbe1c16e953bfda71e7 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 13:02:14 +0800
Subject: [PATCH 13/69] test(pt_expt): move test_edge_energy_deriv into model/
 (mirrors deepmd/pt_expt/model/)

---
 source/tests/pt_expt/{ => model}/test_edge_energy_deriv.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename source/tests/pt_expt/{ => model}/test_edge_energy_deriv.py (100%)

diff --git a/source/tests/pt_expt/test_edge_energy_deriv.py b/source/tests/pt_expt/model/test_edge_energy_deriv.py
similarity index 100%
rename from source/tests/pt_expt/test_edge_energy_deriv.py
rename to source/tests/pt_expt/model/test_edge_energy_deriv.py

From 4e426afc7c3b24e998a05fa07286a1fe0a3cacdf Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 13:06:38 +0800
Subject: [PATCH 14/69] test(pt_expt): dpa1(attn_layer=0) graph-path serialize
 round-trip + interop

---
 .../pt_expt/test_dpa1_graph_serialize.py      | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 source/tests/pt_expt/test_dpa1_graph_serialize.py

diff --git a/source/tests/pt_expt/test_dpa1_graph_serialize.py b/source/tests/pt_expt/test_dpa1_graph_serialize.py
new file mode 100644
index 0000000000..f7864fc040
--- /dev/null
+++ b/source/tests/pt_expt/test_dpa1_graph_serialize.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+import torch
+
+from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1
+from deepmd.dpmodel.utils.nlist import extend_input_and_build_neighbor_list
+from deepmd.pt_expt.descriptor.dpa1 import DescrptDPA1 as DescrptDPA1PT
+from deepmd.pt_expt.utils import env
+
+
+def _small_extended():
+    rng = np.random.default_rng(7)
+    coord = rng.normal(size=(1, 5, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0]], dtype=np.int64)
+    ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+        coord, atype, 4.0, [30], mixed_types=True, box=None
+    )
+    return ext_coord, ext_atype, nlist, mapping
+
+
+class TestDpa1GraphSerialize(unittest.TestCase):
+    def _make(self):
+        return DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[30], ntypes=2, attn_layer=0)
+
+    def test_roundtrip_forward_identical(self) -> None:
+        """dpa1(attn_layer=0) serialize->deserialize gives an identical forward."""
+        dd = self._make()
+        dd2 = DescrptDPA1.deserialize(dd.serialize())
+        ext_coord, ext_atype, nlist, mapping = _small_extended()
+        np.testing.assert_allclose(
+            dd2.call(ext_coord, ext_atype, nlist, mapping)[0],
+            dd.call(ext_coord, ext_atype, nlist, mapping)[0],
+            rtol=1e-12,
+            atol=1e-12,
+        )
+
+    def test_dpmodel_to_pt_expt_interop(self) -> None:
+        """Dpmodel dpa1 serialize -> pt_expt deserialize -> identical descriptor
+        (cross-backend checkpoint interop, graph-routed attn_layer=0 forward).
+        """
+        dd = self._make()
+        dd_pt = DescrptDPA1PT.deserialize(dd.serialize()).to(env.DEVICE)
+        ext_coord, ext_atype, nlist, mapping = _small_extended()
+        ref = dd.call(ext_coord, ext_atype, nlist, mapping)[0]
+        got = dd_pt(
+            torch.from_numpy(ext_coord).to(env.DEVICE),
+            torch.from_numpy(ext_atype).to(env.DEVICE),
+            torch.from_numpy(nlist).to(env.DEVICE),
+            torch.from_numpy(mapping).to(env.DEVICE),
+        )[0]
+        np.testing.assert_allclose(
+            got.detach().cpu().numpy(),
+            ref,
+            rtol=1e-10,
+            atol=1e-10,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From ca77ac3e3618d0fb43331371d0568c11cd8a5c9a Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 13:07:37 +0800
Subject: [PATCH 15/69] test(pt_expt): move dpa1 graph serialize test into
 descriptor/

---
 .../tests/pt_expt/{ => descriptor}/test_dpa1_graph_serialize.py   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename source/tests/pt_expt/{ => descriptor}/test_dpa1_graph_serialize.py (100%)

diff --git a/source/tests/pt_expt/test_dpa1_graph_serialize.py b/source/tests/pt_expt/descriptor/test_dpa1_graph_serialize.py
similarity index 100%
rename from source/tests/pt_expt/test_dpa1_graph_serialize.py
rename to source/tests/pt_expt/descriptor/test_dpa1_graph_serialize.py

From 48c0ea42c8e0c0b19018b1b51918191a5e15ac52 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 13:10:06 +0800
Subject: [PATCH 16/69] test(pt_expt): drop redundant dpa1 graph serialize test

serialize roundtrip + dpmodel->pt_expt interop on the attn_layer=0 graph path
are already covered by test_dpa1.py::test_consistency (lines 86-113), which
routes through the graph forward via the Task-3 dense-call adapter.
---
 .../descriptor/test_dpa1_graph_serialize.py   | 62 -------------------
 1 file changed, 62 deletions(-)
 delete mode 100644 source/tests/pt_expt/descriptor/test_dpa1_graph_serialize.py

diff --git a/source/tests/pt_expt/descriptor/test_dpa1_graph_serialize.py b/source/tests/pt_expt/descriptor/test_dpa1_graph_serialize.py
deleted file mode 100644
index f7864fc040..0000000000
--- a/source/tests/pt_expt/descriptor/test_dpa1_graph_serialize.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# SPDX-License-Identifier: LGPL-3.0-or-later
-import unittest
-
-import numpy as np
-import torch
-
-from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1
-from deepmd.dpmodel.utils.nlist import extend_input_and_build_neighbor_list
-from deepmd.pt_expt.descriptor.dpa1 import DescrptDPA1 as DescrptDPA1PT
-from deepmd.pt_expt.utils import env
-
-
-def _small_extended():
-    rng = np.random.default_rng(7)
-    coord = rng.normal(size=(1, 5, 3)) * 1.5
-    atype = np.array([[0, 1, 0, 1, 0]], dtype=np.int64)
-    ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
-        coord, atype, 4.0, [30], mixed_types=True, box=None
-    )
-    return ext_coord, ext_atype, nlist, mapping
-
-
-class TestDpa1GraphSerialize(unittest.TestCase):
-    def _make(self):
-        return DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[30], ntypes=2, attn_layer=0)
-
-    def test_roundtrip_forward_identical(self) -> None:
-        """dpa1(attn_layer=0) serialize->deserialize gives an identical forward."""
-        dd = self._make()
-        dd2 = DescrptDPA1.deserialize(dd.serialize())
-        ext_coord, ext_atype, nlist, mapping = _small_extended()
-        np.testing.assert_allclose(
-            dd2.call(ext_coord, ext_atype, nlist, mapping)[0],
-            dd.call(ext_coord, ext_atype, nlist, mapping)[0],
-            rtol=1e-12,
-            atol=1e-12,
-        )
-
-    def test_dpmodel_to_pt_expt_interop(self) -> None:
-        """Dpmodel dpa1 serialize -> pt_expt deserialize -> identical descriptor
-        (cross-backend checkpoint interop, graph-routed attn_layer=0 forward).
-        """
-        dd = self._make()
-        dd_pt = DescrptDPA1PT.deserialize(dd.serialize()).to(env.DEVICE)
-        ext_coord, ext_atype, nlist, mapping = _small_extended()
-        ref = dd.call(ext_coord, ext_atype, nlist, mapping)[0]
-        got = dd_pt(
-            torch.from_numpy(ext_coord).to(env.DEVICE),
-            torch.from_numpy(ext_atype).to(env.DEVICE),
-            torch.from_numpy(nlist).to(env.DEVICE),
-            torch.from_numpy(mapping).to(env.DEVICE),
-        )[0]
-        np.testing.assert_allclose(
-            got.detach().cpu().numpy(),
-            ref,
-            rtol=1e-10,
-            atol=1e-10,
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()

From 33284bbc0603e5c89b8e0d79402441190369af8a Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 13:16:54 +0800
Subject: [PATCH 17/69] test: dpa1 graph attn_layer=0 make_fx + type_one_side +
 multi-frame coverage

---
 .../dpmodel/test_dpa1_call_graph_block.py     | 15 ++--
 .../dpmodel/test_dpa1_graph_model_energy.py   | 35 ++++++++++
 source/tests/pt_expt/descriptor/test_dpa1.py  | 69 +++++++++++++++++++
 3 files changed, 115 insertions(+), 4 deletions(-)

diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
index 1da0f5fd1a..d00abedd94 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
@@ -19,7 +19,7 @@
 
 
 class TestDpa1BlockCallGraph:
-    def _make(self, sel):
+    def _make(self, sel, type_one_side=False):
         return DescrptDPA1(
             rcut=4.0,
             rcut_smth=0.5,
@@ -28,6 +28,7 @@ def _make(self, sel):
             attn_layer=0,
             axis_neuron=2,
             neuron=[6, 12],
+            type_one_side=type_one_side,
         )
 
     def setup_method(self) -> None:
@@ -36,10 +37,16 @@ def setup_method(self) -> None:
         self.coord = rng.normal(size=(1, self.nloc, 3)) * 1.5
         self.atype = np.array([[0, 1, 0, 1]], dtype=np.int64)
 
+    @pytest.mark.parametrize("type_one_side", [False, True])  # tebd concat branch
     @pytest.mark.parametrize("sel", [[20], [3]])  # non-binding AND binding
-    def test_block_graph_equals_dense_any_sel(self, sel) -> None:
-        """Graph block output is bit-exact with the dense block on the same nlist."""
-        dd = self._make(sel)
+    def test_block_graph_equals_dense_any_sel(self, sel, type_one_side) -> None:
+        """Graph block output is bit-exact with the dense block on the same nlist.
+
+        ``type_one_side`` toggles the concat branch in the block: when True the
+        per-edge feature concatenates only the NEIGHBOR tebd (no center tebd),
+        so both the graph and dense paths must agree for either branch.
+        """
+        dd = self._make(sel, type_one_side=type_one_side)
         blk = dd.se_atten
         # build the dense nlist exactly as the descriptor would
         (
diff --git a/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py b/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
index e8e5285ebd..4389ff6a17 100644
--- a/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
+++ b/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
@@ -81,6 +81,41 @@ def test_energy_parity_non_binding_sel(method, periodic) -> None:
     np.testing.assert_array_equal(graph["mask"], dense["mask"])
 
 
+@pytest.mark.parametrize("method", ["dense", "ase"])  # in-tree carry-all AND ase
+def test_energy_parity_multiframe_periodic(method) -> None:
+    """Multi-frame (nf=2) PERIODIC energy parity at non-binding sel.
+
+    Exercises the nf>1 graph reductions (``frame_id = repeat(arange(nf),
+    n_node)`` energy segment-sum and the ``frame * nloc`` node offsetting in
+    ``from_dense_quartet``) with DIFFERENT per-frame coordinates and a box.
+    At non-binding sel the carry-all graph and the dense path see the SAME
+    neighbors, so ``energy_redu``/``energy`` are EXACTLY equal per frame.
+    """
+    if method == "ase":
+        pytest.importorskip("ase")
+    rng = np.random.default_rng(3)
+    nf, nloc = 2, 6
+    # distinct coordinates per frame (not a broadcast of one frame)
+    coord = rng.normal(size=(nf, nloc, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0, 1]] * nf, dtype=np.int64)
+    # large box so the cell is essentially non-periodic for rcut=4.0
+    box = np.tile(np.eye(3).reshape(1, 9) * 20.0, (nf, 1))
+    # LARGE sel -> non-binding (no truncation)
+    model = _make_model([200])
+
+    dense = model.call_common(coord, atype, box)
+    graph = model.call_common(coord, atype, box, neighbor_graph_method=method)
+
+    np.testing.assert_allclose(
+        graph["energy_redu"], dense["energy_redu"], rtol=1e-12, atol=1e-12
+    )
+    np.testing.assert_allclose(graph["energy"], dense["energy"], rtol=1e-12, atol=1e-12)
+    np.testing.assert_array_equal(graph["mask"], dense["mask"])
+    # the two frames must produce DIFFERENT energies (genuine nf>1 test, not a
+    # broadcast of one frame); they differ here by ~1e-5.
+    assert not np.array_equal(dense["energy_redu"][0], dense["energy_redu"][1])
+
+
 def test_binding_sel_carries_more_than_dense() -> None:
     """At binding sel the carry-all graph includes neighbors the dense path
     truncates, so energy DIFFERS (intended, decision #17 / Option B).
diff --git a/source/tests/pt_expt/descriptor/test_dpa1.py b/source/tests/pt_expt/descriptor/test_dpa1.py
index c5a2ed57a6..f7c3edb9c2 100644
--- a/source/tests/pt_expt/descriptor/test_dpa1.py
+++ b/source/tests/pt_expt/descriptor/test_dpa1.py
@@ -252,6 +252,75 @@ def fn(coord_ext, atype_ext, nlist):
             atol=atol,
         )
 
+    @pytest.mark.xfail(
+        reason=(
+            "graph forward fx-trace lands in PR-B: the attn_layer=0 graph path "
+            "has data-dependent ops that default (real-tensor) make_fx cannot "
+            "trace -- `int(xp.sum(graph.n_node))` (dpa1.py _call_graph) and "
+            "`xp.nonzero` in from_dense_quartet. PR-B exports via dynamic/symbolic "
+            "shapes. Eager forward+grad still runs correctly (asserted below)."
+        ),
+        strict=False,
+    )
+    @pytest.mark.parametrize("prec", ["float64"])  # precision
+    def test_make_fx_graph(self, prec) -> None:
+        """make_fx (export-readiness) of the attn_layer=0 GRAPH forward.
+
+        For ``attn_layer == 0`` the dense ``forward`` routes through the
+        graph-native path (``from_dense_quartet -> call_graph``). This proves
+        that graph forward + ``autograd.grad`` is fx-traceable (full .pt2
+        export is PR-B).
+        """
+        rng = np.random.default_rng(GLOBAL_SEED)
+        _, _, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(dstd)
+
+        dtype = PRECISION_DICT[prec]
+        rtol, atol = get_tols(prec)
+        dd0 = DescrptDPA1(
+            self.rcut,
+            self.rcut_smth,
+            self.sel_mix,
+            self.nt,
+            attn_layer=0,
+            precision=prec,
+            seed=GLOBAL_SEED,
+        ).to(self.device)
+        dd0.se_atten.mean = torch.tensor(davg, dtype=dtype, device=self.device)
+        dd0.se_atten.stddev = torch.tensor(dstd, dtype=dtype, device=self.device)
+        dd0 = dd0.eval()
+        coord_ext = torch.tensor(self.coord_ext, dtype=dtype, device=self.device)
+        atype_ext = torch.tensor(self.atype_ext, dtype=int, device=self.device)
+        nlist = torch.tensor(self.nlist, dtype=int, device=self.device)
+        # the attn_layer=0 graph adapter (from_dense_quartet) maps every ghost
+        # neighbor to its LOCAL owner via ``mapping``; the mixin's nall(4) > nloc(3)
+        # so a real mapping is required (identity mapping would index out of range).
+        mapping = torch.tensor(self.mapping, dtype=int, device=self.device)
+
+        def fn(coord_ext, atype_ext, nlist, mapping):
+            coord_ext = coord_ext.detach().requires_grad_(True)
+            rd = dd0(coord_ext, atype_ext, nlist, mapping)[0]
+            grad = torch.autograd.grad(rd.sum(), coord_ext, create_graph=False)[0]
+            return rd, grad
+
+        rd_eager, grad_eager = fn(coord_ext, atype_ext, nlist, mapping)
+        traced = make_fx(fn)(coord_ext, atype_ext, nlist, mapping)
+        rd_traced, grad_traced = traced(coord_ext, atype_ext, nlist, mapping)
+        np.testing.assert_allclose(
+            rd_eager.detach().cpu().numpy(),
+            rd_traced.detach().cpu().numpy(),
+            rtol=rtol,
+            atol=atol,
+        )
+        np.testing.assert_allclose(
+            grad_eager.detach().cpu().numpy(),
+            grad_traced.detach().cpu().numpy(),
+            rtol=rtol,
+            atol=atol,
+        )
+
     @pytest.mark.parametrize("shared_level", [0, 1])  # sharing level
     def test_share_params(self, shared_level) -> None:
         """share_params level 0: share all; level 1: share type_embedding only."""

From f85b0f6fabaf74bb0c12054469cc267f627d8f24 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 13:20:49 +0800
Subject: [PATCH 18/69] fix(dpmodel): only route graph-ELIGIBLE configs through
 call_graph; fall back to dense

Task 3's adapter routed ALL attn_layer==0 through the graph, but the graph
only supports tebd_input_mode='concat', no exclude_types, and needs mapping
for ghosts. strip-mode / exclude / mapping-None-with-ghosts attn_layer=0
models raised/IndexError'd. uses_graph_lower() now encodes full eligibility
and ineligible configs fall back to the legacy dense body unchanged.
Fixes test_compressed_forward (attn_layer=0 strip).
---
 deepmd/dpmodel/descriptor/dpa1.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index d79bf778b9..c473b40e11 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -421,10 +421,17 @@ def get_numb_attn_layer(self) -> int:
     def uses_graph_lower(self) -> bool:
         """Returns whether this descriptor supports the graph-native lower.
 
-        The graph-native energy lower (``call_graph``) currently covers only
-        the non-attention (``attn_layer == 0``) factorizable path.
+        The graph-native energy lower (``call_graph``) currently covers only the
+        non-attention (``attn_layer == 0``) factorizable path with concat
+        type-embedding and no type exclusion. Any other config (attention,
+        ``tebd_input_mode == "strip"``, ``exclude_types``) falls back to the
+        legacy dense path, so those models keep working unchanged.
         """
-        return self.se_atten.attn_layer == 0
+        return (
+            self.se_atten.attn_layer == 0
+            and self.se_atten.tebd_input_mode == "concat"
+            and not self.se_atten.exclude_types
+        )
 
     def share_params(
         self, base_class: "DescrptDPA1", shared_level: int, resume: bool = False
@@ -552,10 +559,14 @@ def call(
         xp = array_api_compat.array_namespace(coord_ext, atype_ext, nlist)
         nf, nloc, nnei = nlist.shape
         nall = xp.reshape(coord_ext, (nf, -1)).shape[1] // 3
-        # attn_layer == 0 routes through the graph-native path; the dense call is
-        # a thin adapter (decision #14: graph = single math source). The full
-        # dense 5-tuple ABI is preserved exactly (see call_graph).
-        if self.se_atten.attn_layer == 0:
+        # graph-eligible configs route through the graph-native path; the dense
+        # call is a thin adapter (decision #14: graph = single math source) that
+        # preserves the dense 5-tuple ABI exactly (see call_graph). Ineligible
+        # configs (attention, strip tebd, exclude_types) and the ghost case with
+        # no mapping fall back to the legacy dense body below, so those models
+        # keep working unchanged. The graph needs `mapping` to fold ghosts to
+        # local owners; without it only the no-ghost case (nall == nloc) is valid.
+        if self.uses_graph_lower() and (mapping is not None or nall == nloc):
             from deepmd.dpmodel.utils.neighbor_graph import (
                 from_dense_quartet,
             )

From 01f84a1ab47a1778a1f5033d9178b9c722b56eeb Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 13:22:14 +0800
Subject: [PATCH 19/69] test(dpmodel): regression-lock graph-ineligible
 (strip/exclude/no-mapping-ghosts) dense fallback

---
 .../test_dpa1_call_graph_descriptor.py        | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
index aaf0a3e040..45d698d931 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
@@ -95,3 +95,50 @@ def test_descriptor_graph_equals_dense_full_tuple(self, sel) -> None:
         assert out[3] is None
         # sw
         np.testing.assert_allclose(out[4], ref[4], rtol=1e-12, atol=1e-12)
+
+    @pytest.mark.parametrize(
+        "kwargs",
+        [
+            {"tebd_input_mode": "strip"},  # strip tebd: graph unsupported -> dense
+            {"exclude_types": [(0, 1)]},  # type exclusion: graph unsupported -> dense
+        ],
+    )
+    def test_ineligible_config_falls_back_to_dense(self, kwargs) -> None:
+        """attn_layer=0 configs the graph can't handle (strip tebd, exclude_types)
+        must report uses_graph_lower()=False and run the dense body without
+        raising (regression: Task-3 routing previously raised NotImplementedError).
+        """
+        dd = DescrptDPA1(
+            rcut=4.0, rcut_smth=0.5, sel=[30], ntypes=2, attn_layer=0, **kwargs
+        )
+        assert dd.uses_graph_lower() is False
+        ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+            self.coord,
+            self.atype,
+            dd.get_rcut(),
+            dd.get_sel(),
+            mixed_types=dd.mixed_types(),
+            box=None,
+        )
+        out = dd.call(ext_coord, ext_atype, nlist, mapping=mapping)  # must not raise
+        assert len(out) == 5
+
+    def test_eligible_no_mapping_with_ghosts_falls_back(self) -> None:
+        """An eligible (concat) attn_layer=0 descriptor called with mapping=None
+        on a PERIODIC system (nall > nloc ghosts) must fall back to the dense
+        body and match it (regression: the graph needs mapping for ghosts, the
+        identity-mapping default previously indexed out of range)."""
+        dd = self._make([30])
+        box = np.eye(3, dtype=np.float64)[None] * 6.0
+        ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+            self.coord,
+            self.atype,
+            dd.get_rcut(),
+            dd.get_sel(),
+            mixed_types=dd.mixed_types(),
+            box=box,
+        )
+        assert ext_atype.shape[1] > self.nloc  # ghosts present
+        ref = self._dense_reference(dd, ext_coord, ext_atype, nlist)
+        out = dd.call(ext_coord, ext_atype, nlist, mapping=None)  # must not IndexError
+        np.testing.assert_allclose(out[0], ref[0], rtol=1e-12, atol=1e-12)

From 5d88e3db4e066a8b01944bccb48fb7764548143e Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 13:23:25 +0800
Subject: [PATCH 20/69] style: fix D209 docstring in dpa1 fallback regression
 test

---
 source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
index 45d698d931..aa9fc23575 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
@@ -127,7 +127,8 @@ def test_eligible_no_mapping_with_ghosts_falls_back(self) -> None:
         """An eligible (concat) attn_layer=0 descriptor called with mapping=None
         on a PERIODIC system (nall > nloc ghosts) must fall back to the dense
         body and match it (regression: the graph needs mapping for ghosts, the
-        identity-mapping default previously indexed out of range)."""
+        identity-mapping default previously indexed out of range).
+        """
         dd = self._make([30])
         box = np.eye(3, dtype=np.float64)[None] * 6.0
         ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(

From 912f0542eb614baa075cf4079f3dc1095023419b Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 13:59:58 +0800
Subject: [PATCH 21/69] feat: default eligible dpa1(attn_layer=0) to carry-all
 graph forward (decision #17)

---
 deepmd/dpmodel/model/make_model.py | 27 +++++++++++++++--
 deepmd/pt_expt/model/make_model.py | 48 ++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index e3b4140c23..b78ea24ce3 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -315,7 +315,8 @@ def call_common(
                 coord, box=box, fparam=fparam, aparam=aparam, charge_spin=charge_spin
             )
             del coord, box, fparam, aparam, charge_spin
-            if neighbor_graph_method is not None:
+            graph_method = self._resolve_graph_method(neighbor_graph_method)
+            if graph_method is not None:
                 # carry-all NeighborGraph energy forward (Option B / decision #17)
                 model_predict = self._call_common_graph(
                     cc,
@@ -323,7 +324,8 @@ def call_common(
                     bb,
                     fp,
                     ap,
-                    neighbor_graph_method,
+                    graph_method,
+                    do_atomic_virial,
                 )
             else:
                 # legacy dense-nlist path (builds the extended quartet)
@@ -346,6 +348,26 @@ def call_common(
             model_predict = self._output_type_cast(model_predict, input_prec)
             return model_predict
 
+        def _resolve_graph_method(
+            self, neighbor_graph_method: str | None
+        ) -> str | None:
+            """Resolve the neighbor-graph method.
+
+            ``None`` => AUTO: carry-all graph for graph-eligible mixed_types
+            descriptors (the decision-#17 default), else the legacy dense path.
+            ``"legacy"`` => force the dense path (opt-out). ``"dense"``/``"ase"``
+            => force the graph with that builder.
+            """
+            if neighbor_graph_method == "legacy":
+                return None
+            if neighbor_graph_method is not None:
+                return neighbor_graph_method
+            descriptor = self.atomic_model.descriptor
+            uses_graph_lower = getattr(descriptor, "uses_graph_lower", lambda: False)
+            if self.mixed_types() and uses_graph_lower():
+                return "dense"
+            return None
+
         def _call_common_graph(
             self,
             cc: Array,
@@ -354,6 +376,7 @@ def _call_common_graph(
             fp: Array | None,
             ap: Array | None,
             method: str,
+            do_atomic_virial: bool = False,
         ) -> dict[str, Array]:
             """Carry-all graph energy forward (opt-in, Option B).
 
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 9e9947511b..a6e9a340af 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -373,6 +373,54 @@ def forward_common_lower_graph(
                 out["energy_derv_c"] = atom_virial.reshape(nf, nloc, 1, 9)
             return out
 
+        def _call_common_graph(
+            self,
+            cc: torch.Tensor,
+            atype: torch.Tensor,
+            bb: torch.Tensor | None,
+            fp: torch.Tensor | None,
+            ap: torch.Tensor | None,
+            method: str,
+            do_atomic_virial: bool = False,
+        ) -> dict[str, torch.Tensor]:
+            """Carry-all graph forward with autograd force/virial (pt_expt override).
+
+            Builds the carry-all :class:`NeighborGraph` in TORCH (the array-API
+            builder runs natively and yields a differentiable ``edge_vec``), then
+            routes through :meth:`forward_common_lower_graph` so force / virial /
+            (optional) atom-virial are produced via autograd.  The returned dict
+            uses the SAME internal key names as the legacy dense
+            :meth:`call_common` output (``energy``, ``energy_redu``,
+            ``energy_derv_r``, ``energy_derv_c_redu``, and ``energy_derv_c`` when
+            ``do_atomic_virial``).
+            """
+            from deepmd.dpmodel.utils.neighbor_graph import (
+                build_neighbor_graph,
+                build_neighbor_graph_ase,
+            )
+
+            rcut = self.get_rcut()
+            if method == "dense":
+                ng = build_neighbor_graph(cc, atype, bb, rcut)
+            elif method == "ase":
+                ng = build_neighbor_graph_ase(cc, atype, bb, rcut)
+            else:
+                raise ValueError(
+                    f"unknown neighbor_graph_method {method!r}; use 'dense' or 'ase'"
+                )
+            nf, nloc = atype.shape[:2]
+            atype_flat = atype.reshape(nf * nloc)
+            return self.forward_common_lower_graph(
+                atype_flat,
+                ng.n_node,
+                ng.edge_index,
+                ng.edge_vec,
+                ng.edge_mask,
+                do_atomic_virial=do_atomic_virial,
+                fparam=fp,
+                aparam=ap,
+            )
+
         def forward_common_atomic(
             self,
             extended_coord: torch.Tensor,

From 291c4b007726a49eef27d102ecff01add360cc4d Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 14:15:32 +0800
Subject: [PATCH 22/69] fix: guard descriptor-less atomic models in graph
 auto-resolve; pt_expt graph mask key; legacy opt-out in Option-B test

- _resolve_graph_method/_call_common_graph use getattr(atomic_model,'descriptor',None)
  so Linear/ZBL models (no descriptor) fall back to dense instead of AttributeError
- pt_expt _call_common_graph override adds the all-ones mask key for dense parity
- test_dpa1_graph_model_energy dense refs use neighbor_graph_method='legacy'
  to opt out of the now-default carry-all graph (decision #17 default-flip)
---
 deepmd/dpmodel/model/make_model.py            |  6 +++--
 deepmd/pt_expt/model/make_model.py            |  8 ++++++-
 .../dpmodel/test_dpa1_graph_model_energy.py   | 24 +++++++++----------
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index b78ea24ce3..21459ff263 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -362,7 +362,9 @@ def _resolve_graph_method(
                 return None
             if neighbor_graph_method is not None:
                 return neighbor_graph_method
-            descriptor = self.atomic_model.descriptor
+            # Linear/ZBL atomic models have no single ``descriptor`` -> not graph
+            # eligible (AUTO falls back to the dense path).
+            descriptor = getattr(self.atomic_model, "descriptor", None)
             uses_graph_lower = getattr(descriptor, "uses_graph_lower", lambda: False)
             if self.mixed_types() and uses_graph_lower():
                 return "dense"
@@ -386,7 +388,7 @@ def _call_common_graph(
             (``atom_energy``, ``energy``, ``mask``). Input type-casting is done
             by the caller; output type-casting is also applied by the caller.
             """
-            descriptor = self.atomic_model.descriptor
+            descriptor = getattr(self.atomic_model, "descriptor", None)
             uses_graph_lower = getattr(descriptor, "uses_graph_lower", lambda: False)
             if not (self.mixed_types() and uses_graph_lower()):
                 raise NotImplementedError(
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index a6e9a340af..5414a17582 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -410,7 +410,7 @@ def _call_common_graph(
                 )
             nf, nloc = atype.shape[:2]
             atype_flat = atype.reshape(nf * nloc)
-            return self.forward_common_lower_graph(
+            model_predict = self.forward_common_lower_graph(
                 atype_flat,
                 ng.n_node,
                 ng.edge_index,
@@ -420,6 +420,12 @@ def _call_common_graph(
                 fparam=fp,
                 aparam=ap,
             )
+            # carry-all graph: every local atom is real -> all-ones mask, matching
+            # the dense ``call_common`` output (which carries a ``mask`` key).
+            model_predict["mask"] = torch.ones(
+                (nf, nloc), dtype=torch.int32, device=atype.device
+            )
+            return model_predict
 
         def forward_common_atomic(
             self,
diff --git a/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py b/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
index 4389ff6a17..8baac16818 100644
--- a/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
+++ b/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
@@ -1,18 +1,18 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Opt-in carry-all graph energy forward via ``neighbor_graph_method`` (Option B).
+"""Carry-all graph energy forward via ``neighbor_graph_method`` (Option B).
 
-PR-A 5c: ``CM.call_common`` gains a ``neighbor_graph_method`` keyword that,
-when set, routes a dpa1(``attn_layer == 0``) ENERGY forward through the
-carry-all graph builder + ``call_lower_graph`` instead of the dense nlist path.
+``CM.call_common`` routes a graph-eligible dpa1(``attn_layer == 0``) ENERGY
+forward through the carry-all graph builder + ``call_lower_graph``. Per the
+default-flip (decision #17) this is now the DEFAULT for eligible models;
+``neighbor_graph_method="legacy"`` opts out to the truncating dense nlist path,
+and ``"dense"``/``"ase"`` force the carry-all graph with that builder.
 
 Option-B behavior (decision #17 / spec_unified_edge_nlist):
 
-* non-binding ``sel`` -- the carry-all graph and the dense path see the SAME
-  neighbors, so ``energy``/``atom_energy`` are EXACTLY equal;
-* binding ``sel`` -- the carry-all graph keeps neighbors the dense path
+* non-binding ``sel`` -- the carry-all graph and the legacy dense path see the
+  SAME neighbors, so ``energy``/``atom_energy`` are EXACTLY equal;
+* binding ``sel`` -- the carry-all graph keeps neighbors the legacy dense path
   truncates, so energy DIFFERS (intended).
-
-The DEFAULT (``neighbor_graph_method=None``) keeps the dense path unchanged.
 """
 
 import numpy as np
@@ -68,7 +68,7 @@ def test_energy_parity_non_binding_sel(method, periodic) -> None:
     # LARGE sel -> non-binding (no truncation)
     model = _make_model([200])
 
-    dense = model.call_common(coord, atype, box)
+    dense = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
     graph = model.call_common(coord, atype, box, neighbor_graph_method=method)
 
     # dense energy keys: ``energy_redu`` (reduced, nf x 1) and ``energy``
@@ -103,7 +103,7 @@ def test_energy_parity_multiframe_periodic(method) -> None:
     # LARGE sel -> non-binding (no truncation)
     model = _make_model([200])
 
-    dense = model.call_common(coord, atype, box)
+    dense = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
     graph = model.call_common(coord, atype, box, neighbor_graph_method=method)
 
     np.testing.assert_allclose(
@@ -129,7 +129,7 @@ def test_binding_sel_carries_more_than_dense() -> None:
     # binding sel -> dense path truncates to 4 neighbors per atom
     model = _make_model([4])
 
-    dense = model.call_common(coord, atype, box)
+    dense = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
     graph = model.call_common(coord, atype, box, neighbor_graph_method="dense")
 
     assert not np.allclose(graph["energy_redu"], dense["energy_redu"])

From 18e26c53963949a5e8a2fbfc4dabd2e2da9f9456 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 14:26:06 +0800
Subject: [PATCH 23/69] fix: gate carry-all default-flip to pt_expt only;
 dpmodel/jax keep dense default

dpmodel/jax compute force/virial analytically inside call_common (energy_derv_r);
the energy-only graph lower drops it -> KeyError when force is requested. Only
pt_expt has the autograd graph force/virial path, so only pt_expt defaults
eligible models to the graph. dpmodel base _resolve_graph_method no longer
auto-routes; pt_expt overrides it to re-enable AUTO.
---
 deepmd/dpmodel/model/make_model.py | 23 ++++++++++-------------
 deepmd/pt_expt/model/make_model.py | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 21459ff263..c87da1a8b3 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -353,22 +353,19 @@ def _resolve_graph_method(
         ) -> str | None:
             """Resolve the neighbor-graph method.
 
-            ``None`` => AUTO: carry-all graph for graph-eligible mixed_types
-            descriptors (the decision-#17 default), else the legacy dense path.
-            ``"legacy"`` => force the dense path (opt-out). ``"dense"``/``"ase"``
-            => force the graph with that builder.
+            Base (dpmodel/jax): ``None`` => the dense path. These backends compute
+            force/virial ANALYTICALLY inside ``call_common`` (``energy_derv_r`` in
+            the output); the carry-all graph lower here is ENERGY-only, so it is
+            NOT used by default (it would drop force). ``"legacy"`` => dense;
+            explicit ``"dense"``/``"ase"`` => opt into the (energy-only) graph.
+
+            pt_expt OVERRIDES this so ``None`` defaults graph-eligible mixed_types
+            descriptors to the carry-all graph (decision #17) -- pt_expt has the
+            autograd ``forward_common_lower_graph`` that produces force/virial.
             """
             if neighbor_graph_method == "legacy":
                 return None
-            if neighbor_graph_method is not None:
-                return neighbor_graph_method
-            # Linear/ZBL atomic models have no single ``descriptor`` -> not graph
-            # eligible (AUTO falls back to the dense path).
-            descriptor = getattr(self.atomic_model, "descriptor", None)
-            uses_graph_lower = getattr(descriptor, "uses_graph_lower", lambda: False)
-            if self.mixed_types() and uses_graph_lower():
-                return "dense"
-            return None
+            return neighbor_graph_method
 
         def _call_common_graph(
             self,
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 5414a17582..7e54d643b4 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -373,6 +373,26 @@ def forward_common_lower_graph(
                 out["energy_derv_c"] = atom_virial.reshape(nf, nloc, 1, 9)
             return out
 
+        def _resolve_graph_method(
+            self, neighbor_graph_method: str | None
+        ) -> str | None:
+            """pt_expt default-flip (decision #17): ``None`` => carry-all graph for
+            graph-eligible mixed_types descriptors, else dense. Unlike dpmodel/jax,
+            pt_expt has the autograd ``forward_common_lower_graph`` that produces
+            force/virial on the graph, so the graph can be the DEFAULT here.
+            ``"legacy"`` forces dense; explicit ``"dense"``/``"ase"`` force the graph.
+            """
+            if neighbor_graph_method == "legacy":
+                return None
+            if neighbor_graph_method is not None:
+                return neighbor_graph_method
+            # Linear/ZBL atomic models have no single ``descriptor`` -> dense.
+            descriptor = getattr(self.atomic_model, "descriptor", None)
+            uses_graph_lower = getattr(descriptor, "uses_graph_lower", lambda: False)
+            if self.mixed_types() and uses_graph_lower():
+                return "dense"
+            return None
+
         def _call_common_graph(
             self,
             cc: torch.Tensor,

From b59f5dd9c6d2f4b967c2f14e26465dae0151338f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 14:55:16 +0800
Subject: [PATCH 24/69] feat(dpmodel): shape-static from_dense_quartet + static
 n_total -> dense call jit/export-traceable (decision #16)

---
 deepmd/dpmodel/descriptor/dpa1.py             | 17 ++++--
 .../dpmodel/utils/neighbor_graph/builder.py   | 52 +++++++++++++++++++
 source/tests/pt_expt/descriptor/test_dpa1.py  | 10 ----
 3 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index c473b40e11..56f922d5f7 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -580,7 +580,13 @@ def call(
                 )
             else:
                 mapping_g = xp.reshape(mapping, (nf, nall))
-            graph = from_dense_quartet(coord_ext_3, nlist, mapping_g)
+            # shape-static converter (compact=False, layout=None) so this dense
+            # adapter is jit/export-traceable: no nonzero-compaction (data-dependent
+            # shape). Masked invalid edges contribute zero in call_graph's
+            # segment_sum, so the descriptor output is unchanged.
+            graph = from_dense_quartet(
+                coord_ext_3, nlist, mapping_g, layout=None, compact=False
+            )
             # local atom types, flat (nf * nloc,)
             atype_local = xp.reshape(xp_take_first_n(atype_ext, 1, nloc), (nf * nloc,))
             grrg, rot_mat = self.call_graph(
@@ -668,7 +674,10 @@ def call_graph(
             graph, atype, type_embedding=type_embedding
         )
         nf = graph.n_node.shape[0]
-        nloc = int(graph.n_node[0])
+        # atype is the flat (nf*nloc,) node axis; derive nloc from the STATIC shape
+        # (n_node[i] == nloc for all frames by contract) so this adapter stays
+        # jit/export-traceable (no concretize of n_node).
+        nloc = atype.shape[0] // nf
         ng = self.se_atten.neuron[-1]
         axis = self.se_atten.axis_neuron
         grrg = xp.reshape(grrg_node, (nf, nloc, ng * axis))
@@ -1417,7 +1426,9 @@ def _call_graph(
             raise ValueError("type_embedding is required for the graph path")
         xp = array_api_compat.array_namespace(graph.edge_vec)
         dev = array_api_compat.device(graph.edge_vec)
-        n_total = int(xp.sum(graph.n_node))
+        # N == sum(graph.n_node) by contract (atype is (N,)); use the static shape
+        # value so the kernel stays jit/export-traceable (no concretize of n_node).
+        n_total = atype.shape[0]
         src = graph.edge_index[0, :]
         dst = graph.edge_index[1, :]
         atype = xp.asarray(atype, device=dev)
diff --git a/deepmd/dpmodel/utils/neighbor_graph/builder.py b/deepmd/dpmodel/utils/neighbor_graph/builder.py
index 9a10d3f805..88e4ef0bb4 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/builder.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/builder.py
@@ -56,6 +56,7 @@ def from_dense_quartet(
     nlist: Array,
     mapping: Array,
     layout: GraphLayout | None = None,
+    compact: bool = True,
 ) -> NeighborGraph:
     """Convert a legacy extended quartet into a ghost-free NeighborGraph (CONVERTER).
 
@@ -85,6 +86,15 @@ def from_dense_quartet(
         (nf, nall) extended -> local-owner index (local atoms map to themselves).
     layout
         edge-axis length policy; ``None`` => dynamic (torch) with ``min_edges`` guards.
+    compact
+        If True (default), COMPACT real edges with ``nonzero`` and pad/guard via
+        :func:`pad_and_guard_edges` -- the data-dependent output shape breaks
+        jax.jit / torch.export. If False, emit a SHAPE-STATIC graph: every nlist
+        slot becomes an edge (``E = nf * nloc * nsel``, a static shape), invalid
+        slots (``nlist == -1``) get ``edge_mask=False``, zero ``edge_vec`` and a
+        ``src`` pointing at the center (in-range, masked) -- so no ``nonzero`` is
+        used and the converter is jit/export-traceable. The masked edges contribute
+        zero in a downstream ``segment_sum``, so the descriptor output is unchanged.
     """
     if layout is None:
         layout = GraphLayout()
@@ -92,6 +102,48 @@ def from_dense_quartet(
     dev = array_api_compat.device(extended_coord)
     nf, nloc, nsel = nlist.shape
     nall = extended_coord.shape[1]
+    if not compact:
+        if layout.edge_capacity is not None:
+            raise NotImplementedError(
+                "shape-static from_dense_quartet pads to E=nf*nloc*nsel; "
+                "edge_capacity unsupported here"
+            )
+        # (E,) flat grids, E = nf*nloc*nsel, row-major (frame, center, slot)
+        ff = xp.reshape(
+            xp.broadcast_to(
+                xp.reshape(xp.arange(nf, dtype=xp.int64, device=dev), (nf, 1, 1)),
+                (nf, nloc, nsel),
+            ),
+            (-1,),
+        )
+        center = xp.reshape(
+            xp.broadcast_to(
+                xp.reshape(xp.arange(nloc, dtype=xp.int64, device=dev), (1, nloc, 1)),
+                (nf, nloc, nsel),
+            ),
+            (-1,),
+        )
+        nl = xp.reshape(nlist, (-1,))  # neighbor ext idx or -1
+        valid = nl >= 0  # (E,) bool <-- the mask
+        j_safe = xp.where(valid, nl, xp.zeros_like(nl))  # clamp -1 -> 0 (avoid OOB)
+        ec_flat = xp.reshape(extended_coord, (nf * nall, 3))
+        map_flat = xp.reshape(mapping, (nf * nall,))
+        g_nei = ff * nall + j_safe
+        g_cen = ff * nall + center
+        src_local = xp.take(map_flat, g_nei, axis=0)
+        edge_vec = xp.take(ec_flat, g_nei, axis=0) - xp.take(ec_flat, g_cen, axis=0)
+        edge_vec = edge_vec * xp.astype(valid[:, None], edge_vec.dtype)  # zero invalid
+        src = xp.where(valid, ff * nloc + src_local, ff * nloc + center)  # -> center
+        dst = ff * nloc + center
+        edge_index = xp.astype(xp.stack([src, dst], axis=0), xp.int64)
+        edge_mask = valid
+        n_node = xp.full((nf,), nloc, dtype=xp.int64, device=dev)
+        return NeighborGraph(
+            n_node=n_node,
+            edge_index=edge_index,
+            edge_vec=edge_vec,
+            edge_mask=edge_mask,
+        )
     # per-slot (nf, nloc, nsel) index grids, flattened frame-major
     ff_grid = xp.broadcast_to(
         xp.reshape(xp.arange(nf, dtype=xp.int64, device=dev), (nf, 1, 1)),
diff --git a/source/tests/pt_expt/descriptor/test_dpa1.py b/source/tests/pt_expt/descriptor/test_dpa1.py
index f7c3edb9c2..d7d2718e67 100644
--- a/source/tests/pt_expt/descriptor/test_dpa1.py
+++ b/source/tests/pt_expt/descriptor/test_dpa1.py
@@ -252,16 +252,6 @@ def fn(coord_ext, atype_ext, nlist):
             atol=atol,
         )
 
-    @pytest.mark.xfail(
-        reason=(
-            "graph forward fx-trace lands in PR-B: the attn_layer=0 graph path "
-            "has data-dependent ops that default (real-tensor) make_fx cannot "
-            "trace -- `int(xp.sum(graph.n_node))` (dpa1.py _call_graph) and "
-            "`xp.nonzero` in from_dense_quartet. PR-B exports via dynamic/symbolic "
-            "shapes. Eager forward+grad still runs correctly (asserted below)."
-        ),
-        strict=False,
-    )
     @pytest.mark.parametrize("prec", ["float64"])  # precision
     def test_make_fx_graph(self, prec) -> None:
         """make_fx (export-readiness) of the attn_layer=0 GRAPH forward.

From 663cca6e6b7b795811403e68bd5f7ab21448d36c Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 17:21:25 +0800
Subject: [PATCH 25/69] refactor(dpmodel): explicit if/else for graph-vs-dense
 routing in DescrptDPA1.call

---
 deepmd/dpmodel/descriptor/dpa1.py | 47 ++++++++++++++++---------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 56f922d5f7..e72cd55b34 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -611,29 +611,32 @@ def call(
             sw = xp.where(nlist_mask, sw, xp.zeros_like(sw))
             sw = xp.reshape(sw, (nf, nloc, nnei, 1))
             return grrg, rot_mat, None, None, sw
-        del mapping
-        type_embedding = self.type_embedding.call()
-        # nf x nall x tebd_dim
-        atype_embd_ext = xp.reshape(
-            xp.take(type_embedding, xp.reshape(atype_ext, (-1,)), axis=0),
-            (nf, nall, self.tebd_dim),
-        )
-        # nfnl x tebd_dim
-        atype_embd = xp_take_first_n(atype_embd_ext, 1, nloc)
-        grrg, g2, h2, rot_mat, sw = self.se_atten(
-            nlist,
-            coord_ext,
-            atype_ext,
-            atype_embd_ext,
-            mapping=None,
-            type_embedding=type_embedding,
-        )
-        # nf x nloc x (ng x ng1 + tebd_dim)
-        if self.concat_output_tebd:
-            grrg = xp.concat(
-                [grrg, xp.reshape(atype_embd, (nf, nloc, self.tebd_dim))], axis=-1
+        else:
+            # legacy dense body (attention, strip tebd, exclude_types, or the
+            # ghost case with no mapping) -- kept working unchanged.
+            del mapping
+            type_embedding = self.type_embedding.call()
+            # nf x nall x tebd_dim
+            atype_embd_ext = xp.reshape(
+                xp.take(type_embedding, xp.reshape(atype_ext, (-1,)), axis=0),
+                (nf, nall, self.tebd_dim),
             )
-        return grrg, rot_mat, None, None, sw
+            # nfnl x tebd_dim
+            atype_embd = xp_take_first_n(atype_embd_ext, 1, nloc)
+            grrg, g2, h2, rot_mat, sw = self.se_atten(
+                nlist,
+                coord_ext,
+                atype_ext,
+                atype_embd_ext,
+                mapping=None,
+                type_embedding=type_embedding,
+            )
+            # nf x nloc x (ng x ng1 + tebd_dim)
+            if self.concat_output_tebd:
+                grrg = xp.concat(
+                    [grrg, xp.reshape(atype_embd, (nf, nloc, self.tebd_dim))], axis=-1
+                )
+            return grrg, rot_mat, None, None, sw
 
     def call_graph(
         self,

From 37f9d4a7a14a09664e080cf243f35c911279d9f8 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 17:32:57 +0800
Subject: [PATCH 26/69] fix(dpmodel): address OutisLi #5581 review (spec refs,
 jax int-sum, Array typing)

- swap dangling memory/spec_unified_edge_nlist.md refs -> public design
  discussion (wanghan-iapcm/deepmd-kit#4) so the references resolve
- edge_force_virial: short-circuit n_out=int(node_capacity) when supplied so
  the static jax/export path never calls int() on a traced sum(n_node)
- derivatives.py: move Array import under TYPE_CHECKING (+ from __future__
  import annotations) for subpackage uniformity
---
 .../dpmodel/utils/neighbor_graph/__init__.py  |  2 +-
 .../dpmodel/utils/neighbor_graph/builder.py   |  2 +-
 .../utils/neighbor_graph/derivatives.py       | 22 ++++++++++++++-----
 deepmd/dpmodel/utils/neighbor_graph/graph.py  |  2 +-
 4 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/__init__.py b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
index 27e49a4e18..b20c321dd4 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/__init__.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
@@ -6,7 +6,7 @@
 + edge padding), ``builder`` (the carry-all ``build_neighbor_graph`` dispatcher +
 the ``from_dense_quartet`` legacy converter), ``segment`` (mask-aware
 segment-reduction toolkit), and ``derivatives`` (edge force/virial assembly).
-See memory/spec_unified_edge_nlist.md.
+See the design discussion wanghan-iapcm/deepmd-kit#4.
 """
 
 from .ase_builder import (
diff --git a/deepmd/dpmodel/utils/neighbor_graph/builder.py b/deepmd/dpmodel/utils/neighbor_graph/builder.py
index 88e4ef0bb4..e3d8229ce4 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/builder.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/builder.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """Builders/converters that produce a :class:`NeighborGraph`.
 
-Two distinct groups (see memory/spec_unified_edge_nlist.md decision #17), kept
+Two distinct groups (see the design discussion wanghan-iapcm/deepmd-kit#4 decision #17), kept
 separate so a consumer can never assume completeness while a function silently
 truncated:
 
diff --git a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
index 1c0bafc234..a7c73eaaf5 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
@@ -4,7 +4,7 @@
 The autograd that produces g_e (grad(E, edge_vec)) is wired in the torch/jax
 backend later; this pure-array-API assembly is shared by all backends.
 
-Conventions (see memory/spec_unified_edge_nlist.md):
+Conventions (see the unified edge-nlist design discussion, wanghan-iapcm/deepmd-kit#4):
   edge_vec_e = r_src - r_dst ;  F_k = sum_{dst=k} g - sum_{src=k} g
   per-edge virial w_e = -g_e (x) edge_vec_e
   atom virial attributed FULL-TO-src (canonical TF==pt-legacy convention)
@@ -13,16 +13,25 @@
 Padding/guard edges (edge_mask == 0) are zeroed before any scatter.
 """
 
-import array_api_compat
+from __future__ import (
+    annotations,
+)
 
-from deepmd.dpmodel.array_api import (
-    Array,
+from typing import (
+    TYPE_CHECKING,
 )
 
+import array_api_compat
+
 from .segment import (
     segment_sum,
 )
 
+if TYPE_CHECKING:
+    from deepmd.dpmodel.array_api import (
+        Array,
+    )
+
 
 def edge_force_virial(
     g_e: Array,
@@ -62,8 +71,9 @@ def edge_force_virial(
         frame via the frame of their ``dst`` node.
     """
     xp = array_api_compat.array_namespace(g_e)
-    n_real = int(xp.sum(n_node))  # real node count
-    n_out = n_real if node_capacity is None else int(node_capacity)  # node-axis size
+    # node-axis size; when a static ``node_capacity`` is supplied (the jax/export
+    # path) short-circuit so we never call int() on the traced ``sum(n_node)``.
+    n_out = int(node_capacity) if node_capacity is not None else int(xp.sum(n_node))
     nf = n_node.shape[0]
     # zero padding/guard contributions; cast mask to g's dtype (array-API pure,
     # CLAUDE.md mask-multiply guideline — avoids bool*float under array_api_strict)
diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py
index 232145bda0..eb0e4f7c38 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/graph.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """Backend-agnostic edge-graph neighbor-list contract (NeighborGraph) and its
-length policy (GraphLayout). See memory/spec_unified_edge_nlist.md.
+length policy (GraphLayout). See the design discussion wanghan-iapcm/deepmd-kit#4.
 
 Node validity (real vs padding) is NOT a stored field: it is derived as
 ``arange(N) < sum(n_node)`` because ``n_node`` already encodes the real-node

From c2e0d960dfacc7f063151977d8688e5bd82a1ca8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 25 Jun 2026 09:42:55 +0000
Subject: [PATCH 27/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/dpmodel/utils/neighbor_graph/ase_builder.py   | 1 +
 source/tests/common/dpmodel/test_edge_env_mat.py     | 8 ++++++--
 source/tests/common/dpmodel/test_from_ijs.py         | 4 +++-
 source/tests/pt_expt/model/test_edge_energy_deriv.py | 8 ++++++--
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py b/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
index eaa01359ee..364c1a3c0b 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
@@ -28,6 +28,7 @@
     from deepmd.dpmodel.array_api import (
         Array,
     )
+
     from .graph import (
         GraphLayout,
         NeighborGraph,
diff --git a/source/tests/common/dpmodel/test_edge_env_mat.py b/source/tests/common/dpmodel/test_edge_env_mat.py
index 486a5d226f..ca6884de39 100644
--- a/source/tests/common/dpmodel/test_edge_env_mat.py
+++ b/source/tests/common/dpmodel/test_edge_env_mat.py
@@ -3,7 +3,9 @@
 
 import numpy as np
 
-from deepmd.dpmodel.utils.env_mat import EnvMat
+from deepmd.dpmodel.utils.env_mat import (
+    EnvMat,
+)
 from deepmd.dpmodel.utils.neighbor_graph import (
     edge_env_mat,
     from_dense_quartet,
@@ -58,7 +60,9 @@ def test_slot_broadcast_stats(self) -> None:
         This property is what allows edge_env_mat to use (ntypes, 4) stats
         instead of (ntypes, nnei, 4) stats.
         """
-        from deepmd.dpmodel.descriptor import DescrptDPA1
+        from deepmd.dpmodel.descriptor import (
+            DescrptDPA1,
+        )
 
         rng = np.random.default_rng(42)
         nloc = 6
diff --git a/source/tests/common/dpmodel/test_from_ijs.py b/source/tests/common/dpmodel/test_from_ijs.py
index f4c965372b..bab616e452 100644
--- a/source/tests/common/dpmodel/test_from_ijs.py
+++ b/source/tests/common/dpmodel/test_from_ijs.py
@@ -4,7 +4,9 @@
 import numpy as np
 import pytest
 
-from deepmd.dpmodel.utils.neighbor_graph import neighbor_graph_from_ijs
+from deepmd.dpmodel.utils.neighbor_graph import (
+    neighbor_graph_from_ijs,
+)
 
 
 class TestFromIjs(unittest.TestCase):
diff --git a/source/tests/pt_expt/model/test_edge_energy_deriv.py b/source/tests/pt_expt/model/test_edge_energy_deriv.py
index 9fc8dadc4e..fafc8ac180 100644
--- a/source/tests/pt_expt/model/test_edge_energy_deriv.py
+++ b/source/tests/pt_expt/model/test_edge_energy_deriv.py
@@ -3,8 +3,12 @@
 
 import torch
 
-from deepmd.pt.utils import env
-from deepmd.pt_expt.model.edge_transform_output import edge_energy_deriv
+from deepmd.pt.utils import (
+    env,
+)
+from deepmd.pt_expt.model.edge_transform_output import (
+    edge_energy_deriv,
+)
 
 
 class TestEdgeEnergyDeriv(unittest.TestCase):

From c9c8c2121e3bd16be2375e84c37cfc7a7f2e3d76 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 18:00:14 +0800
Subject: [PATCH 28/69] test(pt_expt): parametrize periodic/do_av in
 test_dpa1_graph_lower (drop for-loops + subTest)

---
 .../pt_expt/model/test_dpa1_graph_lower.py    | 116 ++++++++----------
 1 file changed, 50 insertions(+), 66 deletions(-)

diff --git a/source/tests/pt_expt/model/test_dpa1_graph_lower.py b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
index b192a66b86..ddaf450faf 100644
--- a/source/tests/pt_expt/model/test_dpa1_graph_lower.py
+++ b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
@@ -15,9 +15,8 @@
 reduced (per-frame) virial are frame/local quantities and compare directly.
 """
 
-import unittest
-
 import numpy as np
+import pytest
 import torch
 
 from deepmd.dpmodel.utils.neighbor_graph import (
@@ -64,8 +63,8 @@ def _fold_extended_to_local(
     return out
 
 
-class TestDpa1GraphLower(unittest.TestCase):
-    def setUp(self) -> None:
+class TestDpa1GraphLower:
+    def setup_method(self) -> None:
         self.device = env.DEVICE
         self.natoms = 5
         self.rcut = 4.0
@@ -166,7 +165,9 @@ def _prepare_lower_inputs(self, periodic: bool):
         mapping_t = torch.tensor(mapping, dtype=torch.int64, device=self.device)
         return ext_coord, ext_atype, nlist_t, mapping_t
 
-    def test_force_virial_parity_vs_legacy(self) -> None:
+    @pytest.mark.parametrize("periodic", [True, False])  # PBC vs non-PBC
+    @pytest.mark.parametrize("do_av", [False, True])  # atom-virial off / on
+    def test_force_virial_parity_vs_legacy(self, periodic, do_av) -> None:
         """Graph lower energy/force/virial/atom_virial == legacy dense lower on
         the SAME neighbor set (regime-1 graph from from_dense_quartet).
         """
@@ -177,64 +178,47 @@ def test_force_virial_parity_vs_legacy(self) -> None:
             if self.device.type == "cpu"
             else {"rtol": 1e-10, "atol": 1e-10}
         )
-        for periodic in (True, False):
-            for do_av in (False, True):
-                with self.subTest(periodic=periodic, do_av=do_av):
-                    ext_coord, ext_atype, nlist, mapping = self._prepare_lower_inputs(
-                        periodic
-                    )
-                    nf = ext_coord.shape[0]
-                    nloc = self.natoms
-
-                    legacy = model.forward_common_lower(
-                        ext_coord.clone().requires_grad_(True),
-                        ext_atype,
-                        nlist,
-                        mapping,
-                        do_atomic_virial=do_av,
-                    )
-
-                    # build the regime-1 graph from the SAME extended quartet.
-                    # from_dense_quartet is array-API; feed torch tensors so the
-                    # returned edge_vec is already a torch tensor on env.DEVICE.
-                    ng = from_dense_quartet(ext_coord, nlist, mapping)
-                    atype_local = ext_atype[:, :nloc].reshape(nf * nloc)
-                    graph = model.forward_common_lower_graph(
-                        atype_local,
-                        ng.n_node,
-                        ng.edge_index,
-                        ng.edge_vec,
-                        ng.edge_mask,
-                        do_atomic_virial=do_av,
-                    )
-
-                    # energy / reduced energy / reduced virial: direct compare
-                    torch.testing.assert_close(graph["energy"], legacy["energy"], **tol)
-                    torch.testing.assert_close(
-                        graph["energy_redu"], legacy["energy_redu"], **tol
-                    )
-                    torch.testing.assert_close(
-                        graph["energy_derv_c_redu"],
-                        legacy["energy_derv_c_redu"],
-                        **tol,
-                    )
-
-                    # force: fold legacy extended (nall) -> local (nloc)
-                    legacy_force_local = _fold_extended_to_local(
-                        legacy["energy_derv_r"], mapping, nloc
-                    )
-                    torch.testing.assert_close(
-                        graph["energy_derv_r"], legacy_force_local, **tol
-                    )
-
-                    if do_av:
-                        legacy_av_local = _fold_extended_to_local(
-                            legacy["energy_derv_c"], mapping, nloc
-                        )
-                        torch.testing.assert_close(
-                            graph["energy_derv_c"], legacy_av_local, **tol
-                        )
-
-
-if __name__ == "__main__":
-    unittest.main()
+        ext_coord, ext_atype, nlist, mapping = self._prepare_lower_inputs(periodic)
+        nf = ext_coord.shape[0]
+        nloc = self.natoms
+
+        legacy = model.forward_common_lower(
+            ext_coord.clone().requires_grad_(True),
+            ext_atype,
+            nlist,
+            mapping,
+            do_atomic_virial=do_av,
+        )
+
+        # build the regime-1 graph from the SAME extended quartet.
+        # from_dense_quartet is array-API; feed torch tensors so the
+        # returned edge_vec is already a torch tensor on env.DEVICE.
+        ng = from_dense_quartet(ext_coord, nlist, mapping)
+        atype_local = ext_atype[:, :nloc].reshape(nf * nloc)
+        graph = model.forward_common_lower_graph(
+            atype_local,
+            ng.n_node,
+            ng.edge_index,
+            ng.edge_vec,
+            ng.edge_mask,
+            do_atomic_virial=do_av,
+        )
+
+        # energy / reduced energy / reduced virial: direct compare
+        torch.testing.assert_close(graph["energy"], legacy["energy"], **tol)
+        torch.testing.assert_close(graph["energy_redu"], legacy["energy_redu"], **tol)
+        torch.testing.assert_close(
+            graph["energy_derv_c_redu"], legacy["energy_derv_c_redu"], **tol
+        )
+
+        # force: fold legacy extended (nall) -> local (nloc)
+        legacy_force_local = _fold_extended_to_local(
+            legacy["energy_derv_r"], mapping, nloc
+        )
+        torch.testing.assert_close(graph["energy_derv_r"], legacy_force_local, **tol)
+
+        if do_av:
+            legacy_av_local = _fold_extended_to_local(
+                legacy["energy_derv_c"], mapping, nloc
+            )
+            torch.testing.assert_close(graph["energy_derv_c"], legacy_av_local, **tol)

From f7e84fc1a05992cb945c7d1724a99bc83b66d678 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 18:02:07 +0800
Subject: [PATCH 29/69] refactor(dpmodel): explicit if/else for compact vs
 shape-static in from_dense_quartet

---
 .../dpmodel/utils/neighbor_graph/builder.py   | 78 ++++++++++---------
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/deepmd/dpmodel/utils/neighbor_graph/builder.py b/deepmd/dpmodel/utils/neighbor_graph/builder.py
index e3d8229ce4..a0343ecf68 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/builder.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/builder.py
@@ -144,43 +144,47 @@ def from_dense_quartet(
             edge_vec=edge_vec,
             edge_mask=edge_mask,
         )
-    # per-slot (nf, nloc, nsel) index grids, flattened frame-major
-    ff_grid = xp.broadcast_to(
-        xp.reshape(xp.arange(nf, dtype=xp.int64, device=dev), (nf, 1, 1)),
-        (nf, nloc, nsel),
-    )
-    center_grid = xp.broadcast_to(
-        xp.reshape(xp.arange(nloc, dtype=xp.int64, device=dev), (1, nloc, 1)),
-        (nf, nloc, nsel),
-    )
-    ff_flat = xp.reshape(ff_grid, (-1,))
-    center_flat = xp.reshape(center_grid, (-1,))
-    nl_flat = xp.reshape(nlist, (-1,))
-    keep = xp.reshape(xp.nonzero(nl_flat >= 0)[0], (-1,))
-    ff_k = xp.take(ff_flat, keep, axis=0)
-    dst_local = xp.take(center_flat, keep, axis=0)  # center index in [0, nloc)
-    j_ext = xp.take(nl_flat, keep, axis=0)  # neighbor index in [0, nall)
-    # cross-frame gathers via flat (frame * nall + idx) indices; centers are the
-    # first nloc extended atoms (local atoms precede ghosts).
-    ec_flat = xp.reshape(extended_coord, (nf * nall, 3))
-    map_flat = xp.reshape(mapping, (nf * nall,))
-    g_nei = ff_k * nall + j_ext
-    g_cen = ff_k * nall + dst_local
-    src_local = xp.take(map_flat, g_nei, axis=0)  # local owner of the neighbor
-    edge_vec = xp.take(ec_flat, g_nei, axis=0) - xp.take(ec_flat, g_cen, axis=0)
-    edge_index = xp.astype(
-        xp.stack([ff_k * nloc + src_local, ff_k * nloc + dst_local], axis=0), xp.int64
-    )
-    edge_index, edge_vec, edge_mask = pad_and_guard_edges(
-        edge_index, edge_vec, layout.edge_capacity, layout.min_edges
-    )
-    n_node = xp.full((nf,), nloc, dtype=xp.int64, device=dev)
-    return NeighborGraph(
-        n_node=n_node,
-        edge_index=edge_index,
-        edge_vec=edge_vec,
-        edge_mask=edge_mask,
-    )
+    else:
+        # COMPACT: drop invalid slots via nonzero (dynamic shape -> eager only,
+        # NOT jit/export-traceable) then pad/guard.
+        # per-slot (nf, nloc, nsel) index grids, flattened frame-major
+        ff_grid = xp.broadcast_to(
+            xp.reshape(xp.arange(nf, dtype=xp.int64, device=dev), (nf, 1, 1)),
+            (nf, nloc, nsel),
+        )
+        center_grid = xp.broadcast_to(
+            xp.reshape(xp.arange(nloc, dtype=xp.int64, device=dev), (1, nloc, 1)),
+            (nf, nloc, nsel),
+        )
+        ff_flat = xp.reshape(ff_grid, (-1,))
+        center_flat = xp.reshape(center_grid, (-1,))
+        nl_flat = xp.reshape(nlist, (-1,))
+        keep = xp.reshape(xp.nonzero(nl_flat >= 0)[0], (-1,))
+        ff_k = xp.take(ff_flat, keep, axis=0)
+        dst_local = xp.take(center_flat, keep, axis=0)  # center index in [0, nloc)
+        j_ext = xp.take(nl_flat, keep, axis=0)  # neighbor index in [0, nall)
+        # cross-frame gathers via flat (frame * nall + idx) indices; centers are
+        # the first nloc extended atoms (local atoms precede ghosts).
+        ec_flat = xp.reshape(extended_coord, (nf * nall, 3))
+        map_flat = xp.reshape(mapping, (nf * nall,))
+        g_nei = ff_k * nall + j_ext
+        g_cen = ff_k * nall + dst_local
+        src_local = xp.take(map_flat, g_nei, axis=0)  # local owner of the neighbor
+        edge_vec = xp.take(ec_flat, g_nei, axis=0) - xp.take(ec_flat, g_cen, axis=0)
+        edge_index = xp.astype(
+            xp.stack([ff_k * nloc + src_local, ff_k * nloc + dst_local], axis=0),
+            xp.int64,
+        )
+        edge_index, edge_vec, edge_mask = pad_and_guard_edges(
+            edge_index, edge_vec, layout.edge_capacity, layout.min_edges
+        )
+        n_node = xp.full((nf,), nloc, dtype=xp.int64, device=dev)
+        return NeighborGraph(
+            n_node=n_node,
+            edge_index=edge_index,
+            edge_vec=edge_vec,
+            edge_mask=edge_mask,
+        )
 
 
 def build_neighbor_graph(

From 5a487c73424318fdcefa19413f7c1a3b62d47888 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 18:06:30 +0800
Subject: [PATCH 30/69] refactor(dpmodel): extract DescrptDPA1.call into thin
 dispatcher + _call_graph_adapter/_call_dense

---
 deepmd/dpmodel/descriptor/dpa1.py | 172 +++++++++++++++++-------------
 1 file changed, 98 insertions(+), 74 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index e72cd55b34..768a04a2e7 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -557,86 +557,110 @@ def call(
             The smooth switch function.
         """
         xp = array_api_compat.array_namespace(coord_ext, atype_ext, nlist)
-        nf, nloc, nnei = nlist.shape
-        nall = xp.reshape(coord_ext, (nf, -1)).shape[1] // 3
-        # graph-eligible configs route through the graph-native path; the dense
-        # call is a thin adapter (decision #14: graph = single math source) that
-        # preserves the dense 5-tuple ABI exactly (see call_graph). Ineligible
+        nloc = nlist.shape[1]
+        nall = xp.reshape(coord_ext, (nlist.shape[0], -1)).shape[1] // 3
+        # graph-eligible configs route through the graph-native adapter (decision
+        # #14: graph = single math source, dense call = thin adapter). Ineligible
         # configs (attention, strip tebd, exclude_types) and the ghost case with
-        # no mapping fall back to the legacy dense body below, so those models
-        # keep working unchanged. The graph needs `mapping` to fold ghosts to
-        # local owners; without it only the no-ghost case (nall == nloc) is valid.
+        # no mapping fall back to the legacy dense body. The graph needs `mapping`
+        # to fold ghosts to local owners; without it only nall == nloc is valid.
         if self.uses_graph_lower() and (mapping is not None or nall == nloc):
-            from deepmd.dpmodel.utils.neighbor_graph import (
-                from_dense_quartet,
-            )
+            return self._call_graph_adapter(coord_ext, atype_ext, nlist, mapping)
+        else:
+            return self._call_dense(coord_ext, atype_ext, nlist)
 
-            dev = array_api_compat.device(coord_ext)
-            coord_ext_3 = xp.reshape(coord_ext, (nf, nall, 3))
-            if mapping is None:
-                # default identity mapping (ext == loc, e.g. no-PBC nall == nloc)
-                mapping_g = xp.broadcast_to(
-                    xp.arange(nall, dtype=xp.int64, device=dev)[None, :], (nf, nall)
-                )
-            else:
-                mapping_g = xp.reshape(mapping, (nf, nall))
-            # shape-static converter (compact=False, layout=None) so this dense
-            # adapter is jit/export-traceable: no nonzero-compaction (data-dependent
-            # shape). Masked invalid edges contribute zero in call_graph's
-            # segment_sum, so the descriptor output is unchanged.
-            graph = from_dense_quartet(
-                coord_ext_3, nlist, mapping_g, layout=None, compact=False
-            )
-            # local atom types, flat (nf * nloc,)
-            atype_local = xp.reshape(xp_take_first_n(atype_ext, 1, nloc), (nf * nloc,))
-            grrg, rot_mat = self.call_graph(
-                graph,
-                atype_local,
-                type_embedding=self.type_embedding.call(),
-            )
-            # reconstruct the dense-shaped sw exactly the dense way (env_mat
-            # switch masked where nlist == -1; the graph path forbids
-            # exclude_types, so nlist_mask == nlist != -1, matching
-            # DescrptBlockSeAtten.call). This is a dense-layout artifact tied to
-            # neighbor slots, which the graph does not carry, so it lives here in
-            # the dense adapter (which has nlist/coord_ext available).
-            _, _, sw = self.se_atten.env_mat.call(
-                coord_ext,
-                atype_ext,
-                nlist,
-                self.se_atten.mean[...],
-                self.se_atten.stddev[...],
+    def _call_graph_adapter(
+        self,
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        mapping: Array | None,
+    ) -> Array:
+        """Regime-1 dense->graph adapter (the eligible ``call`` path).
+
+        Builds a NeighborGraph from the dense quartet with the SHAPE-STATIC
+        converter (``compact=False``, so this is jit/export-traceable -- no
+        ``nonzero``), runs :meth:`call_graph`, and reconstructs the dense-shaped
+        ``sw``. Preserves the dense 5-tuple ABI exactly; masked invalid edges
+        contribute zero in ``call_graph``'s ``segment_sum`` so the output is
+        identical to the legacy dense body.
+        """
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            from_dense_quartet,
+        )
+
+        xp = array_api_compat.array_namespace(coord_ext, atype_ext, nlist)
+        dev = array_api_compat.device(coord_ext)
+        nf, nloc, nnei = nlist.shape
+        nall = xp.reshape(coord_ext, (nf, -1)).shape[1] // 3
+        coord_ext_3 = xp.reshape(coord_ext, (nf, nall, 3))
+        if mapping is None:
+            # default identity mapping (ext == loc, e.g. no-PBC nall == nloc)
+            mapping_g = xp.broadcast_to(
+                xp.arange(nall, dtype=xp.int64, device=dev)[None, :], (nf, nall)
             )
-            nlist_mask = (nlist != -1)[:, :, :, None]
-            sw = xp.where(nlist_mask, sw, xp.zeros_like(sw))
-            sw = xp.reshape(sw, (nf, nloc, nnei, 1))
-            return grrg, rot_mat, None, None, sw
         else:
-            # legacy dense body (attention, strip tebd, exclude_types, or the
-            # ghost case with no mapping) -- kept working unchanged.
-            del mapping
-            type_embedding = self.type_embedding.call()
-            # nf x nall x tebd_dim
-            atype_embd_ext = xp.reshape(
-                xp.take(type_embedding, xp.reshape(atype_ext, (-1,)), axis=0),
-                (nf, nall, self.tebd_dim),
-            )
-            # nfnl x tebd_dim
-            atype_embd = xp_take_first_n(atype_embd_ext, 1, nloc)
-            grrg, g2, h2, rot_mat, sw = self.se_atten(
-                nlist,
-                coord_ext,
-                atype_ext,
-                atype_embd_ext,
-                mapping=None,
-                type_embedding=type_embedding,
+            mapping_g = xp.reshape(mapping, (nf, nall))
+        graph = from_dense_quartet(
+            coord_ext_3, nlist, mapping_g, layout=None, compact=False
+        )
+        # local atom types, flat (nf * nloc,)
+        atype_local = xp.reshape(xp_take_first_n(atype_ext, 1, nloc), (nf * nloc,))
+        grrg, rot_mat = self.call_graph(
+            graph,
+            atype_local,
+            type_embedding=self.type_embedding.call(),
+        )
+        # reconstruct the dense-shaped sw the dense way (env_mat switch masked
+        # where nlist == -1; the graph path forbids exclude_types, so nlist_mask
+        # == nlist != -1, matching DescrptBlockSeAtten.call). A dense-layout
+        # artifact tied to neighbor slots, which the graph does not carry.
+        _, _, sw = self.se_atten.env_mat.call(
+            coord_ext,
+            atype_ext,
+            nlist,
+            self.se_atten.mean[...],
+            self.se_atten.stddev[...],
+        )
+        nlist_mask = (nlist != -1)[:, :, :, None]
+        sw = xp.where(nlist_mask, sw, xp.zeros_like(sw))
+        sw = xp.reshape(sw, (nf, nloc, nnei, 1))
+        return grrg, rot_mat, None, None, sw
+
+    def _call_dense(
+        self,
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+    ) -> Array:
+        """Legacy dense descriptor body (the ineligible ``call`` path: attention,
+        strip tebd, exclude_types, or the no-mapping ghost case).
+        """
+        xp = array_api_compat.array_namespace(coord_ext, atype_ext, nlist)
+        nf, nloc = nlist.shape[:2]
+        nall = xp.reshape(coord_ext, (nf, -1)).shape[1] // 3
+        type_embedding = self.type_embedding.call()
+        # nf x nall x tebd_dim
+        atype_embd_ext = xp.reshape(
+            xp.take(type_embedding, xp.reshape(atype_ext, (-1,)), axis=0),
+            (nf, nall, self.tebd_dim),
+        )
+        # nfnl x tebd_dim
+        atype_embd = xp_take_first_n(atype_embd_ext, 1, nloc)
+        grrg, g2, h2, rot_mat, sw = self.se_atten(
+            nlist,
+            coord_ext,
+            atype_ext,
+            atype_embd_ext,
+            mapping=None,
+            type_embedding=type_embedding,
+        )
+        # nf x nloc x (ng x ng1 + tebd_dim)
+        if self.concat_output_tebd:
+            grrg = xp.concat(
+                [grrg, xp.reshape(atype_embd, (nf, nloc, self.tebd_dim))], axis=-1
             )
-            # nf x nloc x (ng x ng1 + tebd_dim)
-            if self.concat_output_tebd:
-                grrg = xp.concat(
-                    [grrg, xp.reshape(atype_embd, (nf, nloc, self.tebd_dim))], axis=-1
-                )
-            return grrg, rot_mat, None, None, sw
+        return grrg, rot_mat, None, None, sw
 
     def call_graph(
         self,

From 7cfb2a863e561733006f4aaedc7949f7e6c220f5 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 21:45:14 +0800
Subject: [PATCH 31/69] feat: general output transform for the graph path
 (support all fitting keys, not just energy)

call_lower_graph now reduces via the general fit_output_to_model_output (any
fitting output, not hard-coded 'energy'); pt_expt adds
fit_output_to_model_output_graph mirroring take_deriv but differentiating
w.r.t. edge_vec per scalar component via edge_energy_deriv, so force/virial/
atom_virial generalize to any r_differentiable output. Fixes the KeyError on
non-energy models (dos/dipole/polar/property) with attn_layer=0 descriptors.
---
 deepmd/dpmodel/model/make_model.py            | 156 ++++++++++-------
 deepmd/pt_expt/model/edge_transform_output.py | 120 +++++++++++++
 deepmd/pt_expt/model/make_model.py            |  65 ++++---
 .../common/dpmodel/test_call_lower_graph.py   |  13 +-
 source/tests/pt_expt/model/test_dos_graph.py  | 161 ++++++++++++++++++
 5 files changed, 410 insertions(+), 105 deletions(-)
 create mode 100644 source/tests/pt_expt/model/test_dos_graph.py

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index c87da1a8b3..297669fc37 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -46,7 +46,6 @@
     NeighborGraph,
     build_neighbor_graph,
     build_neighbor_graph_ase,
-    segment_sum,
 )
 from deepmd.utils.path import (
     DPPath,
@@ -377,13 +376,15 @@ def _call_common_graph(
             method: str,
             do_atomic_virial: bool = False,
         ) -> dict[str, Array]:
-            """Carry-all graph energy forward (opt-in, Option B).
+            """Carry-all graph forward (opt-in, Option B).
 
             Builds a carry-all :class:`NeighborGraph` from ``cc``/``atype``/``bb``
-            and routes the ENERGY forward through :meth:`call_lower_graph`. The
-            returned dict mirrors the dense ``call_common`` energy keys
-            (``atom_energy``, ``energy``, ``mask``). Input type-casting is done
-            by the caller; output type-casting is also applied by the caller.
+            and routes the forward through the OUTPUT-AGNOSTIC
+            :meth:`call_lower_graph`. The returned dict mirrors the dense
+            ``call_common`` keys (``<var>`` per-atom, ``<var>_redu`` reduced,
+            derivative name-holders ``None``, plus ``mask``). Input type-casting
+            is done by the caller; output type-casting is also applied by the
+            caller.
             """
             descriptor = getattr(self.atomic_model, "descriptor", None)
             uses_graph_lower = getattr(descriptor, "uses_graph_lower", lambda: False)
@@ -403,7 +404,9 @@ def _call_common_graph(
             xp = array_api_compat.array_namespace(atype)
             dev = array_api_compat.device(atype)
             nf, nloc = atype.shape[:2]
-            graph_ret = self.call_lower_graph(
+            # OUTPUT-AGNOSTIC standard model dict (``<var>``, ``<var>_redu``,
+            # derivative name-holders ``None``), like the dense ``call_common``.
+            model_predict = self.call_lower_graph(
                 atype=xp.reshape(atype, (nf * nloc,)),
                 n_node=ng.n_node,
                 edge_index=ng.edge_index,
@@ -412,16 +415,9 @@ def _call_common_graph(
                 fparam=fp,
                 aparam=ap,
             )
-            # mirror the dense ``call_common`` energy keys: ``energy`` is the
-            # per-atom energy (nf, nloc, 1); ``energy_redu`` is the per-frame
-            # reduction (nf, 1); ``mask`` is the (nf, nloc) realness mask.
-            model_predict = {
-                "energy": xp.reshape(graph_ret["atom_energy"], (nf, nloc, 1)),
-                "energy_redu": graph_ret["energy"],
-                # carry-all graph: all local atoms are real -> all-ones int mask,
-                # matching the dense path (base_atomic_model: mask = int32 atom_mask).
-                "mask": xp.ones((nf, nloc), dtype=xp.int32, device=dev),
-            }
+            # carry-all graph: all local atoms are real -> all-ones int mask,
+            # matching the dense path (base_atomic_model: mask = int32 atom_mask).
+            model_predict["mask"] = xp.ones((nf, nloc), dtype=xp.int32, device=dev)
             return model_predict
 
         def call_common_lower(
@@ -531,6 +527,54 @@ def forward_common_atomic(
                 mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
             )
 
+        def _graph_descriptor_fitting(
+            self,
+            atype: Array,
+            n_node: Array,
+            edge_index: Array,
+            edge_vec: Array,
+            edge_mask: Array,
+            fparam: Array | None = None,
+            aparam: Array | None = None,
+        ) -> tuple[dict[str, Array], Array]:
+            """Run the graph descriptor + fitting forward.
+
+            Returns the raw rectangular ``fit_ret`` (``(nf, nloc, *shape)`` per
+            fitting output) plus the descriptor output ``gg`` (used by callers as
+            the array-namespace carrier).  ``edge_vec`` is consumed as-is so that
+            callers can make it the autograd leaf (pt_expt) before invoking this.
+            """
+            xp = array_api_compat.array_namespace(edge_vec)
+            dev = array_api_compat.device(edge_vec)
+            graph = NeighborGraph(
+                n_node=n_node,
+                edge_index=edge_index,
+                edge_vec=edge_vec,
+                edge_mask=edge_mask,
+            )
+            nf = n_node.shape[0]
+            nloc = int(n_node[0])
+            descriptor = self.atomic_model.descriptor
+            fitting_net = self.atomic_model.fitting_net
+            # dpa1 call_graph requires the type-embedding table explicitly
+            type_embedding = descriptor.type_embedding.call()
+            gg, rot_mat = descriptor.call_graph(
+                graph, atype, type_embedding=type_embedding
+            )
+            g2 = h2 = None
+            # the fitting expects atype shaped (nf, nloc)
+            atype_2d = xp.reshape(xp.asarray(atype, device=dev), (nf, nloc))
+            fit_ret = fitting_net(
+                gg,
+                atype_2d,
+                gr=rot_mat,
+                g2=g2,
+                h2=h2,
+                fparam=fparam,
+                aparam=aparam,
+            )
+            return fit_ret, gg
+
         def call_lower_graph(
             self,
             atype: Array,
@@ -543,13 +587,18 @@ def call_lower_graph(
             aparam: Array | None = None,
             comm_dict: dict | None = None,
         ) -> dict[str, Array]:
-            """Graph-native ENERGY lower (PR-A: dpa1 ``attn_layer == 0``).
-
-            Energy-level only: returns the per-atom ``atom_energy`` and the
-            per-frame reduced ``energy``. Force/virial are produced by the
-            pt_expt autograd path (a later task). Must match the dense
-            :meth:`call_common_lower` energy and atom-energy on the SAME
-            neighbor list.
+            """Graph-native lower (PR-A: dpa1 ``attn_layer == 0``).
+
+            OUTPUT-AGNOSTIC, like the dense
+            :func:`~deepmd.dpmodel.model.transform_output.fit_output_to_model_output`:
+            runs the graph descriptor + fitting forward to obtain the rectangular
+            ``fit_ret`` (``(nf, nloc, *shape)``), then reduces EVERY reducible
+            fitting output (``xp.sum``/``xp.mean`` over the atom axis, cast to
+            energy precision) and sets derivative name-holders to ``None``.  This
+            makes any fitting (energy/dos/dipole/polar/property/...) flow through
+            the graph path with no change on the fitting side.  Force/virial are
+            produced by the pt_expt autograd path.  Must match the dense
+            :meth:`call_common_lower` reduction on the SAME neighbor list.
 
             Parameters
             ----------
@@ -577,57 +626,32 @@ def call_lower_graph(
             Returns
             -------
             dict
-                ``{"atom_energy": (nf, nloc, 1), "energy": (nf, 1)}``.
+                The standard model dict (``<var>`` per-atom, ``<var>_redu``
+                reduced, derivative name-holders ``None``), matching
+                :func:`fit_output_to_model_output`.
             """
             xp = array_api_compat.array_namespace(edge_vec)
             dev = array_api_compat.device(edge_vec)
-            graph = NeighborGraph(
-                n_node=n_node,
-                edge_index=edge_index,
-                edge_vec=edge_vec,
-                edge_mask=edge_mask,
-            )
             nf = n_node.shape[0]
             nloc = int(n_node[0])
-            descriptor = self.atomic_model.descriptor
-            fitting_net = self.atomic_model.fitting_net
-            # dpa1 call_graph requires the type-embedding table explicitly
-            type_embedding = descriptor.type_embedding.call()
-            gg, rot_mat = descriptor.call_graph(
-                graph, atype, type_embedding=type_embedding
-            )
-            g2 = h2 = None
-            # the fitting expects atype shaped (nf, nloc)
-            atype_2d = xp.reshape(xp.asarray(atype, device=dev), (nf, nloc))
-            fit_ret = fitting_net(
-                gg,
-                atype_2d,
-                gr=rot_mat,
-                g2=g2,
-                h2=h2,
+            fit_ret, gg = self._graph_descriptor_fitting(
+                atype,
+                n_node,
+                edge_index,
+                edge_vec,
+                edge_mask,
                 fparam=fparam,
                 aparam=aparam,
             )
-            atom_energy = fit_ret["energy"]  # (nf, nloc, 1)
-            # per-frame reduction via segment_sum, mirroring the dense reduction
-            # (cast to energy precision before summing; see
-            # transform_output.fit_output_to_model_output).
-            frame_id = xp.repeat(
-                xp.arange(nf, dtype=edge_index.dtype, device=dev),
-                xp.asarray(n_node, device=dev),
-            )
-            ener_dtype = get_xp_precision(
-                xp, RESERVED_PRECISION_DICT[self.global_ener_float_precision]
-            )
-            energy = segment_sum(
-                xp.reshape(
-                    xp.astype(atom_energy, ener_dtype),
-                    (nf * nloc, 1),
-                ),
-                frame_id,
-                nf,
+            # carry-all graph: every local atom is real -> all-ones int mask.
+            mask = xp.ones((nf, nloc), dtype=xp.int32, device=dev)
+            return fit_output_to_model_output(
+                fit_ret,
+                self.atomic_model.fitting_output_def(),
+                gg,
+                do_atomic_virial=False,
+                mask=mask,
             )
-            return {"atom_energy": atom_energy, "energy": energy}
 
         call = call_common
         call_lower = call_common_lower
diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py
index 6d3bf2df4f..98256f2679 100644
--- a/deepmd/pt_expt/model/edge_transform_output.py
+++ b/deepmd/pt_expt/model/edge_transform_output.py
@@ -7,9 +7,17 @@
 
 import torch
 
+from deepmd.dpmodel import (
+    FittingOutputDef,
+    get_deriv_name,
+    get_reduce_name,
+)
 from deepmd.dpmodel.utils.neighbor_graph import (
     edge_force_virial,
 )
+from deepmd.pt.utils import (
+    env,
+)
 
 
 def edge_energy_deriv(
@@ -37,3 +45,115 @@ def edge_energy_deriv(
         g_e, edge_vec, edge_index, edge_mask, n_node
     )
     return force, (atom_virial if do_atomic_virial else None), virial
+
+
+def fit_output_to_model_output_graph(
+    fit_ret: dict[str, torch.Tensor],
+    fit_output_def: FittingOutputDef,
+    edge_vec: torch.Tensor,
+    edge_index: torch.Tensor,
+    edge_mask: torch.Tensor,
+    n_node: torch.Tensor,
+    do_atomic_virial: bool = False,
+    create_graph: bool = True,
+    mask: torch.Tensor | None = None,
+) -> dict[str, torch.Tensor]:
+    """Graph analogue of the dense pt_expt ``fit_output_to_model_output``.
+
+    OUTPUT-AGNOSTIC: reduces EVERY reducible fitting output (cast to energy
+    precision, summed/averaged over the atom axis) and, for every reducible +
+    ``r_differentiable`` output, assembles per-component force / virial /
+    (optional) atom-virial from :func:`edge_energy_deriv` (one ``grad`` w.r.t.
+    ``edge_vec`` per scalar component, then the shared full-to-``src`` scatter).
+
+    Mirrors the dense :func:`deepmd.pt_expt.model.transform_output.take_deriv`
+    output shapes -- ``<var>_derv_r`` is ``(nf, nloc, *shape, 3)``,
+    ``<var>_derv_c`` is ``(nf, nloc, *shape, 9)``, ``<var>_derv_c_redu`` is
+    ``(nf, *shape, 9)`` -- except the graph is ghost-free so the dense ``nall``
+    atom axis collapses to ``nloc`` LOCAL atoms.
+
+    Parameters
+    ----------
+    fit_ret
+        Raw rectangular fitting output, ``(nf, nloc, *shape)`` per key.
+    fit_output_def
+        The fitting output definition.
+    edge_vec
+        (E, 3) edge vectors; MUST be the autograd leaf of ``fit_ret``.
+    edge_index
+        (2, E) ``[src, dst]`` edge endpoints (flat local indices).
+    edge_mask
+        (E,) valid-edge mask.
+    n_node
+        (nf,) per-frame local atom counts.
+    do_atomic_virial
+        Whether to also assemble the per-atom virial ``<var>_derv_c``.
+    create_graph
+        Whether the backward retains a graph (training).
+    mask
+        (nf, nloc) realness mask; used only for intensive-output reduction.
+    """
+    redu_prec = env.GLOBAL_PT_ENER_FLOAT_PRECISION
+    nf = int(n_node.shape[0])
+    # N == sum(n_node) == nf * nloc here (rectangular carry-all graph).
+    nloc = int(fit_ret[next(iter(fit_ret))].shape[1])
+    model_ret: dict[str, torch.Tensor] = dict(fit_ret.items())
+    for kk, vv in fit_ret.items():
+        vdef = fit_output_def[kk]
+        shap = vdef.shape
+        atom_axis = -(len(shap) + 1)
+        if not vdef.reducible:
+            continue
+        kk_redu = get_reduce_name(kk)
+        if vdef.intensive:
+            if mask is not None:
+                model_ret[kk_redu] = torch.sum(
+                    vv.to(redu_prec), dim=atom_axis
+                ) / torch.sum(mask, dim=-1, keepdim=True)
+            else:
+                model_ret[kk_redu] = torch.mean(vv.to(redu_prec), dim=atom_axis)
+        else:
+            model_ret[kk_redu] = torch.sum(vv.to(redu_prec), dim=atom_axis)
+        if not vdef.r_differentiable:
+            continue
+        kk_derv_r, kk_derv_c = get_deriv_name(kk)
+        size = 1
+        for ii in shap:
+            size *= ii
+        # split the reduced output into ``size`` per-frame scalar components.
+        svv = model_ret[kk_redu].reshape(nf, size)
+        ff_list: list[torch.Tensor] = []
+        av_list: list[torch.Tensor] = []
+        vir_list: list[torch.Tensor] = []
+        for c in range(size):
+            force, atom_vir, vir = edge_energy_deriv(
+                svv[:, c],
+                edge_vec,
+                edge_index,
+                edge_mask,
+                n_node,
+                do_atomic_virial=(vdef.c_differentiable and do_atomic_virial),
+                create_graph=create_graph,
+            )
+            # force (N, 3) -> (nf, nloc, 1, 3)
+            ff_list.append(force.reshape(nf, nloc, 1, 3))
+            if vdef.c_differentiable:
+                # virial (nf, 3, 3) -> (nf, 1, 9)
+                vir_list.append(vir.reshape(nf, 1, 9))
+                if do_atomic_virial:
+                    assert atom_vir is not None
+                    # atom_virial (N, 3, 3) -> (nf, nloc, 1, 9)
+                    av_list.append(atom_vir.reshape(nf, nloc, 1, 9))
+        # (nf, nloc, size, 3) -> (nf, nloc, *shape, 3)
+        model_ret[kk_derv_r] = torch.cat(ff_list, dim=-2).reshape([nf, nloc, *shap, 3])
+        if vdef.c_differentiable:
+            # (nf, size, 9) -> (nf, *shape, 9)
+            model_ret[kk_derv_c + "_redu"] = torch.cat(vir_list, dim=-2).reshape(
+                [nf, *shap, 9]
+            )
+            if do_atomic_virial:
+                # (nf, nloc, size, 9) -> (nf, nloc, *shape, 9)
+                model_ret[kk_derv_c] = torch.cat(av_list, dim=-2).reshape(
+                    [nf, nloc, *shap, 9]
+                )
+    return model_ret
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 7e54d643b4..c22d6efcea 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -25,7 +25,7 @@
 )
 
 from .edge_transform_output import (
-    edge_energy_deriv,
+    fit_output_to_model_output_graph,
 )
 from .transform_output import (
     fit_output_to_model_output,
@@ -293,15 +293,19 @@ def forward_common_lower_graph(
         ) -> dict[str, torch.Tensor]:
             """Graph-native lower with autograd force/virial (PR-A: dpa1 ``attn_layer==0``).
 
-            Runs the dpmodel ENERGY-only :meth:`call_lower_graph` with ``edge_vec``
-            as the autograd leaf, then assembles force / per-frame virial /
-            (optional) atom virial from a SINGLE backward pass via
-            :func:`edge_energy_deriv` (``g_e = dE/d(edge_vec)``, then the shared
-            full-to-``src`` scatter).
+            OUTPUT-AGNOSTIC: runs the graph descriptor + fitting forward with
+            ``edge_vec`` as the autograd leaf (via the inherited
+            :meth:`_graph_descriptor_fitting`), then routes the raw rectangular
+            ``fit_ret`` through :func:`fit_output_to_model_output_graph`, which
+            reduces EVERY reducible output and assembles force / per-frame virial
+            / (optional) atom-virial for every ``r_differentiable`` output from a
+            backward pass w.r.t. ``edge_vec`` (the shared full-to-``src`` scatter).
+            This makes any fitting (energy/dos/dipole/polar/property/...) flow
+            through the graph path with no change on the fitting side.
 
             The returned dict uses the SAME internal key names as the legacy dense
-            :meth:`forward_common_lower` (``energy``, ``energy_redu``,
-            ``energy_derv_r``, ``energy_derv_c_redu``, and ``energy_derv_c`` when
+            :meth:`forward_common_lower` (``<var>``, ``<var>_redu``,
+            ``<var>_derv_r``, ``<var>_derv_c_redu``, and ``<var>_derv_c`` when
             ``do_atomic_virial``).  Unlike the dense lower (which returns EXTENDED
             ``nall`` force/atom-virial), the graph is ghost-free, so force and
             atom-virial here live on the ``nloc`` LOCAL atoms (ghost contributions
@@ -322,7 +326,7 @@ def forward_common_lower_graph(
             edge_mask
                 (E,) valid-edge mask.
             do_atomic_virial
-                Whether to also return the per-atom virial ``energy_derv_c``.
+                Whether to also return the per-atom virial ``<var>_derv_c``.
             fparam
                 Frame parameter, ``(nf, ndf)``.
             aparam
@@ -331,47 +335,38 @@ def forward_common_lower_graph(
             Returns
             -------
             dict
-                ``energy`` (nf, nloc, 1), ``energy_redu`` (nf, 1),
-                ``energy_derv_r`` (nf, nloc, 1, 3),
-                ``energy_derv_c_redu`` (nf, 1, 9), and -- when
-                ``do_atomic_virial`` -- ``energy_derv_c`` (nf, nloc, 1, 9).
+                The standard model dict with ``<var>`` (nf, nloc, *shape),
+                ``<var>_redu`` (nf, *shape), and -- for ``r_differentiable``
+                outputs -- ``<var>_derv_r`` (nf, nloc, *shape, 3),
+                ``<var>_derv_c_redu`` (nf, *shape, 9), and -- when
+                ``do_atomic_virial`` -- ``<var>_derv_c`` (nf, nloc, *shape, 9).
             """
             nf = int(n_node.shape[0])
             nloc = int(n_node[0])
             # make edge_vec the autograd leaf for the energy backward
             edge_vec = edge_vec.detach().requires_grad_(True)
-            ret = self.call_lower_graph(
-                atype=atype,
-                n_node=n_node,
-                edge_index=edge_index,
-                edge_vec=edge_vec,
-                edge_mask=edge_mask,
+            fit_ret, _ = self._graph_descriptor_fitting(
+                atype,
+                n_node,
+                edge_index,
+                edge_vec,
+                edge_mask,
                 fparam=fparam,
                 aparam=aparam,
             )
-            atom_energy = ret["atom_energy"]  # (nf, nloc, 1)
-            energy = ret["energy"]  # (nf, 1)
-            force, atom_virial, virial = edge_energy_deriv(
-                energy,
+            # carry-all graph: every local atom is real -> all-ones int mask.
+            mask = torch.ones((nf, nloc), dtype=torch.int32, device=edge_vec.device)
+            return fit_output_to_model_output_graph(
+                fit_ret,
+                self.atomic_model.fitting_output_def(),
                 edge_vec,
                 edge_index,
                 edge_mask,
                 n_node,
                 do_atomic_virial=do_atomic_virial,
                 create_graph=self.training,
+                mask=mask,
             )
-            out = {
-                "energy": atom_energy,
-                "energy_redu": energy,
-                # force (N, 3) -> (nf, nloc, 1, 3); virial (nf, 3, 3) -> (nf, 1, 9)
-                "energy_derv_r": force.reshape(nf, nloc, 1, 3),
-                "energy_derv_c_redu": virial.reshape(nf, 1, 9),
-            }
-            if do_atomic_virial:
-                assert atom_virial is not None
-                # atom_virial (N, 3, 3) -> (nf, nloc, 1, 9)
-                out["energy_derv_c"] = atom_virial.reshape(nf, nloc, 1, 9)
-            return out
 
         def _resolve_graph_method(
             self, neighbor_graph_method: str | None
diff --git a/source/tests/common/dpmodel/test_call_lower_graph.py b/source/tests/common/dpmodel/test_call_lower_graph.py
index 6ea4e83544..29dd87c55a 100644
--- a/source/tests/common/dpmodel/test_call_lower_graph.py
+++ b/source/tests/common/dpmodel/test_call_lower_graph.py
@@ -72,7 +72,10 @@ def test_graph_lower_matches_dense_lower(self) -> None:
             box=None,
         )
 
-        dense = model.call_lower(ext_coord, ext_atype, nlist, mapping)
+        # dense ``call_common_lower`` returns the INTERNAL model_output_def keys
+        # (``energy`` per-atom, ``energy_redu`` reduced), matching the
+        # OUTPUT-AGNOSTIC graph lower.
+        dense = model.call_common_lower(ext_coord, ext_atype, nlist, mapping)
 
         ng = from_dense_quartet(ext_coord, nlist, mapping)
         nloc = nlist.shape[1]
@@ -84,12 +87,14 @@ def test_graph_lower_matches_dense_lower(self) -> None:
             edge_mask=ng.edge_mask,
         )
 
+        # reduced per-frame energy
         np.testing.assert_allclose(
-            out["energy"], dense["energy"], rtol=1e-12, atol=1e-12
+            out["energy_redu"], dense["energy_redu"], rtol=1e-12, atol=1e-12
         )
+        # per-atom energy
         np.testing.assert_allclose(
-            out["atom_energy"].reshape(dense["atom_energy"].shape),
-            dense["atom_energy"],
+            out["energy"].reshape(dense["energy"].shape),
+            dense["energy"],
             rtol=1e-12,
             atol=1e-12,
         )
diff --git a/source/tests/pt_expt/model/test_dos_graph.py b/source/tests/pt_expt/model/test_dos_graph.py
new file mode 100644
index 0000000000..dc8cfc0685
--- /dev/null
+++ b/source/tests/pt_expt/model/test_dos_graph.py
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""The OUTPUT-AGNOSTIC graph lower supports ANY fitting, not just energy.
+
+A non-energy model with a graph-eligible descriptor (dpa1 ``attn_layer==0``)
+routes into the graph path by default.  Before the general output transform this
+KeyError'd on ``"energy"``; now every fitting (dos/dipole/polar/property/...)
+flows through :func:`fit_output_to_model_output_graph` with no change on the
+fitting side.  Each model's graph forward (default ``neighbor_graph_method``)
+must match the dense path (``neighbor_graph_method="legacy"``) on every shared
+key (carry-all graph at non-binding ``sel`` reproduces the dense neighbor set).
+"""
+
+import pytest
+import torch
+
+from deepmd.pt.utils import (
+    env,
+)
+from deepmd.pt_expt.descriptor.dpa1 import (
+    DescrptDPA1,
+)
+from deepmd.pt_expt.fitting import (
+    DipoleFitting,
+    DOSFittingNet,
+    PolarFitting,
+    PropertyFittingNet,
+)
+from deepmd.pt_expt.model import (
+    DipoleModel,
+    DOSModel,
+    PolarModel,
+    PropertyModel,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+
+
+def _make_descriptor() -> DescrptDPA1:
+    return DescrptDPA1(
+        4.0,
+        0.5,
+        20,  # non-binding mixed-type single-int sel -> graph == dense neighbors
+        2,
+        attn_layer=0,  # graph lower only supports attn_layer == 0
+        precision="float64",
+        seed=GLOBAL_SEED,
+    ).to(env.DEVICE)
+
+
+def _make_dos(ds: DescrptDPA1):
+    return DOSModel(
+        ds,
+        DOSFittingNet(
+            2, ds.get_dim_out(), 5, mixed_types=ds.mixed_types(), seed=GLOBAL_SEED
+        ).to(env.DEVICE),
+        type_map=["a", "b"],
+    ).to(env.DEVICE)
+
+
+def _make_dipole(ds: DescrptDPA1):
+    return DipoleModel(
+        ds,
+        DipoleFitting(
+            2,
+            ds.get_dim_out(),
+            embedding_width=ds.get_dim_emb(),
+            mixed_types=ds.mixed_types(),
+            seed=GLOBAL_SEED,
+        ).to(env.DEVICE),
+        type_map=["a", "b"],
+    ).to(env.DEVICE)
+
+
+def _make_polar(ds: DescrptDPA1):
+    return PolarModel(
+        ds,
+        PolarFitting(
+            2,
+            ds.get_dim_out(),
+            embedding_width=ds.get_dim_emb(),
+            mixed_types=ds.mixed_types(),
+            seed=GLOBAL_SEED,
+        ).to(env.DEVICE),
+        type_map=["a", "b"],
+    ).to(env.DEVICE)
+
+
+def _make_property(ds: DescrptDPA1):
+    return PropertyModel(
+        ds,
+        PropertyFittingNet(
+            2,
+            ds.get_dim_out(),
+            task_dim=3,
+            mixed_types=ds.mixed_types(),
+            seed=GLOBAL_SEED,
+        ).to(env.DEVICE),
+        type_map=["a", "b"],
+    ).to(env.DEVICE)
+
+
+class TestNonEnergyGraph:
+    def setup_method(self) -> None:
+        generator = torch.Generator(device=env.DEVICE).manual_seed(GLOBAL_SEED)
+        self.coord = torch.rand(
+            1, 5, 3, dtype=torch.float64, device=env.DEVICE, generator=generator
+        )
+        self.atype = torch.tensor([[0, 1, 0, 1, 0]], device=env.DEVICE)
+
+    def test_dos_repro(self) -> None:
+        """The exact bug repro: a DOS model's default forward used to KeyError
+        on ``"energy"`` in the graph path; now it succeeds.
+        """
+        ds = _make_descriptor()
+        ft = DOSFittingNet(2, ds.get_dim_out(), 5, mixed_types=ds.mixed_types()).to(
+            env.DEVICE
+        )
+        m = DOSModel(ds, ft, type_map=["a", "b"]).to(env.DEVICE)
+        out = m(self.coord, self.atype, box=None)
+        # standard DOS model keys (no KeyError)
+        assert set(out.keys()) >= {"atom_dos", "dos", "mask"}
+        assert out["atom_dos"].shape == (1, 5, 5)
+        assert out["dos"].shape == (1, 5)
+
+    @pytest.mark.parametrize(
+        "make_model",
+        [_make_dos, _make_dipole, _make_polar, _make_property],
+    )  # one builder per fitting kind
+    def test_graph_matches_dense(self, make_model) -> None:
+        """Graph (default) output matches the dense (``legacy``) path on every
+        shared key, including derivatives for r/c-differentiable fittings.
+        """
+        tol = (
+            {"rtol": 1e-11, "atol": 1e-11}
+            if env.DEVICE.type == "cpu"
+            else {"rtol": 1e-9, "atol": 1e-9}
+        )
+        ds = _make_descriptor()
+        m = make_model(ds)
+        graph = m.call_common(self.coord, self.atype, None, do_atomic_virial=True)
+        # the dense path differentiates w.r.t. coord -> needs a coord leaf.
+        dense = m.call_common(
+            self.coord.detach().requires_grad_(True),
+            self.atype,
+            None,
+            do_atomic_virial=True,
+            neighbor_graph_method="legacy",
+        )
+        shared = [
+            k
+            for k in graph
+            if k in dense and graph[k] is not None and dense[k] is not None
+        ]
+        # at least the reduced + per-atom output must be present and shared
+        assert len(shared) >= 2
+        for k in shared:
+            torch.testing.assert_close(
+                graph[k].to(torch.float64), dense[k].to(torch.float64), **tol
+            )

From 58ef3fb560b332e77f23aedbaaff8cbdf6c75d9c Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 22:12:59 +0800
Subject: [PATCH 32/69] fix: address CodeRabbit review on #5583
 (charge_spin/virtual-atom fallthrough, env protection parity, ASE/from_ijs
 device)

---
 deepmd/dpmodel/descriptor/dpa1.py             |  1 +
 deepmd/dpmodel/model/make_model.py            | 74 +++++++++++++------
 .../utils/neighbor_graph/ase_builder.py       | 12 ++-
 deepmd/dpmodel/utils/neighbor_graph/env.py    | 24 ++++--
 .../dpmodel/utils/neighbor_graph/from_ijs.py  |  1 +
 deepmd/pt_expt/model/make_model.py            | 22 +++---
 .../dpmodel/test_dpa1_graph_model_energy.py   | 40 ++++++++++
 7 files changed, 135 insertions(+), 39 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 768a04a2e7..1849e7a2a5 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -1472,6 +1472,7 @@ def _call_graph(
             self.rcut,
             self.rcut_smth,
             protection=self.env_protection,
+            edge_mask=graph.edge_mask,
         )  # (E, 4)
         # radial channel
         ss = rr[:, 0:1]  # (E, 1)
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 297669fc37..7c67b49c15 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -315,6 +315,10 @@ def call_common(
             )
             del coord, box, fparam, aparam, charge_spin
             graph_method = self._resolve_graph_method(neighbor_graph_method)
+            # the graph lower does not consume charge_spin yet -> keep those
+            # models on dense (a None check, so it stays jit/export-safe)
+            if cs is not None:
+                graph_method = None
             if graph_method is not None:
                 # carry-all NeighborGraph energy forward (Option B / decision #17)
                 model_predict = self._call_common_graph(
@@ -402,10 +406,11 @@ def _call_common_graph(
                     f"unknown neighbor_graph_method {method!r}; use 'dense' or 'ase'"
                 )
             xp = array_api_compat.array_namespace(atype)
-            dev = array_api_compat.device(atype)
             nf, nloc = atype.shape[:2]
             # OUTPUT-AGNOSTIC standard model dict (``<var>``, ``<var>_redu``,
-            # derivative name-holders ``None``), like the dense ``call_common``.
+            # derivative name-holders ``None``, plus int ``mask``), like the
+            # dense ``call_common``.  ``call_lower_graph`` masks virtual atoms
+            # (atype<0) and sets the real int mask.
             model_predict = self.call_lower_graph(
                 atype=xp.reshape(atype, (nf * nloc,)),
                 n_node=ng.n_node,
@@ -415,9 +420,6 @@ def _call_common_graph(
                 fparam=fp,
                 aparam=ap,
             )
-            # carry-all graph: all local atoms are real -> all-ones int mask,
-            # matching the dense path (base_atomic_model: mask = int32 atom_mask).
-            model_predict["mask"] = xp.ones((nf, nloc), dtype=xp.int32, device=dev)
             return model_predict
 
         def call_common_lower(
@@ -536,13 +538,22 @@ def _graph_descriptor_fitting(
             edge_mask: Array,
             fparam: Array | None = None,
             aparam: Array | None = None,
-        ) -> tuple[dict[str, Array], Array]:
+        ) -> tuple[dict[str, Array], Array, Array]:
             """Run the graph descriptor + fitting forward.
 
             Returns the raw rectangular ``fit_ret`` (``(nf, nloc, *shape)`` per
-            fitting output) plus the descriptor output ``gg`` (used by callers as
-            the array-namespace carrier).  ``edge_vec`` is consumed as-is so that
-            callers can make it the autograd leaf (pt_expt) before invoking this.
+            fitting output), the descriptor output ``gg`` (used by callers as
+            the array-namespace carrier), and the ``(nf, nloc)`` boolean
+            ``atom_mask`` (True for real atoms, False for virtual ``atype<0``).
+            ``edge_vec`` is consumed as-is so that callers can make it the
+            autograd leaf (pt_expt) before invoking this.
+
+            Virtual atoms (``atype < 0``) are masked exactly like the dense
+            :meth:`base_atomic_model.forward_common_atomic`: the atype fed to
+            the descriptor/fitting is clamped to 0 (so ``take`` never sees a
+            negative index) and the per-atom ``fit_ret`` of masked atoms is
+            zeroed BEFORE any reduction, so virtual atoms contribute no
+            type-embedding/bias energy.
             """
             xp = array_api_compat.array_namespace(edge_vec)
             dev = array_api_compat.device(edge_vec)
@@ -556,24 +567,45 @@ def _graph_descriptor_fitting(
             nloc = int(n_node[0])
             descriptor = self.atomic_model.descriptor
             fitting_net = self.atomic_model.fitting_net
+            atype = xp.asarray(atype, device=dev)
+            atype_2d = xp.reshape(atype, (nf, nloc))
+            # virtual-atom mask (True for real atoms); mirror the dense
+            # base_atomic_model.make_atom_mask (atype >= 0).
+            make_atom_mask = getattr(self.atomic_model, "make_atom_mask", None)
+            if make_atom_mask is not None:
+                atom_mask = make_atom_mask(atype_2d)
+            else:
+                atom_mask = atype_2d >= 0
+            # clamp negative (virtual) types to 0 so take(...) never indexes
+            # out of range; virtual atoms have no edges so this only touches
+            # their (subsequently zeroed) node entries.
+            zeros_atype = xp.zeros_like(atype)
+            atype_safe = xp.where(atype >= 0, atype, zeros_atype)
+            atype_2d_safe = xp.reshape(atype_safe, (nf, nloc))
             # dpa1 call_graph requires the type-embedding table explicitly
             type_embedding = descriptor.type_embedding.call()
             gg, rot_mat = descriptor.call_graph(
-                graph, atype, type_embedding=type_embedding
+                graph, atype_safe, type_embedding=type_embedding
             )
             g2 = h2 = None
-            # the fitting expects atype shaped (nf, nloc)
-            atype_2d = xp.reshape(xp.asarray(atype, device=dev), (nf, nloc))
             fit_ret = fitting_net(
                 gg,
-                atype_2d,
+                atype_2d_safe,
                 gr=rot_mat,
                 g2=g2,
                 h2=h2,
                 fparam=fparam,
                 aparam=aparam,
             )
-            return fit_ret, gg
+            # zero the per-atom output of masked (virtual) atoms BEFORE the
+            # reduction -- mirror base_atomic_model lines 315-320.
+            for kk in fit_ret.keys():
+                vv = fit_ret[kk]
+                out_shape = vv.shape
+                flat = xp.reshape(vv, (out_shape[0], out_shape[1], -1))
+                flat = xp.where(atom_mask[:, :, None], flat, xp.zeros_like(flat))
+                fit_ret[kk] = xp.reshape(flat, out_shape)
+            return fit_ret, gg, atom_mask
 
         def call_lower_graph(
             self,
@@ -631,10 +663,7 @@ def call_lower_graph(
                 :func:`fit_output_to_model_output`.
             """
             xp = array_api_compat.array_namespace(edge_vec)
-            dev = array_api_compat.device(edge_vec)
-            nf = n_node.shape[0]
-            nloc = int(n_node[0])
-            fit_ret, gg = self._graph_descriptor_fitting(
+            fit_ret, gg, atom_mask = self._graph_descriptor_fitting(
                 atype,
                 n_node,
                 edge_index,
@@ -643,15 +672,18 @@ def call_lower_graph(
                 fparam=fparam,
                 aparam=aparam,
             )
-            # carry-all graph: every local atom is real -> all-ones int mask.
-            mask = xp.ones((nf, nloc), dtype=xp.int32, device=dev)
-            return fit_output_to_model_output(
+            # int mask of real atoms (matches the dense base_atomic_model).
+            mask = xp.astype(atom_mask, xp.int32)
+            model_predict = fit_output_to_model_output(
                 fit_ret,
                 self.atomic_model.fitting_output_def(),
                 gg,
                 do_atomic_virial=False,
                 mask=mask,
             )
+            # fit_output_to_model_output does not add "mask"; set it here.
+            model_predict["mask"] = mask
+            return model_predict
 
         call = call_common
         call_lower = call_common_lower
diff --git a/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py b/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
index 364c1a3c0b..2d7543a0d1 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
@@ -16,6 +16,7 @@
 
 from typing import (
     TYPE_CHECKING,
+    Any,
 )
 
 import numpy as np
@@ -83,10 +84,17 @@ def build_neighbor_graph_ase(
             "install ase or use neighbor-graph method 'dense'."
         ) from e
 
-    coord_np = np.asarray(coord)
+    # The ASE topology search runs on the CPU in numpy; convert safely from a
+    # CUDA / grad-requiring torch tensor (the original coord/box are still
+    # passed to neighbor_graph_from_ijs below, which recomputes edge_vec
+    # differentiably on the native backend/device).
+    def _to_cpu_numpy(x: Any) -> np.ndarray:
+        return np.asarray(x.detach().cpu()) if hasattr(x, "detach") else np.asarray(x)
+
+    coord_np = _to_cpu_numpy(coord)
     nf, nloc = coord_np.shape[:2]
     coord_np = coord_np.reshape(nf, nloc, 3)
-    box_np = np.asarray(box).reshape(nf, 3, 3) if box is not None else None
+    box_np = _to_cpu_numpy(box).reshape(nf, 3, 3) if box is not None else None
     periodic = box is not None
 
     i_parts = []
diff --git a/deepmd/dpmodel/utils/neighbor_graph/env.py b/deepmd/dpmodel/utils/neighbor_graph/env.py
index e5bec3f04b..55bbe1b02f 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/env.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/env.py
@@ -40,6 +40,7 @@ def edge_env_mat(
     rcut: float,
     rcut_smth: float,
     protection: float = 0.0,
+    edge_mask: Array | None = None,
 ) -> Array:
     """Compute the per-edge environment-matrix 4-vector.
 
@@ -64,6 +65,13 @@ def edge_env_mat(
     protection
         Small additive offset to avoid exact division-by-zero on
         atoms that are numerically at the same position (default 0).
+    edge_mask
+        (E,) boolean valid-edge mask. When provided, the length of INVALID
+        (padding) edges has 1 added to it before adding ``protection`` ---
+        matching the dense ``_make_env_mat`` (``length = length + ~mask``),
+        which guards padding by mask rather than by a length threshold.
+        When ``None``, fall back to the ``length < 1e-10`` zero-guard
+        (back-compat for callers without a mask).
 
     Returns
     -------
@@ -79,11 +87,17 @@ def edge_env_mat(
     # (E, 1) lengths; safe_for_vector_norm returns 0 for zero vectors
     length = safe_for_vector_norm(edge_vec, axis=-1, keepdims=True)
 
-    # Guard against exact zero to avoid 1/0 (happens on padding edges where
-    # edge_vec = 0).  Real edges always have length > 0.
-    safe_len = xp.where(length < 1e-10, xp.ones_like(length), length)
-
-    denom = safe_len + protection  # (E, 1)
+    # Guard against 1/0 on padding edges.  When an edge_mask is provided,
+    # match the dense _make_env_mat exactly: add 1 to the length of INVALID
+    # (padding) edges by mask (not by a length threshold), so a real edge and
+    # a padding edge never share the same protection arithmetic.  Otherwise
+    # fall back to the length<1e-10 zero-guard (back-compat).
+    if edge_mask is not None:
+        length = length + xp.astype(xp.logical_not(edge_mask)[:, None], length.dtype)
+    else:
+        length = xp.where(length < 1e-10, xp.ones_like(length), length)
+
+    denom = length + protection  # (E, 1)
     t0 = 1.0 / denom  # (E, 1)  — radial component
     t1 = edge_vec / (denom**2)  # (E, 3) — angular components
 
diff --git a/deepmd/dpmodel/utils/neighbor_graph/from_ijs.py b/deepmd/dpmodel/utils/neighbor_graph/from_ijs.py
index 8aa533c61e..0136dd4bf4 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/from_ijs.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/from_ijs.py
@@ -95,6 +95,7 @@ def neighbor_graph_from_ijs(
     r_j = xp.take(coord_flat, j_flat, axis=0)
     edge_vec = r_j - r_i
     if box is not None:
+        box = xp.asarray(box, device=dev)
         box = xp.reshape(box, (nf, 3, 3))
         box_per_edge = xp.take(box, nframe_id, axis=0)  # (E, 3, 3)
         S = xp.astype(xp.asarray(S, device=dev), box.dtype)
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index c22d6efcea..e340ba5cc8 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -341,11 +341,9 @@ def forward_common_lower_graph(
                 ``<var>_derv_c_redu`` (nf, *shape, 9), and -- when
                 ``do_atomic_virial`` -- ``<var>_derv_c`` (nf, nloc, *shape, 9).
             """
-            nf = int(n_node.shape[0])
-            nloc = int(n_node[0])
             # make edge_vec the autograd leaf for the energy backward
             edge_vec = edge_vec.detach().requires_grad_(True)
-            fit_ret, _ = self._graph_descriptor_fitting(
+            fit_ret, _, atom_mask = self._graph_descriptor_fitting(
                 atype,
                 n_node,
                 edge_index,
@@ -354,9 +352,10 @@ def forward_common_lower_graph(
                 fparam=fparam,
                 aparam=aparam,
             )
-            # carry-all graph: every local atom is real -> all-ones int mask.
-            mask = torch.ones((nf, nloc), dtype=torch.int32, device=edge_vec.device)
-            return fit_output_to_model_output_graph(
+            # int mask of real atoms (virtual atype<0 already zeroed in
+            # _graph_descriptor_fitting); matches the dense base_atomic_model.
+            mask = atom_mask.to(torch.int32)
+            model_predict = fit_output_to_model_output_graph(
                 fit_ret,
                 self.atomic_model.fitting_output_def(),
                 edge_vec,
@@ -367,6 +366,9 @@ def forward_common_lower_graph(
                 create_graph=self.training,
                 mask=mask,
             )
+            # fit_output_to_model_output_graph does not add "mask"; set it here.
+            model_predict["mask"] = mask
+            return model_predict
 
         def _resolve_graph_method(
             self, neighbor_graph_method: str | None
@@ -435,11 +437,9 @@ def _call_common_graph(
                 fparam=fp,
                 aparam=ap,
             )
-            # carry-all graph: every local atom is real -> all-ones mask, matching
-            # the dense ``call_common`` output (which carries a ``mask`` key).
-            model_predict["mask"] = torch.ones(
-                (nf, nloc), dtype=torch.int32, device=atype.device
-            )
+            # forward_common_lower_graph already masks virtual atoms (atype<0)
+            # and carries the real int ``mask`` key, matching the dense
+            # ``call_common`` output.
             return model_predict
 
         def forward_common_atomic(
diff --git a/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py b/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
index 8baac16818..d1604c08b7 100644
--- a/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
+++ b/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
@@ -116,6 +116,46 @@ def test_energy_parity_multiframe_periodic(method) -> None:
     assert not np.array_equal(dense["energy_redu"][0], dense["energy_redu"][1])
 
 
+def test_virtual_atom_masked() -> None:
+    """A virtual atom (``atype == -1``) must contribute ZERO energy and have a
+    ZERO mask in the carry-all graph path, matching the dense path exactly.
+
+    Regression for the leak where the graph path fed the raw (negative) atype
+    to the descriptor/fitting and stamped an all-ones mask, so virtual atoms
+    picked up a type-embedding + bias energy that the dense path masks out.
+
+    Uses the in-tree ``"dense"`` builder, which shares the EXACT same quartet
+    neighbor list as the ``"legacy"`` dense path, so the parity is bit-tight
+    (the ``"ase"`` builder has its own near-cutoff boundary quirks, covered by
+    the other tests).
+    """
+    method = "dense"
+    rng = np.random.default_rng(7)
+    nloc = 6
+    coord = rng.normal(size=(1, nloc, 3)) * 1.5
+    # one local virtual atom (atype == -1); the rest are real
+    atype = np.array([[0, 1, -1, 1, 0, 1]], dtype=np.int64)
+    box = None
+    # LARGE sel -> non-binding (no truncation) so dense == graph on real atoms
+    model = _make_model([200])
+
+    dense = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
+    graph = model.call_common(coord, atype, box, neighbor_graph_method=method)
+
+    # graph energy (reduced + per-atom) must match the dense path exactly
+    np.testing.assert_allclose(
+        graph["energy_redu"], dense["energy_redu"], rtol=1e-12, atol=1e-12
+    )
+    np.testing.assert_allclose(graph["energy"], dense["energy"], rtol=1e-12, atol=1e-12)
+    # the virtual atom (index 2) contributes ZERO per-atom energy
+    np.testing.assert_allclose(graph["energy"][0, 2], 0.0, rtol=0, atol=0)
+    # mask must be 0 at the virtual atom and match the dense int mask
+    assert int(graph["mask"][0, 2]) == 0
+    np.testing.assert_array_equal(graph["mask"], dense["mask"])
+    expected_mask = np.array([[1, 1, 0, 1, 1, 1]], dtype=np.int32)
+    np.testing.assert_array_equal(graph["mask"], expected_mask)
+
+
 def test_binding_sel_carries_more_than_dense() -> None:
     """At binding sel the carry-all graph includes neighbors the dense path
     truncates, so energy DIFFERS (intended, decision #17 / Option B).

From 99c707a5169f995926226a94bc266539f08d5000 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Thu, 25 Jun 2026 23:39:48 +0800
Subject: [PATCH 33/69] fix(dpmodel): graph dense-bridge uses neighbor's actual
 extended type

The dpa1 graph dense->graph adapter (from_dense_quartet -> call_graph)
derived the per-edge neighbor type from the local owner atype[mapping[
neighbor]]. That equals the neighbor's true extended type atype_ext[
neighbor] for every PHYSICAL input (a ghost is a periodic image of its
owner), but diverges for an arbitrary external quartet whose mapping is
inconsistent -- e.g. the naively-permuted mapping in the universal
descriptor fixture. The dense path reads atype_ext[neighbor] directly,
so test_exclude_types (which routes the graph-eligible attn_layer=0
dpa1 through the adapter) failed graph-vs-dense parity.

Thread an optional per-edge nei_type override into call_graph/_call_graph;
the adapter computes it from atype_ext[neighbor] (matching the converter's
compact=False edge ordering). Geometry-native (carry-all) builders pass
None and keep using atype[src], correct for all real inputs.

Adds a non-vacuous regression test (corrupts ghost types so ghost type
!= owner type and asserts a ghost is actually a neighbor).
---
 deepmd/dpmodel/descriptor/dpa1.py             | 41 ++++++++++++++++++-
 .../test_dpa1_call_graph_descriptor.py        | 34 +++++++++++++++
 2 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 1849e7a2a5..fe74ee8287 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -606,10 +606,28 @@ def _call_graph_adapter(
         )
         # local atom types, flat (nf * nloc,)
         atype_local = xp.reshape(xp_take_first_n(atype_ext, 1, nloc), (nf * nloc,))
+        # per-edge neighbor type from the ACTUAL extended types (not atype[src]),
+        # so the dense bridge stays byte-faithful to the dense path for any
+        # mapping. Matches from_dense_quartet(compact=False) edge ordering:
+        # row-major (frame, center, slot), neighbor flat index = ff * nall + j.
+        nl_flat = xp.reshape(nlist, (-1,))  # (E,) neighbor ext idx or -1
+        valid = nl_flat >= 0
+        j_safe = xp.where(valid, nl_flat, xp.zeros_like(nl_flat))  # clamp -1 -> 0
+        ff = xp.reshape(
+            xp.broadcast_to(
+                xp.reshape(xp.arange(nf, dtype=nl_flat.dtype, device=dev), (nf, 1, 1)),
+                (nf, nloc, nnei),
+            ),
+            (-1,),
+        )
+        nei_type = xp.take(
+            xp.reshape(atype_ext, (nf * nall,)), ff * nall + j_safe, axis=0
+        )  # (E,)
         grrg, rot_mat = self.call_graph(
             graph,
             atype_local,
             type_embedding=self.type_embedding.call(),
+            nei_type=nei_type,
         )
         # reconstruct the dense-shaped sw the dense way (env_mat switch masked
         # where nlist == -1; the graph path forbids exclude_types, so nlist_mask
@@ -667,6 +685,7 @@ def call_graph(
         graph: Any,
         atype: Array,
         type_embedding: Array | None = None,
+        nei_type: Array | None = None,
     ) -> tuple[Array, Array]:
         """Descriptor-level graph-native forward (``attn_layer == 0``).
 
@@ -687,6 +706,10 @@ def call_graph(
             (nf * nloc,) flat LOCAL atom types.
         type_embedding
             (ntypes_with_padding, tebd_dim) type-embedding table.
+        nei_type
+            (E,) per-edge neighbor type override; see
+            :meth:`DescrptBlockSeAtten._call_graph`. ``None`` derives it from
+            ``atype[src]`` (correct for every physical input).
 
         Returns
         -------
@@ -698,7 +721,7 @@ def call_graph(
         xp = array_api_compat.array_namespace(graph.edge_vec)
         dev = array_api_compat.device(graph.edge_vec)
         grrg_node, rot_mat_node = self.se_atten._call_graph(
-            graph, atype, type_embedding=type_embedding
+            graph, atype, type_embedding=type_embedding, nei_type=nei_type
         )
         nf = graph.n_node.shape[0]
         # atype is the flat (nf*nloc,) node axis; derive nloc from the STATIC shape
@@ -1396,6 +1419,7 @@ def _call_graph(
         graph: Any,
         atype: Array,
         type_embedding: Array | None = None,
+        nei_type: Array | None = None,
     ) -> tuple[Array, Array]:
         """Graph-native forward (``attn_layer=0`` only).
 
@@ -1413,6 +1437,14 @@ def _call_graph(
             (N,) flat node atom types (``N = sum(graph.n_node)``).
         type_embedding
             (ntypes_with_padding, tebd_dim) type-embedding table.
+        nei_type
+            (E,) per-edge neighbor (src) type override. ``None`` (geometry-native
+            builders) gathers the type from the local owner ``atype[src]``, which
+            equals the neighbor's extended type for every physical input (a ghost
+            is a periodic image of its owner). The dense-quartet bridge supplies
+            the neighbor's ACTUAL extended type ``atype_ext[neighbor]`` so it stays
+            byte-faithful to the dense path even for an arbitrary external
+            ``mapping`` where ``atype[src] != atype_ext[neighbor]``.
 
         Returns
         -------
@@ -1460,7 +1492,12 @@ def _call_graph(
         dst = graph.edge_index[1, :]
         atype = xp.asarray(atype, device=dev)
         center_type = xp.take(atype, dst, axis=0)  # (E,)
-        nei_type = xp.take(atype, src, axis=0)  # (E,)
+        if nei_type is None:
+            # geometry-native: owner type == neighbor extended type (consistent
+            # mapping). The dense bridge overrides with atype_ext[neighbor].
+            nei_type = xp.take(atype, src, axis=0)  # (E,)
+        else:
+            nei_type = xp.asarray(nei_type, device=dev)
         # per-edge env-mat 4-vector, normalized by the center (dst) atom type.
         # self.mean/self.stddev are slot-independent (ntypes, nnei, 4); slot 0 is
         # the canonical per-type vector.
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
index aa9fc23575..4fc33dbcbe 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
@@ -143,3 +143,37 @@ def test_eligible_no_mapping_with_ghosts_falls_back(self) -> None:
         ref = self._dense_reference(dd, ext_coord, ext_atype, nlist)
         out = dd.call(ext_coord, ext_atype, nlist, mapping=None)  # must not IndexError
         np.testing.assert_allclose(out[0], ref[0], rtol=1e-12, atol=1e-12)
+
+    def test_inconsistent_mapping_stays_faithful_to_dense(self) -> None:
+        """The dense->graph bridge must reproduce the dense 5-tuple even when the
+        supplied ``mapping`` is INCONSISTENT (a ghost's extended type differs from
+        its local owner's type). Real periodic systems never produce this -- a
+        ghost is a periodic image of its owner -- but a synthetic external quartet
+        can (e.g. the permuted ``mapping`` in the universal descriptor fixture).
+        The dense path reads ``atype_ext[neighbor]`` directly, so the graph bridge
+        must too (regression: it used ``atype[mapping[neighbor]]`` and diverged).
+        """
+        dd = self._make([30])
+        box = np.eye(3, dtype=np.float64)[None] * 6.0
+        ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+            self.coord,
+            self.atype,
+            dd.get_rcut(),
+            dd.get_sel(),
+            mixed_types=dd.mixed_types(),
+            box=box,
+        )
+        assert ext_atype.shape[1] > self.nloc  # ghosts present
+        # Corrupt EVERY ghost's extended type so ghost type != owner type, making
+        # atype_ext[neighbor] != atype[mapping[neighbor]] for every ghost edge.
+        ext_atype = np.array(ext_atype, copy=True)
+        ext_atype[:, self.nloc :] = 1 - ext_atype[:, self.nloc :]
+        # the corruption must actually be exercised: some ghost must be a neighbor
+        ghost_in_nlist = np.any(nlist[nlist >= 0] >= self.nloc)
+        assert ghost_in_nlist, "test is vacuous: no ghost appears in the nlist"
+        # dense reference uses the corrupted atype_ext[neighbor] directly
+        ref = self._dense_reference(dd, ext_coord, ext_atype, nlist)
+        out = dd.call(ext_coord, ext_atype, nlist, mapping=mapping)
+        np.testing.assert_allclose(out[0], ref[0], rtol=1e-12, atol=1e-12)
+        np.testing.assert_allclose(out[1], ref[1], rtol=1e-12, atol=1e-12)
+        np.testing.assert_allclose(out[4], ref[4], rtol=1e-12, atol=1e-12)

From 53ec9a0cecb439086c546cbc130a28b27176e584 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 10:13:13 +0800
Subject: [PATCH 34/69] fix(tests): correct invalid permuted mapping; revert
 graph nei_type to ghost-free

The test_exclude_types failure on #5583 was NOT a graph bug -- it was an
invalid test fixture. extend_coord_with_ghosts tiles the local atype and
aidx (nlist.py:369-370), so atype_ext[k] == atype[mapping[k]] is a hard
single-rank invariant: a ghost is a periodic image of its owner and shares
its type. The ghost-free graph (src = mapping[neighbor], nei_type =
atype[src]) is therefore correct.

The shared TestCaseSingleFrameWithNlist fixtures permuted `mapping` as
`mapping[:, perm]` WITHOUT remapping the local-index VALUES through
inv_perm (which they DO apply to nlist), desyncing atype_ext from mapping
and even breaking local self-mapping -- a quartet extend_coord_with_ghosts
can never emit. The ghost-free graph (correct) then diverged from dense on
this impossible input.

- Fix the 4 duplicated fixture copies: mapping permutation now mirrors the
  nlist one, `inv_perm[mapping[:, perm]]` (universal/common/cases/cases.py
  -- the copy the universal CI test imports -- plus common/dpmodel,
  common/test_mixins, pd/model/test_env_mat).
- Revert the per-edge nei_type channel (commit 99c707a51) back to the
  ghost-free atype[src] gather; it was treating a symptom of the bad fixture.
- Replace the now-invalid inconsistent-mapping regression test with
  test_single_rank_extension_keeps_type_invariant, which pins
  atype_ext[k]==atype[mapping[k]] on extend_coord_with_ghosts output.

Multi-rank (real halo ghosts with independent types) needs the extended
atype table + comm fold and is the documented PR-B follow-on.
---
 deepmd/dpmodel/descriptor/dpa1.py             | 41 +------------------
 .../dpmodel/case_single_frame_with_nlist.py   |  2 +-
 .../test_dpa1_call_graph_descriptor.py        | 31 +++++++-------
 source/tests/common/test_mixins.py            |  2 +-
 source/tests/pd/model/test_env_mat.py         |  2 +-
 source/tests/universal/common/cases/cases.py  |  2 +-
 6 files changed, 21 insertions(+), 59 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index fe74ee8287..1849e7a2a5 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -606,28 +606,10 @@ def _call_graph_adapter(
         )
         # local atom types, flat (nf * nloc,)
         atype_local = xp.reshape(xp_take_first_n(atype_ext, 1, nloc), (nf * nloc,))
-        # per-edge neighbor type from the ACTUAL extended types (not atype[src]),
-        # so the dense bridge stays byte-faithful to the dense path for any
-        # mapping. Matches from_dense_quartet(compact=False) edge ordering:
-        # row-major (frame, center, slot), neighbor flat index = ff * nall + j.
-        nl_flat = xp.reshape(nlist, (-1,))  # (E,) neighbor ext idx or -1
-        valid = nl_flat >= 0
-        j_safe = xp.where(valid, nl_flat, xp.zeros_like(nl_flat))  # clamp -1 -> 0
-        ff = xp.reshape(
-            xp.broadcast_to(
-                xp.reshape(xp.arange(nf, dtype=nl_flat.dtype, device=dev), (nf, 1, 1)),
-                (nf, nloc, nnei),
-            ),
-            (-1,),
-        )
-        nei_type = xp.take(
-            xp.reshape(atype_ext, (nf * nall,)), ff * nall + j_safe, axis=0
-        )  # (E,)
         grrg, rot_mat = self.call_graph(
             graph,
             atype_local,
             type_embedding=self.type_embedding.call(),
-            nei_type=nei_type,
         )
         # reconstruct the dense-shaped sw the dense way (env_mat switch masked
         # where nlist == -1; the graph path forbids exclude_types, so nlist_mask
@@ -685,7 +667,6 @@ def call_graph(
         graph: Any,
         atype: Array,
         type_embedding: Array | None = None,
-        nei_type: Array | None = None,
     ) -> tuple[Array, Array]:
         """Descriptor-level graph-native forward (``attn_layer == 0``).
 
@@ -706,10 +687,6 @@ def call_graph(
             (nf * nloc,) flat LOCAL atom types.
         type_embedding
             (ntypes_with_padding, tebd_dim) type-embedding table.
-        nei_type
-            (E,) per-edge neighbor type override; see
-            :meth:`DescrptBlockSeAtten._call_graph`. ``None`` derives it from
-            ``atype[src]`` (correct for every physical input).
 
         Returns
         -------
@@ -721,7 +698,7 @@ def call_graph(
         xp = array_api_compat.array_namespace(graph.edge_vec)
         dev = array_api_compat.device(graph.edge_vec)
         grrg_node, rot_mat_node = self.se_atten._call_graph(
-            graph, atype, type_embedding=type_embedding, nei_type=nei_type
+            graph, atype, type_embedding=type_embedding
         )
         nf = graph.n_node.shape[0]
         # atype is the flat (nf*nloc,) node axis; derive nloc from the STATIC shape
@@ -1419,7 +1396,6 @@ def _call_graph(
         graph: Any,
         atype: Array,
         type_embedding: Array | None = None,
-        nei_type: Array | None = None,
     ) -> tuple[Array, Array]:
         """Graph-native forward (``attn_layer=0`` only).
 
@@ -1437,14 +1413,6 @@ def _call_graph(
             (N,) flat node atom types (``N = sum(graph.n_node)``).
         type_embedding
             (ntypes_with_padding, tebd_dim) type-embedding table.
-        nei_type
-            (E,) per-edge neighbor (src) type override. ``None`` (geometry-native
-            builders) gathers the type from the local owner ``atype[src]``, which
-            equals the neighbor's extended type for every physical input (a ghost
-            is a periodic image of its owner). The dense-quartet bridge supplies
-            the neighbor's ACTUAL extended type ``atype_ext[neighbor]`` so it stays
-            byte-faithful to the dense path even for an arbitrary external
-            ``mapping`` where ``atype[src] != atype_ext[neighbor]``.
 
         Returns
         -------
@@ -1492,12 +1460,7 @@ def _call_graph(
         dst = graph.edge_index[1, :]
         atype = xp.asarray(atype, device=dev)
         center_type = xp.take(atype, dst, axis=0)  # (E,)
-        if nei_type is None:
-            # geometry-native: owner type == neighbor extended type (consistent
-            # mapping). The dense bridge overrides with atype_ext[neighbor].
-            nei_type = xp.take(atype, src, axis=0)  # (E,)
-        else:
-            nei_type = xp.asarray(nei_type, device=dev)
+        nei_type = xp.take(atype, src, axis=0)  # (E,)
         # per-edge env-mat 4-vector, normalized by the center (dst) atom type.
         # self.mean/self.stddev are slot-independent (ntypes, nnei, 4); slot 0 is
         # the canonical per-type vector.
diff --git a/source/tests/common/dpmodel/case_single_frame_with_nlist.py b/source/tests/common/dpmodel/case_single_frame_with_nlist.py
index 7ec92a1de1..3995bd20f4 100644
--- a/source/tests/common/dpmodel/case_single_frame_with_nlist.py
+++ b/source/tests/common/dpmodel/case_single_frame_with_nlist.py
@@ -69,7 +69,7 @@ def setUp(self) -> None:
             [self.atype_ext, self.atype_ext[:, self.perm]], axis=0
         )
         self.mapping = np.concatenate(
-            [self.mapping, self.mapping[:, self.perm]], axis=0
+            [self.mapping, inv_perm[self.mapping[:, self.perm]]], axis=0
         )
         # permute the nlist
         nlist1 = self.nlist[:, self.perm[: self.nloc], :]
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
index 4fc33dbcbe..423f8e0308 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
@@ -144,14 +144,15 @@ def test_eligible_no_mapping_with_ghosts_falls_back(self) -> None:
         out = dd.call(ext_coord, ext_atype, nlist, mapping=None)  # must not IndexError
         np.testing.assert_allclose(out[0], ref[0], rtol=1e-12, atol=1e-12)
 
-    def test_inconsistent_mapping_stays_faithful_to_dense(self) -> None:
-        """The dense->graph bridge must reproduce the dense 5-tuple even when the
-        supplied ``mapping`` is INCONSISTENT (a ghost's extended type differs from
-        its local owner's type). Real periodic systems never produce this -- a
-        ghost is a periodic image of its owner -- but a synthetic external quartet
-        can (e.g. the permuted ``mapping`` in the universal descriptor fixture).
-        The dense path reads ``atype_ext[neighbor]`` directly, so the graph bridge
-        must too (regression: it used ``atype[mapping[neighbor]]`` and diverged).
+    def test_single_rank_extension_keeps_type_invariant(self) -> None:
+        """The ghost-free graph types a neighbor as ``atype[mapping[neighbor]]``
+        (its local owner). This is correct because a real single-rank extension
+        is type-consistent: ``extend_coord_with_ghosts`` tiles the local atype, so
+        ``atype_ext[k] == atype[mapping[k]]`` for every extended atom -- a ghost is
+        a periodic image of its owner and shares its type. This test pins that
+        invariant (an inconsistent ``mapping`` like the universal fixture's old
+        buggy permutation is NOT a valid single-rank extension) and confirms the
+        graph-routed ``call`` matches dense on the resulting quartet.
         """
         dd = self._make([30])
         box = np.eye(3, dtype=np.float64)[None] * 6.0
@@ -164,14 +165,12 @@ def test_inconsistent_mapping_stays_faithful_to_dense(self) -> None:
             box=box,
         )
         assert ext_atype.shape[1] > self.nloc  # ghosts present
-        # Corrupt EVERY ghost's extended type so ghost type != owner type, making
-        # atype_ext[neighbor] != atype[mapping[neighbor]] for every ghost edge.
-        ext_atype = np.array(ext_atype, copy=True)
-        ext_atype[:, self.nloc :] = 1 - ext_atype[:, self.nloc :]
-        # the corruption must actually be exercised: some ghost must be a neighbor
-        ghost_in_nlist = np.any(nlist[nlist >= 0] >= self.nloc)
-        assert ghost_in_nlist, "test is vacuous: no ghost appears in the nlist"
-        # dense reference uses the corrupted atype_ext[neighbor] directly
+        # the single-rank type invariant the ghost-free graph relies on
+        nf, nall = ext_atype.shape
+        for f in range(nf):
+            np.testing.assert_array_equal(
+                ext_atype[f], ext_atype[f][mapping[f]]
+            )  # atype_ext[k] == atype[mapping[k]]
         ref = self._dense_reference(dd, ext_coord, ext_atype, nlist)
         out = dd.call(ext_coord, ext_atype, nlist, mapping=mapping)
         np.testing.assert_allclose(out[0], ref[0], rtol=1e-12, atol=1e-12)
diff --git a/source/tests/common/test_mixins.py b/source/tests/common/test_mixins.py
index e311baf5cf..5dd907ded4 100644
--- a/source/tests/common/test_mixins.py
+++ b/source/tests/common/test_mixins.py
@@ -54,7 +54,7 @@ def setUp(self) -> None:
             [self.atype_ext, self.atype_ext[:, self.perm]], axis=0
         )
         self.mapping = np.concatenate(
-            [self.mapping, self.mapping[:, self.perm]], axis=0
+            [self.mapping, inv_perm[self.mapping[:, self.perm]]], axis=0
         )
 
         # permute the nlist
diff --git a/source/tests/pd/model/test_env_mat.py b/source/tests/pd/model/test_env_mat.py
index bbdb7c75a3..b5b9e0bee6 100644
--- a/source/tests/pd/model/test_env_mat.py
+++ b/source/tests/pd/model/test_env_mat.py
@@ -63,7 +63,7 @@ def setUp(self) -> None:
             [self.atype_ext, self.atype_ext[:, self.perm]], axis=0
         )
         self.mapping = np.concatenate(
-            [self.mapping, self.mapping[:, self.perm]], axis=0
+            [self.mapping, inv_perm[self.mapping[:, self.perm]]], axis=0
         )
 
         # permute the nlist
diff --git a/source/tests/universal/common/cases/cases.py b/source/tests/universal/common/cases/cases.py
index d625ef0d35..421cbf4556 100644
--- a/source/tests/universal/common/cases/cases.py
+++ b/source/tests/universal/common/cases/cases.py
@@ -51,7 +51,7 @@ def setUp(self) -> None:
             [self.atype_ext, self.atype_ext[:, self.perm]], axis=0
         )
         self.mapping = np.concatenate(
-            [self.mapping, self.mapping[:, self.perm]], axis=0
+            [self.mapping, inv_perm[self.mapping[:, self.perm]]], axis=0
         )
         self.mock_descriptor = np.concatenate(
             [self.mock_descriptor, self.mock_descriptor[:, self.perm[: self.nloc], :]],

From 48fe4b16439b2e5c374d64add0c9035f0dc85feb Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 10:29:33 +0800
Subject: [PATCH 35/69] refactor(dpmodel): extract _finalize_atomic_ret from
 forward_common_atomic

---
 .../dpmodel/atomic_model/base_atomic_model.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index cf59af94db..b319d2c3dd 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -303,15 +303,25 @@ def forward_common_atomic(
             comm_dict=comm_dict,
             charge_spin=charge_spin,
         )
-        ret_dict = self.apply_out_stat(ret_dict, atype)
-
-        # nf x nloc
         atom_mask = xp_take_first_n(ext_atom_mask, 1, nloc)
+        return self._finalize_atomic_ret(ret_dict, atom_mask, atype)
+
+    def _finalize_atomic_ret(
+        self, ret_dict: dict, atom_mask: Array, atype: Array
+    ) -> dict:
+        """Out-stat + atom-exclusion + virtual-atom zeroing + ``mask`` key.
+
+        Shared by the dense (:meth:`forward_common_atomic`) and graph
+        (:meth:`forward_common_atomic_graph`) wrappers. ``atom_mask`` is the
+        (nf, nloc) real-atom mask (from ``make_atom_mask``); ``atype`` is the
+        (nf, nloc) LOCAL atom types (used for out-stat and ``atom_excl``).
+        """
+        xp = array_api_compat.array_namespace(atype)
+        ret_dict = self.apply_out_stat(ret_dict, atype)
         if self.atom_excl is not None:
             atom_mask = xp.logical_and(
                 atom_mask, self.atom_excl.build_type_exclude_mask(atype)
             )
-
         for kk in ret_dict.keys():
             out_shape = ret_dict[kk].shape
             out_shape2 = math.prod(out_shape[2:])
@@ -319,7 +329,6 @@ def forward_common_atomic(
             tmp_arr = xp.where(atom_mask[:, :, None], tmp_arr, xp.zeros_like(tmp_arr))
             ret_dict[kk] = xp.reshape(tmp_arr, out_shape)
         ret_dict["mask"] = xp.astype(atom_mask, xp.int32)
-
         return ret_dict
 
     def call(

From d136eea5abc4c048a457a43894ded4bee95b23f2 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 10:36:06 +0800
Subject: [PATCH 36/69] feat(dpmodel): add DPAtomicModel.forward_atomic_graph

---
 .../dpmodel/atomic_model/dp_atomic_model.py   | 46 +++++++++++++++++++
 .../dpmodel/test_graph_atomic_parity.py       | 30 ++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 source/tests/common/dpmodel/test_graph_atomic_parity.py

diff --git a/deepmd/dpmodel/atomic_model/dp_atomic_model.py b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
index a2b49f47e3..cf4052368e 100644
--- a/deepmd/dpmodel/atomic_model/dp_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
@@ -3,9 +3,15 @@
     Callable,
 )
 from typing import (
+    TYPE_CHECKING,
     Any,
 )
 
+if TYPE_CHECKING:
+    from deepmd.dpmodel.utils.neighbor_graph import (
+        NeighborGraph,
+    )
+
 from deepmd.dpmodel.array_api import (
     Array,
     xp_take_first_n,
@@ -248,6 +254,46 @@ def forward_atomic(
         )
         return ret
 
+    def forward_atomic_graph(
+        self,
+        graph: "NeighborGraph",
+        atype: Array,
+        fparam: Array | None = None,
+        aparam: Array | None = None,
+    ) -> dict[str, Array]:
+        """Graph analogue of :meth:`forward_atomic`: descriptor ``call_graph`` ->
+        fitting. ``atype`` is flat LOCAL types (N,). Returns the raw fitting dict
+        (no reduction, no masking -- the wrapper handles those).
+
+        Parameters
+        ----------
+        graph
+            NeighborGraph for the local atoms (ghost-free).
+        atype
+            Flat local atom types, shape (nf*nloc,).
+        fparam
+            Frame parameters. nf x ndf
+        aparam
+            Atomic parameters. nf x nloc x nda
+
+        Returns
+        -------
+        result_dict
+            the result dict, defined by the `FittingOutputDef`.
+        """
+        import array_api_compat
+
+        xp = array_api_compat.array_namespace(graph.edge_vec)
+        nf = graph.n_node.shape[0]
+        nloc = atype.shape[0] // nf
+        descriptor = self.descriptor
+        type_embedding = descriptor.type_embedding.call()
+        gg, rot_mat = descriptor.call_graph(graph, atype, type_embedding=type_embedding)
+        atype_2d = xp.reshape(atype, (nf, nloc))
+        return self.fitting_net(
+            gg, atype_2d, gr=rot_mat, g2=None, h2=None, fparam=fparam, aparam=aparam
+        )
+
     def compute_or_load_stat(
         self,
         sampled_func: Callable[[], list[dict]],
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
new file mode 100644
index 0000000000..3cd454b37f
--- /dev/null
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import numpy as np
+
+from deepmd.dpmodel.atomic_model.dp_atomic_model import DPAtomicModel
+from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1
+from deepmd.dpmodel.fitting import InvarFitting
+from deepmd.dpmodel.utils.neighbor_graph import from_dense_quartet
+from deepmd.dpmodel.utils.nlist import extend_input_and_build_neighbor_list
+
+
+def _atomic_model(sel=(30,), **kw):
+    ds = DescrptDPA1(
+        rcut=4.0, rcut_smth=0.5, sel=list(sel), ntypes=2, attn_layer=0, **kw
+    )
+    ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)
+    return DPAtomicModel(ds, ft, type_map=["a", "b"])
+
+
+def test_forward_atomic_graph_matches_dense():
+    rng = np.random.default_rng(0)
+    coord = rng.normal(size=(1, 5, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0]], dtype=np.int64)
+    am = _atomic_model()
+    ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+        coord, atype, 4.0, [30], mixed_types=True, box=None
+    )
+    dense = am.forward_atomic(ext_coord, ext_atype, nlist, mapping=mapping)
+    ng = from_dense_quartet(ext_coord, nlist, mapping)
+    graph = am.forward_atomic_graph(ng, atype.reshape(-1))
+    np.testing.assert_allclose(graph["energy"], dense["energy"], rtol=1e-12, atol=1e-12)

From 10af4727a7b8b3e03db513c35e60f82f866bb4f0 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 10:42:29 +0800
Subject: [PATCH 37/69] feat(dpmodel): add
 BaseAtomicModel.forward_common_atomic_graph

---
 .../dpmodel/atomic_model/base_atomic_model.py | 33 +++++++++++++++++++
 .../dpmodel/test_graph_atomic_parity.py       | 17 ++++++++++
 2 files changed, 50 insertions(+)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index b319d2c3dd..94d9340537 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -5,9 +5,15 @@
     Callable,
 )
 from typing import (
+    TYPE_CHECKING,
     Any,
 )
 
+if TYPE_CHECKING:
+    from deepmd.dpmodel.utils.neighbor_graph import (
+        NeighborGraph,
+    )
+
 import array_api_compat
 import numpy as np
 
@@ -306,6 +312,33 @@ def forward_common_atomic(
         atom_mask = xp_take_first_n(ext_atom_mask, 1, nloc)
         return self._finalize_atomic_ret(ret_dict, atom_mask, atype)
 
+    def forward_common_atomic_graph(
+        self,
+        graph: "NeighborGraph",
+        atype: Array,
+        fparam: Array | None = None,
+        aparam: Array | None = None,
+    ) -> dict:
+        """Graph analogue of :meth:`forward_common_atomic`.
+
+        The graph is ghost-free
+        (atype is LOCAL), so masking/out-stat operate directly on the nloc atoms.
+        Reuses :meth:`_finalize_atomic_ret`, so virtual-atom masking, ``atom_excl``
+        and ``apply_out_stat`` match the dense path. (Pair ``exclude_types`` is not
+        supported on the graph path -- those models keep ``uses_graph_lower()==False``
+        and route to the dense path.)
+        """
+        xp = array_api_compat.array_namespace(graph.edge_vec)
+        nf = graph.n_node.shape[0]
+        nloc = atype.shape[0] // nf
+        atype_2d = xp.reshape(atype, (nf, nloc))
+        atom_mask = self.make_atom_mask(atype_2d)
+        atype_clamped = xp.where(atom_mask, atype_2d, xp.zeros_like(atype_2d))
+        ret_dict = self.forward_atomic_graph(
+            graph, xp.reshape(atype_clamped, (nf * nloc,)), fparam=fparam, aparam=aparam
+        )
+        return self._finalize_atomic_ret(ret_dict, atom_mask, atype_2d)
+
     def _finalize_atomic_ret(
         self, ret_dict: dict, atom_mask: Array, atype: Array
     ) -> dict:
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index 3cd454b37f..d4191c98d6 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -28,3 +28,20 @@ def test_forward_atomic_graph_matches_dense():
     ng = from_dense_quartet(ext_coord, nlist, mapping)
     graph = am.forward_atomic_graph(ng, atype.reshape(-1))
     np.testing.assert_allclose(graph["energy"], dense["energy"], rtol=1e-12, atol=1e-12)
+
+
+def test_forward_common_atomic_graph_matches_dense():
+    rng = np.random.default_rng(1)
+    coord = rng.normal(size=(1, 5, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0]], dtype=np.int64)
+    am = _atomic_model()
+    ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+        coord, atype, 4.0, [30], mixed_types=True, box=None
+    )
+    dense = am.forward_common_atomic(ext_coord, ext_atype, nlist, mapping=mapping)
+    ng = from_dense_quartet(ext_coord, nlist, mapping)
+    graph = am.forward_common_atomic_graph(ng, atype.reshape(-1))
+    for k in ("energy", "mask"):
+        np.testing.assert_allclose(
+            np.asarray(graph[k]), np.asarray(dense[k]), rtol=1e-12, atol=1e-12
+        )

From e182ed9a09d9d0f72f6220a41a04817426619415 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 10:51:48 +0800
Subject: [PATCH 38/69] refactor(dpmodel): call_lower_graph reuses
 forward_common_atomic_graph

---
 deepmd/dpmodel/model/make_model.py | 32 ++++++++++++------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 7c67b49c15..97ee0e2b10 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -662,28 +662,22 @@ def call_lower_graph(
                 reduced, derivative name-holders ``None``), matching
                 :func:`fit_output_to_model_output`.
             """
-            xp = array_api_compat.array_namespace(edge_vec)
-            fit_ret, gg, atom_mask = self._graph_descriptor_fitting(
-                atype,
-                n_node,
-                edge_index,
-                edge_vec,
-                edge_mask,
-                fparam=fparam,
-                aparam=aparam,
+            graph = NeighborGraph(
+                n_node=n_node,
+                edge_index=edge_index,
+                edge_vec=edge_vec,
+                edge_mask=edge_mask,
             )
-            # int mask of real atoms (matches the dense base_atomic_model).
-            mask = xp.astype(atom_mask, xp.int32)
-            model_predict = fit_output_to_model_output(
-                fit_ret,
-                self.atomic_model.fitting_output_def(),
-                gg,
+            atomic_ret = self.atomic_model.forward_common_atomic_graph(
+                graph, atype, fparam=fparam, aparam=aparam
+            )
+            return fit_output_to_model_output(
+                atomic_ret,
+                self.atomic_output_def(),
+                edge_vec,
                 do_atomic_virial=False,
-                mask=mask,
+                mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
             )
-            # fit_output_to_model_output does not add "mask"; set it here.
-            model_predict["mask"] = mask
-            return model_predict
 
         call = call_common
         call_lower = call_common_lower

From f597547e6e65b289489afe1feb35940068b4f46e Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 11:00:47 +0800
Subject: [PATCH 39/69] refactor(pt_expt): forward_common_lower_graph reuses
 forward_common_atomic_graph; drop _graph_descriptor_fitting

---
 deepmd/dpmodel/model/make_model.py | 78 ------------------------------
 deepmd/pt_expt/model/make_model.py | 33 ++++++-------
 2 files changed, 16 insertions(+), 95 deletions(-)

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 97ee0e2b10..0c68a90662 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -529,84 +529,6 @@ def forward_common_atomic(
                 mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
             )
 
-        def _graph_descriptor_fitting(
-            self,
-            atype: Array,
-            n_node: Array,
-            edge_index: Array,
-            edge_vec: Array,
-            edge_mask: Array,
-            fparam: Array | None = None,
-            aparam: Array | None = None,
-        ) -> tuple[dict[str, Array], Array, Array]:
-            """Run the graph descriptor + fitting forward.
-
-            Returns the raw rectangular ``fit_ret`` (``(nf, nloc, *shape)`` per
-            fitting output), the descriptor output ``gg`` (used by callers as
-            the array-namespace carrier), and the ``(nf, nloc)`` boolean
-            ``atom_mask`` (True for real atoms, False for virtual ``atype<0``).
-            ``edge_vec`` is consumed as-is so that callers can make it the
-            autograd leaf (pt_expt) before invoking this.
-
-            Virtual atoms (``atype < 0``) are masked exactly like the dense
-            :meth:`base_atomic_model.forward_common_atomic`: the atype fed to
-            the descriptor/fitting is clamped to 0 (so ``take`` never sees a
-            negative index) and the per-atom ``fit_ret`` of masked atoms is
-            zeroed BEFORE any reduction, so virtual atoms contribute no
-            type-embedding/bias energy.
-            """
-            xp = array_api_compat.array_namespace(edge_vec)
-            dev = array_api_compat.device(edge_vec)
-            graph = NeighborGraph(
-                n_node=n_node,
-                edge_index=edge_index,
-                edge_vec=edge_vec,
-                edge_mask=edge_mask,
-            )
-            nf = n_node.shape[0]
-            nloc = int(n_node[0])
-            descriptor = self.atomic_model.descriptor
-            fitting_net = self.atomic_model.fitting_net
-            atype = xp.asarray(atype, device=dev)
-            atype_2d = xp.reshape(atype, (nf, nloc))
-            # virtual-atom mask (True for real atoms); mirror the dense
-            # base_atomic_model.make_atom_mask (atype >= 0).
-            make_atom_mask = getattr(self.atomic_model, "make_atom_mask", None)
-            if make_atom_mask is not None:
-                atom_mask = make_atom_mask(atype_2d)
-            else:
-                atom_mask = atype_2d >= 0
-            # clamp negative (virtual) types to 0 so take(...) never indexes
-            # out of range; virtual atoms have no edges so this only touches
-            # their (subsequently zeroed) node entries.
-            zeros_atype = xp.zeros_like(atype)
-            atype_safe = xp.where(atype >= 0, atype, zeros_atype)
-            atype_2d_safe = xp.reshape(atype_safe, (nf, nloc))
-            # dpa1 call_graph requires the type-embedding table explicitly
-            type_embedding = descriptor.type_embedding.call()
-            gg, rot_mat = descriptor.call_graph(
-                graph, atype_safe, type_embedding=type_embedding
-            )
-            g2 = h2 = None
-            fit_ret = fitting_net(
-                gg,
-                atype_2d_safe,
-                gr=rot_mat,
-                g2=g2,
-                h2=h2,
-                fparam=fparam,
-                aparam=aparam,
-            )
-            # zero the per-atom output of masked (virtual) atoms BEFORE the
-            # reduction -- mirror base_atomic_model lines 315-320.
-            for kk in fit_ret.keys():
-                vv = fit_ret[kk]
-                out_shape = vv.shape
-                flat = xp.reshape(vv, (out_shape[0], out_shape[1], -1))
-                flat = xp.where(atom_mask[:, :, None], flat, xp.zeros_like(flat))
-                fit_ret[kk] = xp.reshape(flat, out_shape)
-            return fit_ret, gg, atom_mask
-
         def call_lower_graph(
             self,
             atype: Array,
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index e340ba5cc8..d8f0e84eb2 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -295,8 +295,8 @@ def forward_common_lower_graph(
 
             OUTPUT-AGNOSTIC: runs the graph descriptor + fitting forward with
             ``edge_vec`` as the autograd leaf (via the inherited
-            :meth:`_graph_descriptor_fitting`), then routes the raw rectangular
-            ``fit_ret`` through :func:`fit_output_to_model_output_graph`, which
+            :meth:`forward_common_atomic_graph`), then routes the raw rectangular
+            ``atomic_ret`` through :func:`fit_output_to_model_output_graph`, which
             reduces EVERY reducible output and assembles force / per-frame virial
             / (optional) atom-virial for every ``r_differentiable`` output from a
             backward pass w.r.t. ``edge_vec`` (the shared full-to-``src`` scatter).
@@ -341,34 +341,33 @@ def forward_common_lower_graph(
                 ``<var>_derv_c_redu`` (nf, *shape, 9), and -- when
                 ``do_atomic_virial`` -- ``<var>_derv_c`` (nf, nloc, *shape, 9).
             """
+            from deepmd.dpmodel.utils.neighbor_graph import NeighborGraph
+
             # make edge_vec the autograd leaf for the energy backward
             edge_vec = edge_vec.detach().requires_grad_(True)
-            fit_ret, _, atom_mask = self._graph_descriptor_fitting(
+            graph = NeighborGraph(
+                n_node=n_node,
+                edge_index=edge_index,
+                edge_vec=edge_vec,
+                edge_mask=edge_mask,
+            )
+            atomic_ret = self.atomic_model.forward_common_atomic_graph(
+                graph,
                 atype,
-                n_node,
-                edge_index,
-                edge_vec,
-                edge_mask,
                 fparam=fparam,
                 aparam=aparam,
             )
-            # int mask of real atoms (virtual atype<0 already zeroed in
-            # _graph_descriptor_fitting); matches the dense base_atomic_model.
-            mask = atom_mask.to(torch.int32)
-            model_predict = fit_output_to_model_output_graph(
-                fit_ret,
-                self.atomic_model.fitting_output_def(),
+            return fit_output_to_model_output_graph(
+                atomic_ret,
+                self.atomic_output_def(),
                 edge_vec,
                 edge_index,
                 edge_mask,
                 n_node,
                 do_atomic_virial=do_atomic_virial,
                 create_graph=self.training,
-                mask=mask,
+                mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
             )
-            # fit_output_to_model_output_graph does not add "mask"; set it here.
-            model_predict["mask"] = mask
-            return model_predict
 
         def _resolve_graph_method(
             self, neighbor_graph_method: str | None

From 15fa2455f2f19e946dd8324063d4ef90c5af4a8f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 11:09:08 +0800
Subject: [PATCH 40/69] test(dpmodel): feature-flag graph-vs-dense parity
 matrix + protection + exclude fallback

---
 .../tests/common/dpmodel/test_edge_env_mat.py | 60 +++++++++++++++++++
 .../dpmodel/test_graph_atomic_parity.py       | 47 +++++++++++++++
 2 files changed, 107 insertions(+)

diff --git a/source/tests/common/dpmodel/test_edge_env_mat.py b/source/tests/common/dpmodel/test_edge_env_mat.py
index ca6884de39..f70a44e0fd 100644
--- a/source/tests/common/dpmodel/test_edge_env_mat.py
+++ b/source/tests/common/dpmodel/test_edge_env_mat.py
@@ -2,6 +2,7 @@
 import unittest
 
 import numpy as np
+import pytest
 
 from deepmd.dpmodel.utils.env_mat import (
     EnvMat,
@@ -105,3 +106,62 @@ def test_slot_broadcast_stats(self) -> None:
                 atol=0,
                 err_msg=f"stddev slot {k} != slot 0",
             )
+
+
+# ── Protection parity (Task 6) ────────────────────────────────────────────────
+
+
+@pytest.mark.parametrize("protection", [0.0, 1e-2])  # env-mat protection offset
+def test_edge_env_mat_protection_parity(protection):
+    """edge_env_mat(protection=p, edge_mask=...) must match EnvMat(protection=p).call slice."""
+    rng = np.random.default_rng(7)
+    rcut, rcut_smth = 4.0, 0.5
+    nf, nloc, nnei = 1, 4, 6
+    nt = 2
+
+    ext_coord = rng.normal(size=(nf, nloc, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1]], dtype=np.int64)
+
+    # Build nlist with at most 3 valid neighbors per atom; slots 3-5 are padding (-1).
+    nlist = -np.ones((nf, nloc, nnei), dtype=np.int64)
+    for i in range(nloc):
+        ns = [j for j in range(nloc) if j != i][:nnei]
+        nlist[0, i, : len(ns)] = ns
+    mapping = np.arange(nloc, dtype=np.int64)[None]
+
+    davg = rng.normal(size=(nt, 4))
+    dstd = np.abs(rng.normal(size=(nt, 4))) + 0.5
+
+    # ── dense reference (EnvMat.call) ──────────────────────────────────────
+    davg_dense = np.broadcast_to(davg[:, None, :], (nt, nnei, 4)).copy()
+    dstd_dense = np.broadcast_to(dstd[:, None, :], (nt, nnei, 4)).copy()
+    dmat, _, _ = EnvMat(rcut, rcut_smth, protection=protection).call(
+        ext_coord, atype, nlist, davg_dense, dstd_dense
+    )
+
+    # ── graph path (edge_env_mat with edge_mask) ───────────────────────────
+    ng = from_dense_quartet(ext_coord, nlist, mapping)
+    center_type = atype.reshape(-1)[ng.edge_index[1]]
+    em = edge_env_mat(
+        ng.edge_vec,
+        center_type,
+        davg,
+        dstd,
+        rcut,
+        rcut_smth,
+        protection=protection,
+        edge_mask=ng.edge_mask,
+    )
+
+    # Compare valid edges only, matched to their dense (frame, dst, slot) position.
+    ei = ng.edge_index[:, ng.edge_mask]
+    for k in range(ei.shape[1]):
+        src, dst = int(ei[0, k]), int(ei[1, k])
+        slot = list(nlist[0, dst]).index(src)
+        np.testing.assert_allclose(
+            em[ng.edge_mask][k],
+            dmat[0, dst, slot],
+            rtol=1e-12,
+            atol=1e-12,
+            err_msg=f"protection={protection}, edge {k} (src={src}, dst={dst}, slot={slot})",
+        )
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index d4191c98d6..4fd2eb3727 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import numpy as np
+import pytest
 
 from deepmd.dpmodel.atomic_model.dp_atomic_model import DPAtomicModel
 from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1
 from deepmd.dpmodel.fitting import InvarFitting
+from deepmd.dpmodel.model.ener_model import EnergyModel
 from deepmd.dpmodel.utils.neighbor_graph import from_dense_quartet
 from deepmd.dpmodel.utils.nlist import extend_input_and_build_neighbor_list
 
@@ -45,3 +47,48 @@ def test_forward_common_atomic_graph_matches_dense():
         np.testing.assert_allclose(
             np.asarray(graph[k]), np.asarray(dense[k]), rtol=1e-12, atol=1e-12
         )
+
+
+# ── Feature-flag parity matrix (Task 6) ──────────────────────────────────────
+
+
+def _ener_model(sel, type_one_side=False, exclude_types=None):
+    ds = DescrptDPA1(
+        rcut=4.0,
+        rcut_smth=0.5,
+        sel=list(sel),
+        ntypes=2,
+        attn_layer=0,
+        type_one_side=type_one_side,
+        exclude_types=exclude_types or [],
+    )
+    ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)
+    return EnergyModel(ds, ft, type_map=["a", "b"])
+
+
+@pytest.mark.parametrize("virtual", [False, True])  # one local atype == -1
+@pytest.mark.parametrize("type_one_side", [False, True])  # tebd concat content
+@pytest.mark.parametrize("nf", [1, 2])  # single- and multi-frame
+def test_graph_matches_dense_over_flags(virtual, type_one_side, nf):
+    rng = np.random.default_rng(2)
+    nloc = 6
+    coord = rng.normal(size=(nf, nloc, 3)) * 1.5
+    atype = np.tile(np.array([[0, 1, 0, 1, 0, 1]], dtype=np.int64), (nf, 1))
+    if virtual:
+        atype[:, -1] = -1  # mark one local atom virtual
+    box = np.tile(np.eye(3).reshape(1, 9) * 20.0, (nf, 1))
+    model = _ener_model([200], type_one_side=type_one_side)  # non-binding sel
+    g = model.call_common(coord, atype, box, neighbor_graph_method="dense")
+    d = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
+    for k in ("energy", "energy_redu", "mask"):
+        np.testing.assert_allclose(
+            np.asarray(g[k]), np.asarray(d[k]), rtol=1e-12, atol=1e-12
+        )
+    if virtual:
+        assert int(np.asarray(g["mask"])[0, -1]) == 0  # virtual atom masked
+
+
+def test_pair_exclude_types_falls_back_to_dense():
+    """Pair exclude_types is unsupported on the graph -> uses_graph_lower False."""
+    m = _ener_model([30], exclude_types=[(0, 1)])
+    assert m.atomic_model.descriptor.uses_graph_lower() is False

From 52997c14b9ff627ebc7bfa7e9a6caf89f6290a39 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 11:15:52 +0800
Subject: [PATCH 41/69] test(dpmodel): guard protection env-mat test against
 vacuous pass

---
 .../tests/common/dpmodel/test_edge_env_mat.py | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/source/tests/common/dpmodel/test_edge_env_mat.py b/source/tests/common/dpmodel/test_edge_env_mat.py
index f70a44e0fd..fd7a51deeb 100644
--- a/source/tests/common/dpmodel/test_edge_env_mat.py
+++ b/source/tests/common/dpmodel/test_edge_env_mat.py
@@ -165,3 +165,60 @@ def test_edge_env_mat_protection_parity(protection):
             atol=1e-12,
             err_msg=f"protection={protection}, edge {k} (src={src}, dst={dst}, slot={slot})",
         )
+
+
+def test_protection_actually_changes_env_mat() -> None:
+    """Guard against vacuous pass: verify that changing protection parameter
+    actually modifies the edge_env_mat output. If this test fails (outputs are
+    identical for protection=0 and protection=1e-2), it means protection is
+    silently ignored and the parity test cannot validate the protection path.
+    """
+    rng = np.random.default_rng(7)
+    rcut, rcut_smth = 4.0, 0.5
+    nf, nloc, nnei = 1, 4, 6
+    nt = 2
+
+    ext_coord = rng.normal(size=(nf, nloc, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1]], dtype=np.int64)
+
+    # Build nlist with at most 3 valid neighbors per atom; slots 3-5 are padding (-1).
+    nlist = -np.ones((nf, nloc, nnei), dtype=np.int64)
+    for i in range(nloc):
+        ns = [j for j in range(nloc) if j != i][:nnei]
+        nlist[0, i, : len(ns)] = ns
+    mapping = np.arange(nloc, dtype=np.int64)[None]
+
+    davg = rng.normal(size=(nt, 4))
+    dstd = np.abs(rng.normal(size=(nt, 4))) + 0.5
+
+    # Build the graph once
+    ng = from_dense_quartet(ext_coord, nlist, mapping)
+    center_type = atype.reshape(-1)[ng.edge_index[1]]
+
+    # Evaluate edge_env_mat with two different protection values
+    em_p0 = edge_env_mat(
+        ng.edge_vec,
+        center_type,
+        davg,
+        dstd,
+        rcut,
+        rcut_smth,
+        protection=0.0,
+        edge_mask=ng.edge_mask,
+    )
+    em_p1 = edge_env_mat(
+        ng.edge_vec,
+        center_type,
+        davg,
+        dstd,
+        rcut,
+        rcut_smth,
+        protection=1e-2,
+        edge_mask=ng.edge_mask,
+    )
+
+    # Assert they differ: protection must affect the output
+    assert not np.allclose(em_p0, em_p1), (
+        "protection parameter has no effect on edge_env_mat output; "
+        "parity test cannot validate protection path"
+    )

From 5b0ec90e2964817583f3e5c7158dfb22699491c6 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 11:19:56 +0800
Subject: [PATCH 42/69] test(dpmodel): graph applies out-stat (out-bias)
 identically to dense

---
 .../dpmodel/test_graph_atomic_parity.py       | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index 4fd2eb3727..0d1c87acb6 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -92,3 +92,29 @@ def test_pair_exclude_types_falls_back_to_dense():
     """Pair exclude_types is unsupported on the graph -> uses_graph_lower False."""
     m = _ener_model([30], exclude_types=[(0, 1)])
     assert m.atomic_model.descriptor.uses_graph_lower() is False
+
+
+def test_graph_matches_dense_with_out_bias():
+    """The graph path applies apply_out_stat (per-type out-bias) identically
+    to the dense path. With a non-zero bias, graph == dense at 1e-12, and the
+    bias actually shifts the graph energy (non-vacuous).
+    """
+    rng = np.random.default_rng(3)
+    nloc = 5
+    coord = rng.normal(size=(1, nloc, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0]], dtype=np.int64)
+    box = np.eye(3).reshape(1, 9) * 20.0
+    model = _ener_model([200])
+    # energy BEFORE setting bias (zero out-bias), graph path
+    g_zero = model.call_common(coord, atype, box, neighbor_graph_method="dense")
+    # set a non-zero per-type energy out-bias
+    model.atomic_model.out_bias[0, :, 0] = np.array([0.3, -0.7])
+    g = model.call_common(coord, atype, box, neighbor_graph_method="dense")
+    d = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
+    # graph applies out-stat exactly like dense
+    for k in ("energy", "energy_redu"):
+        np.testing.assert_allclose(
+            np.asarray(g[k]), np.asarray(d[k]), rtol=1e-12, atol=1e-12
+        )
+    # non-vacuous: the bias actually shifted the graph energy
+    assert not np.allclose(np.asarray(g["energy"]), np.asarray(g_zero["energy"]))

From 2c82dafa9026d7cfe9e39254619dda5624ad33f5 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 11:37:10 +0800
Subject: [PATCH 43/69] fix: gate model-level pair_exclude_types out of the
 graph path + tests

---
 .../dpmodel/atomic_model/base_atomic_model.py | 10 ++-
 .../dpmodel/atomic_model/dp_atomic_model.py   |  5 ++
 deepmd/dpmodel/model/make_model.py            |  8 +-
 deepmd/pt_expt/model/make_model.py            |  3 +-
 .../dpmodel/test_graph_atomic_parity.py       | 77 +++++++++++++++++++
 5 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index 94d9340537..0a0857c98c 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -324,9 +324,13 @@ def forward_common_atomic_graph(
         The graph is ghost-free
         (atype is LOCAL), so masking/out-stat operate directly on the nloc atoms.
         Reuses :meth:`_finalize_atomic_ret`, so virtual-atom masking, ``atom_excl``
-        and ``apply_out_stat`` match the dense path. (Pair ``exclude_types`` is not
-        supported on the graph path -- those models keep ``uses_graph_lower()==False``
-        and route to the dense path.)
+        and ``apply_out_stat`` match the dense path.
+
+        Models with model-level ``pair_exclude_types`` are gated OUT of the graph
+        path by the model routing logic (``_resolve_graph_method`` in pt_expt and
+        ``_call_common_graph`` in dpmodel both require ``pair_excl is None``);
+        descriptor-level ``exclude_types`` is gated by ``uses_graph_lower()==False``
+        on the descriptor itself.
         """
         xp = array_api_compat.array_namespace(graph.edge_vec)
         nf = graph.n_node.shape[0]
diff --git a/deepmd/dpmodel/atomic_model/dp_atomic_model.py b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
index cf4052368e..c72317a152 100644
--- a/deepmd/dpmodel/atomic_model/dp_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
@@ -265,6 +265,11 @@ def forward_atomic_graph(
         fitting. ``atype`` is flat LOCAL types (N,). Returns the raw fitting dict
         (no reduction, no masking -- the wrapper handles those).
 
+        This method calls ``self.descriptor.type_embedding.call()`` internally and
+        is therefore valid only for graph-eligible descriptors (e.g. DPA1 with a
+        type embedding).  The graph routing (``_resolve_graph_method`` /
+        ``_call_common_graph``) guarantees this via ``uses_graph_lower()==True``.
+
         Parameters
         ----------
         graph
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 0c68a90662..49c44f249e 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -392,10 +392,12 @@ def _call_common_graph(
             """
             descriptor = getattr(self.atomic_model, "descriptor", None)
             uses_graph_lower = getattr(descriptor, "uses_graph_lower", lambda: False)
-            if not (self.mixed_types() and uses_graph_lower()):
+            pair_excl = getattr(self.atomic_model, "pair_excl", None)
+            if not (self.mixed_types() and uses_graph_lower() and pair_excl is None):
                 raise NotImplementedError(
-                    "neighbor_graph_method requires a mixed_types descriptor "
-                    "with a graph lower (e.g. dpa1 attn_layer=0)"
+                    "neighbor_graph_method requires a mixed_types descriptor with a "
+                    "graph lower (e.g. dpa1 attn_layer=0) and no model-level "
+                    "pair_exclude_types (pair exclusion is not supported on the graph path)"
                 )
             if method == "dense":
                 ng = build_neighbor_graph(cc, atype, bb, self.get_rcut())
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index d8f0e84eb2..fc4224d564 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -385,7 +385,8 @@ def _resolve_graph_method(
             # Linear/ZBL atomic models have no single ``descriptor`` -> dense.
             descriptor = getattr(self.atomic_model, "descriptor", None)
             uses_graph_lower = getattr(descriptor, "uses_graph_lower", lambda: False)
-            if self.mixed_types() and uses_graph_lower():
+            pair_excl = getattr(self.atomic_model, "pair_excl", None)
+            if self.mixed_types() and uses_graph_lower() and pair_excl is None:
                 return "dense"
             return None
 
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index 0d1c87acb6..375bf1f78d 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -94,6 +94,83 @@ def test_pair_exclude_types_falls_back_to_dense():
     assert m.atomic_model.descriptor.uses_graph_lower() is False
 
 
+def test_model_pair_exclude_types_disables_graph():
+    """A dpa1(attn_layer=0) model WITH model-level pair_exclude_types must NOT
+    route to the graph path: the explicit dpmodel graph request must raise
+    NotImplementedError (pair exclusion is unsupported on the graph path).
+    This also documents that without the gate, the old predicate (uses_graph_lower
+    only) would have WRONGLY allowed the graph, producing silent wrong energies.
+    """
+    ds = DescrptDPA1(
+        rcut=4.0,
+        rcut_smth=0.5,
+        sel=[30],
+        ntypes=2,
+        attn_layer=0,
+        # no descriptor-level exclude_types -> uses_graph_lower() == True
+    )
+    ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)
+    am = DPAtomicModel(ds, ft, type_map=["a", "b"], pair_exclude_types=[(0, 1)])
+    model = EnergyModel(atomic_model_=am)
+    # Preconditions that document the old-gate failure mode:
+    assert model.atomic_model.pair_excl is not None, (
+        "pair_excl must be set so the old gate would have wrongly allowed graph"
+    )
+    assert model.atomic_model.descriptor.uses_graph_lower() is True, (
+        "uses_graph_lower must be True so the old gate would have wrongly allowed graph"
+    )
+    # The fixed gate must raise when pair_exclude_types is present:
+    rng = np.random.default_rng(10)
+    coord = rng.normal(size=(1, 5, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0]], dtype=np.int64)
+    box = np.eye(3).reshape(1, 9) * 20.0
+    with pytest.raises(NotImplementedError, match="pair_exclude_types"):
+        model.call_common(coord, atype, box, neighbor_graph_method="dense")
+
+
+def test_graph_matches_dense_with_atom_exclude():
+    """Model-level atom_exclude_types IS supported on the graph path (applied
+    via _finalize_atomic_ret's atom_excl).  Graph == dense at rtol/atol 1e-12.
+    Also proves atom-level exclusion is correctly inherited and non-vacuous.
+    """
+    rng = np.random.default_rng(11)
+    nloc = 6
+    coord = rng.normal(size=(1, nloc, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0, 1]], dtype=np.int64)
+    box = np.eye(3).reshape(1, 9) * 20.0
+    ds = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[200], ntypes=2, attn_layer=0)
+    ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)
+    am = DPAtomicModel(ds, ft, type_map=["a", "b"], atom_exclude_types=[0])
+    model = EnergyModel(atomic_model_=am)
+    g = model.call_common(coord, atype, box, neighbor_graph_method="dense")
+    d = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
+    for k in ("energy", "energy_redu", "mask"):
+        g_arr = np.asarray(g[k])
+        d_arr = np.asarray(d[k])
+        max_diff = float(np.max(np.abs(g_arr - d_arr)))
+        np.testing.assert_allclose(
+            g_arr,
+            d_arr,
+            rtol=1e-12,
+            atol=1e-12,
+            err_msg=f"graph vs dense mismatch for '{k}': max_diff={max_diff}",
+        )
+    # non-vacuous: type-0 atoms have zero energy (excluded), type-1 have nonzero
+    g_energy = np.asarray(g["energy"])
+    g_mask = np.asarray(g["mask"])
+    type0_indices = atype[0] == 0
+    assert np.allclose(g_energy[0, type0_indices], 0.0), (
+        "excluded type-0 atoms must have zero energy"
+    )
+    assert not np.allclose(g_energy[0, ~type0_indices], 0.0), (
+        "non-excluded type-1 atoms must have nonzero energy"
+    )
+    # also check mask: excluded type-0 atoms should have mask==0
+    assert np.all(g_mask[0, type0_indices] == 0), (
+        "excluded type-0 atoms must have mask==0"
+    )
+
+
 def test_graph_matches_dense_with_out_bias():
     """The graph path applies apply_out_stat (per-type out-bias) identically
     to the dense path. With a non-zero bias, graph == dense at 1e-12, and the

From e04bdc209728c43453dfb6874b3d6ca874837b48 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Jun 2026 03:39:45 +0000
Subject: [PATCH 44/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/pt_expt/model/make_model.py            |  4 +++-
 .../dpmodel/test_graph_atomic_parity.py       | 24 ++++++++++++++-----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index fc4224d564..474e90b71c 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -341,7 +341,9 @@ def forward_common_lower_graph(
                 ``<var>_derv_c_redu`` (nf, *shape, 9), and -- when
                 ``do_atomic_virial`` -- ``<var>_derv_c`` (nf, nloc, *shape, 9).
             """
-            from deepmd.dpmodel.utils.neighbor_graph import NeighborGraph
+            from deepmd.dpmodel.utils.neighbor_graph import (
+                NeighborGraph,
+            )
 
             # make edge_vec the autograd leaf for the energy backward
             edge_vec = edge_vec.detach().requires_grad_(True)
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index 375bf1f78d..7a212ddd69 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -2,12 +2,24 @@
 import numpy as np
 import pytest
 
-from deepmd.dpmodel.atomic_model.dp_atomic_model import DPAtomicModel
-from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1
-from deepmd.dpmodel.fitting import InvarFitting
-from deepmd.dpmodel.model.ener_model import EnergyModel
-from deepmd.dpmodel.utils.neighbor_graph import from_dense_quartet
-from deepmd.dpmodel.utils.nlist import extend_input_and_build_neighbor_list
+from deepmd.dpmodel.atomic_model.dp_atomic_model import (
+    DPAtomicModel,
+)
+from deepmd.dpmodel.descriptor.dpa1 import (
+    DescrptDPA1,
+)
+from deepmd.dpmodel.fitting import (
+    InvarFitting,
+)
+from deepmd.dpmodel.model.ener_model import (
+    EnergyModel,
+)
+from deepmd.dpmodel.utils.neighbor_graph import (
+    from_dense_quartet,
+)
+from deepmd.dpmodel.utils.nlist import (
+    extend_input_and_build_neighbor_list,
+)
 
 
 def _atomic_model(sel=(30,), **kw):

From 051f66ba203e185756fa3483941297b450c3e81f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 12:41:19 +0800
Subject: [PATCH 45/69] docs(dpmodel): align graph atomic-wrapper docstrings
 with NumPy style

forward_atomic_graph / forward_common_atomic_graph / _finalize_atomic_ret
now use the project's NumPy-style docstrings (concise summary + Parameters
+ Returns sections, lowercase descriptions, nf x nloc shapes), matching
forward_atomic / forward_common_atomic.
---
 .../dpmodel/atomic_model/base_atomic_model.py | 53 ++++++++++++++-----
 .../dpmodel/atomic_model/dp_atomic_model.py   | 23 ++++----
 2 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index 0a0857c98c..d3217ff6a1 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -321,16 +321,31 @@ def forward_common_atomic_graph(
     ) -> dict:
         """Graph analogue of :meth:`forward_common_atomic`.
 
-        The graph is ghost-free
-        (atype is LOCAL), so masking/out-stat operate directly on the nloc atoms.
-        Reuses :meth:`_finalize_atomic_ret`, so virtual-atom masking, ``atom_excl``
-        and ``apply_out_stat`` match the dense path.
-
-        Models with model-level ``pair_exclude_types`` are gated OUT of the graph
-        path by the model routing logic (``_resolve_graph_method`` in pt_expt and
-        ``_call_common_graph`` in dpmodel both require ``pair_excl is None``);
+        The graph is ghost-free (``atype`` is local), so masking and out-stat
+        operate directly on the ``nloc`` atoms. Reuses :meth:`_finalize_atomic_ret`,
+        so virtual-atom masking, ``atom_excl`` and ``apply_out_stat`` match the
+        dense path. Models with model-level ``pair_exclude_types`` are gated out of
+        the graph path by the model routing (``_resolve_graph_method`` in pt_expt
+        and ``_call_common_graph`` in dpmodel both require ``pair_excl is None``);
         descriptor-level ``exclude_types`` is gated by ``uses_graph_lower()==False``
         on the descriptor itself.
+
+        Parameters
+        ----------
+        graph
+            neighbor graph for the local atoms (ghost-free)
+        atype
+            flat local atom types. nf * nloc
+        fparam
+            frame parameter. nf x ndf
+        aparam
+            atomic parameter. nf x nloc x nda
+
+        Returns
+        -------
+        result_dict
+            the result dict, defined by the `FittingOutputDef`.
+
         """
         xp = array_api_compat.array_namespace(graph.edge_vec)
         nf = graph.n_node.shape[0]
@@ -346,12 +361,26 @@ def forward_common_atomic_graph(
     def _finalize_atomic_ret(
         self, ret_dict: dict, atom_mask: Array, atype: Array
     ) -> dict:
-        """Out-stat + atom-exclusion + virtual-atom zeroing + ``mask`` key.
+        """Apply out-stat, atom exclusion and virtual-atom zeroing; set ``mask``.
 
         Shared by the dense (:meth:`forward_common_atomic`) and graph
-        (:meth:`forward_common_atomic_graph`) wrappers. ``atom_mask`` is the
-        (nf, nloc) real-atom mask (from ``make_atom_mask``); ``atype`` is the
-        (nf, nloc) LOCAL atom types (used for out-stat and ``atom_excl``).
+        (:meth:`forward_common_atomic_graph`) wrappers.
+
+        Parameters
+        ----------
+        ret_dict
+            the raw per-atom result dict from ``forward_atomic``
+        atom_mask
+            the real-atom mask, True for real and False for virtual atoms. nf x nloc
+        atype
+            the local atom types, used for out-stat and ``atom_excl``. nf x nloc
+
+        Returns
+        -------
+        result_dict
+            ``ret_dict`` with out-stat applied, virtual and excluded atoms zeroed,
+            and the integer ``mask`` key set.
+
         """
         xp = array_api_compat.array_namespace(atype)
         ret_dict = self.apply_out_stat(ret_dict, atype)
diff --git a/deepmd/dpmodel/atomic_model/dp_atomic_model.py b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
index c72317a152..6f0ee97fcf 100644
--- a/deepmd/dpmodel/atomic_model/dp_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
@@ -261,30 +261,31 @@ def forward_atomic_graph(
         fparam: Array | None = None,
         aparam: Array | None = None,
     ) -> dict[str, Array]:
-        """Graph analogue of :meth:`forward_atomic`: descriptor ``call_graph`` ->
-        fitting. ``atype`` is flat LOCAL types (N,). Returns the raw fitting dict
-        (no reduction, no masking -- the wrapper handles those).
+        """Graph analogue of :meth:`forward_atomic`.
 
-        This method calls ``self.descriptor.type_embedding.call()`` internally and
-        is therefore valid only for graph-eligible descriptors (e.g. DPA1 with a
-        type embedding).  The graph routing (``_resolve_graph_method`` /
-        ``_call_common_graph``) guarantees this via ``uses_graph_lower()==True``.
+        Runs the descriptor ``call_graph`` then the fitting net and returns the
+        raw fitting dict (no reduction or masking; the wrapper handles those).
+        Calls ``self.descriptor.type_embedding.call()`` internally and is therefore
+        valid only for graph-eligible descriptors (e.g. DPA1 with a type embedding);
+        the graph routing (``_resolve_graph_method`` / ``_call_common_graph``)
+        guarantees this via ``uses_graph_lower()==True``.
 
         Parameters
         ----------
         graph
-            NeighborGraph for the local atoms (ghost-free).
+            neighbor graph for the local atoms (ghost-free)
         atype
-            Flat local atom types, shape (nf*nloc,).
+            flat local atom types. nf * nloc
         fparam
-            Frame parameters. nf x ndf
+            frame parameter. nf x ndf
         aparam
-            Atomic parameters. nf x nloc x nda
+            atomic parameter. nf x nloc x nda
 
         Returns
         -------
         result_dict
             the result dict, defined by the `FittingOutputDef`.
+
         """
         import array_api_compat
 

From 8d3a882922e0fe17ea938b61f6e130e149633293 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 13:31:57 +0800
Subject: [PATCH 46/69] feat(dpmodel): frame_id_from_n_node node->frame map for
 flat-N graph path

---
 .../dpmodel/utils/neighbor_graph/__init__.py  |  2 ++
 deepmd/dpmodel/utils/neighbor_graph/graph.py  | 25 +++++++++++++++++++
 .../tests/common/dpmodel/test_graph_ragged.py | 23 +++++++++++++++++
 3 files changed, 50 insertions(+)
 create mode 100644 source/tests/common/dpmodel/test_graph_ragged.py

diff --git a/deepmd/dpmodel/utils/neighbor_graph/__init__.py b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
index b20c321dd4..6e041805b2 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/__init__.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/__init__.py
@@ -28,6 +28,7 @@
 from .graph import (
     GraphLayout,
     NeighborGraph,
+    frame_id_from_n_node,
     node_validity_mask,
     pad_and_guard_edges,
 )
@@ -43,6 +44,7 @@
     "build_neighbor_graph_ase",
     "edge_env_mat",
     "edge_force_virial",
+    "frame_id_from_n_node",
     "from_dense_quartet",
     "neighbor_graph_from_ijs",
     "node_validity_mask",
diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py
index eb0e4f7c38..4487d910b2 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/graph.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py
@@ -102,6 +102,31 @@ def pad_and_guard_edges(
     return ei, ev, edge_mask
 
 
+def frame_id_from_n_node(n_node: Array) -> Array:
+    """Node->frame map for a flat node axis: ``repeat(arange(nf), n_node)``.
+
+    Implemented via ``searchsorted(cumulative_sum(n_node), arange(N), side="right")``
+    -- the same primitives used in ``edge_force_virial`` for per-frame virial.
+
+    Parameters
+    ----------
+    n_node
+        Per-frame node counts.  Shape ``(nf,)``.
+
+    Returns
+    -------
+    frame_id
+        Frame index of each flat node, compact-prefix frame-major.
+        Shape ``(N,)`` int64, where ``N = sum(n_node)``.
+    """
+    xp = array_api_compat.array_namespace(n_node)
+    dev = array_api_compat.device(n_node)
+    n_total = int(xp.sum(n_node))
+    idx = xp.arange(n_total, dtype=n_node.dtype, device=dev)
+    boundaries = xp.cumulative_sum(n_node)  # (nf,) upper bounds, exclusive
+    return xp.astype(xp.searchsorted(boundaries, idx, side="right"), xp.int64)
+
+
 def node_validity_mask(n_node: Array, n_total: int) -> Array:
     """Derive the (n_total,) real-vs-padding node mask from per-frame counts.
 
diff --git a/source/tests/common/dpmodel/test_graph_ragged.py b/source/tests/common/dpmodel/test_graph_ragged.py
new file mode 100644
index 0000000000..143d48ddc4
--- /dev/null
+++ b/source/tests/common/dpmodel/test_graph_ragged.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Flat-N ragged-native graph path: nodes on a flat (N,) axis, N = sum(n_node);
+per-frame reductions use segment_sum over frame_id. UNEQUAL per-frame node counts
+(ragged) -- the case the old rectangular (nf,nloc) path could not represent.
+"""
+
+import numpy as np
+
+from deepmd.dpmodel.utils.neighbor_graph import frame_id_from_n_node
+
+
+def test_frame_id_ragged():
+    fid = frame_id_from_n_node(np.array([3, 5, 2], dtype=np.int64))  # N=10
+    np.testing.assert_array_equal(
+        fid, np.array([0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=fid.dtype)
+    )
+
+
+def test_frame_id_rectangular():
+    fid = frame_id_from_n_node(np.array([4, 4], dtype=np.int64))
+    np.testing.assert_array_equal(
+        fid, np.array([0, 0, 0, 0, 1, 1, 1, 1], dtype=fid.dtype)
+    )

From deb524ae9d966fa457c83b4b208f10a87e23cbf1 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 13:37:43 +0800
Subject: [PATCH 47/69] refactor(dpmodel): call_graph returns flat (N,...) node
 axis

---
 deepmd/dpmodel/descriptor/dpa1.py             | 34 ++++++++-----------
 .../test_dpa1_call_graph_descriptor.py        | 22 ++++++++++++
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 1849e7a2a5..ebb6b456bc 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -606,11 +606,15 @@ def _call_graph_adapter(
         )
         # local atom types, flat (nf * nloc,)
         atype_local = xp.reshape(xp_take_first_n(atype_ext, 1, nloc), (nf * nloc,))
-        grrg, rot_mat = self.call_graph(
+        grrg_flat, rot_mat_flat = self.call_graph(
             graph,
             atype_local,
             type_embedding=self.type_embedding.call(),
         )
+        # call_graph returns flat (N, ...) node axis; reshape to (nf, nloc, ...)
+        # for the dense 5-tuple ABI -- this reshape is LOCAL to the adapter shim.
+        grrg = xp.reshape(grrg_flat, (nf, nloc, *grrg_flat.shape[1:]))
+        rot_mat = xp.reshape(rot_mat_flat, (nf, nloc, *rot_mat_flat.shape[1:]))
         # reconstruct the dense-shaped sw the dense way (env_mat switch masked
         # where nlist == -1; the graph path forbids exclude_types, so nlist_mask
         # == nlist != -1, matching DescrptBlockSeAtten.call). A dense-layout
@@ -672,8 +676,9 @@ def call_graph(
 
         Wraps the private block kernel
         :meth:`DescrptBlockSeAtten._call_graph`, adds the descriptor-level
-        ``concat_output_tebd`` step, and reshapes the per-node outputs back to
-        the dense ABI shapes ``(nf, nloc, ...)``.
+        ``concat_output_tebd`` step, and returns the outputs on the flat ``(N,
+        ...)`` node axis (ragged-native; no rectangular ``(nf, nloc)``
+        reshape).
 
         This method is graph-native: it takes no dense quartet inputs and does
         not produce the dense ``sw`` (that lives in the dense :meth:`call`
@@ -684,37 +689,28 @@ def call_graph(
         graph
             A :class:`~deepmd.dpmodel.utils.neighbor_graph.NeighborGraph`.
         atype
-            (nf * nloc,) flat LOCAL atom types.
+            (N,) flat LOCAL atom types where ``N = sum(n_node)``.
         type_embedding
             (ntypes_with_padding, tebd_dim) type-embedding table.
 
         Returns
         -------
         grrg : Array
-            (nf, nloc, ng * axis_neuron [+ tebd_dim]) descriptor.
+            (N, ng * axis_neuron [+ tebd_dim]) descriptor, flat node axis.
         rot_mat : Array
-            (nf, nloc, ng, 3) equivariant single-particle representation.
+            (N, ng, 3) equivariant single-particle representation, flat node
+            axis.
         """
         xp = array_api_compat.array_namespace(graph.edge_vec)
         dev = array_api_compat.device(graph.edge_vec)
-        grrg_node, rot_mat_node = self.se_atten._call_graph(
+        grrg, rot_mat = self.se_atten._call_graph(
             graph, atype, type_embedding=type_embedding
         )
-        nf = graph.n_node.shape[0]
-        # atype is the flat (nf*nloc,) node axis; derive nloc from the STATIC shape
-        # (n_node[i] == nloc for all frames by contract) so this adapter stays
-        # jit/export-traceable (no concretize of n_node).
-        nloc = atype.shape[0] // nf
-        ng = self.se_atten.neuron[-1]
-        axis = self.se_atten.axis_neuron
-        grrg = xp.reshape(grrg_node, (nf, nloc, ng * axis))
-        rot_mat = xp.reshape(rot_mat_node, (nf, nloc, ng, 3))
-        # descriptor-level concat_output_tebd
+        # FLAT node axis (N, ...): no (nf, nloc) reshape -- ragged-native, spec.
         if self.concat_output_tebd:
             tebd = xp.asarray(type_embedding, device=dev)
             atype_local = xp.asarray(atype, device=dev)
-            atype_embd = xp.take(tebd, atype_local, axis=0)  # (nf*nloc, tebd_dim)
-            atype_embd = xp.reshape(atype_embd, (nf, nloc, self.tebd_dim))
+            atype_embd = xp.take(tebd, atype_local, axis=0)  # (N, tebd_dim)
             grrg = xp.concat([grrg, atype_embd], axis=-1)
         return grrg, rot_mat
 
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
index 423f8e0308..6f1cceff8c 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
@@ -176,3 +176,25 @@ def test_single_rank_extension_keeps_type_invariant(self) -> None:
         np.testing.assert_allclose(out[0], ref[0], rtol=1e-12, atol=1e-12)
         np.testing.assert_allclose(out[1], ref[1], rtol=1e-12, atol=1e-12)
         np.testing.assert_allclose(out[4], ref[4], rtol=1e-12, atol=1e-12)
+
+    def test_call_graph_returns_flat_node_axis(self) -> None:
+        """call_graph output lives on the flat (N,) node axis, not (nf, nloc)."""
+        from deepmd.dpmodel.utils.neighbor_graph import from_dense_quartet
+
+        dd = self._make([30])
+        ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+            self.coord,
+            self.atype,
+            dd.get_rcut(),
+            dd.get_sel(),
+            mixed_types=dd.mixed_types(),
+            box=None,
+        )
+        graph = from_dense_quartet(ext_coord, nlist, mapping, compact=True)
+        atype_local = self.atype.reshape(-1)
+        grrg, rot_mat = dd.call_graph(
+            graph, atype_local, type_embedding=dd.type_embedding.call()
+        )
+        n = atype_local.shape[0]
+        assert grrg.shape[0] == n and grrg.ndim == 2
+        assert rot_mat.shape[0] == n and rot_mat.ndim == 3

From 117cfe58a302e21d54891d155c29e9472e6f1b3e Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 13:44:59 +0800
Subject: [PATCH 48/69] feat(dpmodel): GeneralFitting.call_graph graph-native
 (flat-N) fitting forward

---
 deepmd/dpmodel/fitting/general_fitting.py     | 48 +++++++++++++++++++
 .../common/dpmodel/test_fitting_call_graph.py | 34 +++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 source/tests/common/dpmodel/test_fitting_call_graph.py

diff --git a/deepmd/dpmodel/fitting/general_fitting.py b/deepmd/dpmodel/fitting/general_fitting.py
index b9129a4364..75de52470f 100644
--- a/deepmd/dpmodel/fitting/general_fitting.py
+++ b/deepmd/dpmodel/fitting/general_fitting.py
@@ -790,3 +790,51 @@ def _call_common(
         if self.eval_return_middle_output and len(self.neuron) > 0:
             results["middle_output"] = middle_outs
         return results
+
+    def call_graph(
+        self,
+        descriptor: Array,
+        atype: Array,
+        gr: Array | None = None,
+        g2: Array | None = None,
+        h2: Array | None = None,
+        fparam: Array | None = None,
+        aparam: Array | None = None,
+    ) -> dict[str, Array]:
+        """Graph-native (flat node axis) fitting forward.
+
+        The node axis is flat ``(N,)``. This reuses the dense forward by treating
+        the node axis as ``nf'=N`` single-atom frames (``nloc'=1``) -- an internal,
+        encapsulated workaround, verified bit-identical to the dense call.
+
+        Parameters
+        ----------
+        descriptor
+            input descriptor. N x nd
+        atype
+            the atom type. N
+        gr
+            equivariant single-particle representation. N x ng x 3
+        fparam
+            NODE-level frame parameter (already gathered by frame_id). N x nfp
+        aparam
+            atomic parameter. N x nap
+
+        Returns
+        -------
+        result_dict
+            the fitting result on the flat node axis. each value N x *shape
+
+        """
+        import array_api_compat
+
+        xp = array_api_compat.array_namespace(descriptor, atype)
+        n, nd = descriptor.shape
+        d1 = xp.reshape(descriptor, (n, 1, nd))
+        a1 = xp.reshape(atype, (n, 1))
+        g1 = None if gr is None else xp.reshape(gr, (n, 1, gr.shape[-2], 3))
+        ap1 = None if aparam is None else xp.reshape(aparam, (n, 1, aparam.shape[-1]))
+        # fparam: dense API expects (nf, nfp); here nf'=N single-atom frames, so the
+        # node-level (N, nfp) IS the per-(pseudo)frame param -- tiled over nloc'=1.
+        ret = self.__call__(d1, a1, gr=g1, g2=g2, h2=h2, fparam=fparam, aparam=ap1)
+        return {kk: xp.reshape(vv, (n, *vv.shape[2:])) for kk, vv in ret.items()}
diff --git a/source/tests/common/dpmodel/test_fitting_call_graph.py b/source/tests/common/dpmodel/test_fitting_call_graph.py
new file mode 100644
index 0000000000..dc44d9e6d2
--- /dev/null
+++ b/source/tests/common/dpmodel/test_fitting_call_graph.py
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""GeneralFitting.call_graph is the graph-native (flat-N) fitting API. Its result
+must be bit-identical to the dense __call__ raveled over (nf, nloc) -- it reuses
+the dense net via the (N,1,nd) single-atom-frame workaround. fparam is node-level
+(N, ndf) (the caller gathers per-frame fparam by frame_id).
+"""
+
+import numpy as np
+import pytest
+
+from deepmd.dpmodel.fitting import InvarFitting
+
+
+@pytest.mark.parametrize("ndf", [0, 3])  # numb_fparam: no-fparam AND fparam
+def test_call_graph_matches_dense_raveled(ndf):
+    rng = np.random.default_rng(0)
+    nf, nloc, nd, ntypes, ng = 2, 4, 8, 2, 5
+    ft = InvarFitting("energy", ntypes, nd, 1, mixed_types=True, numb_fparam=ndf)
+    desc = rng.normal(size=(nf, nloc, nd))
+    atype = rng.integers(0, ntypes, size=(nf, nloc))
+    gr = rng.normal(size=(nf, nloc, ng, 3))
+    fparam = rng.normal(size=(nf, ndf)) if ndf else None
+    dense = ft(desc, atype, gr=gr, fparam=fparam)["energy"]  # (nf, nloc, 1)
+    N = nf * nloc
+    frame_id = np.repeat(np.arange(nf), nloc)
+    fparam_node = fparam[frame_id] if ndf else None  # (N, ndf)
+    flat = ft.call_graph(
+        desc.reshape(N, nd),
+        atype.reshape(N),
+        gr=gr.reshape(N, ng, 3),
+        fparam=fparam_node,
+    )["energy"]  # (N, 1)
+    assert flat.shape == (N, 1)
+    np.testing.assert_allclose(flat, dense.reshape(N, 1), rtol=1e-12, atol=1e-12)

From 5076c3ac8f05aef6fd1f2875b2b041a35eee7811 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 13:52:39 +0800
Subject: [PATCH 49/69] refactor(dpmodel): forward_atomic_graph uses
 fitting.call_graph on the flat (N,) axis

---
 .../dpmodel/atomic_model/dp_atomic_model.py   | 40 ++++++++++---------
 .../dpmodel/test_graph_atomic_parity.py       | 22 +++++++++-
 2 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/deepmd/dpmodel/atomic_model/dp_atomic_model.py b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
index 6f0ee97fcf..d17adf2821 100644
--- a/deepmd/dpmodel/atomic_model/dp_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
@@ -261,43 +261,47 @@ def forward_atomic_graph(
         fparam: Array | None = None,
         aparam: Array | None = None,
     ) -> dict[str, Array]:
-        """Graph analogue of :meth:`forward_atomic`.
+        """Graph analogue of :meth:`forward_atomic` on the flat node axis.
 
-        Runs the descriptor ``call_graph`` then the fitting net and returns the
-        raw fitting dict (no reduction or masking; the wrapper handles those).
-        Calls ``self.descriptor.type_embedding.call()`` internally and is therefore
-        valid only for graph-eligible descriptors (e.g. DPA1 with a type embedding);
-        the graph routing (``_resolve_graph_method`` / ``_call_common_graph``)
-        guarantees this via ``uses_graph_lower()==True``.
+        Runs the descriptor ``call_graph`` then the fitting ``call_graph`` PER NODE
+        and returns the raw fitting dict on the flat ``(N, *)`` axis (no reduction
+        or masking; the wrapper handles those). ``fparam`` is gathered to nodes by
+        ``frame_id`` so each node sees its frame's parameter.
 
         Parameters
         ----------
         graph
             neighbor graph for the local atoms (ghost-free)
         atype
-            flat local atom types. nf * nloc
+            flat local atom types. N
         fparam
             frame parameter. nf x ndf
         aparam
-            atomic parameter. nf x nloc x nda
+            atomic parameter. N x nda
 
         Returns
         -------
         result_dict
-            the result dict, defined by the `FittingOutputDef`.
+            the result dict on the flat node axis, defined by the `FittingOutputDef`.
 
         """
         import array_api_compat
 
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            frame_id_from_n_node,
+        )
+
         xp = array_api_compat.array_namespace(graph.edge_vec)
-        nf = graph.n_node.shape[0]
-        nloc = atype.shape[0] // nf
-        descriptor = self.descriptor
-        type_embedding = descriptor.type_embedding.call()
-        gg, rot_mat = descriptor.call_graph(graph, atype, type_embedding=type_embedding)
-        atype_2d = xp.reshape(atype, (nf, nloc))
-        return self.fitting_net(
-            gg, atype_2d, gr=rot_mat, g2=None, h2=None, fparam=fparam, aparam=aparam
+        type_embedding = self.descriptor.type_embedding.call()
+        gg, rot_mat = self.descriptor.call_graph(
+            graph, atype, type_embedding=type_embedding
+        )
+        fparam_node = None
+        if fparam is not None:
+            frame_id = frame_id_from_n_node(graph.n_node)
+            fparam_node = xp.take(fparam, frame_id, axis=0)  # (N, ndf)
+        return self.fitting_net.call_graph(
+            gg, atype, gr=rot_mat, g2=None, h2=None, fparam=fparam_node, aparam=aparam
         )
 
     def compute_or_load_stat(
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index 7a212ddd69..1dc2efe651 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -41,7 +41,27 @@ def test_forward_atomic_graph_matches_dense():
     dense = am.forward_atomic(ext_coord, ext_atype, nlist, mapping=mapping)
     ng = from_dense_quartet(ext_coord, nlist, mapping)
     graph = am.forward_atomic_graph(ng, atype.reshape(-1))
-    np.testing.assert_allclose(graph["energy"], dense["energy"], rtol=1e-12, atol=1e-12)
+    np.testing.assert_allclose(
+        graph["energy"], dense["energy"].reshape(-1, 1), rtol=1e-12, atol=1e-12
+    )
+
+
+def test_forward_atomic_graph_flat_shape_and_parity():
+    """Flat (N, *) output, matching dense forward_atomic raveled over (nf, nloc)."""
+    rng = np.random.default_rng(0)
+    coord = rng.normal(size=(1, 5, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0]], dtype=np.int64)
+    am = _atomic_model()
+    ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+        coord, atype, 4.0, [30], mixed_types=True, box=None
+    )
+    dense = am.forward_atomic(ext_coord, ext_atype, nlist, mapping=mapping)
+    ng = from_dense_quartet(ext_coord, nlist, mapping)
+    graph = am.forward_atomic_graph(ng, atype.reshape(-1))
+    assert graph["energy"].shape == (5, 1)  # FLAT (N, 1)
+    np.testing.assert_allclose(
+        graph["energy"], dense["energy"].reshape(5, 1), rtol=1e-12, atol=1e-12
+    )
 
 
 def test_forward_common_atomic_graph_matches_dense():

From 613756f22d1c05a8760ba852654fa0052eb003b8 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 14:25:00 +0800
Subject: [PATCH 50/69] refactor(dpmodel): forward_common_atomic_graph +
 _finalize_atomic_ret go flat-N (ragged-native)

---
 .../dpmodel/atomic_model/base_atomic_model.py | 57 +++++++++----------
 deepmd/dpmodel/model/make_model.py            | 18 +++++-
 deepmd/dpmodel/utils/exclude_mask.py          | 11 ++--
 deepmd/pt_expt/model/make_model.py            | 17 +++++-
 .../dpmodel/test_graph_atomic_parity.py       | 21 ++++++-
 .../tests/common/dpmodel/test_graph_ragged.py | 28 +++++++++
 6 files changed, 109 insertions(+), 43 deletions(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index d3217ff6a1..71a6947629 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import functools
-import math
 from collections.abc import (
     Callable,
 )
@@ -319,61 +318,59 @@ def forward_common_atomic_graph(
         fparam: Array | None = None,
         aparam: Array | None = None,
     ) -> dict:
-        """Graph analogue of :meth:`forward_common_atomic`.
+        """Graph analogue of :meth:`forward_common_atomic` on the flat node axis.
 
-        The graph is ghost-free (``atype`` is local), so masking and out-stat
-        operate directly on the ``nloc`` atoms. Reuses :meth:`_finalize_atomic_ret`,
-        so virtual-atom masking, ``atom_excl`` and ``apply_out_stat`` match the
-        dense path. Models with model-level ``pair_exclude_types`` are gated out of
-        the graph path by the model routing (``_resolve_graph_method`` in pt_expt
-        and ``_call_common_graph`` in dpmodel both require ``pair_excl is None``);
-        descriptor-level ``exclude_types`` is gated by ``uses_graph_lower()==False``
-        on the descriptor itself.
+        The node axis is flat ``(N,)`` (``N = sum(graph.n_node)``); masking and
+        out-stat operate per node. Reuses :meth:`_finalize_atomic_ret`, so
+        virtual-atom masking, ``atom_excl`` and ``apply_out_stat`` match the dense
+        path. Model-level ``pair_exclude_types`` is gated out of the graph path by
+        the model routing (``_resolve_graph_method`` / the ``_call_common_graph``
+        gate require ``pair_excl is None``); descriptor-level ``exclude_types`` is
+        gated by ``uses_graph_lower()==False``.
 
         Parameters
         ----------
         graph
             neighbor graph for the local atoms (ghost-free)
         atype
-            flat local atom types. nf * nloc
+            flat local atom types. N
         fparam
             frame parameter. nf x ndf
         aparam
-            atomic parameter. nf x nloc x nda
+            atomic parameter. N x nda
 
         Returns
         -------
         result_dict
-            the result dict, defined by the `FittingOutputDef`.
+            the result dict on the flat node axis, defined by the `FittingOutputDef`.
 
         """
         xp = array_api_compat.array_namespace(graph.edge_vec)
-        nf = graph.n_node.shape[0]
-        nloc = atype.shape[0] // nf
-        atype_2d = xp.reshape(atype, (nf, nloc))
-        atom_mask = self.make_atom_mask(atype_2d)
-        atype_clamped = xp.where(atom_mask, atype_2d, xp.zeros_like(atype_2d))
+        atype = xp.asarray(atype, device=array_api_compat.device(graph.edge_vec))
+        atom_mask = self.make_atom_mask(atype)  # (N,) bool
+        atype_clamped = xp.where(atom_mask, atype, xp.zeros_like(atype))
         ret_dict = self.forward_atomic_graph(
-            graph, xp.reshape(atype_clamped, (nf * nloc,)), fparam=fparam, aparam=aparam
+            graph, atype_clamped, fparam=fparam, aparam=aparam
         )
-        return self._finalize_atomic_ret(ret_dict, atom_mask, atype_2d)
+        return self._finalize_atomic_ret(ret_dict, atom_mask, atype)
 
     def _finalize_atomic_ret(
         self, ret_dict: dict, atom_mask: Array, atype: Array
     ) -> dict:
         """Apply out-stat, atom exclusion and virtual-atom zeroing; set ``mask``.
 
-        Shared by the dense (:meth:`forward_common_atomic`) and graph
-        (:meth:`forward_common_atomic_graph`) wrappers.
+        Shared by the dense (:meth:`forward_common_atomic`, ``(nf, nloc)`` leading
+        dims) and graph (:meth:`forward_common_atomic_graph`, flat ``(N,)`` leading
+        dim) wrappers -- leading-dim-agnostic.
 
         Parameters
         ----------
         ret_dict
-            the raw per-atom result dict from ``forward_atomic``
+            the raw per-atom result dict from ``forward_atomic``/``forward_atomic_graph``
         atom_mask
-            the real-atom mask, True for real and False for virtual atoms. nf x nloc
+            the real-atom mask, True for real and False for virtual atoms. leading dims
         atype
-            the local atom types, used for out-stat and ``atom_excl``. nf x nloc
+            the local atom types, used for out-stat and ``atom_excl``. leading dims
 
         Returns
         -------
@@ -388,12 +385,12 @@ def _finalize_atomic_ret(
             atom_mask = xp.logical_and(
                 atom_mask, self.atom_excl.build_type_exclude_mask(atype)
             )
+        lead = atom_mask.shape  # (nf, nloc) dense | (N,) graph
         for kk in ret_dict.keys():
-            out_shape = ret_dict[kk].shape
-            out_shape2 = math.prod(out_shape[2:])
-            tmp_arr = ret_dict[kk].reshape([out_shape[0], out_shape[1], out_shape2])
-            tmp_arr = xp.where(atom_mask[:, :, None], tmp_arr, xp.zeros_like(tmp_arr))
-            ret_dict[kk] = xp.reshape(tmp_arr, out_shape)
+            out = ret_dict[kk]
+            flat = xp.reshape(out, (*lead, -1))
+            flat = xp.where(atom_mask[..., None], flat, xp.zeros_like(flat))
+            ret_dict[kk] = xp.reshape(flat, out.shape)
         ret_dict["mask"] = xp.astype(atom_mask, xp.int32)
         return ret_dict
 
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 49c44f249e..fc0682c1f2 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -595,12 +595,26 @@ def call_lower_graph(
             atomic_ret = self.atomic_model.forward_common_atomic_graph(
                 graph, atype, fparam=fparam, aparam=aparam
             )
+            # ``forward_common_atomic_graph`` returns flat ``(N, *)`` output
+            # (N = sum(n_node)). Reshape to rectangular ``(nf, nloc, *)`` so
+            # that ``fit_output_to_model_output`` can reduce over the atom axis
+            # (axis=-len(shap)-1) exactly as the dense path does.
+            # This reshape is valid for rectangular frames (uniform nloc per
+            # frame); ragged support is deferred to PR-B segment_sum reduction.
+            xp = array_api_compat.array_namespace(atype)
+            nf = n_node.shape[0]
+            N = atype.shape[0]
+            nloc = N // nf
+            atomic_ret_rect = {
+                kk: xp.reshape(vv, (nf, nloc, *vv.shape[1:]))
+                for kk, vv in atomic_ret.items()
+            }
             return fit_output_to_model_output(
-                atomic_ret,
+                atomic_ret_rect,
                 self.atomic_output_def(),
                 edge_vec,
                 do_atomic_virial=False,
-                mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
+                mask=atomic_ret_rect["mask"] if "mask" in atomic_ret_rect else None,
             )
 
         call = call_common
diff --git a/deepmd/dpmodel/utils/exclude_mask.py b/deepmd/dpmodel/utils/exclude_mask.py
index 80153129cc..2fafbe5293 100644
--- a/deepmd/dpmodel/utils/exclude_mask.py
+++ b/deepmd/dpmodel/utils/exclude_mask.py
@@ -42,25 +42,24 @@ def build_type_exclude_mask(
         Parameters
         ----------
         atype
-            The extended aotm types. shape: nf x natom
+            The atom types. shape: nf x natom (dense) or N (graph / flat)
 
         Returns
         -------
         mask
-            The type exclusion mask for atoms. shape: nf x natom
-            Element [ff,ii] being 0 if type(ii) is excluded,
-            otherwise being 1.
+            The type exclusion mask for atoms, same shape as ``atype``.
+            Element being 0 if the type is excluded, otherwise being 1.
 
         """
         xp = array_api_compat.array_namespace(atype)
-        nf, natom = atype.shape
+        lead = atype.shape  # (nf, natom) dense | (N,) graph
         return xp.reshape(
             xp.take(
                 xp.asarray(self.type_mask[...], device=array_api_compat.device(atype)),
                 xp.reshape(atype, (-1,)),
                 axis=0,
             ),
-            (nf, natom),
+            lead,
         )
 
 
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 474e90b71c..fa8952d8cb 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -359,8 +359,21 @@ def forward_common_lower_graph(
                 fparam=fparam,
                 aparam=aparam,
             )
+            # ``forward_common_atomic_graph`` returns flat ``(N, *)`` output
+            # (N = sum(n_node)). Reshape to rectangular ``(nf, nloc, *)`` so
+            # that ``fit_output_to_model_output_graph`` can reduce over the
+            # atom axis and compute force/virial via autograd exactly as the
+            # dense path does.  This reshape is valid for rectangular frames
+            # (uniform nloc per frame); ragged support is deferred to PR-B.
+            nf = int(n_node.shape[0])
+            N = int(atype.shape[0])
+            nloc = N // nf
+            atomic_ret_rect = {
+                kk: vv.reshape(nf, nloc, *vv.shape[1:])
+                for kk, vv in atomic_ret.items()
+            }
             return fit_output_to_model_output_graph(
-                atomic_ret,
+                atomic_ret_rect,
                 self.atomic_output_def(),
                 edge_vec,
                 edge_index,
@@ -368,7 +381,7 @@ def forward_common_lower_graph(
                 n_node,
                 do_atomic_virial=do_atomic_virial,
                 create_graph=self.training,
-                mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
+                mask=atomic_ret_rect["mask"] if "mask" in atomic_ret_rect else None,
             )
 
         def _resolve_graph_method(
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index 1dc2efe651..85caf4271e 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -75,10 +75,11 @@ def test_forward_common_atomic_graph_matches_dense():
     dense = am.forward_common_atomic(ext_coord, ext_atype, nlist, mapping=mapping)
     ng = from_dense_quartet(ext_coord, nlist, mapping)
     graph = am.forward_common_atomic_graph(ng, atype.reshape(-1))
+    # graph returns flat (N,*); reshape dense (nf,nloc,*) -> flat for comparison
     for k in ("energy", "mask"):
-        np.testing.assert_allclose(
-            np.asarray(graph[k]), np.asarray(dense[k]), rtol=1e-12, atol=1e-12
-        )
+        g_arr = np.asarray(graph[k])
+        d_arr = np.asarray(dense[k]).reshape(g_arr.shape)
+        np.testing.assert_allclose(g_arr, d_arr, rtol=1e-12, atol=1e-12)
 
 
 # ── Feature-flag parity matrix (Task 6) ──────────────────────────────────────
@@ -203,6 +204,20 @@ def test_graph_matches_dense_with_atom_exclude():
     )
 
 
+def test_forward_common_atomic_graph_flat_shape():
+    rng = np.random.default_rng(1)
+    coord = rng.normal(size=(1, 5, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0]], dtype=np.int64)
+    am = _atomic_model()
+    ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
+        coord, atype, 4.0, [30], mixed_types=True, box=None
+    )
+    ng = from_dense_quartet(ext_coord, nlist, mapping)
+    out = am.forward_common_atomic_graph(ng, atype.reshape(-1))
+    assert out["energy"].shape == (5, 1)  # flat (N, 1)
+    assert out["mask"].shape == (5,)  # flat (N,)
+
+
 def test_graph_matches_dense_with_out_bias():
     """The graph path applies apply_out_stat (per-type out-bias) identically
     to the dense path. With a non-zero bias, graph == dense at 1e-12, and the
diff --git a/source/tests/common/dpmodel/test_graph_ragged.py b/source/tests/common/dpmodel/test_graph_ragged.py
index 143d48ddc4..7d3d87b070 100644
--- a/source/tests/common/dpmodel/test_graph_ragged.py
+++ b/source/tests/common/dpmodel/test_graph_ragged.py
@@ -16,6 +16,34 @@ def test_frame_id_ragged():
     )
 
 
+def test_forward_common_atomic_graph_ragged():
+    """Two frames with DIFFERENT node counts (3 and 2) share one flat node axis.
+
+    The old rectangular path (nloc = N // nf) could not represent this.
+    """
+    import numpy as np
+
+    from deepmd.dpmodel.atomic_model.dp_atomic_model import DPAtomicModel
+    from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1
+    from deepmd.dpmodel.fitting import InvarFitting
+    from deepmd.dpmodel.utils.neighbor_graph import NeighborGraph
+
+    ds = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[30], ntypes=2, attn_layer=0)
+    ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)
+    am = DPAtomicModel(ds, ft, type_map=["a", "b"])
+    n_node = np.array([3, 2], dtype=np.int64)  # RAGGED, N=5
+    atype = np.array([0, 1, 0, 1, 0], dtype=np.int64)
+    edge_index = np.array([[1, 0, 4], [0, 1, 3]], dtype=np.int64)  # within-frame
+    edge_vec = np.array([[1.0, 0, 0], [-1.0, 0, 0], [0.5, 0, 0]], dtype=np.float64)
+    edge_mask = np.array([True, True, True])
+    g = NeighborGraph(
+        n_node=n_node, edge_index=edge_index, edge_vec=edge_vec, edge_mask=edge_mask
+    )
+    out = am.forward_common_atomic_graph(g, atype)
+    assert out["energy"].shape == (5, 1) and out["mask"].shape == (5,)
+    assert np.all(np.isfinite(out["energy"]))
+
+
 def test_frame_id_rectangular():
     fid = frame_id_from_n_node(np.array([4, 4], dtype=np.int64))
     np.testing.assert_array_equal(

From 1103264a1f80cb40546fff6eb4ae09f8c83fa643 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 14:33:14 +0800
Subject: [PATCH 51/69] refactor(dpmodel): flat-N graph output transform
 (segment_sum over frame_id); reshape only at I/O boundary

---
 deepmd/dpmodel/model/make_model.py            | 33 ++++++-------
 deepmd/dpmodel/model/transform_output.py      | 46 +++++++++++++++++++
 .../tests/common/dpmodel/test_graph_ragged.py | 39 ++++++++++++++++
 3 files changed, 100 insertions(+), 18 deletions(-)

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index fc0682c1f2..2c706ea411 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -54,6 +54,7 @@
 from .transform_output import (
     communicate_extended_output,
     fit_output_to_model_output,
+    fit_output_to_model_output_graph,
 )
 
 
@@ -422,6 +423,12 @@ def _call_common_graph(
                 fparam=fp,
                 aparam=ap,
             )
+            # Public ABI is rectangular (nf, nloc, *); the lower is flat
+            # (N=nf*nloc, *).  Unravel per-atom keys here at the boundary.
+            for k in list(model_predict.keys()):
+                v = model_predict[k]
+                if v is not None and v.shape[:1] == (nf * nloc,):
+                    model_predict[k] = xp.reshape(v, (nf, nloc, *v.shape[1:]))
             return model_predict
 
         def call_common_lower(
@@ -596,25 +603,15 @@ def call_lower_graph(
                 graph, atype, fparam=fparam, aparam=aparam
             )
             # ``forward_common_atomic_graph`` returns flat ``(N, *)`` output
-            # (N = sum(n_node)). Reshape to rectangular ``(nf, nloc, *)`` so
-            # that ``fit_output_to_model_output`` can reduce over the atom axis
-            # (axis=-len(shap)-1) exactly as the dense path does.
-            # This reshape is valid for rectangular frames (uniform nloc per
-            # frame); ragged support is deferred to PR-B segment_sum reduction.
-            xp = array_api_compat.array_namespace(atype)
-            nf = n_node.shape[0]
-            N = atype.shape[0]
-            nloc = N // nf
-            atomic_ret_rect = {
-                kk: xp.reshape(vv, (nf, nloc, *vv.shape[1:]))
-                for kk, vv in atomic_ret.items()
-            }
-            return fit_output_to_model_output(
-                atomic_ret_rect,
+            # (N = sum(n_node)). Reduce per-frame via segment_sum over
+            # frame_id — supports ragged frames without any nloc = N // nf
+            # reshape. The I/O boundary unravel to (nf, nloc, *) happens
+            # in ``_call_common_graph`` for the public ABI.
+            return fit_output_to_model_output_graph(
+                atomic_ret,
                 self.atomic_output_def(),
-                edge_vec,
-                do_atomic_virial=False,
-                mask=atomic_ret_rect["mask"] if "mask" in atomic_ret_rect else None,
+                n_node,
+                mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
             )
 
         call = call_common
diff --git a/deepmd/dpmodel/model/transform_output.py b/deepmd/dpmodel/model/transform_output.py
index b8ca99469f..71b0fd37a2 100644
--- a/deepmd/dpmodel/model/transform_output.py
+++ b/deepmd/dpmodel/model/transform_output.py
@@ -66,6 +66,52 @@ def fit_output_to_model_output(
     return model_ret
 
 
+def fit_output_to_model_output_graph(
+    fit_ret: dict[str, Array],
+    fit_output_def: FittingOutputDef,
+    n_node: Array,
+    mask: Array | None = None,
+) -> dict[str, Array]:
+    """Flat-N analogue of :func:`fit_output_to_model_output`.
+
+    The atom axis is flat ``(N,)``; reducible outputs reduce per frame via
+    ``segment_sum`` over ``frame_id = repeat(arange(nf), n_node)`` (intensive ⇒
+    divide by the per-frame real-node count). Derivative name-holders are ``None``.
+    """
+    from deepmd.dpmodel.utils.neighbor_graph import (
+        frame_id_from_n_node,
+        segment_sum,
+    )
+
+    xp = array_api_compat.get_namespace(n_node)
+    nf = n_node.shape[0]
+    frame_id = frame_id_from_n_node(n_node)
+    model_ret = dict(fit_ret.items())
+    for kk, vv in fit_ret.items():
+        vdef = fit_output_def[kk]
+        if not vdef.reducible:
+            continue
+        kk_redu = get_reduce_name(kk)
+        vv_e = xp.astype(vv, GLOBAL_ENER_FLOAT_PRECISION)
+        redu = segment_sum(vv_e, frame_id, nf)  # (nf, *shape)
+        if vdef.intensive:
+            if mask is not None:
+                cnt = segment_sum(
+                    xp.astype(mask, GLOBAL_ENER_FLOAT_PRECISION), frame_id, nf
+                )
+            else:
+                cnt = xp.astype(n_node, GLOBAL_ENER_FLOAT_PRECISION)
+            redu = redu / xp.reshape(cnt, (nf, *([1] * (redu.ndim - 1))))
+        model_ret[kk_redu] = redu
+        if vdef.r_differentiable:
+            kk_derv_r, _ = get_deriv_name(kk)
+            model_ret[kk_derv_r] = None
+        if vdef.c_differentiable:
+            _, kk_derv_c = get_deriv_name(kk)
+            model_ret[kk_derv_c] = None
+    return model_ret
+
+
 def get_leading_dims(
     vv: Array,
     vdef: OutputVariableDef,
diff --git a/source/tests/common/dpmodel/test_graph_ragged.py b/source/tests/common/dpmodel/test_graph_ragged.py
index 7d3d87b070..dedd1fc34e 100644
--- a/source/tests/common/dpmodel/test_graph_ragged.py
+++ b/source/tests/common/dpmodel/test_graph_ragged.py
@@ -49,3 +49,42 @@ def test_frame_id_rectangular():
     np.testing.assert_array_equal(
         fid, np.array([0, 0, 0, 0, 1, 1, 1, 1], dtype=fid.dtype)
     )
+
+
+def test_call_lower_graph_ragged_energy_reduction():
+    """Per-frame energy_redu = segment_sum of the frame's atom energies; ragged."""
+    import numpy as np
+
+    from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1
+    from deepmd.dpmodel.fitting import InvarFitting
+    from deepmd.dpmodel.model.ener_model import EnergyModel
+
+    ds = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[30], ntypes=2, attn_layer=0)
+    ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)
+    m = EnergyModel(ds, ft, type_map=["a", "b"])
+    n_node = np.array([3, 2], dtype=np.int64)
+    atype = np.array([0, 1, 0, 1, 0], dtype=np.int64)
+    edge_index = np.array([[1, 0, 4], [0, 1, 3]], dtype=np.int64)
+    edge_vec = np.array([[1.0, 0, 0], [-1.0, 0, 0], [0.5, 0, 0]], dtype=np.float64)
+    edge_mask = np.array([True, True, True])
+    out = m.call_lower_graph(
+        atype=atype,
+        n_node=n_node,
+        edge_index=edge_index,
+        edge_vec=edge_vec,
+        edge_mask=edge_mask,
+    )
+    assert out["energy"].shape == (5, 1)  # flat node energy
+    assert out["energy_redu"].shape == (2, 1)  # per-FRAME reduced
+    np.testing.assert_allclose(
+        out["energy_redu"][0, 0],
+        out["energy"][0:3, 0].sum(),
+        rtol=1e-12,
+        atol=1e-12,
+    )
+    np.testing.assert_allclose(
+        out["energy_redu"][1, 0],
+        out["energy"][3:5, 0].sum(),
+        rtol=1e-12,
+        atol=1e-12,
+    )

From 1a9174650a8b369007c0e131e48dd7cd298ffca8 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 15:51:33 +0800
Subject: [PATCH 52/69] refactor(pt_expt): flat-N graph output transform +
 lower; reshape only at I/O boundary

---
 .../atomic_model/polar_atomic_model.py        |  15 +-
 deepmd/pt_expt/model/edge_transform_output.py |  66 ++++----
 deepmd/pt_expt/model/make_model.py            |  70 ++++-----
 .../pt_expt/model/test_dpa1_graph_lower.py    |  19 ++-
 .../tests/pt_expt/model/test_graph_ragged.py  | 144 ++++++++++++++++++
 5 files changed, 234 insertions(+), 80 deletions(-)
 create mode 100644 source/tests/pt_expt/model/test_graph_ragged.py

diff --git a/deepmd/dpmodel/atomic_model/polar_atomic_model.py b/deepmd/dpmodel/atomic_model/polar_atomic_model.py
index 76a221de46..0da87e5c27 100644
--- a/deepmd/dpmodel/atomic_model/polar_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/polar_atomic_model.py
@@ -46,7 +46,6 @@ def apply_out_stat(
         out_bias, out_std = self._fetch_out_stat(self.bias_keys)
 
         if self.fitting_net.shift_diag:
-            nframes, nloc = atype.shape
             dtype = out_bias[self.bias_keys[0]].dtype
             device = array_api_compat.device(out_bias[self.bias_keys[0]])
             for kk in self.bias_keys:
@@ -57,16 +56,22 @@ def apply_out_stat(
                 )
                 modified_bias = temp[atype]
 
-                # (nframes, nloc, 1)
+                # (..., 1)  -- (nframes, nloc, 1) or (N, 1)
                 modified_bias = (
                     modified_bias[..., xp.newaxis] * (self.fitting_net.scale[atype])
                 )
 
                 eye = xp.eye(3, dtype=dtype, device=device)
-                eye = xp.tile(eye, (nframes, nloc, 1, 1))
-                # (nframes, nloc, 3, 3)
+                if atype.ndim == 2:
+                    nframes, nloc = atype.shape
+                    eye = xp.tile(eye, (nframes, nloc, 1, 1))
+                else:
+                    # flat graph path: atype is (N,)
+                    N = atype.shape[0]
+                    eye = xp.tile(eye, (N, 1, 1))
+                # (..., 3, 3)
                 modified_bias = modified_bias[..., xp.newaxis] * eye
 
-                # nf x nloc x odims, out_bias: ntypes x odims
+                # nf x nloc x odims (rect) or N x odims (flat), out_bias: ntypes x odims
                 ret[kk] = ret[kk] + modified_bias
         return ret
diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py
index 98256f2679..d3d3ec343b 100644
--- a/deepmd/pt_expt/model/edge_transform_output.py
+++ b/deepmd/pt_expt/model/edge_transform_output.py
@@ -14,6 +14,8 @@
 )
 from deepmd.dpmodel.utils.neighbor_graph import (
     edge_force_virial,
+    frame_id_from_n_node,
+    segment_sum,
 )
 from deepmd.pt.utils import (
     env,
@@ -61,25 +63,26 @@ def fit_output_to_model_output_graph(
     """Graph analogue of the dense pt_expt ``fit_output_to_model_output``.
 
     OUTPUT-AGNOSTIC: reduces EVERY reducible fitting output (cast to energy
-    precision, summed/averaged over the atom axis) and, for every reducible +
-    ``r_differentiable`` output, assembles per-component force / virial /
-    (optional) atom-virial from :func:`edge_energy_deriv` (one ``grad`` w.r.t.
-    ``edge_vec`` per scalar component, then the shared full-to-``src`` scatter).
+    precision, summed/averaged per frame via ``segment_sum`` over ``frame_id``)
+    and, for every reducible + ``r_differentiable`` output, assembles
+    per-component force / virial / (optional) atom-virial from
+    :func:`edge_energy_deriv` (one ``grad`` w.r.t. ``edge_vec`` per scalar
+    component, then the shared full-to-``src`` scatter).
 
-    Mirrors the dense :func:`deepmd.pt_expt.model.transform_output.take_deriv`
-    output shapes -- ``<var>_derv_r`` is ``(nf, nloc, *shape, 3)``,
-    ``<var>_derv_c`` is ``(nf, nloc, *shape, 9)``, ``<var>_derv_c_redu`` is
-    ``(nf, *shape, 9)`` -- except the graph is ghost-free so the dense ``nall``
-    atom axis collapses to ``nloc`` LOCAL atoms.
+    All per-atom outputs stay FLAT with leading dimension ``N = sum(n_node)``:
+    ``<var>`` is ``(N, *shape)``, ``<var>_derv_r`` is ``(N, *shape, 3)``,
+    ``<var>_derv_c`` is ``(N, *shape, 9)``.  Per-frame reductions have leading
+    dimension ``nf``: ``<var>_redu`` is ``(nf, *shape)``,
+    ``<var>_derv_c_redu`` is ``(nf, *shape, 9)``.
 
     Parameters
     ----------
     fit_ret
-        Raw rectangular fitting output, ``(nf, nloc, *shape)`` per key.
+        Raw flat fitting output, ``(N, *shape)`` per key (``N = sum(n_node)``).
     fit_output_def
         The fitting output definition.
     edge_vec
-        (E, 3) edge vectors; MUST be the autograd leaf of ``fit_ret``.
+        (E, 3) edge vectors; MUST be the autograd leaf for ``fit_ret``.
     edge_index
         (2, E) ``[src, dst]`` edge endpoints (flat local indices).
     edge_mask
@@ -91,29 +94,32 @@ def fit_output_to_model_output_graph(
     create_graph
         Whether the backward retains a graph (training).
     mask
-        (nf, nloc) realness mask; used only for intensive-output reduction.
+        (N,) flat realness mask; used only for intensive-output reduction.
     """
     redu_prec = env.GLOBAL_PT_ENER_FLOAT_PRECISION
     nf = int(n_node.shape[0])
-    # N == sum(n_node) == nf * nloc here (rectangular carry-all graph).
-    nloc = int(fit_ret[next(iter(fit_ret))].shape[1])
+    N = int(n_node.sum())
+    frame_id = frame_id_from_n_node(n_node)  # (N,) int64 frame index per atom
     model_ret: dict[str, torch.Tensor] = dict(fit_ret.items())
     for kk, vv in fit_ret.items():
         vdef = fit_output_def[kk]
         shap = vdef.shape
-        atom_axis = -(len(shap) + 1)
         if not vdef.reducible:
             continue
         kk_redu = get_reduce_name(kk)
+        # segment_sum reduces axis 0 (the flat atom axis) per frame
+        vv_e = vv.to(redu_prec)  # (N, *shape)
+        redu = segment_sum(vv_e, frame_id, nf)  # (nf, *shape)
         if vdef.intensive:
             if mask is not None:
-                model_ret[kk_redu] = torch.sum(
-                    vv.to(redu_prec), dim=atom_axis
-                ) / torch.sum(mask, dim=-1, keepdim=True)
+                # real-atom count per frame: segment_sum of the mask
+                cnt = segment_sum(mask.to(redu_prec), frame_id, nf)  # (nf,)
+                # broadcast cnt to (nf, 1, ..., 1) to match redu shape
+                cnt = cnt.reshape(nf, *([1] * (redu.ndim - 1)))
             else:
-                model_ret[kk_redu] = torch.mean(vv.to(redu_prec), dim=atom_axis)
-        else:
-            model_ret[kk_redu] = torch.sum(vv.to(redu_prec), dim=atom_axis)
+                cnt = n_node.to(redu_prec).reshape(nf, *([1] * (redu.ndim - 1)))
+            redu = redu / cnt
+        model_ret[kk_redu] = redu
         if not vdef.r_differentiable:
             continue
         kk_derv_r, kk_derv_c = get_deriv_name(kk)
@@ -135,25 +141,23 @@ def fit_output_to_model_output_graph(
                 do_atomic_virial=(vdef.c_differentiable and do_atomic_virial),
                 create_graph=create_graph,
             )
-            # force (N, 3) -> (nf, nloc, 1, 3)
-            ff_list.append(force.reshape(nf, nloc, 1, 3))
+            # force (N, 3) -> (N, 1, 3)  [flat; caller unravels at I/O boundary]
+            ff_list.append(force.reshape(N, 1, 3))
             if vdef.c_differentiable:
                 # virial (nf, 3, 3) -> (nf, 1, 9)
                 vir_list.append(vir.reshape(nf, 1, 9))
                 if do_atomic_virial:
                     assert atom_vir is not None
-                    # atom_virial (N, 3, 3) -> (nf, nloc, 1, 9)
-                    av_list.append(atom_vir.reshape(nf, nloc, 1, 9))
-        # (nf, nloc, size, 3) -> (nf, nloc, *shape, 3)
-        model_ret[kk_derv_r] = torch.cat(ff_list, dim=-2).reshape([nf, nloc, *shap, 3])
+                    # atom_virial (N, 3, 3) -> (N, 1, 9)  [flat]
+                    av_list.append(atom_vir.reshape(N, 1, 9))
+        # (N, size, 3) -> (N, *shape, 3)
+        model_ret[kk_derv_r] = torch.cat(ff_list, dim=-2).reshape([N, *shap, 3])
         if vdef.c_differentiable:
             # (nf, size, 9) -> (nf, *shape, 9)
             model_ret[kk_derv_c + "_redu"] = torch.cat(vir_list, dim=-2).reshape(
                 [nf, *shap, 9]
             )
             if do_atomic_virial:
-                # (nf, nloc, size, 9) -> (nf, nloc, *shape, 9)
-                model_ret[kk_derv_c] = torch.cat(av_list, dim=-2).reshape(
-                    [nf, nloc, *shap, 9]
-                )
+                # (N, size, 9) -> (N, *shape, 9)
+                model_ret[kk_derv_c] = torch.cat(av_list, dim=-2).reshape([N, *shap, 9])
     return model_ret
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index fa8952d8cb..3e97e973f4 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -295,23 +295,20 @@ def forward_common_lower_graph(
 
             OUTPUT-AGNOSTIC: runs the graph descriptor + fitting forward with
             ``edge_vec`` as the autograd leaf (via the inherited
-            :meth:`forward_common_atomic_graph`), then routes the raw rectangular
+            :meth:`forward_common_atomic_graph`), then routes the raw flat
             ``atomic_ret`` through :func:`fit_output_to_model_output_graph`, which
-            reduces EVERY reducible output and assembles force / per-frame virial
-            / (optional) atom-virial for every ``r_differentiable`` output from a
-            backward pass w.r.t. ``edge_vec`` (the shared full-to-``src`` scatter).
-            This makes any fitting (energy/dos/dipole/polar/property/...) flow
-            through the graph path with no change on the fitting side.
-
-            The returned dict uses the SAME internal key names as the legacy dense
-            :meth:`forward_common_lower` (``<var>``, ``<var>_redu``,
-            ``<var>_derv_r``, ``<var>_derv_c_redu``, and ``<var>_derv_c`` when
-            ``do_atomic_virial``).  Unlike the dense lower (which returns EXTENDED
-            ``nall`` force/atom-virial), the graph is ghost-free, so force and
-            atom-virial here live on the ``nloc`` LOCAL atoms (ghost contributions
-            are already folded onto their local owner via ``src = mapping[neighbor]``).
-            They equal the dense lower's extended quantities once the latter are
-            folded onto local atoms via ``mapping`` (i.e. ``communicate_extended_output``).
+            reduces EVERY reducible output via ``segment_sum`` and assembles
+            force / per-frame virial / (optional) atom-virial for every
+            ``r_differentiable`` output from a backward pass w.r.t. ``edge_vec``
+            (the shared full-to-``src`` scatter).  This makes any fitting
+            (energy/dos/dipole/polar/property/...) flow through the graph path
+            with no change on the fitting side.
+
+            All per-atom outputs stay FLAT with leading dimension
+            ``N = sum(n_node)``; per-frame reductions have leading dimension
+            ``nf``.  Callers that need rectangular ``(nf, nloc, *)`` output
+            (e.g. :meth:`_call_common_graph` where ``atype`` is rectangular)
+            unravel at the public I/O boundary.
 
             Parameters
             ----------
@@ -335,11 +332,11 @@ def forward_common_lower_graph(
             Returns
             -------
             dict
-                The standard model dict with ``<var>`` (nf, nloc, *shape),
-                ``<var>_redu`` (nf, *shape), and -- for ``r_differentiable``
-                outputs -- ``<var>_derv_r`` (nf, nloc, *shape, 3),
-                ``<var>_derv_c_redu`` (nf, *shape, 9), and -- when
-                ``do_atomic_virial`` -- ``<var>_derv_c`` (nf, nloc, *shape, 9).
+                Flat model dict: ``<var>`` (N, *shape), ``<var>_redu``
+                (nf, *shape), and -- for ``r_differentiable`` outputs --
+                ``<var>_derv_r`` (N, *shape, 3), ``<var>_derv_c_redu``
+                (nf, *shape, 9), and -- when ``do_atomic_virial`` --
+                ``<var>_derv_c`` (N, *shape, 9).
             """
             from deepmd.dpmodel.utils.neighbor_graph import (
                 NeighborGraph,
@@ -359,21 +356,10 @@ def forward_common_lower_graph(
                 fparam=fparam,
                 aparam=aparam,
             )
-            # ``forward_common_atomic_graph`` returns flat ``(N, *)`` output
-            # (N = sum(n_node)). Reshape to rectangular ``(nf, nloc, *)`` so
-            # that ``fit_output_to_model_output_graph`` can reduce over the
-            # atom axis and compute force/virial via autograd exactly as the
-            # dense path does.  This reshape is valid for rectangular frames
-            # (uniform nloc per frame); ragged support is deferred to PR-B.
-            nf = int(n_node.shape[0])
-            N = int(atype.shape[0])
-            nloc = N // nf
-            atomic_ret_rect = {
-                kk: vv.reshape(nf, nloc, *vv.shape[1:])
-                for kk, vv in atomic_ret.items()
-            }
+            # ``forward_common_atomic_graph`` returns flat ``(N, *)`` output.
+            # Pass directly to the flat-N transform; no rectangular reshape needed.
             return fit_output_to_model_output_graph(
-                atomic_ret_rect,
+                atomic_ret,
                 self.atomic_output_def(),
                 edge_vec,
                 edge_index,
@@ -381,7 +367,7 @@ def forward_common_lower_graph(
                 n_node,
                 do_atomic_virial=do_atomic_virial,
                 create_graph=self.training,
-                mask=atomic_ret_rect["mask"] if "mask" in atomic_ret_rect else None,
+                mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
             )
 
         def _resolve_graph_method(
@@ -452,9 +438,15 @@ def _call_common_graph(
                 fparam=fp,
                 aparam=ap,
             )
-            # forward_common_lower_graph already masks virtual atoms (atype<0)
-            # and carries the real int ``mask`` key, matching the dense
-            # ``call_common`` output.
+            # ``forward_common_lower_graph`` returns flat ``(N, *)`` per-atom
+            # outputs (N = nf * nloc for a carry-all rectangular graph).
+            # Unravel to rectangular ``(nf, nloc, *)`` at the public I/O boundary
+            # so that callers receive the same shape as the dense ``call_common``.
+            N = nf * nloc
+            for k in list(model_predict.keys()):
+                v = model_predict[k]
+                if v is not None and v.shape[:1] == torch.Size([N]):
+                    model_predict[k] = v.reshape(nf, nloc, *v.shape[1:])
             return model_predict
 
         def forward_common_atomic(
diff --git a/source/tests/pt_expt/model/test_dpa1_graph_lower.py b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
index ddaf450faf..8001213e1c 100644
--- a/source/tests/pt_expt/model/test_dpa1_graph_lower.py
+++ b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
@@ -204,21 +204,30 @@ def test_force_virial_parity_vs_legacy(self, periodic, do_av) -> None:
             do_atomic_virial=do_av,
         )
 
-        # energy / reduced energy / reduced virial: direct compare
-        torch.testing.assert_close(graph["energy"], legacy["energy"], **tol)
+        # forward_common_lower_graph returns flat (N,*) per-atom outputs.
+        # Reshape to (nf, nloc, *) for comparison against the legacy dense lower.
+        N = nf * nloc
+
+        # per-atom energy: flat (N, 1) -> (nf, nloc, 1)
+        graph_energy = graph["energy"].reshape(nf, nloc, 1)
+        torch.testing.assert_close(graph_energy, legacy["energy"], **tol)
+
+        # reduced energy and virial: already per-frame (nf, *)
         torch.testing.assert_close(graph["energy_redu"], legacy["energy_redu"], **tol)
         torch.testing.assert_close(
             graph["energy_derv_c_redu"], legacy["energy_derv_c_redu"], **tol
         )
 
-        # force: fold legacy extended (nall) -> local (nloc)
+        # force: graph is flat (N, 1, 3); fold legacy extended (nall) -> local (nloc)
         legacy_force_local = _fold_extended_to_local(
             legacy["energy_derv_r"], mapping, nloc
         )
-        torch.testing.assert_close(graph["energy_derv_r"], legacy_force_local, **tol)
+        graph_force = graph["energy_derv_r"].reshape(nf, nloc, 1, 3)
+        torch.testing.assert_close(graph_force, legacy_force_local, **tol)
 
         if do_av:
             legacy_av_local = _fold_extended_to_local(
                 legacy["energy_derv_c"], mapping, nloc
             )
-            torch.testing.assert_close(graph["energy_derv_c"], legacy_av_local, **tol)
+            graph_av = graph["energy_derv_c"].reshape(nf, nloc, 1, 9)
+            torch.testing.assert_close(graph_av, legacy_av_local, **tol)
diff --git a/source/tests/pt_expt/model/test_graph_ragged.py b/source/tests/pt_expt/model/test_graph_ragged.py
new file mode 100644
index 0000000000..5f78be5ff1
--- /dev/null
+++ b/source/tests/pt_expt/model/test_graph_ragged.py
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Ragged n_node test for forward_common_lower_graph.
+
+Verifies that the flat-N graph transform correctly handles ragged frames
+(n_node=[3,2], N=5): energy shape (5,1), energy_redu shape (2,1),
+energy_derv_r leading dim 5.  All entries must be finite.
+"""
+
+import torch
+
+from deepmd.pt.utils import (
+    env,
+)
+from deepmd.pt_expt.descriptor.dpa1 import (
+    DescrptDPA1,
+)
+from deepmd.pt_expt.fitting import (
+    InvarFitting,
+)
+from deepmd.pt_expt.model import (
+    EnergyModel,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+
+_RCUT = 3.0
+_NT = 2
+
+
+def _make_model() -> EnergyModel:
+    ds = DescrptDPA1(
+        _RCUT,
+        0.5,
+        10,
+        _NT,
+        neuron=[3, 6],
+        axis_neuron=2,
+        attn=4,
+        attn_layer=0,
+        attn_dotr=True,
+        attn_mask=False,
+        activation_function="tanh",
+        set_davg_zero=True,
+        type_one_side=True,
+        precision="float64",
+        seed=GLOBAL_SEED,
+    ).to(env.DEVICE)
+    ft = InvarFitting(
+        "energy",
+        _NT,
+        ds.get_dim_out(),
+        1,
+        mixed_types=ds.mixed_types(),
+        precision="float64",
+        seed=GLOBAL_SEED,
+    ).to(env.DEVICE)
+    return EnergyModel(ds, ft, type_map=["A", "B"]).to(env.DEVICE)
+
+
+def _make_ragged_graph(device: torch.device) -> tuple:
+    """Build a ragged graph with n_node=[3,2] (N=5).
+
+    Frame 0: atoms 0,1,2 — fully connected (6 directed edges within rcut).
+    Frame 1: atoms 3,4   — fully connected (2 directed edges within rcut).
+    Edge vectors are chosen to be small enough to fall within _RCUT.
+    """
+    rng = torch.Generator(device=device).manual_seed(GLOBAL_SEED)
+    # flat atom types (N=5)
+    atype = torch.tensor([0, 1, 0, 1, 0], dtype=torch.int64, device=device)
+    # n_node per frame
+    n_node = torch.tensor([3, 2], dtype=torch.int64, device=device)
+    # edge_index: all pairs within each frame (flat indices into [0,4])
+    # frame 0: 0↔1, 0↔2, 1↔2  (both directions = 6 edges)
+    # frame 1: 3↔4             (both directions = 2 edges)
+    src = torch.tensor([0, 1, 0, 2, 1, 2, 3, 4], dtype=torch.int64, device=device)
+    dst = torch.tensor([1, 0, 2, 0, 2, 1, 4, 3], dtype=torch.int64, device=device)
+    edge_index = torch.stack([src, dst], dim=0)  # (2, 8)
+    # edge_vec: random small vectors well within rcut
+    edge_vec = (
+        torch.rand(8, 3, dtype=torch.float64, device=device, generator=rng) * 0.5
+    ).detach()
+    edge_mask = torch.ones(8, dtype=torch.bool, device=device)
+    return atype, n_node, edge_index, edge_vec, edge_mask
+
+
+class TestGraphRagged:
+    def setup_method(self) -> None:
+        self.model = _make_model()
+        self.model.eval()
+        self.device = env.DEVICE
+        self.atype, self.n_node, self.edge_index, self.edge_vec, self.edge_mask = (
+            _make_ragged_graph(self.device)
+        )
+
+    def test_flat_energy_shapes(self) -> None:
+        """forward_common_lower_graph returns flat (N,1) energy, (nf,1) energy_redu."""
+        ret = self.model.forward_common_lower_graph(
+            self.atype,
+            self.n_node,
+            self.edge_index,
+            self.edge_vec,
+            self.edge_mask,
+            do_atomic_virial=False,
+        )
+        N = int(self.n_node.sum())  # 5
+        nf = int(self.n_node.shape[0])  # 2
+        # per-atom energy: flat (N, *shap) = (5, 1)
+        assert ret["energy"].shape == (N, 1), (
+            f"expected (5,1) got {ret['energy'].shape}"
+        )
+        # reduced energy: per-frame (nf, *shap) = (2, 1)
+        assert ret["energy_redu"].shape == (nf, 1), (
+            f"expected (2,1) got {ret['energy_redu'].shape}"
+        )
+        # force: flat leading dim N
+        assert ret["energy_derv_r"].shape[0] == N, (
+            f"expected leading dim 5 got {ret['energy_derv_r'].shape}"
+        )
+        # all finite
+        assert torch.isfinite(ret["energy"]).all()
+        assert torch.isfinite(ret["energy_redu"]).all()
+        assert torch.isfinite(ret["energy_derv_r"]).all()
+
+    def test_flat_atom_virial_shapes(self) -> None:
+        """With do_atomic_virial=True, atom_virial is also flat (N,1,9)."""
+        ret = self.model.forward_common_lower_graph(
+            self.atype,
+            self.n_node,
+            self.edge_index,
+            self.edge_vec,
+            self.edge_mask,
+            do_atomic_virial=True,
+        )
+        N = int(self.n_node.sum())  # 5
+        nf = int(self.n_node.shape[0])  # 2
+        assert ret["energy"].shape == (N, 1)
+        assert ret["energy_redu"].shape == (nf, 1)
+        assert ret["energy_derv_r"].shape[0] == N
+        assert ret["energy_derv_c"].shape[0] == N
+        assert ret["energy_derv_c_redu"].shape[0] == nf
+        assert torch.isfinite(ret["energy_derv_c"]).all()
+        assert torch.isfinite(ret["energy_derv_c_redu"]).all()

From 71fcb4195fde919e8de3b9b8a882caa92213bf04 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 16:03:03 +0800
Subject: [PATCH 53/69] fix(dpmodel,pt_expt): graph I/O unravel skips _redu
 keys (nloc==1 N==nf ambiguity)

---
 deepmd/dpmodel/model/make_model.py            |  7 ++++-
 deepmd/pt_expt/model/make_model.py            |  7 ++++-
 .../dpmodel/test_graph_atomic_parity.py       | 30 +++++++++++++++++++
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 2c706ea411..1a2597f096 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -427,7 +427,12 @@ def _call_common_graph(
             # (N=nf*nloc, *).  Unravel per-atom keys here at the boundary.
             for k in list(model_predict.keys()):
                 v = model_predict[k]
-                if v is not None and v.shape[:1] == (nf * nloc,):
+                # per-frame reduced keys (..._redu) keep their (nf, *) shape; only node-level (N,*) keys unravel — guards the nloc==1 case where N == nf.
+                if (
+                    v is not None
+                    and not k.endswith("_redu")
+                    and v.shape[:1] == (nf * nloc,)
+                ):
                     model_predict[k] = xp.reshape(v, (nf, nloc, *v.shape[1:]))
             return model_predict
 
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 3e97e973f4..3ab0c9535d 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -445,7 +445,12 @@ def _call_common_graph(
             N = nf * nloc
             for k in list(model_predict.keys()):
                 v = model_predict[k]
-                if v is not None and v.shape[:1] == torch.Size([N]):
+                # per-frame reduced keys (..._redu) keep their (nf, *) shape; only node-level (N,*) keys unravel — guards the nloc==1 case where N == nf.
+                if (
+                    v is not None
+                    and not k.endswith("_redu")
+                    and v.shape[:1] == torch.Size([N])
+                ):
                     model_predict[k] = v.reshape(nf, nloc, *v.shape[1:])
             return model_predict
 
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index 85caf4271e..f62ed07434 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -218,6 +218,36 @@ def test_forward_common_atomic_graph_flat_shape():
     assert out["mask"].shape == (5,)  # flat (N,)
 
 
+def test_graph_nloc1_unravel_shapes():
+    """Regression: when nloc==1, N==nf so per-frame _redu keys must NOT be
+    reshaped to (nf,1,*).  Before the fix, energy_redu came out (nf,1,1) instead
+    of (nf,1).  Checks both shapes and value parity against the dense (legacy) path.
+    """
+    nf = 2
+    rng = np.random.default_rng(42)
+    coord = rng.normal(size=(nf, 1, 3)) * 1.5
+    atype = np.zeros((nf, 1), dtype=np.int64)
+    box = np.tile(np.eye(3).reshape(1, 9) * 20.0, (nf, 1))
+    model = _ener_model([200])  # non-binding sel
+    g = model.call_common(coord, atype, box, neighbor_graph_method="dense")
+    d = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
+    # shape assertions — the critical regression check
+    assert g["energy"].shape == (nf, 1, 1), f"energy shape {g['energy'].shape}"
+    assert g["energy_redu"].shape == (nf, 1), (
+        f"energy_redu shape {g['energy_redu'].shape}"
+    )
+    assert g["mask"].shape == (nf, 1), f"mask shape {g['mask'].shape}"
+    # value parity with the dense path
+    for k in ("energy", "energy_redu", "mask"):
+        np.testing.assert_allclose(
+            np.asarray(g[k]),
+            np.asarray(d[k]),
+            rtol=1e-12,
+            atol=1e-12,
+            err_msg=f"graph vs legacy mismatch for key '{k}'",
+        )
+
+
 def test_graph_matches_dense_with_out_bias():
     """The graph path applies apply_out_stat (per-type out-bias) identically
     to the dense path. With a non-zero bias, graph == dense at 1e-12, and the

From 1320ca158485d49ae783215902b9090a7018095f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 16:08:19 +0800
Subject: [PATCH 54/69] test(dpmodel): ragged-native gate + rectangular
 free-view equivalence

---
 deepmd/dpmodel/model/make_model.py            |   1 +
 deepmd/pt_expt/model/make_model.py            |   1 +
 .../tests/common/dpmodel/test_graph_ragged.py | 125 ++++++++++++++++++
 3 files changed, 127 insertions(+)

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 1a2597f096..c4f4bcb882 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -425,6 +425,7 @@ def _call_common_graph(
             )
             # Public ABI is rectangular (nf, nloc, *); the lower is flat
             # (N=nf*nloc, *).  Unravel per-atom keys here at the boundary.
+            # public call_common always passes rectangular (nf,nloc) coord/atype (N == nf*nloc), so this unravel always applies; ragged graphs reach call_lower_graph/forward_common_lower_graph directly (no unravel) and stay flat (N,*).
             for k in list(model_predict.keys()):
                 v = model_predict[k]
                 # per-frame reduced keys (..._redu) keep their (nf, *) shape; only node-level (N,*) keys unravel — guards the nloc==1 case where N == nf.
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 3ab0c9535d..f1939289d0 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -443,6 +443,7 @@ def _call_common_graph(
             # Unravel to rectangular ``(nf, nloc, *)`` at the public I/O boundary
             # so that callers receive the same shape as the dense ``call_common``.
             N = nf * nloc
+            # public call_common always passes rectangular (nf,nloc) coord/atype (N == nf*nloc), so this unravel always applies; ragged graphs reach call_lower_graph/forward_common_lower_graph directly (no unravel) and stay flat (N,*).
             for k in list(model_predict.keys()):
                 v = model_predict[k]
                 # per-frame reduced keys (..._redu) keep their (nf, *) shape; only node-level (N,*) keys unravel — guards the nloc==1 case where N == nf.
diff --git a/source/tests/common/dpmodel/test_graph_ragged.py b/source/tests/common/dpmodel/test_graph_ragged.py
index dedd1fc34e..40987e86df 100644
--- a/source/tests/common/dpmodel/test_graph_ragged.py
+++ b/source/tests/common/dpmodel/test_graph_ragged.py
@@ -6,6 +6,9 @@
 
 import numpy as np
 
+from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1
+from deepmd.dpmodel.fitting import InvarFitting
+from deepmd.dpmodel.model.ener_model import EnergyModel
 from deepmd.dpmodel.utils.neighbor_graph import frame_id_from_n_node
 
 
@@ -88,3 +91,125 @@ def test_call_lower_graph_ragged_energy_reduction():
         rtol=1e-12,
         atol=1e-12,
     )
+
+
+def _ener_model_ragged(sel=(200,)):
+    """Build a dpa1(attn_layer=0) EnergyModel for gate tests."""
+    ds = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=list(sel), ntypes=2, attn_layer=0)
+    ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)
+    return EnergyModel(ds, ft, type_map=["a", "b"])
+
+
+def test_rectangular_free_view_equivalence():
+    """GATE: rectangular nf=2, nloc=5 graph path == legacy dense path bit-identical.
+
+    Proves the flat-N rewrite does not perturb the rectangular special case.
+    public call_common with neighbor_graph_method='dense' must match 'legacy'
+    on energy / energy_redu / mask at rtol/atol 1e-12 (non-binding sel=[200]).
+    """
+    nf, nloc = 2, 5
+    rng = np.random.default_rng(7)
+    coord = rng.normal(size=(nf, nloc, 3)) * 1.5
+    atype = np.tile(np.array([[0, 1, 0, 1, 0]], dtype=np.int64), (nf, 1))
+    box = np.tile(np.eye(3).reshape(1, 9) * 20.0, (nf, 1))  # large PBC box
+    model = _ener_model_ragged(sel=[200])  # non-binding sel
+    g = model.call_common(coord, atype, box, neighbor_graph_method="dense")
+    d = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
+    for k in ("energy", "energy_redu", "mask"):
+        np.testing.assert_allclose(
+            np.asarray(g[k]),
+            np.asarray(d[k]),
+            rtol=1e-12,
+            atol=1e-12,
+            err_msg=f"graph vs legacy mismatch for key '{k}'",
+        )
+
+
+def test_ragged_frames_independent():
+    """GATE: ragged n_node=[3,2] per-frame energies equal two single-frame runs.
+
+    Proves frames do not leak through segment_sum on the flat axis: the ragged
+    energy_redu[i] must match running the i-th frame's atoms+edges in isolation
+    through call_lower_graph.  The SAME model weights are used for all three
+    calls so the comparison is meaningful.
+
+    Frame 0: nodes 0-2 (atype [0,1,0]), edges 0<->1, 1<->2.
+    Frame 1: nodes 3-4 (atype [1,0]), edges 3<->4 (global) = 0<->1 (local).
+    """
+    model = _ener_model_ragged()
+
+    # ── Ragged graph (both frames in one flat call) ────────────────────────
+    atype5 = np.array([0, 1, 0, 1, 0], dtype=np.int64)
+    #  frame-0 edges (global indices 0,1,2): 0↔1, 1↔2
+    #  frame-1 edges (global indices 3,4):  3↔4
+    edge_index_rag = np.array([[0, 1, 1, 2, 3, 4], [1, 0, 2, 1, 4, 3]], dtype=np.int64)
+    edge_vec_rag = np.array(
+        [
+            [1.0, 0.0, 0.0],
+            [-1.0, 0.0, 0.0],
+            [1.5, 0.0, 0.0],
+            [-1.5, 0.0, 0.0],
+            [0.5, 0.0, 0.0],
+            [-0.5, 0.0, 0.0],
+        ],
+        dtype=np.float64,
+    )
+    edge_mask_rag = np.ones(6, dtype=bool)
+    ragged = model.call_lower_graph(
+        atype=atype5,
+        n_node=np.array([3, 2], dtype=np.int64),
+        edge_index=edge_index_rag,
+        edge_vec=edge_vec_rag,
+        edge_mask=edge_mask_rag,
+    )
+
+    # ── Single-frame 0 (nodes 0-2) ─────────────────────────────────────────
+    atype_f0 = atype5[:3]  # [0, 1, 0]
+    edge_index_f0 = np.array([[0, 1, 1, 2], [1, 0, 2, 1]], dtype=np.int64)
+    edge_vec_f0 = np.array(
+        [
+            [1.0, 0.0, 0.0],
+            [-1.0, 0.0, 0.0],
+            [1.5, 0.0, 0.0],
+            [-1.5, 0.0, 0.0],
+        ],
+        dtype=np.float64,
+    )
+    edge_mask_f0 = np.ones(4, dtype=bool)
+    f0 = model.call_lower_graph(
+        atype=atype_f0,
+        n_node=np.array([3], dtype=np.int64),
+        edge_index=edge_index_f0,
+        edge_vec=edge_vec_f0,
+        edge_mask=edge_mask_f0,
+    )
+
+    # ── Single-frame 1 (nodes 3-4, remapped to local indices 0-1) ──────────
+    atype_f1 = atype5[3:]  # [1, 0]  (atype of global nodes 3,4)
+    # global edge 3→4 becomes local 0→1; global 4→3 becomes local 1→0
+    edge_index_f1 = np.array([[0, 1], [1, 0]], dtype=np.int64)
+    edge_vec_f1 = np.array([[0.5, 0.0, 0.0], [-0.5, 0.0, 0.0]], dtype=np.float64)
+    edge_mask_f1 = np.ones(2, dtype=bool)
+    f1 = model.call_lower_graph(
+        atype=atype_f1,
+        n_node=np.array([2], dtype=np.int64),
+        edge_index=edge_index_f1,
+        edge_vec=edge_vec_f1,
+        edge_mask=edge_mask_f1,
+    )
+
+    # ── Gate assertions ────────────────────────────────────────────────────
+    np.testing.assert_allclose(
+        np.asarray(ragged["energy_redu"][0]),
+        np.asarray(f0["energy_redu"][0]),
+        rtol=1e-12,
+        atol=1e-12,
+        err_msg="ragged frame-0 energy_redu must equal single-frame-0 energy_redu",
+    )
+    np.testing.assert_allclose(
+        np.asarray(ragged["energy_redu"][1]),
+        np.asarray(f1["energy_redu"][0]),
+        rtol=1e-12,
+        atol=1e-12,
+        err_msg="ragged frame-1 energy_redu must equal single-frame-1 energy_redu",
+    )

From 32a9d4c57e282a6846f37bae789942fd43afe6a6 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 16:14:12 +0800
Subject: [PATCH 55/69] feat(dpmodel): graph-native pair-exclude edge mask;
 graph supports pair_exclude_types (drop dense fallback)

---
 .../dpmodel/atomic_model/base_atomic_model.py | 17 +++++-
 deepmd/dpmodel/model/make_model.py            |  6 +-
 deepmd/dpmodel/utils/exclude_mask.py          | 32 ++++++++++
 deepmd/pt_expt/model/make_model.py            |  3 +-
 .../dpmodel/test_graph_atomic_parity.py       | 59 +++++++++----------
 5 files changed, 78 insertions(+), 39 deletions(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index 71a6947629..2c094ca030 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -323,9 +323,10 @@ def forward_common_atomic_graph(
         The node axis is flat ``(N,)`` (``N = sum(graph.n_node)``); masking and
         out-stat operate per node. Reuses :meth:`_finalize_atomic_ret`, so
         virtual-atom masking, ``atom_excl`` and ``apply_out_stat`` match the dense
-        path. Model-level ``pair_exclude_types`` is gated out of the graph path by
-        the model routing (``_resolve_graph_method`` / the ``_call_common_graph``
-        gate require ``pair_excl is None``); descriptor-level ``exclude_types`` is
+        path. Model-level ``pair_exclude_types`` is graph-native: when
+        ``self.pair_excl is not None``, an edge-keep mask is ANDed into
+        ``graph.edge_mask`` before the descriptor forward, so excluded type-pairs
+        contribute zero to the segment_sum. Descriptor-level ``exclude_types`` is
         gated by ``uses_graph_lower()==False``.
 
         Parameters
@@ -345,10 +346,20 @@ def forward_common_atomic_graph(
             the result dict on the flat node axis, defined by the `FittingOutputDef`.
 
         """
+        import dataclasses
+
         xp = array_api_compat.array_namespace(graph.edge_vec)
         atype = xp.asarray(atype, device=array_api_compat.device(graph.edge_vec))
         atom_mask = self.make_atom_mask(atype)  # (N,) bool
         atype_clamped = xp.where(atom_mask, atype, xp.zeros_like(atype))
+        if self.pair_excl is not None:
+            keep = self.pair_excl.build_edge_exclude_mask(
+                graph.edge_index, atype_clamped
+            )
+            graph = dataclasses.replace(
+                graph,
+                edge_mask=graph.edge_mask * xp.astype(keep, graph.edge_mask.dtype),
+            )
         ret_dict = self.forward_atomic_graph(
             graph, atype_clamped, fparam=fparam, aparam=aparam
         )
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index c4f4bcb882..82010da04d 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -393,12 +393,10 @@ def _call_common_graph(
             """
             descriptor = getattr(self.atomic_model, "descriptor", None)
             uses_graph_lower = getattr(descriptor, "uses_graph_lower", lambda: False)
-            pair_excl = getattr(self.atomic_model, "pair_excl", None)
-            if not (self.mixed_types() and uses_graph_lower() and pair_excl is None):
+            if not (self.mixed_types() and uses_graph_lower()):
                 raise NotImplementedError(
                     "neighbor_graph_method requires a mixed_types descriptor with a "
-                    "graph lower (e.g. dpa1 attn_layer=0) and no model-level "
-                    "pair_exclude_types (pair exclusion is not supported on the graph path)"
+                    "graph lower (e.g. dpa1 attn_layer=0)"
                 )
             if method == "dense":
                 ng = build_neighbor_graph(cc, atype, bb, self.get_rcut())
diff --git a/deepmd/dpmodel/utils/exclude_mask.py b/deepmd/dpmodel/utils/exclude_mask.py
index 2fafbe5293..1cc4ea7479 100644
--- a/deepmd/dpmodel/utils/exclude_mask.py
+++ b/deepmd/dpmodel/utils/exclude_mask.py
@@ -158,5 +158,37 @@ def build_type_exclude_mask(
         )
         return mask
 
+    def build_edge_exclude_mask(self, edge_index: Array, atype: Array) -> Array:
+        """Graph-native pair exclusion: per-edge keep mask (1 keep, 0 exclude).
+
+        Parameters
+        ----------
+        edge_index
+            (2, E) [src, dst]; src = neighbor, dst = center; into [0, N).
+        atype
+            (N,) flat local node types (clamped >= 0).
+
+        Returns
+        -------
+        mask
+            (E,) int. ``type_mask[atype[dst]*(ntypes+1) + atype[src]]``.
+
+        """
+        xp = array_api_compat.array_namespace(atype)
+        if len(self.exclude_types) == 0:
+            return xp.ones(
+                (edge_index.shape[1],),
+                dtype=xp.int32,
+                device=array_api_compat.device(atype),
+            )
+        src_t = xp.take(atype, edge_index[0, :], axis=0)
+        dst_t = xp.take(atype, edge_index[1, :], axis=0)
+        type_ij = dst_t * (self.ntypes + 1) + src_t
+        return xp.take(
+            xp.asarray(self.type_mask[...], device=array_api_compat.device(atype)),
+            type_ij,
+            axis=0,
+        )
+
     def __contains__(self, item: tuple[int, int]) -> bool:
         return item in self.exclude_types
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index f1939289d0..b3b89b6025 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -386,8 +386,7 @@ def _resolve_graph_method(
             # Linear/ZBL atomic models have no single ``descriptor`` -> dense.
             descriptor = getattr(self.atomic_model, "descriptor", None)
             uses_graph_lower = getattr(descriptor, "uses_graph_lower", lambda: False)
-            pair_excl = getattr(self.atomic_model, "pair_excl", None)
-            if self.mixed_types() and uses_graph_lower() and pair_excl is None:
+            if self.mixed_types() and uses_graph_lower():
                 return "dense"
             return None
 
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index f62ed07434..f172896493 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -127,38 +127,37 @@ def test_pair_exclude_types_falls_back_to_dense():
     assert m.atomic_model.descriptor.uses_graph_lower() is False
 
 
-def test_model_pair_exclude_types_disables_graph():
-    """A dpa1(attn_layer=0) model WITH model-level pair_exclude_types must NOT
-    route to the graph path: the explicit dpmodel graph request must raise
-    NotImplementedError (pair exclusion is unsupported on the graph path).
-    This also documents that without the gate, the old predicate (uses_graph_lower
-    only) would have WRONGLY allowed the graph, producing silent wrong energies.
+def test_model_pair_exclude_types_graph_matches_dense():
+    """Model-level pair_exclude_types is now graph-native (edge mask): graph ==
+    dense at 1e-12 (was: gated to dense / raises NotImplementedError).
     """
-    ds = DescrptDPA1(
-        rcut=4.0,
-        rcut_smth=0.5,
-        sel=[30],
-        ntypes=2,
-        attn_layer=0,
-        # no descriptor-level exclude_types -> uses_graph_lower() == True
-    )
-    ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)
-    am = DPAtomicModel(ds, ft, type_map=["a", "b"], pair_exclude_types=[(0, 1)])
-    model = EnergyModel(atomic_model_=am)
-    # Preconditions that document the old-gate failure mode:
-    assert model.atomic_model.pair_excl is not None, (
-        "pair_excl must be set so the old gate would have wrongly allowed graph"
-    )
-    assert model.atomic_model.descriptor.uses_graph_lower() is True, (
-        "uses_graph_lower must be True so the old gate would have wrongly allowed graph"
-    )
-    # The fixed gate must raise when pair_exclude_types is present:
-    rng = np.random.default_rng(10)
-    coord = rng.normal(size=(1, 5, 3)) * 1.5
-    atype = np.array([[0, 1, 0, 1, 0]], dtype=np.int64)
+    rng = np.random.default_rng(4)
+    nloc = 6
+    coord = rng.normal(size=(1, nloc, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0, 1]], dtype=np.int64)
     box = np.eye(3).reshape(1, 9) * 20.0
-    with pytest.raises(NotImplementedError, match="pair_exclude_types"):
-        model.call_common(coord, atype, box, neighbor_graph_method="dense")
+    ds = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[200], ntypes=2, attn_layer=0)
+    ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)
+    model = EnergyModel(ds, ft, type_map=["a", "b"], pair_exclude_types=[(0, 1)])
+    assert model.atomic_model.pair_excl is not None
+    g = model.call_common(coord, atype, box, neighbor_graph_method="dense")
+    d = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
+    for k in ("energy", "energy_redu", "mask"):
+        np.testing.assert_allclose(
+            np.asarray(g[k]), np.asarray(d[k]), rtol=1e-12, atol=1e-12
+        )
+    # non-vacuous: pair exclusion actually changed the energy vs no exclusion
+    # (different network weights for model0, but same geometry — just confirm
+    # pair exclusion is not a no-op on the dense path)
+    ds0 = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[200], ntypes=2, attn_layer=0)
+    ft0 = InvarFitting("energy", 2, ds0.get_dim_out(), 1, mixed_types=True)
+    model0 = EnergyModel(ds0, ft0, type_map=["a", "b"])
+    d0 = model0.call_common(coord, atype, box, neighbor_graph_method="legacy")
+    d_excl = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
+    # The two models have different weights AND exclusion, so energies differ
+    assert not np.allclose(
+        np.asarray(d_excl["energy_redu"]), np.asarray(d0["energy_redu"])
+    ), "pair exclusion + different weights must produce different total energy"
 
 
 def test_graph_matches_dense_with_atom_exclude():

From 3d3d65c393c4aa3a209046958db420ade4c2b504 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 16:21:37 +0800
Subject: [PATCH 56/69] style(dpmodel): hoist dataclasses import to module top
 (Task 9 review)

---
 deepmd/dpmodel/atomic_model/base_atomic_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index 2c094ca030..92184598a3 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import dataclasses
 import functools
 from collections.abc import (
     Callable,
@@ -346,7 +347,6 @@ def forward_common_atomic_graph(
             the result dict on the flat node axis, defined by the `FittingOutputDef`.
 
         """
-        import dataclasses
 
         xp = array_api_compat.array_namespace(graph.edge_vec)
         atype = xp.asarray(atype, device=array_api_compat.device(graph.edge_vec))

From 22ba5090c4568a42f9dae488f2967ffc4e98f85d Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 16:22:38 +0800
Subject: [PATCH 57/69] style(dpmodel): drop blank line after
 forward_common_atomic_graph docstring (D202)

---
 deepmd/dpmodel/atomic_model/base_atomic_model.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index 92184598a3..1819d2ffd3 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -347,7 +347,6 @@ def forward_common_atomic_graph(
             the result dict on the flat node axis, defined by the `FittingOutputDef`.
 
         """
-
         xp = array_api_compat.array_namespace(graph.edge_vec)
         atype = xp.asarray(atype, device=array_api_compat.device(graph.edge_vec))
         atom_mask = self.make_atom_mask(atype)  # (N,) bool

From c247a74a0f057e508fc8c0536633a0f316d48f9f Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 16:42:20 +0800
Subject: [PATCH 58/69] fix(dpmodel): _finalize_atomic_ret zero-atom reshape
 (explicit trailing product, not -1)

The leading-dim-agnostic refactor used xp.reshape(out, (*lead, -1)), but
numpy cannot infer -1 for a size-0 array (a zero-atom forward, nloc==0).
Restore the explicit trailing product math.prod(out.shape[len(lead):]).
Fixes 101 universal model test_zero_forward failures across all
descriptors (SeA/DPA3/...); dense path now byte-unchanged. Re-add the
'import math' dropped in the Task-5 refactor.
---
 deepmd/dpmodel/atomic_model/base_atomic_model.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index 1819d2ffd3..1301084d98 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import dataclasses
+import math
 import functools
 from collections.abc import (
     Callable,
@@ -398,7 +399,10 @@ def _finalize_atomic_ret(
         lead = atom_mask.shape  # (nf, nloc) dense | (N,) graph
         for kk in ret_dict.keys():
             out = ret_dict[kk]
-            flat = xp.reshape(out, (*lead, -1))
+            # explicit trailing product (NOT -1): a zero-atom forward (nloc==0)
+            # has size 0, and numpy cannot infer -1 for a size-0 array.
+            trail = math.prod(out.shape[len(lead) :])
+            flat = xp.reshape(out, (*lead, trail))
             flat = xp.where(atom_mask[..., None], flat, xp.zeros_like(flat))
             ret_dict[kk] = xp.reshape(flat, out.shape)
         ret_dict["mask"] = xp.astype(atom_mask, xp.int32)

From e8e9885a6df2e585d54d25ce9309b258c20fb001 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Jun 2026 08:57:05 +0000
Subject: [PATCH 59/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../dpmodel/atomic_model/base_atomic_model.py |  2 +-
 .../test_dpa1_call_graph_descriptor.py        |  4 +-
 .../common/dpmodel/test_fitting_call_graph.py |  4 +-
 .../tests/common/dpmodel/test_graph_ragged.py | 44 ++++++++++++++-----
 4 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index 1301084d98..610218a0bf 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import dataclasses
-import math
 import functools
+import math
 from collections.abc import (
     Callable,
 )
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
index 6f1cceff8c..dc1d51da91 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_descriptor.py
@@ -179,7 +179,9 @@ def test_single_rank_extension_keeps_type_invariant(self) -> None:
 
     def test_call_graph_returns_flat_node_axis(self) -> None:
         """call_graph output lives on the flat (N,) node axis, not (nf, nloc)."""
-        from deepmd.dpmodel.utils.neighbor_graph import from_dense_quartet
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            from_dense_quartet,
+        )
 
         dd = self._make([30])
         ext_coord, ext_atype, mapping, nlist = extend_input_and_build_neighbor_list(
diff --git a/source/tests/common/dpmodel/test_fitting_call_graph.py b/source/tests/common/dpmodel/test_fitting_call_graph.py
index dc44d9e6d2..2e143046eb 100644
--- a/source/tests/common/dpmodel/test_fitting_call_graph.py
+++ b/source/tests/common/dpmodel/test_fitting_call_graph.py
@@ -8,7 +8,9 @@
 import numpy as np
 import pytest
 
-from deepmd.dpmodel.fitting import InvarFitting
+from deepmd.dpmodel.fitting import (
+    InvarFitting,
+)
 
 
 @pytest.mark.parametrize("ndf", [0, 3])  # numb_fparam: no-fparam AND fparam
diff --git a/source/tests/common/dpmodel/test_graph_ragged.py b/source/tests/common/dpmodel/test_graph_ragged.py
index 40987e86df..1c4dca7332 100644
--- a/source/tests/common/dpmodel/test_graph_ragged.py
+++ b/source/tests/common/dpmodel/test_graph_ragged.py
@@ -6,10 +6,18 @@
 
 import numpy as np
 
-from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1
-from deepmd.dpmodel.fitting import InvarFitting
-from deepmd.dpmodel.model.ener_model import EnergyModel
-from deepmd.dpmodel.utils.neighbor_graph import frame_id_from_n_node
+from deepmd.dpmodel.descriptor.dpa1 import (
+    DescrptDPA1,
+)
+from deepmd.dpmodel.fitting import (
+    InvarFitting,
+)
+from deepmd.dpmodel.model.ener_model import (
+    EnergyModel,
+)
+from deepmd.dpmodel.utils.neighbor_graph import (
+    frame_id_from_n_node,
+)
 
 
 def test_frame_id_ragged():
@@ -26,10 +34,18 @@ def test_forward_common_atomic_graph_ragged():
     """
     import numpy as np
 
-    from deepmd.dpmodel.atomic_model.dp_atomic_model import DPAtomicModel
-    from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1
-    from deepmd.dpmodel.fitting import InvarFitting
-    from deepmd.dpmodel.utils.neighbor_graph import NeighborGraph
+    from deepmd.dpmodel.atomic_model.dp_atomic_model import (
+        DPAtomicModel,
+    )
+    from deepmd.dpmodel.descriptor.dpa1 import (
+        DescrptDPA1,
+    )
+    from deepmd.dpmodel.fitting import (
+        InvarFitting,
+    )
+    from deepmd.dpmodel.utils.neighbor_graph import (
+        NeighborGraph,
+    )
 
     ds = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[30], ntypes=2, attn_layer=0)
     ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)
@@ -58,9 +74,15 @@ def test_call_lower_graph_ragged_energy_reduction():
     """Per-frame energy_redu = segment_sum of the frame's atom energies; ragged."""
     import numpy as np
 
-    from deepmd.dpmodel.descriptor.dpa1 import DescrptDPA1
-    from deepmd.dpmodel.fitting import InvarFitting
-    from deepmd.dpmodel.model.ener_model import EnergyModel
+    from deepmd.dpmodel.descriptor.dpa1 import (
+        DescrptDPA1,
+    )
+    from deepmd.dpmodel.fitting import (
+        InvarFitting,
+    )
+    from deepmd.dpmodel.model.ener_model import (
+        EnergyModel,
+    )
 
     ds = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[30], ntypes=2, attn_layer=0)
     ft = InvarFitting("energy", 2, ds.get_dim_out(), 1, mixed_types=True)

From fd9c158f19ec7df85b079039674e9d12b5179a35 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 26 Jun 2026 18:53:36 +0800
Subject: [PATCH 60/69] test+refactor(dpmodel): fparam-through-graph test;
 stronger pair-exclude non-vacuity; unify polar eye tiling

- forward_atomic_graph fparam-by-frame_id dispatch now UTed (graph==dense
  1e-12 + per-frame fparam differs) [review #2]
- pair-exclude non-vacuity toggles pair_excl on the SAME model weights
  (isolates exclusion from weights) [review #1]
- polar apply_out_stat eye tiling unified to xp.tile(eye, (*atype.shape,1,1))
  (drops the ndim==2 if/else) [review #3]
---
 .../atomic_model/polar_atomic_model.py        |  9 +---
 .../dpmodel/test_graph_atomic_parity.py       | 52 +++++++++++++++----
 2 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/deepmd/dpmodel/atomic_model/polar_atomic_model.py b/deepmd/dpmodel/atomic_model/polar_atomic_model.py
index 0da87e5c27..293eae62c4 100644
--- a/deepmd/dpmodel/atomic_model/polar_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/polar_atomic_model.py
@@ -62,13 +62,8 @@ def apply_out_stat(
                 )
 
                 eye = xp.eye(3, dtype=dtype, device=device)
-                if atype.ndim == 2:
-                    nframes, nloc = atype.shape
-                    eye = xp.tile(eye, (nframes, nloc, 1, 1))
-                else:
-                    # flat graph path: atype is (N,)
-                    N = atype.shape[0]
-                    eye = xp.tile(eye, (N, 1, 1))
+                # leading-dim-agnostic: (nf, nloc) dense or (N,) flat graph path
+                eye = xp.tile(eye, (*atype.shape, 1, 1))
                 # (..., 3, 3)
                 modified_bias = modified_bias[..., xp.newaxis] * eye
 
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index f172896493..f95851f2d1 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -146,18 +146,48 @@ def test_model_pair_exclude_types_graph_matches_dense():
         np.testing.assert_allclose(
             np.asarray(g[k]), np.asarray(d[k]), rtol=1e-12, atol=1e-12
         )
-    # non-vacuous: pair exclusion actually changed the energy vs no exclusion
-    # (different network weights for model0, but same geometry — just confirm
-    # pair exclusion is not a no-op on the dense path)
-    ds0 = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[200], ntypes=2, attn_layer=0)
-    ft0 = InvarFitting("energy", 2, ds0.get_dim_out(), 1, mixed_types=True)
-    model0 = EnergyModel(ds0, ft0, type_map=["a", "b"])
-    d0 = model0.call_common(coord, atype, box, neighbor_graph_method="legacy")
-    d_excl = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
-    # The two models have different weights AND exclusion, so energies differ
+    # non-vacuous: toggle pair exclusion OFF on the SAME model (same weights),
+    # so any energy difference is due solely to the exclusion (not weights).
+    g_excl = model.call_common(coord, atype, box, neighbor_graph_method="dense")
+    model.atomic_model.reinit_pair_exclude([])  # clear pair exclusion
+    assert model.atomic_model.pair_excl is None
+    g_noexcl = model.call_common(coord, atype, box, neighbor_graph_method="dense")
     assert not np.allclose(
-        np.asarray(d_excl["energy_redu"]), np.asarray(d0["energy_redu"])
-    ), "pair exclusion + different weights must produce different total energy"
+        np.asarray(g_excl["energy_redu"]), np.asarray(g_noexcl["energy_redu"])
+    ), "pair exclusion must change the graph energy (same weights)"
+
+
+def test_graph_matches_dense_with_fparam():
+    """Frame parameter is gathered to nodes by frame_id in forward_atomic_graph
+    and fed to the fitting's call_graph; the graph path must match dense at 1e-12
+    with a non-zero fparam (exercises the frame_id gather + xp.take dispatch).
+    """
+    rng = np.random.default_rng(7)
+    nf, nloc, ndf = 2, 5, 3
+    coord = rng.normal(size=(nf, nloc, 3)) * 1.5
+    atype = np.tile(np.array([[0, 1, 0, 1, 0]], dtype=np.int64), (nf, 1))
+    box = np.tile(np.eye(3).reshape(1, 9) * 20.0, (nf, 1))
+    fparam = rng.normal(size=(nf, ndf))  # per-frame, differs across frames
+    ds = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[200], ntypes=2, attn_layer=0)
+    ft = InvarFitting(
+        "energy", 2, ds.get_dim_out(), 1, mixed_types=True, numb_fparam=ndf
+    )
+    model = EnergyModel(ds, ft, type_map=["a", "b"])
+    g = model.call_common(
+        coord, atype, box, fparam=fparam, neighbor_graph_method="dense"
+    )
+    d = model.call_common(
+        coord, atype, box, fparam=fparam, neighbor_graph_method="legacy"
+    )
+    for k in ("energy", "energy_redu"):
+        np.testing.assert_allclose(
+            np.asarray(g[k]), np.asarray(d[k]), rtol=1e-12, atol=1e-12
+        )
+    # non-vacuous: each frame's fparam differs, so a mis-gathered fparam (e.g.
+    # every node given frame 0's fparam) would make the two frames' energies equal.
+    assert not np.allclose(
+        np.asarray(g["energy_redu"][0]), np.asarray(g["energy_redu"][1])
+    )
 
 
 def test_graph_matches_dense_with_atom_exclude():

From a95e3972f3e10983cbf41466eb3d4bf2d4d1871a Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 27 Jun 2026 09:05:51 +0800
Subject: [PATCH 61/69] refactor(dpmodel): rename
 DescrptBlockSeAtten._call_graph -> call_graph (align with .call) [review #5]

---
 deepmd/dpmodel/descriptor/dpa1.py                         | 8 ++++----
 source/tests/common/dpmodel/test_dpa1_call_graph_block.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index ebb6b456bc..eb94b148cc 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -674,8 +674,8 @@ def call_graph(
     ) -> tuple[Array, Array]:
         """Descriptor-level graph-native forward (``attn_layer == 0``).
 
-        Wraps the private block kernel
-        :meth:`DescrptBlockSeAtten._call_graph`, adds the descriptor-level
+        Wraps the block kernel
+        :meth:`DescrptBlockSeAtten.call_graph`, adds the descriptor-level
         ``concat_output_tebd`` step, and returns the outputs on the flat ``(N,
         ...)`` node axis (ragged-native; no rectangular ``(nf, nloc)``
         reshape).
@@ -703,7 +703,7 @@ def call_graph(
         """
         xp = array_api_compat.array_namespace(graph.edge_vec)
         dev = array_api_compat.device(graph.edge_vec)
-        grrg, rot_mat = self.se_atten._call_graph(
+        grrg, rot_mat = self.se_atten.call_graph(
             graph, atype, type_embedding=type_embedding
         )
         # FLAT node axis (N, ...): no (nf, nloc) reshape -- ragged-native, spec.
@@ -1387,7 +1387,7 @@ def call(
             xp.reshape(sw, (nf, nloc, nnei, 1)),
         )
 
-    def _call_graph(
+    def call_graph(
         self,
         graph: Any,
         atype: Array,
diff --git a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
index d00abedd94..e8930101dd 100644
--- a/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
+++ b/source/tests/common/dpmodel/test_dpa1_call_graph_block.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Bit-exact parity between the graph-native ``DescrptBlockSeAtten._call_graph``
+"""Bit-exact parity between the graph-native ``DescrptBlockSeAtten.call_graph``
 (attn_layer=0) and the legacy dense ``DescrptBlockSeAtten.call`` on the SAME
 neighbor list, for binding AND non-binding ``sel``.
 """
@@ -78,7 +78,7 @@ def test_block_graph_equals_dense_any_sel(self, sel, type_one_side) -> None:
             type_embedding=tebd,
         )
         ng = from_dense_quartet(ext_coord, nlist, mapping)
-        graph_g, _rot_mat = blk._call_graph(
+        graph_g, _rot_mat = blk.call_graph(
             ng,
             np.reshape(ext_atype, (-1,)),
             type_embedding=tebd,
@@ -94,7 +94,7 @@ def test_attn_layer_gt0_raises(self) -> None:
         """The graph block kernel fail-fasts for attn_layer > 0 (unsupported)."""
         dd = DescrptDPA1(rcut=4.0, rcut_smth=0.5, sel=[20], ntypes=2, attn_layer=2)
         with pytest.raises(NotImplementedError):
-            dd.se_atten._call_graph(None, np.array([0], dtype=np.int64))
+            dd.se_atten.call_graph(None, np.array([0], dtype=np.int64))
 
     def test_exclude_types_raises(self) -> None:
         """The graph block kernel fail-fasts for exclude_types (not yet applied)."""
@@ -114,6 +114,6 @@ def test_exclude_types_raises(self) -> None:
             np.arange(self.nloc, dtype=np.int64)[None],
         )
         with pytest.raises(NotImplementedError):
-            dd.se_atten._call_graph(
+            dd.se_atten.call_graph(
                 ng, self.atype.reshape(-1), type_embedding=dd.type_embedding.call()
             )

From ff70bac0616c6a3e28d52b49c139b586cb5972f9 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 27 Jun 2026 09:13:44 +0800
Subject: [PATCH 62/69] refactor(dpmodel,pt_expt): graph output transform takes
 NeighborGraph; move dpmodel transform to edge_transform_output.py [review
 #8,#10]

- fit_output_to_model_output_graph now takes the NeighborGraph instead of
  n_node (dpmodel) / edge_vec+edge_index+edge_mask+n_node (pt_expt); the
  pt_expt autograd leaf is graph.edge_vec. Unifies the two signatures.
- dpmodel fit_output_to_model_output_graph moved transform_output.py ->
  new edge_transform_output.py (mirrors the pt_expt file layout).
- tighten pair-exclude non-vacuity tolerance (1e-9; the (0,1) effect is ~2e-6).
---
 deepmd/dpmodel/model/edge_transform_output.py | 101 ++++++++++++++++++
 deepmd/dpmodel/model/make_model.py            |   6 +-
 deepmd/dpmodel/model/transform_output.py      |  46 --------
 deepmd/pt_expt/model/edge_transform_output.py |  23 ++--
 deepmd/pt_expt/model/make_model.py            |   5 +-
 .../dpmodel/test_graph_atomic_parity.py       |   7 +-
 6 files changed, 123 insertions(+), 65 deletions(-)
 create mode 100644 deepmd/dpmodel/model/edge_transform_output.py

diff --git a/deepmd/dpmodel/model/edge_transform_output.py b/deepmd/dpmodel/model/edge_transform_output.py
new file mode 100644
index 0000000000..f9cc49f874
--- /dev/null
+++ b/deepmd/dpmodel/model/edge_transform_output.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Flat-N (ragged-native) graph output transform for the dpmodel backend.
+
+The graph lower produces per-node outputs on the flat ``(N,)`` node axis
+(``N = sum(graph.n_node)``); this reduces every reducible fitting output per
+frame via ``segment_sum`` over ``frame_id``.  dpmodel is energy-only (no
+autograd force on the graph path), so derivative name-holders are ``None`` --
+the pt_expt backend (:mod:`deepmd.pt_expt.model.edge_transform_output`) assembles
+force/virial from the same ``NeighborGraph`` via ``edge_energy_deriv``.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+from typing import (
+    TYPE_CHECKING,
+)
+
+import array_api_compat
+
+from deepmd.dpmodel.common import (
+    GLOBAL_ENER_FLOAT_PRECISION,
+)
+from deepmd.dpmodel.output_def import (
+    get_deriv_name,
+    get_reduce_name,
+)
+
+if TYPE_CHECKING:
+    from deepmd.dpmodel.array_api import (
+        Array,
+    )
+    from deepmd.dpmodel.output_def import (
+        FittingOutputDef,
+    )
+    from deepmd.dpmodel.utils.neighbor_graph import (
+        NeighborGraph,
+    )
+
+
+def fit_output_to_model_output_graph(
+    fit_ret: dict[str, Array],
+    fit_output_def: FittingOutputDef,
+    graph: NeighborGraph,
+    mask: Array | None = None,
+) -> dict[str, Array]:
+    """Flat-N analogue of :func:`~deepmd.dpmodel.model.transform_output.fit_output_to_model_output`.
+
+    Parameters
+    ----------
+    fit_ret
+        the raw per-node fitting dict, each value ``(N, *shape)``.
+    fit_output_def
+        the fitting output def (drives the per-key reduction).
+    graph
+        the :class:`~deepmd.dpmodel.utils.neighbor_graph.NeighborGraph`; only
+        ``graph.n_node`` is used (the node->frame map for the reduction).
+    mask
+        the ``(N,)`` real-node mask for the intensive-output denominator.
+
+    Returns
+    -------
+    model_ret
+        ``fit_ret`` plus, for each reducible key, ``<var>_redu (nf, *shape)`` via
+        ``segment_sum`` over ``frame_id`` (intensive ⇒ divide by the per-frame
+        real-node count); derivative name-holders are ``None``.
+    """
+    from deepmd.dpmodel.utils.neighbor_graph import (
+        frame_id_from_n_node,
+        segment_sum,
+    )
+
+    n_node = graph.n_node
+    xp = array_api_compat.get_namespace(n_node)
+    nf = n_node.shape[0]
+    frame_id = frame_id_from_n_node(n_node)
+    model_ret = dict(fit_ret.items())
+    for kk, vv in fit_ret.items():
+        vdef = fit_output_def[kk]
+        if not vdef.reducible:
+            continue
+        kk_redu = get_reduce_name(kk)
+        vv_e = xp.astype(vv, GLOBAL_ENER_FLOAT_PRECISION)
+        redu = segment_sum(vv_e, frame_id, nf)  # (nf, *shape)
+        if vdef.intensive:
+            if mask is not None:
+                cnt = segment_sum(
+                    xp.astype(mask, GLOBAL_ENER_FLOAT_PRECISION), frame_id, nf
+                )
+            else:
+                cnt = xp.astype(n_node, GLOBAL_ENER_FLOAT_PRECISION)
+            redu = redu / xp.reshape(cnt, (nf, *([1] * (redu.ndim - 1))))
+        model_ret[kk_redu] = redu
+        if vdef.r_differentiable:
+            kk_derv_r, _ = get_deriv_name(kk)
+            model_ret[kk_derv_r] = None
+        if vdef.c_differentiable:
+            _, kk_derv_c = get_deriv_name(kk)
+            model_ret[kk_derv_c] = None
+    return model_ret
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 82010da04d..ddc542a389 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -51,10 +51,12 @@
     DPPath,
 )
 
+from .edge_transform_output import (
+    fit_output_to_model_output_graph,
+)
 from .transform_output import (
     communicate_extended_output,
     fit_output_to_model_output,
-    fit_output_to_model_output_graph,
 )
 
 
@@ -614,7 +616,7 @@ def call_lower_graph(
             return fit_output_to_model_output_graph(
                 atomic_ret,
                 self.atomic_output_def(),
-                n_node,
+                graph,
                 mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
             )
 
diff --git a/deepmd/dpmodel/model/transform_output.py b/deepmd/dpmodel/model/transform_output.py
index 71b0fd37a2..b8ca99469f 100644
--- a/deepmd/dpmodel/model/transform_output.py
+++ b/deepmd/dpmodel/model/transform_output.py
@@ -66,52 +66,6 @@ def fit_output_to_model_output(
     return model_ret
 
 
-def fit_output_to_model_output_graph(
-    fit_ret: dict[str, Array],
-    fit_output_def: FittingOutputDef,
-    n_node: Array,
-    mask: Array | None = None,
-) -> dict[str, Array]:
-    """Flat-N analogue of :func:`fit_output_to_model_output`.
-
-    The atom axis is flat ``(N,)``; reducible outputs reduce per frame via
-    ``segment_sum`` over ``frame_id = repeat(arange(nf), n_node)`` (intensive ⇒
-    divide by the per-frame real-node count). Derivative name-holders are ``None``.
-    """
-    from deepmd.dpmodel.utils.neighbor_graph import (
-        frame_id_from_n_node,
-        segment_sum,
-    )
-
-    xp = array_api_compat.get_namespace(n_node)
-    nf = n_node.shape[0]
-    frame_id = frame_id_from_n_node(n_node)
-    model_ret = dict(fit_ret.items())
-    for kk, vv in fit_ret.items():
-        vdef = fit_output_def[kk]
-        if not vdef.reducible:
-            continue
-        kk_redu = get_reduce_name(kk)
-        vv_e = xp.astype(vv, GLOBAL_ENER_FLOAT_PRECISION)
-        redu = segment_sum(vv_e, frame_id, nf)  # (nf, *shape)
-        if vdef.intensive:
-            if mask is not None:
-                cnt = segment_sum(
-                    xp.astype(mask, GLOBAL_ENER_FLOAT_PRECISION), frame_id, nf
-                )
-            else:
-                cnt = xp.astype(n_node, GLOBAL_ENER_FLOAT_PRECISION)
-            redu = redu / xp.reshape(cnt, (nf, *([1] * (redu.ndim - 1))))
-        model_ret[kk_redu] = redu
-        if vdef.r_differentiable:
-            kk_derv_r, _ = get_deriv_name(kk)
-            model_ret[kk_derv_r] = None
-        if vdef.c_differentiable:
-            _, kk_derv_c = get_deriv_name(kk)
-            model_ret[kk_derv_c] = None
-    return model_ret
-
-
 def get_leading_dims(
     vv: Array,
     vdef: OutputVariableDef,
diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py
index d3d3ec343b..a2c6b2144d 100644
--- a/deepmd/pt_expt/model/edge_transform_output.py
+++ b/deepmd/pt_expt/model/edge_transform_output.py
@@ -13,6 +13,7 @@
     get_reduce_name,
 )
 from deepmd.dpmodel.utils.neighbor_graph import (
+    NeighborGraph,
     edge_force_virial,
     frame_id_from_n_node,
     segment_sum,
@@ -52,10 +53,7 @@ def edge_energy_deriv(
 def fit_output_to_model_output_graph(
     fit_ret: dict[str, torch.Tensor],
     fit_output_def: FittingOutputDef,
-    edge_vec: torch.Tensor,
-    edge_index: torch.Tensor,
-    edge_mask: torch.Tensor,
-    n_node: torch.Tensor,
+    graph: NeighborGraph,
     do_atomic_virial: bool = False,
     create_graph: bool = True,
     mask: torch.Tensor | None = None,
@@ -81,14 +79,11 @@ def fit_output_to_model_output_graph(
         Raw flat fitting output, ``(N, *shape)`` per key (``N = sum(n_node)``).
     fit_output_def
         The fitting output definition.
-    edge_vec
-        (E, 3) edge vectors; MUST be the autograd leaf for ``fit_ret``.
-    edge_index
-        (2, E) ``[src, dst]`` edge endpoints (flat local indices).
-    edge_mask
-        (E,) valid-edge mask.
-    n_node
-        (nf,) per-frame local atom counts.
+    graph
+        the :class:`~deepmd.dpmodel.utils.neighbor_graph.NeighborGraph`. Its
+        ``edge_vec`` MUST be the autograd leaf for ``fit_ret`` (the force backward
+        differentiates the reduced energy w.r.t. it); ``edge_index``/``edge_mask``
+        define the scatter, ``n_node`` the node->frame map.
     do_atomic_virial
         Whether to also assemble the per-atom virial ``<var>_derv_c``.
     create_graph
@@ -96,6 +91,10 @@ def fit_output_to_model_output_graph(
     mask
         (N,) flat realness mask; used only for intensive-output reduction.
     """
+    edge_vec = graph.edge_vec
+    edge_index = graph.edge_index
+    edge_mask = graph.edge_mask
+    n_node = graph.n_node
     redu_prec = env.GLOBAL_PT_ENER_FLOAT_PRECISION
     nf = int(n_node.shape[0])
     N = int(n_node.sum())
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index b3b89b6025..bb8a6b6b70 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -361,10 +361,7 @@ def forward_common_lower_graph(
             return fit_output_to_model_output_graph(
                 atomic_ret,
                 self.atomic_output_def(),
-                edge_vec,
-                edge_index,
-                edge_mask,
-                n_node,
+                graph,
                 do_atomic_virial=do_atomic_virial,
                 create_graph=self.training,
                 mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
diff --git a/source/tests/common/dpmodel/test_graph_atomic_parity.py b/source/tests/common/dpmodel/test_graph_atomic_parity.py
index f95851f2d1..7de084a25f 100644
--- a/source/tests/common/dpmodel/test_graph_atomic_parity.py
+++ b/source/tests/common/dpmodel/test_graph_atomic_parity.py
@@ -152,8 +152,13 @@ def test_model_pair_exclude_types_graph_matches_dense():
     model.atomic_model.reinit_pair_exclude([])  # clear pair exclusion
     assert model.atomic_model.pair_excl is None
     g_noexcl = model.call_common(coord, atype, box, neighbor_graph_method="dense")
+    # tight tolerance: the excluded (0,1) pairs contribute a small but real
+    # amount; default rtol=1e-5 is too loose to register it.
     assert not np.allclose(
-        np.asarray(g_excl["energy_redu"]), np.asarray(g_noexcl["energy_redu"])
+        np.asarray(g_excl["energy_redu"]),
+        np.asarray(g_noexcl["energy_redu"]),
+        rtol=1e-9,
+        atol=1e-9,
     ), "pair exclusion must change the graph energy (same weights)"
 
 

From be0fc97900d752a5a0ab5c55aeafe54f760ab660 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 27 Jun 2026 09:18:32 +0800
Subject: [PATCH 63/69] refactor(dpmodel): symmetric public graph lower
 (casting + model wrapper + alias) [review #6,#7,#9]

Mirror the dense lower structure for the graph path:
- NEW model-level forward_common_atomic_graph (builds NeighborGraph + atomic
  forward_common_atomic_graph + flat-N output transform) -- analogue of the
  dense forward_common_atomic; the graph build is no longer inlined in the
  lower [#6].
- call_lower_graph -> public call_common_lower_graph WITH _input/_output_type_cast
  (edge_vec is the geometry in place of coord), making it a directly-callable
  PRIMARY interface per spec decision #14 [#7].
- call_lower_graph = call_common_lower_graph alias (mirrors call_lower =
  call_common_lower) [#9].
---
 deepmd/dpmodel/model/make_model.py | 93 ++++++++++++++++++++++--------
 1 file changed, 68 insertions(+), 25 deletions(-)

diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index ddc542a389..2e98f1e9aa 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -544,7 +544,7 @@ def forward_common_atomic(
                 mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
             )
 
-        def call_lower_graph(
+        def forward_common_atomic_graph(
             self,
             atype: Array,
             n_node: Array,
@@ -556,18 +556,16 @@ def call_lower_graph(
             aparam: Array | None = None,
             comm_dict: dict | None = None,
         ) -> dict[str, Array]:
-            """Graph-native lower (PR-A: dpa1 ``attn_layer == 0``).
-
-            OUTPUT-AGNOSTIC, like the dense
-            :func:`~deepmd.dpmodel.model.transform_output.fit_output_to_model_output`:
-            runs the graph descriptor + fitting forward to obtain the rectangular
-            ``fit_ret`` (``(nf, nloc, *shape)``), then reduces EVERY reducible
-            fitting output (``xp.sum``/``xp.mean`` over the atom axis, cast to
-            energy precision) and sets derivative name-holders to ``None``.  This
-            makes any fitting (energy/dos/dipole/polar/property/...) flow through
-            the graph path with no change on the fitting side.  Force/virial are
-            produced by the pt_expt autograd path.  Must match the dense
-            :meth:`call_common_lower` reduction on the SAME neighbor list.
+            """Model-level graph forward (no type cast). Analogue of the dense
+            :meth:`forward_common_atomic`.
+
+            Builds a :class:`NeighborGraph` from the flat edge fields, runs the
+            atomic model's :meth:`forward_common_atomic_graph` (flat ``(N, *)``
+            per-node output), then the flat-N output transform (per-frame
+            ``segment_sum`` reduction; derivative name-holders ``None`` --
+            force/virial come from the pt_expt autograd lower). The
+            ``(nf, nloc)`` unravel for the public ABI happens in the caller
+            (:meth:`_call_common_graph`).
 
             Parameters
             ----------
@@ -582,22 +580,22 @@ def call_lower_graph(
             edge_mask
                 (E,) boolean/0-1 valid-edge mask.
             n_local
-                Per-rank local atom counts for multi-rank inference.
-                Ignored in PR-A (single-rank); accepted for ABI stability.
+                Per-rank local atom counts for multi-rank inference. Ignored in
+                PR-A (single-rank); accepted for ABI stability.
             fparam
                 Frame parameter, ``(nf, ndf)``.
             aparam
-                Atomic parameter, ``(nf, nloc, nda)``.
+                Atomic parameter, ``(N, nda)``.
             comm_dict
-                MPI communication metadata. Ignored in PR-A; accepted for
-                ABI stability.
+                MPI communication metadata. Ignored in PR-A; accepted for ABI
+                stability.
 
             Returns
             -------
             dict
-                The standard model dict (``<var>`` per-atom, ``<var>_redu``
+                The standard model dict (``<var>`` per-node, ``<var>_redu``
                 reduced, derivative name-holders ``None``), matching
-                :func:`fit_output_to_model_output`.
+                :func:`fit_output_to_model_output_graph`.
             """
             graph = NeighborGraph(
                 n_node=n_node,
@@ -608,11 +606,6 @@ def call_lower_graph(
             atomic_ret = self.atomic_model.forward_common_atomic_graph(
                 graph, atype, fparam=fparam, aparam=aparam
             )
-            # ``forward_common_atomic_graph`` returns flat ``(N, *)`` output
-            # (N = sum(n_node)). Reduce per-frame via segment_sum over
-            # frame_id — supports ragged frames without any nloc = N // nf
-            # reshape. The I/O boundary unravel to (nf, nloc, *) happens
-            # in ``_call_common_graph`` for the public ABI.
             return fit_output_to_model_output_graph(
                 atomic_ret,
                 self.atomic_output_def(),
@@ -620,6 +613,56 @@ def call_lower_graph(
                 mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
             )
 
+        def call_common_lower_graph(
+            self,
+            atype: Array,
+            n_node: Array,
+            edge_index: Array,
+            edge_vec: Array,
+            edge_mask: Array,
+            n_local: Array | None = None,
+            fparam: Array | None = None,
+            aparam: Array | None = None,
+            comm_dict: dict | None = None,
+        ) -> dict[str, Array]:
+            """Graph-native PUBLIC lower (PR-A: dpa1 ``attn_layer == 0``).
+
+            The PRIMARY directly-callable graph interface (spec decision #14).
+            Casts inputs/outputs to/from the model precision exactly like the
+            dense :meth:`call_common_lower` (``edge_vec`` is the geometry, in
+            place of ``coord``), then runs :meth:`forward_common_atomic_graph`.
+            OUTPUT-AGNOSTIC: every fitting (energy/dos/dipole/polar/property/...)
+            flows through with no change on the fitting side; force/virial are
+            produced by the pt_expt autograd lower. Must match the dense
+            :meth:`call_common_lower` reduction on the SAME neighbor set.
+
+            Parameters mirror :meth:`forward_common_atomic_graph`.
+
+            Returns
+            -------
+            dict
+                The standard model dict in the INPUT precision.
+            """
+            edge_vec, _, fparam, aparam, _, input_prec = self._input_type_cast(
+                edge_vec, fparam=fparam, aparam=aparam
+            )
+            model_predict = self.forward_common_atomic_graph(
+                atype,
+                n_node,
+                edge_index,
+                edge_vec,
+                edge_mask,
+                n_local=n_local,
+                fparam=fparam,
+                aparam=aparam,
+                comm_dict=comm_dict,
+            )
+            model_predict = self._output_type_cast(model_predict, input_prec)
+            return model_predict
+
+        # backward-compat alias (mirrors ``call_lower = call_common_lower``)
+        call_lower_graph = call_common_lower_graph
+
         call = call_common
         call_lower = call_common_lower
 

From 6987b50374a42acbf4f3843309309ab40cb774ec Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 28 Jun 2026 01:45:29 +0800
Subject: [PATCH 64/69] fix(pt_expt): dpa1 varying-natoms compile test compares
 dense-vs-dense

The TestCompiledVaryingNatoms dpa1(attn_layer=0) case failed: the uncompiled
reference uses the pt_expt carry-all GRAPH forward (default-flip #17) while the
compiled forward_lower uses the sel-capped DENSE forward. Those are two
different force computations -- even at non-binding sel the forward matches to
~1e-16 but their backward gradients agree only to fp64 accumulation (~1e-12),
which the optimizer amplifies into a diverging training trajectory (weight
drift ~1e-3 after one step). It is NOT sel-binding and NOT a torch.compile
dynamic-shape bug.

Pin BOTH sides to the legacy dense env-mat path via force_legacy_descriptor=True
(monkeypatch descriptor.uses_graph_lower -> False, killing both the default-flip
and the _call_graph_adapter), so this stays a true compile-correctness check on
the path it actually compiles. Compiling the GRAPH lower so eager==compiled is
tracked for PR-B.
---
 source/tests/pt_expt/test_training.py | 41 +++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/source/tests/pt_expt/test_training.py b/source/tests/pt_expt/test_training.py
index 6e3f0b97a7..45061c084a 100644
--- a/source/tests/pt_expt/test_training.py
+++ b/source/tests/pt_expt/test_training.py
@@ -1352,7 +1352,9 @@ def _make_varying_config(
         config = normalize(config)
         return config
 
-    def _check_varying_natoms(self, descriptor: dict | None = None) -> None:
+    def _check_varying_natoms(
+        self, descriptor: dict | None = None, force_legacy_descriptor: bool = False
+    ) -> None:
         """Per-step compiled-vs-uncompiled comparison for the given descriptor.
 
         The loss config has ``start_pref_f=1000`` and ``start_pref_v=1.0``,
@@ -1367,6 +1369,18 @@ def _check_varying_natoms(self, descriptor: dict | None = None) -> None:
         ``atol=rtol=1e-10`` tolerance; if a descriptor's compiled path
         cannot meet that on float64 the descriptor has a real numerical
         problem (see the DPA1 limitation note where this happened).
+
+        ``force_legacy_descriptor`` makes a graph-eligible descriptor (dpa1
+        ``attn_layer==0``) take the legacy *dense* (env-mat) path on BOTH the
+        compiled and uncompiled sides, so this stays a true compile-correctness
+        check (same computation, compiled vs eager).  The pt_expt eager default
+        for such a descriptor is the carry-all GRAPH forward while the compiled
+        ``forward_lower`` is the sel-capped DENSE forward; those are two
+        *different* force computations whose parameter gradients agree only to
+        fp64 accumulation (~1e-12), which the optimizer then amplifies into a
+        diverging training trajectory.  Making the compiled GRAPH lower (so
+        eager==compiled) is tracked for PR-B; until then this test exercises the
+        dense path it actually compiles.
         """
         from deepmd.pt_expt.train.training import (
             _CompiledModel,
@@ -1386,6 +1400,16 @@ def _check_varying_natoms(self, descriptor: dict | None = None) -> None:
                 compiled_model = trainer_c.wrapper.model["Default"]
                 self.assertIsInstance(compiled_model, _CompiledModel)
 
+                if force_legacy_descriptor:
+                    # Pin BOTH sides to the legacy dense (env-mat) path so the
+                    # uncompiled reference matches the dense ``forward_lower``
+                    # that gets compiled (must happen before the first forward,
+                    # i.e. before the lazy compile trace).  See the docstring /
+                    # PR-B note: the graph forward vs dense forward differ in the
+                    # backward at fp64 precision, which the optimizer amplifies.
+                    for _m in (trainer_uc.model, compiled_model.original_model):
+                        _m.get_descriptor().uses_graph_lower = lambda: False
+
                 # Sync weights so predictions can be compared exactly
                 compiled_model.original_model.load_state_dict(
                     trainer_uc.model.state_dict()
@@ -1458,14 +1482,25 @@ def test_compiled_matches_uncompiled_varying_natoms_dpa3(self) -> None:
         self._check_varying_natoms(_DESCRIPTOR_DPA3)
 
     def test_compiled_matches_uncompiled_varying_natoms_dpa1_no_attn(self) -> None:
-        """DPA1 (attn_layer=0): compiled vs uncompiled match.
+        """DPA1 (attn_layer=0): compiled vs uncompiled match (dense path).
+
+        ``force_legacy_descriptor=True`` pins both sides to the legacy dense
+        (env-mat) forward -- the path the compiled ``forward_lower`` actually
+        uses.  The pt_expt eager default for dpa1(attn_layer=0) is the carry-all
+        GRAPH forward, a *different* force computation from the compiled dense
+        forward; their backward gradients agree only to fp64 accumulation, which
+        the optimizer amplifies, so comparing graph-vs-dense through training is
+        ill-posed.  Making the compiled path the GRAPH lower (eager==compiled)
+        is tracked for PR-B (graph .pt2/export).
 
         DPA1 with attention layers is intentionally not covered: the
         compiled se_atten path is hardware-sensitive on multi-threaded
         CPUs (parallel reduction order diverges from eager above the
         1e-10 tolerance).  ``_compile_model`` warns the user instead.
         """
-        self._check_varying_natoms(_DESCRIPTOR_DPA1_NO_ATTN)
+        self._check_varying_natoms(
+            _DESCRIPTOR_DPA1_NO_ATTN, force_legacy_descriptor=True
+        )
 
     def test_compile_warns_dpa1_with_attention(self) -> None:
         """DPA1 (attn_layer>0) under compile must emit a warning.

From 8d21609a510f01e5db00f52c5e804fd001997a3d Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 28 Jun 2026 01:45:47 +0800
Subject: [PATCH 65/69] docs(dpmodel,pt_expt): conform graph-lower docstrings
 to NumPy convention

Add the missing Parameters/Returns sections (and fill incomplete ones) on the
NeighborGraph / graph-lower functions so they match the package numpydoc style:

- dpa1: _call_graph_adapter, _call_dense (Parameters+Returns)
- general_fitting.call_graph: add missing g2, h2 params
- neighbor_graph: pad_and_guard_edges, node_validity_mask (Parameters+Returns);
  from_dense_quartet, build_neighbor_graph_ase (Returns); edge_force_virial
  (add g_e/edge_vec/edge_index/edge_mask params)
- dpmodel/pt_expt make_model: _resolve_graph_method, _call_common_graph
  (Parameters+Returns); call_common_lower_graph (replace "Parameters mirror ..."
  cross-ref with an explicit Parameters section)
- pt_expt edge_transform_output: edge_energy_deriv (Parameters+Returns);
  fit_output_to_model_output_graph (Returns)

Docstring-only; no behavior change.
---
 deepmd/dpmodel/descriptor/dpa1.py             | 49 ++++++++++++++
 deepmd/dpmodel/fitting/general_fitting.py     |  6 ++
 deepmd/dpmodel/model/make_model.py            | 64 +++++++++++++++++--
 .../utils/neighbor_graph/ase_builder.py       |  7 ++
 .../dpmodel/utils/neighbor_graph/builder.py   |  7 ++
 .../utils/neighbor_graph/derivatives.py       |  9 +++
 deepmd/dpmodel/utils/neighbor_graph/graph.py  | 41 ++++++++++--
 deepmd/pt_expt/model/edge_transform_output.py | 39 ++++++++++-
 deepmd/pt_expt/model/make_model.py            | 43 +++++++++++--
 9 files changed, 249 insertions(+), 16 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index eb94b148cc..27e2d68bfc 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -584,6 +584,32 @@ def _call_graph_adapter(
         ``sw``. Preserves the dense 5-tuple ABI exactly; masked invalid edges
         contribute zero in ``call_graph``'s ``segment_sum`` so the output is
         identical to the legacy dense body.
+
+        Parameters
+        ----------
+        coord_ext
+            The extended coordinates of atoms. shape: nf x (nall x 3)
+        atype_ext
+            The extended atom types. shape: nf x nall
+        nlist
+            The neighbor list. shape: nf x nloc x nnei
+        mapping
+            The index mapping from extended to local region. shape: nf x nall.
+            ``None`` is allowed only when nall == nloc (identity mapping).
+
+        Returns
+        -------
+        descriptor
+            The descriptor. shape: nf x nloc x (ng x axis_neuron)
+        gr
+            The rotationally equivariant single-particle representation.
+            shape: nf x nloc x ng x 3
+        g2
+            ``None`` for this descriptor.
+        h2
+            ``None`` for this descriptor.
+        sw
+            The smooth switch function. shape: nf x nloc x nnei x 1
         """
         from deepmd.dpmodel.utils.neighbor_graph import (
             from_dense_quartet,
@@ -639,6 +665,29 @@ def _call_dense(
     ) -> Array:
         """Legacy dense descriptor body (the ineligible ``call`` path: attention,
         strip tebd, exclude_types, or the no-mapping ghost case).
+
+        Parameters
+        ----------
+        coord_ext
+            The extended coordinates of atoms. shape: nf x (nall x 3)
+        atype_ext
+            The extended atom types. shape: nf x nall
+        nlist
+            The neighbor list. shape: nf x nloc x nnei
+
+        Returns
+        -------
+        descriptor
+            The descriptor. shape: nf x nloc x (ng x axis_neuron)
+        gr
+            The rotationally equivariant single-particle representation.
+            shape: nf x nloc x ng x 3
+        g2
+            ``None`` for this descriptor.
+        h2
+            ``None`` for this descriptor.
+        sw
+            The smooth switch function. shape: nf x nloc x nnei x 1
         """
         xp = array_api_compat.array_namespace(coord_ext, atype_ext, nlist)
         nf, nloc = nlist.shape[:2]
diff --git a/deepmd/dpmodel/fitting/general_fitting.py b/deepmd/dpmodel/fitting/general_fitting.py
index 75de52470f..4734a6be66 100644
--- a/deepmd/dpmodel/fitting/general_fitting.py
+++ b/deepmd/dpmodel/fitting/general_fitting.py
@@ -815,6 +815,12 @@ def call_graph(
             the atom type. N
         gr
             equivariant single-particle representation. N x ng x 3
+        g2
+            the rotationally invariant pair-partical representation.
+            unused by this fitting; passed through to the dense call.
+        h2
+            the rotationally equivariant pair-partical representation.
+            unused by this fitting; passed through to the dense call.
         fparam
             NODE-level frame parameter (already gathered by frame_id). N x nfp
         aparam
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 2e98f1e9aa..50c140005a 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -368,6 +368,18 @@ def _resolve_graph_method(
             pt_expt OVERRIDES this so ``None`` defaults graph-eligible mixed_types
             descriptors to the carry-all graph (decision #17) -- pt_expt has the
             autograd ``forward_common_lower_graph`` that produces force/virial.
+
+            Parameters
+            ----------
+            neighbor_graph_method
+                The user-requested method: ``None`` (default), ``"legacy"``
+                (force dense), or ``"dense"``/``"ase"`` (force the graph builder).
+
+            Returns
+            -------
+            method
+                The resolved method passed to :meth:`_call_common_graph`, or
+                ``None`` to take the dense path.
             """
             if neighbor_graph_method == "legacy":
                 return None
@@ -387,11 +399,32 @@ def _call_common_graph(
 
             Builds a carry-all :class:`NeighborGraph` from ``cc``/``atype``/``bb``
             and routes the forward through the OUTPUT-AGNOSTIC
-            :meth:`call_lower_graph`. The returned dict mirrors the dense
-            ``call_common`` keys (``<var>`` per-atom, ``<var>_redu`` reduced,
-            derivative name-holders ``None``, plus ``mask``). Input type-casting
-            is done by the caller; output type-casting is also applied by the
+            :meth:`call_lower_graph`. Input/output type-casting is done by the
             caller.
+
+            Parameters
+            ----------
+            cc
+                coordinates. nf x nloc x 3 (or nf x (nloc x 3))
+            atype
+                the atom types. nf x nloc
+            bb
+                the simulation cell. nf x 3 x 3, or ``None`` for non-periodic.
+            fp
+                the frame parameter. nf x ndf
+            ap
+                the atomic parameter. nf x nloc x nda
+            method
+                the carry-all builder, ``"dense"`` or ``"ase"``.
+            do_atomic_virial
+                whether to calculate the atomic virial.
+
+            Returns
+            -------
+            model_predict
+                the standard model dict mirroring the dense ``call_common`` keys
+                (``<var>`` per-atom, ``<var>_redu`` reduced, derivative
+                name-holders ``None``, plus the int ``mask``).
             """
             descriptor = getattr(self.atomic_model, "descriptor", None)
             uses_graph_lower = getattr(descriptor, "uses_graph_lower", lambda: False)
@@ -636,7 +669,28 @@ def call_common_lower_graph(
             produced by the pt_expt autograd lower. Must match the dense
             :meth:`call_common_lower` reduction on the SAME neighbor set.
 
-            Parameters mirror :meth:`forward_common_atomic_graph`.
+            Parameters
+            ----------
+            atype
+                (N,) flat LOCAL atom types, ``N == sum(n_node)``.
+            n_node
+                (nf,) per-frame local atom counts.
+            edge_index
+                (2, E) ``[src, dst]`` edge endpoints (flat local indices).
+            edge_vec
+                (E, 3) neighbor-minus-center edge vectors.
+            edge_mask
+                (E,) boolean/0-1 valid-edge mask.
+            n_local
+                Per-rank local atom counts for multi-rank inference. Ignored in
+                PR-A (single-rank); accepted for ABI stability.
+            fparam
+                Frame parameter, ``(nf, ndf)``.
+            aparam
+                Atomic parameter, ``(N, nda)``.
+            comm_dict
+                MPI communication metadata. Ignored in PR-A; accepted for ABI
+                stability.
 
             Returns
             -------
diff --git a/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py b/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
index 2d7543a0d1..bc7312fcab 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/ase_builder.py
@@ -66,6 +66,13 @@ def build_neighbor_graph_ase(
     layout
         edge-axis length policy; ``None`` => dynamic (torch) with ``min_edges`` guards.
 
+    Returns
+    -------
+    graph
+        The carry-all :class:`NeighborGraph` over the LOCAL atoms
+        (``n_node = nloc`` per frame), with ``edge_vec`` recomputed
+        differentiably from ``coord``/``box``.
+
     Raises
     ------
     ImportError
diff --git a/deepmd/dpmodel/utils/neighbor_graph/builder.py b/deepmd/dpmodel/utils/neighbor_graph/builder.py
index a0343ecf68..003911598a 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/builder.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/builder.py
@@ -95,6 +95,13 @@ def from_dense_quartet(
         ``src`` pointing at the center (in-range, masked) -- so no ``nonzero`` is
         used and the converter is jit/export-traceable. The masked edges contribute
         zero in a downstream ``segment_sum``, so the descriptor output is unchanged.
+
+    Returns
+    -------
+    graph
+        The :class:`NeighborGraph` over the LOCAL atoms (``n_node = nloc`` per
+        frame): ``edge_index`` ``[src, dst]`` in local indices, ``edge_vec`` the
+        neighbor-minus-center displacement, and ``edge_mask`` flagging real edges.
     """
     if layout is None:
         layout = GraphLayout()
diff --git a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
index a7c73eaaf5..494e97a0c9 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
@@ -49,6 +49,15 @@ def edge_force_virial(
 
     Parameters
     ----------
+    g_e
+        (E, 3) per-edge gradient ``dE/d(edge_vec)``.
+    edge_vec
+        (E, 3) per-edge displacement ``r_src - r_dst``; padding edges are zero.
+    edge_index
+        (2, E) ``[src, dst]`` node endpoints of each edge.
+    edge_mask
+        (E,) boolean valid-edge mask; padding/guard edges (``False``) are zeroed
+        before any scatter.
     n_node
         (nf,) per-frame REAL node counts. Real nodes occupy the compact prefix
         ``[0, sum(n_node))`` frame-major; ``nf = n_node.shape[0]``.
diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py
index 4487d910b2..4b2f1e9ab3 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/graph.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py
@@ -75,11 +75,32 @@ def pad_and_guard_edges(
     """Append padding/guard edges as a contiguous suffix and build edge_mask.
 
     Real edges (``edge_index``/``edge_vec``) stay at the front (compact layout).
-    - ``capacity is None`` (torch dynamic): append exactly ``min_edges`` masked
-      dummy edges so the edge axis has a known lower bound and shape-stable
-      guards for export.
-    - ``capacity`` set (jax static): pad to ``E_max = capacity``; raise on overflow.
     Dummy edges point at node ``pad_value`` (in-range) with zero ``edge_vec``.
+
+    Parameters
+    ----------
+    edge_index
+        (2, E_real) ``[src, dst]`` node endpoints of the real edges.
+    edge_vec
+        (E_real, 3) per-edge displacement of the real edges.
+    capacity
+        Target edge-axis length ``E_max``. ``None`` (torch dynamic) appends
+        exactly ``min_edges`` masked dummy edges so the axis has a known lower
+        bound and shape-stable guards for export; an int (jax static) pads to
+        ``E_max = capacity`` and raises ``ValueError`` on overflow.
+    min_edges
+        Number of dummy edges appended when ``capacity is None``.
+    pad_value
+        Node index the dummy edges point at (must be in range).
+
+    Returns
+    -------
+    edge_index
+        (2, target) padded edge endpoints.
+    edge_vec
+        (target, 3) padded edge displacements (dummy rows zero).
+    edge_mask
+        (target,) boolean mask, ``True`` for the real-edge prefix.
     """
     xp = array_api_compat.array_namespace(edge_index)
     dev = array_api_compat.device(edge_index)
@@ -132,6 +153,18 @@ def node_validity_mask(n_node: Array, n_total: int) -> Array:
 
     Compact-prefix layout: the first ``sum(n_node)`` nodes are real, the rest
     are padding. jit-safe (no Python ``int`` cast on the traced sum).
+
+    Parameters
+    ----------
+    n_node
+        (nf,) per-frame REAL node counts.
+    n_total
+        Size of the (possibly padded) flat node axis ``N``.
+
+    Returns
+    -------
+    mask
+        (n_total,) boolean mask, ``True`` for the real-node compact prefix.
     """
     xp = array_api_compat.array_namespace(n_node)
     idx = xp.arange(n_total, dtype=n_node.dtype, device=array_api_compat.device(n_node))
diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py
index a2c6b2144d..565e155157 100644
--- a/deepmd/pt_expt/model/edge_transform_output.py
+++ b/deepmd/pt_expt/model/edge_transform_output.py
@@ -35,8 +35,33 @@ def edge_energy_deriv(
     """Return (force, atom_virial_or_None, virial) from a graph energy.
 
     g_e = dE/d(edge_vec) via one torch.autograd.grad, then the shared
-    edge_force_virial scatter. ``virial`` (per-frame) is always computed;
-    ``atom_virial`` is materialized only when do_atomic_virial=True.
+    edge_force_virial scatter.
+
+    Parameters
+    ----------
+    energy
+        the reduced per-frame energy to differentiate. ``(nf,)`` (or scalar).
+    edge_vec
+        (E, 3) per-edge displacement; the autograd leaf of ``energy``.
+    edge_index
+        (2, E) ``[src, dst]`` edge endpoints.
+    edge_mask
+        (E,) valid-edge mask.
+    n_node
+        (nf,) per-frame node counts.
+    do_atomic_virial
+        whether to materialize the per-atom virial (else ``None`` is returned).
+    create_graph
+        whether the backward retains a graph (training, for second-order grad).
+
+    Returns
+    -------
+    force
+        (N, 3) per-node force.
+    atom_virial
+        (N, 3, 3) per-node virial when ``do_atomic_virial`` else ``None``.
+    virial
+        (nf, 3, 3) per-frame virial (always computed).
     """
     (g_e,) = torch.autograd.grad(
         energy.sum() if energy.dim() else energy,
@@ -90,6 +115,16 @@ def fit_output_to_model_output_graph(
         Whether the backward retains a graph (training).
     mask
         (N,) flat realness mask; used only for intensive-output reduction.
+
+    Returns
+    -------
+    model_ret
+        ``fit_ret`` plus, for each reducible key, the per-frame reduction
+        ``<var>_redu`` ``(nf, *shape)`` and -- for ``r_differentiable`` keys --
+        the FLAT per-atom force ``<var>_derv_r`` ``(N, *shape, 3)``, the
+        per-frame virial ``<var>_derv_c_redu`` ``(nf, *shape, 9)``, and (when
+        ``do_atomic_virial``) the per-atom virial ``<var>_derv_c``
+        ``(N, *shape, 9)``.
     """
     edge_vec = graph.edge_vec
     edge_index = graph.edge_index
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index bb8a6b6b70..517086f200 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -375,6 +375,18 @@ def _resolve_graph_method(
             pt_expt has the autograd ``forward_common_lower_graph`` that produces
             force/virial on the graph, so the graph can be the DEFAULT here.
             ``"legacy"`` forces dense; explicit ``"dense"``/``"ase"`` force the graph.
+
+            Parameters
+            ----------
+            neighbor_graph_method
+                The user-requested method: ``None`` (default-flip), ``"legacy"``
+                (force dense), or ``"dense"``/``"ase"`` (force the graph builder).
+
+            Returns
+            -------
+            method
+                The resolved method passed to :meth:`_call_common_graph`, or
+                ``None`` to take the dense path.
             """
             if neighbor_graph_method == "legacy":
                 return None
@@ -402,11 +414,32 @@ def _call_common_graph(
             Builds the carry-all :class:`NeighborGraph` in TORCH (the array-API
             builder runs natively and yields a differentiable ``edge_vec``), then
             routes through :meth:`forward_common_lower_graph` so force / virial /
-            (optional) atom-virial are produced via autograd.  The returned dict
-            uses the SAME internal key names as the legacy dense
-            :meth:`call_common` output (``energy``, ``energy_redu``,
-            ``energy_derv_r``, ``energy_derv_c_redu``, and ``energy_derv_c`` when
-            ``do_atomic_virial``).
+            (optional) atom-virial are produced via autograd.
+
+            Parameters
+            ----------
+            cc
+                coordinates. nf x nloc x 3 (or nf x (nloc x 3))
+            atype
+                the atom types. nf x nloc
+            bb
+                the simulation cell. nf x 3 x 3, or ``None`` for non-periodic.
+            fp
+                the frame parameter. nf x ndf
+            ap
+                the atomic parameter. nf x nloc x nda
+            method
+                the carry-all builder, ``"dense"`` or ``"ase"``.
+            do_atomic_virial
+                whether to calculate the atomic virial.
+
+            Returns
+            -------
+            model_predict
+                the standard model dict using the SAME internal key names as the
+                legacy dense :meth:`call_common` output (``energy``,
+                ``energy_redu``, ``energy_derv_r``, ``energy_derv_c_redu``, and
+                ``energy_derv_c`` when ``do_atomic_virial``).
             """
             from deepmd.dpmodel.utils.neighbor_graph import (
                 build_neighbor_graph,

From 5271edb088f2de8e9ee2e6060f98079a9682be24 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 28 Jun 2026 15:58:57 +0800
Subject: [PATCH 66/69] fix(dpmodel,pt_expt): address iProzd review on #5583

- call_common: an explicit `neighbor_list` (a dense-nlist strategy) is no longer
  silently ignored by the graph default. Raise on `neighbor_list` + explicit
  `neighbor_graph_method`; otherwise honor the nlist by taking the dense route.
- frame_id_from_n_node: accept an optional static `n_total` (jax/export
  trace-friendly, avoids `int(sum(n_node))`); clamp padding nodes to the last
  frame so a padded node axis stays in range for segment_sum.
- thread `charge_spin` (accept-for-ABI-stability, like comm_dict/n_local)
  through the graph interface: forward_atomic_graph, forward_common_atomic_graph,
  call_common_lower_graph, forward_common_lower_graph.
- docs: list neighbor_graph_method options one per line incl. "legacy", clarify
  "dense"/"ase" are carry-all GRAPH builders (not the dense nlist lower);
  contrast from_dense_quartet (legacy-quartet adapter, keeps sel truncation) vs
  the carry-all builders.

Tests: neighbor_list conflict-raise + dense-route fallback; frame_id static
n_total (exact + padded).
---
 .../dpmodel/atomic_model/base_atomic_model.py | 10 ++-
 .../dpmodel/atomic_model/dp_atomic_model.py   |  4 ++
 deepmd/dpmodel/model/make_model.py            | 66 ++++++++++++++-----
 .../dpmodel/utils/neighbor_graph/builder.py   |  5 +-
 deepmd/dpmodel/utils/neighbor_graph/graph.py  | 22 +++++--
 deepmd/pt_expt/model/make_model.py            |  5 ++
 .../dpmodel/test_dpa1_graph_model_energy.py   | 49 ++++++++++++++
 .../tests/common/dpmodel/test_graph_ragged.py | 18 +++++
 8 files changed, 157 insertions(+), 22 deletions(-)

diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index 610218a0bf..bf41735f89 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -319,6 +319,7 @@ def forward_common_atomic_graph(
         atype: Array,
         fparam: Array | None = None,
         aparam: Array | None = None,
+        charge_spin: Array | None = None,
     ) -> dict:
         """Graph analogue of :meth:`forward_common_atomic` on the flat node axis.
 
@@ -341,6 +342,9 @@ def forward_common_atomic_graph(
             frame parameter. nf x ndf
         aparam
             atomic parameter. N x nda
+        charge_spin
+            charge/spin conditioning. Unused by the dpa1 graph path; accepted so
+            the interface stays stable for charge/spin-conditioned descriptors.
 
         Returns
         -------
@@ -361,7 +365,11 @@ def forward_common_atomic_graph(
                 edge_mask=graph.edge_mask * xp.astype(keep, graph.edge_mask.dtype),
             )
         ret_dict = self.forward_atomic_graph(
-            graph, atype_clamped, fparam=fparam, aparam=aparam
+            graph,
+            atype_clamped,
+            fparam=fparam,
+            aparam=aparam,
+            charge_spin=charge_spin,
         )
         return self._finalize_atomic_ret(ret_dict, atom_mask, atype)
 
diff --git a/deepmd/dpmodel/atomic_model/dp_atomic_model.py b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
index d17adf2821..440eb75284 100644
--- a/deepmd/dpmodel/atomic_model/dp_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
@@ -260,6 +260,7 @@ def forward_atomic_graph(
         atype: Array,
         fparam: Array | None = None,
         aparam: Array | None = None,
+        charge_spin: Array | None = None,
     ) -> dict[str, Array]:
         """Graph analogue of :meth:`forward_atomic` on the flat node axis.
 
@@ -278,6 +279,9 @@ def forward_atomic_graph(
             frame parameter. nf x ndf
         aparam
             atomic parameter. N x nda
+        charge_spin
+            charge/spin conditioning. Unused by the dpa1 graph path; accepted so
+            the interface stays stable for charge/spin-conditioned descriptors.
 
         Returns
         -------
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index 50c140005a..24cba63558 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -290,20 +290,33 @@ def call_common(
                 The coordinates correction for virial.
                 shape: nf x (nloc x 3)
             neighbor_list
-                The neighbor-list construction strategy.  ``None`` uses the
-                default all-pairs builder; an alternative strategy (e.g. an O(N)
-                cell list) may be injected to speed up neighbor-list construction
-                without changing model outputs.
+                Neighbor-list construction strategy for the DENSE-nlist path
+                only.  ``None`` uses the default all-pairs builder; an
+                alternative strategy (e.g. an O(N) cell list) may be injected to
+                speed up nlist construction without changing model outputs.  It
+                is consumed by the dense lower; supplying it forces the dense
+                route (see below) and it is rejected together with an explicit
+                ``neighbor_graph_method``.
             neighbor_graph_method
-                Opt-in CARRY-ALL graph energy forward (Option B). ``None``
-                (default) keeps the existing dense nlist path UNCHANGED. When
-                set to ``"dense"`` (in-tree all-pairs search) or ``"ase"``
-                (O(N) ASE cell list), the model builds a carry-all
-                :class:`NeighborGraph` and routes the ENERGY forward through
-                :meth:`call_lower_graph`. Requires a ``mixed_types`` descriptor
-                with a graph lower (dpa1 ``attn_layer == 0``). At non-binding
-                ``sel`` this matches the dense path exactly; at binding ``sel``
-                the carry-all graph keeps neighbors the dense path truncates, so
+                Selects the lower the model routes through.  The option strings
+                refer to the neighbor-GRAPH builder, NOT the legacy dense nlist:
+
+                - ``None`` -- default.  dpmodel/jax keep the dense nlist path;
+                  pt_expt default-flips graph-eligible mixed_types descriptors to
+                  the carry-all graph (decision #17).
+                - ``"legacy"`` -- force the dense nlist path (opt out of the
+                  default-flip).
+                - ``"dense"`` -- build a carry-all :class:`NeighborGraph` with the
+                  in-tree O(N^2) ALL-PAIRS search (this is NOT the dense nlist
+                  lower; "dense" = the all-pairs graph builder).
+                - ``"ase"`` -- build the carry-all graph with the O(N) ASE cell
+                  list.
+
+                The graph routes (``"dense"``/``"ase"``, and the pt_expt
+                default-flip) require a ``mixed_types`` descriptor with a graph
+                lower (dpa1 ``attn_layer == 0``).  At non-binding ``sel`` the
+                graph matches the dense path exactly; at binding ``sel`` the
+                carry-all graph keeps neighbors the dense path truncates, so the
                 energy intentionally differs.
 
             Returns
@@ -318,6 +331,18 @@ def call_common(
             )
             del coord, box, fparam, aparam, charge_spin
             graph_method = self._resolve_graph_method(neighbor_graph_method)
+            # ``neighbor_list`` is a DENSE-nlist strategy; the graph path cannot
+            # consume it. Reject an explicit graph+nlist combination, and
+            # otherwise honor the supplied nlist by taking the dense route
+            # (don't let the pt_expt default-flip silently ignore it).
+            if neighbor_list is not None:
+                if neighbor_graph_method not in (None, "legacy"):
+                    raise ValueError(
+                        "neighbor_list is a dense-nlist strategy and cannot be "
+                        f"combined with neighbor_graph_method={neighbor_graph_method!r}; "
+                        "pass one or the other"
+                    )
+                graph_method = None
             # the graph lower does not consume charge_spin yet -> keep those
             # models on dense (a None check, so it stays jit/export-safe)
             if cs is not None:
@@ -588,6 +613,7 @@ def forward_common_atomic_graph(
             fparam: Array | None = None,
             aparam: Array | None = None,
             comm_dict: dict | None = None,
+            charge_spin: Array | None = None,
         ) -> dict[str, Array]:
             """Model-level graph forward (no type cast). Analogue of the dense
             :meth:`forward_common_atomic`.
@@ -622,6 +648,9 @@ def forward_common_atomic_graph(
             comm_dict
                 MPI communication metadata. Ignored in PR-A; accepted for ABI
                 stability.
+            charge_spin
+                charge/spin conditioning. Ignored in PR-A; accepted for ABI
+                stability with charge/spin-conditioned descriptors.
 
             Returns
             -------
@@ -637,7 +666,7 @@ def forward_common_atomic_graph(
                 edge_mask=edge_mask,
             )
             atomic_ret = self.atomic_model.forward_common_atomic_graph(
-                graph, atype, fparam=fparam, aparam=aparam
+                graph, atype, fparam=fparam, aparam=aparam, charge_spin=charge_spin
             )
             return fit_output_to_model_output_graph(
                 atomic_ret,
@@ -657,6 +686,7 @@ def call_common_lower_graph(
             fparam: Array | None = None,
             aparam: Array | None = None,
             comm_dict: dict | None = None,
+            charge_spin: Array | None = None,
         ) -> dict[str, Array]:
             """Graph-native PUBLIC lower (PR-A: dpa1 ``attn_layer == 0``).
 
@@ -691,14 +721,17 @@ def call_common_lower_graph(
             comm_dict
                 MPI communication metadata. Ignored in PR-A; accepted for ABI
                 stability.
+            charge_spin
+                charge/spin conditioning. Ignored in PR-A; accepted for ABI
+                stability with charge/spin-conditioned descriptors.
 
             Returns
             -------
             dict
                 The standard model dict in the INPUT precision.
             """
-            edge_vec, _, fparam, aparam, _, input_prec = self._input_type_cast(
-                edge_vec, fparam=fparam, aparam=aparam
+            edge_vec, _, fparam, aparam, cs, input_prec = self._input_type_cast(
+                edge_vec, fparam=fparam, aparam=aparam, charge_spin=charge_spin
             )
             model_predict = self.forward_common_atomic_graph(
                 atype,
@@ -710,6 +743,7 @@ def call_common_lower_graph(
                 fparam=fparam,
                 aparam=aparam,
                 comm_dict=comm_dict,
+                charge_spin=cs,
             )
             model_predict = self._output_type_cast(model_predict, input_prec)
             return model_predict
diff --git a/deepmd/dpmodel/utils/neighbor_graph/builder.py b/deepmd/dpmodel/utils/neighbor_graph/builder.py
index 003911598a..71ca699e1b 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/builder.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/builder.py
@@ -63,7 +63,10 @@ def from_dense_quartet(
     This is a backward-compat CONVERTER (World 1 -> graph): it performs NO neighbor
     search and INHERITS the ``sel`` truncation already baked into ``nlist``. Use it
     only when a caller (an MD code, or the legacy dense path) already holds a
-    built quartet; for the carry-all graph use :func:`build_neighbor_graph`.
+    built quartet. In contrast, the carry-all graph builders search from RAW
+    coordinates and apply NO ``sel`` truncation: :func:`build_neighbor_graph`
+    (the ``neighbor_graph_method="dense"`` all-pairs route) and
+    :func:`build_neighbor_graph_ase` (the ``"ase"`` O(N) cell-list route).
 
     For each valid neighbor slot it emits one edge with ``src = mapping[neighbor]``
     (the neighbor's LOCAL owner -> ghost-free index), ``dst = center`` (local), and
diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py
index 4b2f1e9ab3..e527a84bf0 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/graph.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py
@@ -123,7 +123,7 @@ def pad_and_guard_edges(
     return ei, ev, edge_mask
 
 
-def frame_id_from_n_node(n_node: Array) -> Array:
+def frame_id_from_n_node(n_node: Array, n_total: int | None = None) -> Array:
     """Node->frame map for a flat node axis: ``repeat(arange(nf), n_node)``.
 
     Implemented via ``searchsorted(cumulative_sum(n_node), arange(N), side="right")``
@@ -133,19 +133,33 @@ def frame_id_from_n_node(n_node: Array) -> Array:
     ----------
     n_node
         Per-frame node counts.  Shape ``(nf,)``.
+    n_total
+        Size of the (possibly padded) flat node axis ``N``.  ``None`` (the
+        numpy/eager default) falls back to ``int(sum(n_node))``; pass a STATIC
+        value to keep the function trace-friendly under jax.jit / export, where
+        ``int()`` on the traced sum is not allowed (mirrors
+        :func:`node_validity_mask`).  Padding nodes ``[sum(n_node), n_total)``
+        are CLAMPED to the last frame (``nf - 1``) so a downstream
+        ``segment_sum(..., num_segments=nf)`` stays in range; they carry no real
+        edge, so this assignment is unused downstream.
 
     Returns
     -------
     frame_id
         Frame index of each flat node, compact-prefix frame-major.
-        Shape ``(N,)`` int64, where ``N = sum(n_node)``.
+        Shape ``(n_total,)`` int64 (``n_total = sum(n_node)`` when not padded).
     """
     xp = array_api_compat.array_namespace(n_node)
     dev = array_api_compat.device(n_node)
-    n_total = int(xp.sum(n_node))
+    if n_total is None:
+        n_total = int(xp.sum(n_node))
+    nf = n_node.shape[0]
     idx = xp.arange(n_total, dtype=n_node.dtype, device=dev)
     boundaries = xp.cumulative_sum(n_node)  # (nf,) upper bounds, exclusive
-    return xp.astype(xp.searchsorted(boundaries, idx, side="right"), xp.int64)
+    frame_id = xp.astype(xp.searchsorted(boundaries, idx, side="right"), xp.int64)
+    # padding nodes (idx >= sum(n_node)) land at frame ``nf`` (OOB); clamp them to
+    # the last real frame so the per-frame scatter never indexes out of range.
+    return xp.minimum(frame_id, xp.asarray(nf - 1, dtype=xp.int64, device=dev))
 
 
 def node_validity_mask(n_node: Array, n_total: int) -> Array:
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 517086f200..50ede240e4 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -290,6 +290,7 @@ def forward_common_lower_graph(
             do_atomic_virial: bool = False,
             fparam: torch.Tensor | None = None,
             aparam: torch.Tensor | None = None,
+            charge_spin: torch.Tensor | None = None,
         ) -> dict[str, torch.Tensor]:
             """Graph-native lower with autograd force/virial (PR-A: dpa1 ``attn_layer==0``).
 
@@ -328,6 +329,9 @@ def forward_common_lower_graph(
                 Frame parameter, ``(nf, ndf)``.
             aparam
                 Atomic parameter, ``(nf, nloc, nda)``.
+            charge_spin
+                charge/spin conditioning. Ignored in PR-A; accepted for ABI
+                stability with charge/spin-conditioned descriptors.
 
             Returns
             -------
@@ -355,6 +359,7 @@ def forward_common_lower_graph(
                 atype,
                 fparam=fparam,
                 aparam=aparam,
+                charge_spin=charge_spin,
             )
             # ``forward_common_atomic_graph`` returns flat ``(N, *)`` output.
             # Pass directly to the flat-N transform; no rectangular reshape needed.
diff --git a/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py b/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
index d1604c08b7..91507bb0d8 100644
--- a/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
+++ b/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
@@ -173,3 +173,52 @@ def test_binding_sel_carries_more_than_dense() -> None:
     graph = model.call_common(coord, atype, box, neighbor_graph_method="dense")
 
     assert not np.allclose(graph["energy_redu"], dense["energy_redu"])
+
+
+def test_neighbor_list_conflicts_with_graph_method() -> None:
+    """An explicit ``neighbor_list`` (a dense-nlist strategy) cannot be combined
+    with an explicit graph ``neighbor_graph_method``; passing both raises.
+    """
+    from deepmd.dpmodel.utils.default_neighbor_list import (
+        DefaultNeighborList,
+    )
+
+    rng = np.random.default_rng(2)
+    nloc = 6
+    coord = rng.normal(size=(1, nloc, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0, 1]], dtype=np.int64)
+    model = _make_model([200])
+
+    with pytest.raises(ValueError, match="cannot be combined"):
+        model.call_common(
+            coord,
+            atype,
+            None,
+            neighbor_list=DefaultNeighborList(),
+            neighbor_graph_method="dense",
+        )
+
+
+def test_neighbor_list_takes_dense_route() -> None:
+    """Supplying ``neighbor_list`` (without an explicit graph method) takes the
+    dense route -- it is NOT silently ignored by the graph path. With the
+    default builder the result matches the legacy dense path exactly.
+    """
+    from deepmd.dpmodel.utils.default_neighbor_list import (
+        DefaultNeighborList,
+    )
+
+    rng = np.random.default_rng(3)
+    nloc = 6
+    coord = rng.normal(size=(1, nloc, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0, 1]], dtype=np.int64)
+    box = np.eye(3).reshape(1, 9) * 20.0
+    model = _make_model([200])
+
+    legacy = model.call_common(coord, atype, box, neighbor_graph_method="legacy")
+    with_nlist = model.call_common(
+        coord, atype, box, neighbor_list=DefaultNeighborList()
+    )
+    np.testing.assert_allclose(
+        with_nlist["energy_redu"], legacy["energy_redu"], rtol=1e-12, atol=1e-12
+    )
diff --git a/source/tests/common/dpmodel/test_graph_ragged.py b/source/tests/common/dpmodel/test_graph_ragged.py
index 1c4dca7332..a651d245ea 100644
--- a/source/tests/common/dpmodel/test_graph_ragged.py
+++ b/source/tests/common/dpmodel/test_graph_ragged.py
@@ -27,6 +27,24 @@ def test_frame_id_ragged():
     )
 
 
+def test_frame_id_static_n_total():
+    """A static ``n_total`` (jax/export trace-friendly path) matches the default
+    ``int(sum(n_node))`` path exactly when ``n_total == sum(n_node)``; a PADDED
+    ``n_total`` assigns the padding tail to the last frame.
+    """
+    n_node = np.array([3, 5, 2], dtype=np.int64)  # sum = 10
+    # exact n_total reproduces the default (None) path
+    np.testing.assert_array_equal(
+        frame_id_from_n_node(n_node, n_total=10),
+        frame_id_from_n_node(n_node),
+    )
+    # padded n_total -> padding nodes [10, 12) map to the last frame (nf-1 == 2)
+    np.testing.assert_array_equal(
+        frame_id_from_n_node(n_node, n_total=12),
+        np.array([0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64),
+    )
+
+
 def test_forward_common_atomic_graph_ragged():
     """Two frames with DIFFERENT node counts (3 and 2) share one flat node axis.
 

From 83583a31b13407b460edc6e9a0f39799955363ae Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 28 Jun 2026 16:08:06 +0800
Subject: [PATCH 67/69] test(dpmodel,pt_expt): pin dpa1 graph lower invariance
 to charge_spin

dpa1 does not consume charge_spin (get_dim_chg_spin()==0; the dense atomic model
passes None to the descriptor since add_chg_spin_ebd is False). charge_spin is
accepted on the graph lower only for ABI stability with charge/spin-conditioned
descriptors (dpa3/dpa4, PR-G). Pin that the dpa1 graph lower output is INVARIANT
to charge_spin:
- dpmodel call_common_lower_graph: energy/atom_energy/mask unchanged.
- pt_expt forward_common_lower_graph: energy/force/virial/atom_virial unchanged.

With the existing graph==dense parity at non-binding sel this gives the full
claim graph(charge_spin) == graph(None) == dense. Guards against a future
regression where charge_spin leaks into the dpa1 graph path.
---
 .../dpmodel/test_dpa1_graph_model_energy.py   | 45 +++++++++++++++++++
 .../tests/pt_expt/model/test_graph_ragged.py  | 30 +++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py b/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
index 91507bb0d8..37cb18808e 100644
--- a/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
+++ b/source/tests/common/dpmodel/test_dpa1_graph_model_energy.py
@@ -222,3 +222,48 @@ def test_neighbor_list_takes_dense_route() -> None:
     np.testing.assert_allclose(
         with_nlist["energy_redu"], legacy["energy_redu"], rtol=1e-12, atol=1e-12
     )
+
+
+def test_graph_lower_invariant_to_charge_spin() -> None:
+    """dpa1 does NOT consume charge_spin (``get_dim_chg_spin() == 0``); the dense
+    atomic model passes ``None`` to the dpa1 descriptor regardless. The graph
+    lower accepts ``charge_spin`` only for ABI stability with charge/spin
+    descriptors (dpa3/dpa4, PR-G), so its output must be INVARIANT to it.
+
+    Combined with the graph==dense parity at non-binding sel
+    (:func:`test_energy_parity_non_binding_sel`), this gives the full claim:
+    ``graph(charge_spin) == graph(None) == dense``.
+    """
+    from deepmd.dpmodel.utils.neighbor_graph import (
+        build_neighbor_graph,
+    )
+
+    rng = np.random.default_rng(4)
+    nloc = 6
+    coord = rng.normal(size=(1, nloc, 3)) * 1.5
+    atype = np.array([[0, 1, 0, 1, 0, 1]], dtype=np.int64)
+    box = np.eye(3).reshape(1, 9) * 20.0
+    model = _make_model([200])
+    assert model.get_descriptor().get_dim_chg_spin() == 0  # dpa1: no chg/spin
+
+    ng = build_neighbor_graph(coord, atype, box, model.get_rcut())
+    atype_flat = atype.reshape(-1)
+    base = model.call_common_lower_graph(
+        atype_flat, ng.n_node, ng.edge_index, ng.edge_vec, ng.edge_mask
+    )
+    # arbitrary non-None charge/spin -> must NOT change the dpa1 graph output
+    cs = np.array([[1.0, 2.0]], dtype=coord.dtype)
+    with_cs = model.call_common_lower_graph(
+        atype_flat,
+        ng.n_node,
+        ng.edge_index,
+        ng.edge_vec,
+        ng.edge_mask,
+        charge_spin=cs,
+    )
+    assert set(base) == set(with_cs)
+    for k, v in base.items():
+        if v is None:
+            assert with_cs[k] is None
+        else:
+            np.testing.assert_array_equal(with_cs[k], v)
diff --git a/source/tests/pt_expt/model/test_graph_ragged.py b/source/tests/pt_expt/model/test_graph_ragged.py
index 5f78be5ff1..e6636cd742 100644
--- a/source/tests/pt_expt/model/test_graph_ragged.py
+++ b/source/tests/pt_expt/model/test_graph_ragged.py
@@ -142,3 +142,33 @@ def test_flat_atom_virial_shapes(self) -> None:
         assert ret["energy_derv_c_redu"].shape[0] == nf
         assert torch.isfinite(ret["energy_derv_c"]).all()
         assert torch.isfinite(ret["energy_derv_c_redu"]).all()
+
+    def test_invariant_to_charge_spin(self) -> None:
+        """dpa1 does NOT consume charge_spin (``get_dim_chg_spin() == 0``);
+        forward_common_lower_graph accepts it only for ABI stability with
+        charge/spin descriptors (dpa3/dpa4, PR-G), so energy / force / virial /
+        atom-virial must be INVARIANT to it.
+        """
+        assert self.model.get_descriptor().get_dim_chg_spin() == 0  # dpa1
+        args = (
+            self.atype,
+            self.n_node,
+            self.edge_index,
+            self.edge_vec,
+            self.edge_mask,
+        )
+        base = self.model.forward_common_lower_graph(*args, do_atomic_virial=True)
+        nf = int(self.n_node.shape[0])
+        # arbitrary non-None charge/spin -> must NOT change any dpa1 graph output
+        cs = torch.tensor(
+            [[1.0, 2.0]] * nf, dtype=torch.float64, device=self.device
+        )
+        with_cs = self.model.forward_common_lower_graph(
+            *args, do_atomic_virial=True, charge_spin=cs
+        )
+        assert set(base) == set(with_cs)
+        for k, v in base.items():
+            if v is None:
+                assert with_cs[k] is None
+            else:
+                torch.testing.assert_close(with_cs[k], v, rtol=1e-12, atol=1e-12)

From 52e6c8b40793f8bab60a5d83c84d1b7b8278ab91 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 28 Jun 2026 08:08:59 +0000
Subject: [PATCH 68/69] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 source/tests/pt_expt/model/test_graph_ragged.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/source/tests/pt_expt/model/test_graph_ragged.py b/source/tests/pt_expt/model/test_graph_ragged.py
index e6636cd742..efe2ffeaec 100644
--- a/source/tests/pt_expt/model/test_graph_ragged.py
+++ b/source/tests/pt_expt/model/test_graph_ragged.py
@@ -160,9 +160,7 @@ def test_invariant_to_charge_spin(self) -> None:
         base = self.model.forward_common_lower_graph(*args, do_atomic_virial=True)
         nf = int(self.n_node.shape[0])
         # arbitrary non-None charge/spin -> must NOT change any dpa1 graph output
-        cs = torch.tensor(
-            [[1.0, 2.0]] * nf, dtype=torch.float64, device=self.device
-        )
+        cs = torch.tensor([[1.0, 2.0]] * nf, dtype=torch.float64, device=self.device)
         with_cs = self.model.forward_common_lower_graph(
             *args, do_atomic_virial=True, charge_spin=cs
         )

From 3789dbd7f5bc7af54e41f55a26f9449eb5617e1e Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sun, 28 Jun 2026 20:46:41 +0800
Subject: [PATCH 69/69] test(pt_expt): drop unused N in test_dpa1_graph_lower
 (CodeQL)

CodeQL flagged the unused local `N = nf * nloc`; fold it into the comment.
---
 source/tests/pt_expt/model/test_dpa1_graph_lower.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/source/tests/pt_expt/model/test_dpa1_graph_lower.py b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
index 8001213e1c..e274a1bcec 100644
--- a/source/tests/pt_expt/model/test_dpa1_graph_lower.py
+++ b/source/tests/pt_expt/model/test_dpa1_graph_lower.py
@@ -204,9 +204,8 @@ def test_force_virial_parity_vs_legacy(self, periodic, do_av) -> None:
             do_atomic_virial=do_av,
         )
 
-        # forward_common_lower_graph returns flat (N,*) per-atom outputs.
-        # Reshape to (nf, nloc, *) for comparison against the legacy dense lower.
-        N = nf * nloc
+        # forward_common_lower_graph returns flat (N = nf * nloc, *) per-atom
+        # outputs. Reshape to (nf, nloc, *) to compare against the dense lower.
 
         # per-atom energy: flat (N, 1) -> (nf, nloc, 1)
         graph_energy = graph["energy"].reshape(nf, nloc, 1)