diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 27e2d68bfc..3f49b6c04f 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -757,9 +757,13 @@ def call_graph(
         )
         # FLAT node axis (N, ...): no (nf, nloc) reshape -- ragged-native, spec.
         if self.concat_output_tebd:
-            tebd = xp.asarray(type_embedding, device=dev)
+            # Use type_embedding directly (mirrors the dense path's
+            # ``xp.take(type_embedding, ...)``): ``xp.asarray(..., device=dev)``
+            # DETACHES under torch, silently severing the type-embedding weight
+            # gradient so the tebd net never trains; type_embedding already lives
+            # on the model device, so the device cast was redundant anyway.
             atype_local = xp.asarray(atype, device=dev)
-            atype_embd = xp.take(tebd, atype_local, axis=0)  # (N, tebd_dim)
+            atype_embd = xp.take(type_embedding, atype_local, axis=0)  # (N, tebd_dim)
             grrg = xp.concat([grrg, atype_embd], axis=-1)
         return grrg, rot_mat
 
@@ -1523,7 +1527,10 @@ def call_graph(
         ss = rr[:, 0:1]  # (E, 1)
         # neighbor / center type embeddings (concat mode); ghost type == owner type
         # so gathering by the LOCAL owner (src) reproduces the dense neighbor tebd.
-        tebd = xp.asarray(type_embedding, device=dev)
+        # NB: do NOT wrap in ``xp.asarray(..., device=dev)`` -- that DETACHES under
+        # torch and severs the type-embedding weight gradient (the tebd net would
+        # never train); type_embedding already lives on the model device.
+        tebd = type_embedding
         atype_embd_nlist = xp.take(tebd, nei_type, axis=0)  # (E, tebd_dim)
         if not self.type_one_side:
             atype_embd_nnei = xp.take(tebd, center_type, axis=0)  # (E, tebd_dim)
diff --git a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
index 494e97a0c9..599877d7e2 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py
@@ -80,15 +80,42 @@ def edge_force_virial(
         frame via the frame of their ``dst`` node.
     """
     xp = array_api_compat.array_namespace(g_e)
-    # node-axis size; when a static ``node_capacity`` is supplied (the jax/export
-    # path) short-circuit so we never call int() on the traced ``sum(n_node)``.
-    n_out = int(node_capacity) if node_capacity is not None else int(xp.sum(n_node))
+    # node-axis size; when a ``node_capacity`` is supplied (the jax/export path)
+    # use it AS-IS so we never call int() on the traced ``sum(n_node)`` -- and,
+    # crucially, never on ``node_capacity`` itself: under symbolic make_fx /
+    # torch.export it is a SymInt (``atype.shape[0]``); ``int(SymInt)`` would
+    # SPECIALIZE the node axis to the trace-time sample size, baking a constant
+    # ``N`` into the scatter and breaking dynamic-``N`` inference.
+    n_out = node_capacity if node_capacity is not None else int(xp.sum(n_node))
     nf = n_node.shape[0]
     # zero padding/guard contributions; cast mask to g's dtype (array-API pure,
     # CLAUDE.md mask-multiply guideline — avoids bool*float under array_api_strict)
     g = g_e * xp.astype(edge_mask[:, None], g_e.dtype)
-    src = edge_index[0]
-    dst = edge_index[1]
+    # Wrap node indices into ``[0, n_out)`` so every scatter address is provably
+    # in-bounds. For a well-formed graph every real edge already has
+    # ``index < n_out`` (== ``atype.shape[0]``), so this modulo is the IDENTITY on
+    # real edges (pinned by test_modulo_clamp_leaves_real_edges_unchanged) -- a
+    # correctness-preserving guard, not a value fixup.
+    #
+    # Why it is needed (root cause, GPU-confirmed): under the dynamic-edge graph
+    # ``torch.export`` path the node count is traced as several equal-but-distinct
+    # symbols (``atype.shape[0]``, ``fit_ret.shape[0]``, ...), tied only by
+    # ``aten._assert_scalar(Eq(...))`` nodes. ``_strip_shape_assertions``
+    # (pt_expt/utils/serialization.py) neutralises ALL such asserts so export can
+    # trace -- which also drops those node-count equalities, so inductor can no
+    # longer prove the scatter index and its bound ``ks0 == n_out`` share a symbol
+    # and emits ``tl.device_assert(idx < ks0)`` (fatal on CUDA; unchecked on CPU,
+    # which is why all CPU dev/CI was green). ``% n_out`` discharges that guard
+    # unconditionally. This is the PERMANENT fix: the upstream alternative --
+    # making the SHARED, spin-export-critical ``_strip_shape_assertions``
+    # selective -- risks re-triggering the torch.export bugs it exists to bypass
+    # and the spin ``.pt2`` path, so it is deliberately NOT taken.
+    #
+    # Pure arithmetic => torch.export-safe, unlike ``xp.clip`` (SymInt bound
+    # breaks array_api_compat's clip) and unlike a mask-multiply (which misses the
+    # ``edge_mask == 1`` indices the stripped guard mis-bounds).
+    src = edge_index[0] % n_out
+    dst = edge_index[1] % n_out
     # force (output sized to the node axis, incl. any padding tail)
     force = segment_sum(g, dst, n_out) - segment_sum(g, src, n_out)
     # per-edge virial w_e[k, j] = -g_e[k] * edge_vec[j]  (broadcast, no einsum)
@@ -101,6 +128,8 @@ def edge_force_virial(
     boundaries = xp.cumulative_sum(n_node)  # (nf,) per-frame node upper bounds
     edge_frame = xp.astype(
         xp.searchsorted(boundaries, dst, side="right"), xp.int64
-    )  # (E,) in [0, nf)
+    )  # (E,) in [0, nf]
+    # wrap into [0, nf) for the same CUDA-bounds reason (export-safe modulo)
+    edge_frame = edge_frame % nf
     virial = segment_sum(w_edge, edge_frame, nf)  # (nf, 3, 3)
     return force, atom_virial, virial
diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py
index e527a84bf0..0ce10efdf6 100644
--- a/deepmd/dpmodel/utils/neighbor_graph/graph.py
+++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py
@@ -153,13 +153,18 @@ def frame_id_from_n_node(n_node: Array, n_total: int | None = None) -> Array:
     dev = array_api_compat.device(n_node)
     if n_total is None:
         n_total = int(xp.sum(n_node))
-    nf = n_node.shape[0]
     idx = xp.arange(n_total, dtype=n_node.dtype, device=dev)
     boundaries = xp.cumulative_sum(n_node)  # (nf,) upper bounds, exclusive
     frame_id = xp.astype(xp.searchsorted(boundaries, idx, side="right"), xp.int64)
     # padding nodes (idx >= sum(n_node)) land at frame ``nf`` (OOB); clamp them to
     # the last real frame so the per-frame scatter never indexes out of range.
-    return xp.minimum(frame_id, xp.asarray(nf - 1, dtype=xp.int64, device=dev))
+    # Derive ``nf - 1`` as a RUNTIME 0-d tensor (sum of ones over the frame axis)
+    # rather than ``xp.asarray(n_node.shape[0] - 1)``: under symbolic make_fx /
+    # torch.export, ``shape[0]`` is a SymInt and materializing it into a constant
+    # tensor SPECIALIZES the frame axis -- baking the trace-time frame count into
+    # every downstream per-frame reduction and breaking dynamic-``nf`` inference.
+    last_frame = xp.sum(xp.ones_like(n_node)) - 1  # 0-d int == nf - 1
+    return xp.minimum(frame_id, xp.astype(last_frame, xp.int64))
 
 
 def node_validity_mask(n_node: Array, n_total: int) -> Array:
diff --git a/deepmd/main.py b/deepmd/main.py
index 43f40dc214..0a4d44137a 100644
--- a/deepmd/main.py
+++ b/deepmd/main.py
@@ -350,6 +350,16 @@ def main_parser() -> argparse.ArgumentParser:
         type=str,
         help="(Supported backend: PyTorch) Task head (alias: model branch) to freeze if in multi-task mode.",
     )
+    parser_frz.add_argument(
+        "--lower-kind",
+        default="nlist",
+        type=str,
+        choices=["nlist", "graph"],
+        help="(Supported backend: PyTorch Exportable) Lower-level export form of the "
+        "frozen .pt2: 'nlist' (default, dense neighbor-list lower) or 'graph' "
+        "(NeighborGraph edge-list lower; only for graph-eligible models, currently "
+        "dpa1 with attn_layer=0). 'graph' selects the C++ graph inference path.",
+    )
 
     # * test script ********************************************************************
     parser_tst = subparsers.add_parser(
diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py
index da28229bf4..5b3b74b6bd 100644
--- a/deepmd/pt_expt/entrypoints/main.py
+++ b/deepmd/pt_expt/entrypoints/main.py
@@ -387,6 +387,7 @@ def freeze(
     model: str,
     output: str = "frozen_model.pte",
     head: str | None = None,
+    lower_kind: str = "nlist",
 ) -> None:
     """Freeze a pt_expt checkpoint into a .pte exported model.
 
@@ -398,6 +399,13 @@ def freeze(
         Path for the output .pte file.
     head : str or None
         Head to freeze in multi-task mode.
+    lower_kind : str
+        Lower-level export form: ``"nlist"`` (default, dense neighbor-list lower)
+        or ``"graph"`` (NeighborGraph edge-list lower). ``"graph"`` is only valid
+        for graph-eligible models (``mixed_types`` and ``uses_graph_lower``,
+        currently dpa1 with ``attn_layer == 0``) and selects the C++ graph
+        inference path; the per-atom virial is enabled for it (near-free in the
+        graph path: one extra scatter off the shared single backward).
     """
     import torch
 
@@ -458,12 +466,34 @@ def freeze(
         single_model_params = model_params
 
     m.eval()
+
+    # The graph lower is opt-in and only valid for graph-eligible models (dpa1
+    # attn_layer==0 today). Fail fast with a clear message rather than emitting a
+    # broken .pt2. Enable the per-atom virial for the graph form -- it is
+    # near-free there (one extra scatter off the single shared backward).
+    do_atomic_virial = False
+    if lower_kind == "graph":
+        from deepmd.pt_expt.train.training import (
+            _model_uses_graph_lower,
+        )
+
+        if not _model_uses_graph_lower(m):
+            raise ValueError(
+                "lower_kind='graph' requires a graph-eligible model "
+                "(mixed_types and a descriptor exposing uses_graph_lower()==True, "
+                "currently dpa1 with attn_layer==0). Use lower_kind='nlist' for "
+                "this model."
+            )
+        do_atomic_virial = True
+
     model_dict_serialized = m.serialize()
     deserialize_to_file(
         output,
         {"model": model_dict_serialized, "model_def_script": single_model_params},
+        do_atomic_virial=do_atomic_virial,
+        lower_kind=lower_kind,
     )
-    log.info("Saved frozen model to %s", output)
+    log.info("Saved frozen model to %s (lower_kind=%s)", output, lower_kind)
 
 
 def change_bias(
@@ -701,9 +731,19 @@ def main(args: list[str] | argparse.Namespace | None = None) -> None:
                     f"Checkpoint path '{model_path}' does not exist."
                 )
             FLAGS.model = str(model_path)
+        _lower_kind = getattr(FLAGS, "lower_kind", "nlist")
         if not FLAGS.output.endswith((".pte", ".pt2")):
-            FLAGS.output = str(Path(FLAGS.output).with_suffix(".pte"))
-        freeze(model=FLAGS.model, output=FLAGS.output, head=FLAGS.head)
+            # Default suffix: .pt2 for the graph export (an AOTI .pt2 archive is
+            # what the C++ graph path consumes), .pte otherwise. Explicit user
+            # .pte / .pt2 suffixes are preserved for both.
+            _default_suffix = ".pt2" if _lower_kind == "graph" else ".pte"
+            FLAGS.output = str(Path(FLAGS.output).with_suffix(_default_suffix))
+        freeze(
+            model=FLAGS.model,
+            output=FLAGS.output,
+            head=FLAGS.head,
+            lower_kind=_lower_kind,
+        )
     elif FLAGS.command == "change-bias":
         change_bias(
             input_file=FLAGS.INPUT,
diff --git a/deepmd/pt_expt/infer/deep_eval.py b/deepmd/pt_expt/infer/deep_eval.py
index 97bba3d4a5..e03f893040 100644
--- a/deepmd/pt_expt/infer/deep_eval.py
+++ b/deepmd/pt_expt/infer/deep_eval.py
@@ -66,6 +66,20 @@
     import ase.neighborlist
 
 
+# Public output keys emitted by the graph-form AOTI forward
+# (``forward_lower_graph_exportable``) keyed by the output-variable category that
+# ``request_defs`` carries.  The graph path is LOCAL-only (``N == sum(n_node)``
+# nodes, no ghosts), so its outputs are already at local-atom resolution -- no
+# ``communicate_extended_output`` fold-back is needed.
+_GRAPH_CATEGORY_TO_KEY = {
+    OutputVariableCategory.OUT: "atom_energy",
+    OutputVariableCategory.REDU: "energy",
+    OutputVariableCategory.DERV_R: "force",
+    OutputVariableCategory.DERV_C_REDU: "virial",
+    OutputVariableCategory.DERV_C: "atom_virial",
+}
+
+
 def _reshape_charge_spin(
     charge_spin: np.ndarray, nframes: int, dim_chg_spin: int
 ) -> np.ndarray:
@@ -1423,6 +1437,10 @@ def _eval_model(
         request_defs: list[OutputVariableDef],
         charge_spin: np.ndarray | None = None,
     ) -> tuple[np.ndarray, ...]:
+        if self.metadata.get("lower_input_kind") == "graph":
+            return self._eval_model_graph(
+                coords, cells, atom_types, fparam, aparam, request_defs, charge_spin
+            )
         model_inputs, mapping_t, nframes, natoms = self._prepare_inputs(
             coords, cells, atom_types, fparam, aparam, charge_spin
         )
@@ -1621,6 +1639,101 @@ def _eval_model_spin(
                 )
         return tuple(results)
 
+    def _eval_model_graph(
+        self,
+        coords: np.ndarray,
+        cells: np.ndarray | None,
+        atom_types: np.ndarray,
+        fparam: np.ndarray | None,
+        aparam: np.ndarray | None,
+        request_defs: list[OutputVariableDef],
+        charge_spin: np.ndarray | None = None,
+    ) -> tuple[np.ndarray, ...]:
+        """Evaluate a graph-form ``.pt2`` (``lower_input_kind == "graph"``).
+
+        Builds a carry-all :class:`~deepmd.dpmodel.utils.neighbor_graph.NeighborGraph`
+        from the eval system at its exact (tight) edge count and feeds the
+        positional schema
+        ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam,
+        charge_spin)`` to the exported forward.  The AOTI artifact's edge axis
+        is DYNAMIC (B2.0), so no ``edge_capacity`` padding is needed.  The
+        forward returns the LOCAL public keys directly, so results are reshaped
+        without ``communicate_extended_output``.
+        """
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            build_neighbor_graph,
+        )
+        from deepmd.pt_expt.utils.env import (
+            DEVICE,
+        )
+
+        nframes = coords.shape[0]
+        if len(atom_types.shape) == 1:
+            natoms = len(atom_types)
+            atom_types = np.tile(atom_types, nframes).reshape(nframes, -1)
+        else:
+            natoms = len(atom_types[0])
+
+        coord_input = coords.reshape(nframes, natoms, 3)
+        box_input = cells.reshape(nframes, 9) if cells is not None else None
+        # Dynamic edge axis (B2.0): build the carry-all graph at its exact edge
+        # count (no static padding); the AOTI artifact accepts any E.
+        graph = build_neighbor_graph(
+            coord_input,
+            atom_types,
+            box_input,
+            self._rcut,
+        )
+
+        atype_t = torch.tensor(
+            np.asarray(atom_types).reshape(-1), dtype=torch.int64, device=DEVICE
+        )
+        n_node_t = torch.tensor(
+            np.asarray(graph.n_node), dtype=torch.int64, device=DEVICE
+        )
+        edge_index_t = torch.tensor(
+            np.asarray(graph.edge_index), dtype=torch.int64, device=DEVICE
+        )
+        edge_vec_t = torch.tensor(
+            np.asarray(graph.edge_vec), dtype=torch.float64, device=DEVICE
+        )
+        edge_mask_t = torch.tensor(
+            np.asarray(graph.edge_mask), dtype=torch.bool, device=DEVICE
+        )
+
+        fparam_t, aparam_t = self._prepare_optional_lower_inputs(
+            fparam, aparam, nframes, natoms, DEVICE
+        )
+        charge_spin_t = self._make_charge_spin_input(nframes, charge_spin)
+
+        model_inputs = (
+            atype_t,
+            n_node_t,
+            edge_index_t,
+            edge_vec_t,
+            edge_mask_t,
+            fparam_t,
+            aparam_t,
+            charge_spin_t,
+        )
+        if self._is_pt2:
+            model_ret = self._pt2_runner(*model_inputs)
+        else:
+            model_ret = self.exported_module(*model_inputs)
+
+        results = []
+        for odef in request_defs:
+            shape = self._get_output_shape(odef, nframes, natoms)
+            gkey = _GRAPH_CATEGORY_TO_KEY.get(odef.category)
+            val = model_ret.get(gkey) if gkey is not None else None
+            if val is not None:
+                results.append(val.detach().cpu().numpy().reshape(shape))
+            else:
+                results.append(
+                    np.full(np.abs(shape), np.nan, dtype=GLOBAL_NP_FLOAT_PRECISION)
+                )
+        return tuple(results)
+
     def _get_output_shape(
         self, odef: OutputVariableDef, nframes: int, natoms: int
     ) -> list[int]:
diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py
index 565e155157..653f323404 100644
--- a/deepmd/pt_expt/model/edge_transform_output.py
+++ b/deepmd/pt_expt/model/edge_transform_output.py
@@ -29,6 +29,8 @@ def edge_energy_deriv(
     edge_index: torch.Tensor,
     edge_mask: torch.Tensor,
     n_node: torch.Tensor,
+    node_capacity: int | None = None,
+    *,
     do_atomic_virial: bool = False,
     create_graph: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor]:
@@ -49,6 +51,10 @@ def edge_energy_deriv(
         (E,) valid-edge mask.
     n_node
         (nf,) per-frame node counts.
+    node_capacity
+        Static node-axis size ``N``.  ``None`` (eager default) falls back to
+        ``int(n_node.sum())``.  Pass a static value (e.g. ``atype.shape[0]``)
+        to keep this function trace-safe under ``make_fx``/``torch.export``.
     do_atomic_virial
         whether to materialize the per-atom virial (else ``None`` is returned).
     create_graph
@@ -70,7 +76,7 @@ def edge_energy_deriv(
         retain_graph=True,
     )
     force, atom_virial, virial = edge_force_virial(
-        g_e, edge_vec, edge_index, edge_mask, n_node
+        g_e, edge_vec, edge_index, edge_mask, n_node, node_capacity=node_capacity
     )
     return force, (atom_virial if do_atomic_virial else None), virial
 
@@ -82,6 +88,7 @@ def fit_output_to_model_output_graph(
     do_atomic_virial: bool = False,
     create_graph: bool = True,
     mask: torch.Tensor | None = None,
+    node_capacity: int | None = None,
 ) -> dict[str, torch.Tensor]:
     """Graph analogue of the dense pt_expt ``fit_output_to_model_output``.
 
@@ -115,6 +122,15 @@ def fit_output_to_model_output_graph(
         Whether the backward retains a graph (training).
     mask
         (N,) flat realness mask; used only for intensive-output reduction.
+    node_capacity
+        Authoritative node-axis size ``N`` = the scatter bound for the
+        per-node force/atom-virial assembly. Pass the INPUT ``atype.shape[0]``
+        (the pristine node-axis symbol that ``edge_index`` indexes into by
+        construction); ``None`` falls back to the descriptor/fitting output's
+        ``fit_ret.shape[0]`` (value-equal). This makes the scatter bound the
+        input node axis rather than a re-derived shape -- hardening; the actual
+        CUDA out-of-bounds device-assert is prevented by the index clamp in
+        :func:`~deepmd.dpmodel.utils.neighbor_graph.derivatives.edge_force_virial`.
 
     Returns
     -------
@@ -131,9 +147,22 @@ def fit_output_to_model_output_graph(
     edge_mask = graph.edge_mask
     n_node = graph.n_node
     redu_prec = env.GLOBAL_PT_ENER_FLOAT_PRECISION
-    nf = int(n_node.shape[0])
-    N = int(n_node.sum())
-    frame_id = frame_id_from_n_node(n_node)  # (N,) int64 frame index per atom
+    # Keep ``nf`` as a (possibly symbolic) shape value: under symbolic make_fx /
+    # torch.export ``n_node`` dim-0 is the dynamic frame axis, and ``int()`` on a
+    # SymInt SPECIALIZES it -- baking the trace-time frame count into every
+    # per-frame reduction (energy_redu / virial) and breaking multi-frame infer.
+    nf = n_node.shape[0]
+    # Derive N from the fitting output's leading shape rather than int(n_node.sum()).
+    # shape attributes are always static Python ints (or SymInts in symbolic-mode
+    # tracing) and are trace-safe; reading a tensor VALUE via int() is not.
+    N = (
+        node_capacity
+        if node_capacity is not None
+        else next(iter(fit_ret.values())).shape[0]
+    )
+    frame_id = frame_id_from_n_node(
+        n_node, n_total=N
+    )  # (N,) int64 frame index per atom
     model_ret: dict[str, torch.Tensor] = dict(fit_ret.items())
     for kk, vv in fit_ret.items():
         vdef = fit_output_def[kk]
@@ -172,6 +201,7 @@ def fit_output_to_model_output_graph(
                 edge_index,
                 edge_mask,
                 n_node,
+                node_capacity=N,
                 do_atomic_virial=(vdef.c_differentiable and do_atomic_virial),
                 create_graph=create_graph,
             )
diff --git a/deepmd/pt_expt/model/ener_model.py b/deepmd/pt_expt/model/ener_model.py
index 6347382135..140141dd5e 100644
--- a/deepmd/pt_expt/model/ener_model.py
+++ b/deepmd/pt_expt/model/ener_model.py
@@ -32,6 +32,41 @@
 DPEnergyModel_ = make_model(DPEnergyAtomicModel, T_Bases=(BaseModel,))
 
 
+def _translate_energy_keys(
+    model_ret: dict[str, torch.Tensor],
+    *,
+    do_grad_r: bool,
+    do_grad_c: bool,
+    do_atomic_virial: bool,
+    local: bool,
+) -> dict[str, torch.Tensor]:
+    """Map internal fitting keys -> public energy-model keys (shared by the
+    dense and graph ``forward_lower`` export traces).
+
+    Operates on plain dicts (make_fx-safe). ``local=True`` is the GRAPH path
+    (per-node ``N == sum(n_node)`` local atoms, no ghost/extended region) and
+    emits ``force``/``atom_virial``; ``local=False`` is the DENSE extended-region
+    path and emits ``extended_force``/``extended_virial`` (folded to local by
+    ``communicate_extended_output`` at inference).
+    """
+    out: dict[str, torch.Tensor] = {}
+    out["atom_energy"] = model_ret["energy"]
+    out["energy"] = model_ret["energy_redu"]
+    if do_grad_r:
+        out["force" if local else "extended_force"] = model_ret[
+            "energy_derv_r"
+        ].squeeze(-2)
+    if do_grad_c:
+        out["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2)
+        if do_atomic_virial:
+            out["atom_virial" if local else "extended_virial"] = model_ret[
+                "energy_derv_c"
+            ].squeeze(-2)
+    if "mask" in model_ret:
+        out["mask"] = model_ret["mask"]
+    return out
+
+
 @BaseModel.register("ener")
 @BaseModel.register("sezm_ener")
 @BaseModel.register("dpa4_ener")
@@ -229,21 +264,115 @@ def fn(
                 aparam,
                 charge_spin,
             )
-            model_predict: dict[str, torch.Tensor] = {}
-            model_predict["atom_energy"] = model_ret["energy"]
-            model_predict["energy"] = model_ret["energy_redu"]
-            if do_grad_r:
-                model_predict["extended_force"] = model_ret["energy_derv_r"].squeeze(-2)
-            if do_grad_c:
-                model_predict["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2)
-                if do_atomic_virial:
-                    model_predict["extended_virial"] = model_ret[
-                        "energy_derv_c"
-                    ].squeeze(-2)
-            if "mask" in model_ret:
-                model_predict["mask"] = model_ret["mask"]
-            return model_predict
+            return _translate_energy_keys(
+                model_ret,
+                do_grad_r=do_grad_r,
+                do_grad_c=do_grad_c,
+                do_atomic_virial=do_atomic_virial,
+                local=False,
+            )
 
         return make_fx(fn, **make_fx_kwargs)(
             extended_coord, extended_atype, nlist, mapping, fparam, aparam, charge_spin
         )
+
+    def forward_lower_graph_exportable(
+        self,
+        atype: torch.Tensor,
+        n_node: torch.Tensor,
+        edge_index: torch.Tensor,
+        edge_vec: torch.Tensor,
+        edge_mask: torch.Tensor,
+        fparam: torch.Tensor | None = None,
+        aparam: torch.Tensor | None = None,
+        do_atomic_virial: bool = False,
+        charge_spin: torch.Tensor | None = None,
+        **make_fx_kwargs: Any,
+    ) -> torch.nn.Module:
+        """Trace ``forward_common_lower_graph`` into an exportable module with
+        public output keys.
+
+        Delegates to ``forward_common_lower_graph_exportable`` for tracing,
+        then translates the internal keys to the ``forward_lower`` convention.
+
+        Parameters
+        ----------
+        atype
+            (N,) flat local atom types, ``N == sum(n_node)``.
+        n_node
+            (nf,) per-frame local atom counts.
+        edge_index
+            (2, E) ``[src, dst]`` edge endpoints (flat local indices).
+        edge_vec
+            (E, 3) neighbor-minus-center edge vectors (sample for tracing).
+        edge_mask
+            (E,) valid-edge mask (sample for tracing).
+        fparam, aparam, do_atomic_virial, charge_spin
+            As in ``forward_lower``.
+        **make_fx_kwargs
+            Extra keyword arguments forwarded to ``make_fx``
+            (e.g. ``tracing_mode="symbolic"``).
+
+        Returns
+        -------
+        torch.nn.Module
+            A traced module whose ``forward`` accepts
+            ``(atype, n_node, edge_index, edge_vec, edge_mask,
+            fparam, aparam, charge_spin)`` and returns a dict with the
+            public keys: ``atom_energy``, ``energy``, ``force``,
+            ``virial``, ``atom_virial`` (the last only when
+            ``do_atomic_virial``).  Unlike the dense
+            :meth:`forward_lower_exportable` (which emits ``extended_force`` /
+            ``extended_virial`` over the ghost-padded extended region), the
+            graph path is LOCAL-only (``N == sum(n_node)`` nodes, no ghosts),
+            so it emits ``force`` / ``atom_virial`` directly.
+        """
+        traced = self.forward_common_lower_graph_exportable(
+            atype,
+            n_node,
+            edge_index,
+            edge_vec,
+            edge_mask,
+            fparam=fparam,
+            aparam=aparam,
+            charge_spin=charge_spin,
+            do_atomic_virial=do_atomic_virial,
+            **make_fx_kwargs,
+        )
+
+        # Translate internal keys to public convention.
+        # Capture model config at trace time via closures.
+        do_grad_r = self.do_grad_r("energy")
+        do_grad_c = self.do_grad_c("energy")
+
+        def fn(
+            atype: torch.Tensor,
+            n_node: torch.Tensor,
+            edge_index: torch.Tensor,
+            edge_vec: torch.Tensor,
+            edge_mask: torch.Tensor,
+            fparam: torch.Tensor | None,
+            aparam: torch.Tensor | None,
+            charge_spin: torch.Tensor | None,
+        ) -> dict[str, torch.Tensor]:
+            model_ret = traced(
+                atype,
+                n_node,
+                edge_index,
+                edge_vec,
+                edge_mask,
+                fparam,
+                aparam,
+                charge_spin,
+            )
+            return _translate_energy_keys(
+                model_ret,
+                do_grad_r=do_grad_r,
+                do_grad_c=do_grad_c,
+                do_atomic_virial=do_atomic_virial,
+                local=True,
+            )
+
+        return make_fx(fn, **make_fx_kwargs)(
+            atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, charge_spin
+        )
diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py
index 50ede240e4..5b19cb63f1 100644
--- a/deepmd/pt_expt/model/make_model.py
+++ b/deepmd/pt_expt/model/make_model.py
@@ -370,6 +370,11 @@ def forward_common_lower_graph(
                 do_atomic_virial=do_atomic_virial,
                 create_graph=self.training,
                 mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
+                # Bound the per-node scatter by the INPUT node axis (the symbol
+                # ``edge_index`` indexes into), not the re-derived fitting-output
+                # shape -- avoids a CUDA out-of-bounds device-assert under
+                # dynamic-edge torch.export. See fit_output_to_model_output_graph.
+                node_capacity=atype.shape[0],
             )
 
         def _resolve_graph_method(
@@ -630,6 +635,86 @@ def fn(
                 model.need_sorted_nlist_for_lower = _orig_need_sort
             return traced
 
+        def forward_common_lower_graph_exportable(
+            self,
+            atype: torch.Tensor,
+            n_node: torch.Tensor,
+            edge_index: torch.Tensor,
+            edge_vec: torch.Tensor,
+            edge_mask: torch.Tensor,
+            fparam: torch.Tensor | None = None,
+            aparam: torch.Tensor | None = None,
+            do_atomic_virial: bool = False,
+            charge_spin: torch.Tensor | None = None,
+            **make_fx_kwargs: Any,
+        ) -> torch.nn.Module:
+            """make_fx trace of ``forward_common_lower_graph`` with ``edge_vec``
+            as the autograd leaf — the export target for graph-form .pt2 archives.
+
+            Parameters
+            ----------
+            atype
+                (N,) flat local atom types, ``N == sum(n_node)``.
+            n_node
+                (nf,) per-frame local atom counts.
+            edge_index
+                (2, E) ``[src, dst]`` edge endpoints (flat local indices).
+            edge_vec
+                (E, 3) neighbor-minus-center edge vectors (sample for tracing).
+            edge_mask
+                (E,) valid-edge mask (sample for tracing).
+            fparam, aparam, do_atomic_virial, charge_spin
+                As in ``forward_common_lower_graph``.
+            **make_fx_kwargs
+                Extra keyword arguments forwarded to ``make_fx``
+                (e.g. ``tracing_mode="symbolic"``).
+
+            Returns
+            -------
+            torch.nn.Module
+                A traced module whose ``forward`` accepts
+                ``(atype, n_node, edge_index, edge_vec, edge_mask,
+                fparam, aparam, charge_spin)`` and returns a dict with the
+                same internal keys as ``forward_common_lower_graph``.
+            """
+            model = self
+
+            def fn(
+                atype: torch.Tensor,
+                n_node: torch.Tensor,
+                edge_index: torch.Tensor,
+                edge_vec: torch.Tensor,
+                edge_mask: torch.Tensor,
+                fparam: torch.Tensor | None,
+                aparam: torch.Tensor | None,
+                charge_spin: torch.Tensor | None,
+            ) -> dict[str, torch.Tensor]:
+                # forward_common_lower_graph creates the autograd leaf from
+                # edge_vec internally, so no outer detach/requires_grad_ here
+                # (it would only add spurious ops to the traced graph).
+                return model.forward_common_lower_graph(
+                    atype,
+                    n_node,
+                    edge_index,
+                    edge_vec,
+                    edge_mask,
+                    do_atomic_virial=do_atomic_virial,
+                    fparam=fparam,
+                    aparam=aparam,
+                    charge_spin=charge_spin,
+                )
+
+            return make_fx(fn, **make_fx_kwargs)(
+                atype,
+                n_node,
+                edge_index,
+                edge_vec,
+                edge_mask,
+                fparam,
+                aparam,
+                charge_spin,
+            )
+
         def forward_common_lower_exportable_with_comm(
             self,
             extended_coord: torch.Tensor,
diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py
index bd6fdb02a3..4ab43fe329 100644
--- a/deepmd/pt_expt/train/training.py
+++ b/deepmd/pt_expt/train/training.py
@@ -274,6 +274,41 @@ def get_additional_data_requirement(_model: Any) -> list[DataRequirementItem]:
 # ---------------------------------------------------------------------------
 
 
+def _forbidden_dims_from_model(
+    model: torch.nn.Module,
+    task_buf_vals: tuple[torch.Tensor, ...],
+) -> set[int]:
+    """Prime-collision set for trace-dim selection.
+
+    Collects every ``> 1`` dim of the model's parameters/buffers (so
+    ``_next_safe_prime`` never aliases an internal dim like ``g2_dim`` /
+    ``axis_neuron`` / ``attn_head`` without a hardcoded list), plus
+    ``dim_fparam``/``dim_aparam`` and the task-buffer dims.  Shared by the dense
+    :func:`_trace_and_compile` and the graph :func:`_trace_and_compile_graph`;
+    each caller adds its path-specific dims (nall/nloc/nsel for dense,
+    charge_spin for both) on top of this base set.
+    """
+    forbidden: set[int] = {
+        int(_d)
+        for _src in (model.parameters(), model.buffers())
+        for _p in _src
+        for _d in _p.shape
+        if _d > 1
+    }
+    for _getter in (model.get_dim_fparam, model.get_dim_aparam):
+        try:
+            _dim = _getter()
+            if _dim > 1:
+                forbidden.add(int(_dim))
+        except Exception:
+            pass  # best-effort: dim unavailable -> nothing to forbid
+    for _tbv in task_buf_vals:
+        for _d in _tbv.shape:
+            if _d > 1:
+                forbidden.add(int(_d))
+    return forbidden
+
+
 def _trace_and_compile(
     model: torch.nn.Module,
     ext_coord: torch.Tensor,
@@ -397,17 +432,11 @@ def fn(
     # large to alias with any architecture dim and need no adjustment.
     #
     # The prime for nf is chosen by enumerating every dimension that appears
-    # in the model's parameters and buffers, then calling _next_safe_prime to
-    # find the first prime that doesn't collide with any of them.  This
-    # catches internal dims like g2_dim, axis_neuron, attn_head, etc.
-    # without requiring a hardcoded list.
-    _forbidden: set[int] = {
-        int(_d)
-        for _src in (model.parameters(), model.buffers())
-        for _p in _src
-        for _d in _p.shape
-        if _d > 1
-    }
+    # in the model's parameters and buffers (see _forbidden_dims_from_model),
+    # then calling _next_safe_prime to find the first prime that doesn't collide
+    # with any of them -- catching internal dims like g2_dim/axis_neuron/
+    # attn_head without a hardcoded list.  Add the dense-path dims on top.
+    _forbidden = _forbidden_dims_from_model(model, task_buf_vals_trace)
     # Also add the real nloc and nall so trace_nf never aliases them.
     _forbidden.add(int(ext_coord.shape[1]))  # nall
     _forbidden.add(int(ext_atype.shape[1]))  # nall (same tensor, defensive)
@@ -416,26 +445,10 @@ def fn(
     _nsel = int(nlist.shape[2])
     if _nsel > 1:
         _forbidden.add(_nsel)
-    try:
-        _dim_fp = model.get_dim_fparam()
-        if _dim_fp > 1:
-            _forbidden.add(_dim_fp)
-    except Exception:
-        pass
-    try:
-        _dim_ap = model.get_dim_aparam()
-        if _dim_ap > 1:
-            _forbidden.add(_dim_ap)
-    except Exception:
-        pass
     if charge_spin is not None:
         _dim_cs = int(charge_spin.shape[1])
         if _dim_cs > 1:
             _forbidden.add(_dim_cs)
-    for _tbv in task_buf_vals_trace:
-        for _d in _tbv.shape:
-            if _d > 1:
-                _forbidden.add(int(_d))
 
     trace_nf = _next_safe_prime(5, _forbidden)
 
@@ -477,6 +490,25 @@ def fn(
         *task_buf_vals_trace,
     )
 
+    return (
+        _finalize_compiled_lower(traced_lower, model, was_training, compile_opts),
+        task_buf_order,
+    )
+
+
+def _finalize_compiled_lower(
+    traced_lower: "torch.fx.GraphModule",
+    model: torch.nn.Module,
+    was_training: bool,
+    compile_opts: dict[str, Any] | None,
+    extra_options: dict[str, Any] | None = None,
+) -> torch.nn.Module:
+    """Shared post-``make_fx`` tail: strip detach, rebuild, inductor-compile.
+
+    Used by both the dense :func:`_trace_and_compile` and the graph
+    :func:`_trace_and_compile_graph` so the second-order-gradient handling
+    (detach removal) and inductor options stay identical on both paths.
+    """
     # make_fx inserts aten.detach.default for saved tensors used in the
     # decomposed autograd.grad backward ops.  These detach nodes break
     # second-order gradient flow (d(force)/d(params) for force training).
@@ -503,6 +535,8 @@ def fn(
         # pytorch/pytorch#174379, #178080, #179494 under
         # data-dependent symbolic shapes.
     }
+    if extra_options:
+        inductor_options.update(extra_options)
     if compile_opts:
         inductor_options.update(compile_opts)
 
@@ -511,7 +545,264 @@ def fn(
         backend="inductor",
         dynamic=True,
         options=inductor_options,
-    ), task_buf_order
+    )
+
+
+def _model_uses_graph_lower(model: torch.nn.Module) -> bool:
+    """Whether ``model``'s eager default-flip routes through the GRAPH lower.
+
+    Mirrors the predicate in
+    :meth:`~deepmd.pt_expt.model.make_model.make_model.<locals>.CM._resolve_graph_method`
+    for ``neighbor_graph_method is None`` (the training default): a model is
+    graph-eligible iff it is ``mixed_types`` AND its single descriptor reports
+    ``uses_graph_lower() == True`` (currently only dpa1 ``attn_layer == 0``).
+
+    When True the compiled lower must be the GRAPH ``forward_common_lower_graph``
+    so the compiled path matches eager training (which already default-flips to
+    the carry-all graph forward); when False the dense ``forward_lower`` is
+    compiled (se_e2_a / dpa2 / dpa3 / linear / zbl).
+
+    ASSUMPTION: training uses the default ``neighbor_graph_method`` (None). If a
+    user-facing ``"legacy"`` opt-out is ever plumbed into the trainer, this gate
+    must also honor it (else eager would run dense while the compiled path runs
+    the graph lower, re-introducing the eager!=compiled divergence this fixes).
+    """
+    if not hasattr(model, "mixed_types"):
+        return False
+    try:
+        if not model.mixed_types():
+            return False
+    except (AttributeError, NotImplementedError):
+        return False
+    # Linear / ZBL atomic models have no single ``descriptor`` -> dense.
+    descriptor = getattr(getattr(model, "atomic_model", None), "descriptor", None)
+    uses_graph = getattr(descriptor, "uses_graph_lower", None)
+    if uses_graph is None:
+        return False
+    try:
+        return bool(uses_graph())
+    except (AttributeError, NotImplementedError):
+        return False
+
+
+def _trace_and_compile_graph(
+    model: torch.nn.Module,
+    fparam: torch.Tensor | None,
+    aparam: torch.Tensor | None,
+    charge_spin: torch.Tensor | None,
+    compile_opts: dict[str, Any] | None = None,
+    task_buffers: dict[str, torch.Tensor] | None = None,
+) -> tuple[torch.nn.Module, tuple[str, ...]]:
+    """Symbolic-trace ``forward_common_lower_graph`` and inductor-compile it.
+
+    The GRAPH analogue of :func:`_trace_and_compile`.  Builds a small synthetic
+    NeighborGraph with prime-controlled ``nf`` / ``N`` / ``E`` axes (so make_fx's
+    duck-shape unification keeps the three dynamic dims as distinct symbols),
+    traces ``model.forward_common_lower_graph`` with ``edge_vec`` as the autograd
+    leaf, and translates the internal fitting keys to the public energy-model
+    keys (``atom_energy`` / ``energy`` / ``force`` / ``virial``).  The compiled
+    callable accepts the positional graph tensors plus the promoted task buffers
+    and returns those public keys on the FLAT node axis (``N == sum(n_node)``);
+    the caller (:meth:`_CompiledModel.forward`) unravels them to ``(nf, nloc, *)``.
+
+    Parameters
+    ----------
+    model
+        The (uncompiled) graph-eligible energy model.
+    fparam, aparam, charge_spin
+        Representative optional inputs (or ``None``) so the traced branch
+        matches what :meth:`_CompiledModel.forward` passes at run time.
+    compile_opts
+        User-supplied inductor options (merged over the built-in defaults).
+    task_buffers
+        Per-task buffers promoted to FX placeholders (see
+        :func:`_detect_task_buffers`).
+    """
+    import math
+
+    from torch._decomp import (
+        get_decompositions,
+    )
+    from torch.fx.experimental.proxy_tensor import (
+        make_fx,
+    )
+
+    from deepmd.pt_expt.model.ener_model import (
+        _translate_energy_keys,
+    )
+
+    was_training = model.training
+    # Trace in train mode so create_graph=True is captured inside the graph
+    # force backward (forward_common_lower_graph passes create_graph=self.training).
+    model.train()
+
+    task_buf_order: tuple[str, ...] = tuple(task_buffers.keys()) if task_buffers else ()
+    task_buf_vals_trace: tuple[torch.Tensor, ...] = (
+        tuple(task_buffers[k] for k in task_buf_order) if task_buffers else ()
+    )
+
+    _fitting: torch.nn.Module | None = None
+    _atomic_model: torch.nn.Module | None = None
+    if task_buf_order:
+        try:
+            _fitting = model.get_fitting_net()
+        except AttributeError:
+            pass  # optional accessor; a model without a fitting net keeps None
+        try:
+            _atomic_model = model.atomic_model
+        except AttributeError:
+            pass  # optional attribute; a model without an atomic model keeps None
+
+    do_grad_r = model.do_grad_r("energy")
+    do_grad_c = model.do_grad_c("energy")
+
+    # ------------------------------------------------------------------
+    # Build the trace-time NeighborGraph with prime-distinct nf / N / E.
+    #
+    # make_fx (tracing_mode="symbolic") unifies dimension symbols that share a
+    # concrete value (duck-shape merging).  The three dynamic axes of the graph
+    # lower must stay distinct symbols, otherwise the per-frame segment_sum
+    # (N -> nf) and the per-edge scatter (E -> N) bake in a false equality:
+    #   * nf  = n_node.shape[0]      (per-frame reductions)
+    #   * N   = atype.shape[0]       (flat node axis = sum(n_node))
+    #   * E   = edge_vec.shape[0]    (edge axis)
+    # They are chosen as collision-free primes vs every parameter/buffer dim
+    # (see _forbidden_dims_from_model) plus charge_spin.
+    # ------------------------------------------------------------------
+    _forbidden = _forbidden_dims_from_model(model, task_buf_vals_trace)
+    if charge_spin is not None and charge_spin.shape[-1] > 1:
+        _forbidden.add(int(charge_spin.shape[-1]))
+
+    trace_nf = _next_safe_prime(5, _forbidden)
+    # nloc such that N = trace_nf * nloc is collision-free (and != trace_nf).
+    nloc_trace = 7
+    while (trace_nf * nloc_trace) in (_forbidden | {trace_nf}):
+        nloc_trace += 1
+    trace_N = trace_nf * nloc_trace
+    # Static edge capacity, prime-padded to stay distinct from nf and N.
+    nnei = sum(model.get_sel())
+    e_max_base = max(math.ceil(1.25 * nloc_trace * nnei), 7)
+    e_max = _next_safe_prime(e_max_base, _forbidden | {trace_nf, trace_N})
+
+    # Shared with the .pt2 export trace (serialization.py) so the two graph
+    # traces can never desync on the input schema.  Training uses the run-time
+    # float precision and device; optional tensors match the actual call.
+    from deepmd.pt_expt.utils.serialization import (
+        build_synthetic_graph_inputs,
+    )
+
+    sample = build_synthetic_graph_inputs(
+        model,
+        e_max=e_max,
+        nframes=trace_nf,
+        nloc=nloc_trace,
+        dtype=GLOBAL_PT_FLOAT_PRECISION,
+        device=DEVICE,
+        want_fparam=fparam is not None,
+        want_aparam=aparam is not None,
+        want_charge_spin=charge_spin is not None,
+    )
+    (
+        s_atype,
+        s_n_node,
+        s_edge_index,
+        s_edge_vec,
+        s_edge_mask,
+        s_fparam,
+        s_aparam,
+        s_charge_spin,
+    ) = sample
+
+    def fn(
+        atype: torch.Tensor,
+        n_node: torch.Tensor,
+        edge_index: torch.Tensor,
+        edge_vec: torch.Tensor,
+        edge_mask: torch.Tensor,
+        fparam: torch.Tensor | None,
+        aparam: torch.Tensor | None,
+        charge_spin: torch.Tensor | None,
+        *task_buf_vals: torch.Tensor,
+    ) -> dict[str, torch.Tensor]:
+        # Patch task-specific buffers with the proxy tensors so make_fx records
+        # them as FX placeholders (mirrors the dense ``_trace_and_compile``).
+        originals: dict[str, torch.Tensor | None] = {}
+        if task_buf_order:
+            for name, val in zip(task_buf_order, task_buf_vals, strict=True):
+                if name.startswith(_AM_PREFIX):
+                    actual = name[len(_AM_PREFIX) :]
+                    if _atomic_model is not None:
+                        originals[name] = _atomic_model._buffers.get(actual)
+                        _atomic_model._buffers[actual] = val
+                else:
+                    if _fitting is not None:
+                        originals[name] = _fitting._buffers.get(name)
+                        _fitting._buffers[name] = val
+        try:
+            # forward_common_lower_graph makes edge_vec the autograd leaf
+            # internally, so no outer detach/requires_grad_ here.
+            model_ret = model.forward_common_lower_graph(
+                atype,
+                n_node,
+                edge_index,
+                edge_vec,
+                edge_mask,
+                do_atomic_virial=False,
+                fparam=fparam,
+                aparam=aparam,
+                charge_spin=charge_spin,
+            )
+            return _translate_energy_keys(
+                model_ret,
+                do_grad_r=do_grad_r,
+                do_grad_c=do_grad_c,
+                do_atomic_virial=False,
+                local=True,
+            )
+        finally:
+            for name, orig in originals.items():
+                if name.startswith(_AM_PREFIX):
+                    actual = name[len(_AM_PREFIX) :]
+                    if _atomic_model is not None:
+                        _atomic_model._buffers[actual] = orig
+                else:
+                    if _fitting is not None:
+                        _fitting._buffers[name] = orig
+
+    decomp_table = get_decompositions([torch.ops.aten.silu_backward.default])
+
+    traced_lower = make_fx(
+        fn,
+        tracing_mode="symbolic",
+        _allow_non_fake_inputs=True,
+        decomposition_table=decomp_table,
+    )(
+        s_atype,
+        s_n_node,
+        s_edge_index,
+        s_edge_vec,
+        s_edge_mask,
+        s_fparam,
+        s_aparam,
+        s_charge_spin,
+        *task_buf_vals_trace,
+    )
+
+    # The per-frame virial reduction scatters E edges into the (nf, 3, 3) virial
+    # via an atomic_add; inductor's CPU vectorizer asserts on that scatter's
+    # scalar index (``index.is_vec``).  Disable CPU SIMD for the graph lower so
+    # the scatter is emitted scalar — numerically this only removes a
+    # reduction-order source, keeping eager==compiled within fp64 tolerance.
+    return (
+        _finalize_compiled_lower(
+            traced_lower,
+            model,
+            was_training,
+            compile_opts,
+            extra_options={"cpp.simdlen": 0},
+        ),
+        task_buf_order,
+    )
 
 
 class _CompiledModel(torch.nn.Module):
@@ -546,6 +837,9 @@ def __init__(
         self._compiled_by_structure: dict = (
             compiled_by_structure if compiled_by_structure is not None else {}
         )
+        # Resolved on the first forward: whether to compile the GRAPH lower
+        # (graph-eligible mixed_types descriptors) or the dense forward_lower.
+        self._graph_eligible: bool | None = None
 
     def __getattr__(self, name: str) -> Any:
         # Delegate unknown lookups to original_model so that callers such as
@@ -579,6 +873,18 @@ def forward(
 
         nframes, nloc = atype.shape[:2]
         rcut = self.original_model.get_rcut()
+
+        # Graph-eligible models (dpa1 attn_layer==0) default-flip to the carry-all
+        # GRAPH forward in eager training; the compiled lower must be the GRAPH
+        # lower too, otherwise the eager (graph) and compiled (dense) backward
+        # gradients diverge at fp64 accumulation and the optimizer amplifies it.
+        if self._graph_eligible is None:
+            self._graph_eligible = _model_uses_graph_lower(self.original_model)
+        if self._graph_eligible:
+            return self._forward_graph(
+                coord, atype, box, fparam, aparam, charge_spin, nframes, nloc, rcut
+            )
+
         sel = self.original_model.get_sel()
 
         # coord extension + nlist (data-dependent, run in eager)
@@ -751,6 +1057,163 @@ def forward(
             out["mask"] = result["mask"]
         return out
 
+    def _forward_graph(
+        self,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
+        box: torch.Tensor | None,
+        fparam: torch.Tensor | None,
+        aparam: torch.Tensor | None,
+        charge_spin: torch.Tensor | None,
+        nframes: int,
+        nloc: int,
+        rcut: float,
+    ) -> dict[str, torch.Tensor]:
+        """Carry-all GRAPH forward -> compiled ``forward_common_lower_graph``.
+
+        Builds the carry-all NeighborGraph eagerly (the SAME builder the eager
+        uncompiled default-flip uses, so the graph tensors are bit-identical),
+        then calls the compiled graph lower.  The graph force is per-LOCAL-node
+        ``(N, 3)`` with ``N == nframes * nloc`` for a single-rank carry-all graph,
+        so no extended->local scatter is needed; only the flat ``(N, *)`` node
+        keys are unravelled to ``(nf, nloc, *)`` at the I/O boundary.
+        """
+        from deepmd.dpmodel.utils.neighbor_graph import (
+            build_neighbor_graph,
+        )
+
+        _model = self.original_model
+
+        coord_3d = coord.detach().reshape(nframes, nloc, 3)
+        box_flat = box.detach().reshape(nframes, 9) if box is not None else None
+
+        # Mirror the optional-input defaulting of the dense path / eager
+        # call_common: a model configured with fparam / charge_spin substitutes
+        # its default when the data omits it, so the compiled (frozen) branch
+        # always sees a tensor.
+        _dim_fparam = (
+            _model.get_dim_fparam() if hasattr(_model, "get_dim_fparam") else 0
+        )
+        if fparam is None and _dim_fparam > 0:
+            _default_fparam = _model.get_default_fparam()
+            if _default_fparam is not None:
+                fparam = (
+                    torch.as_tensor(
+                        _default_fparam, dtype=coord_3d.dtype, device=coord_3d.device
+                    )
+                    .reshape(1, _dim_fparam)
+                    .expand(nframes, -1)
+                )
+        _dim_cs = (
+            _model.get_dim_chg_spin() if hasattr(_model, "get_dim_chg_spin") else 0
+        )
+        if charge_spin is None and _dim_cs > 0:
+            _default_cs = _model.get_default_chg_spin()
+            if _default_cs is not None:
+                charge_spin = (
+                    torch.as_tensor(
+                        _default_cs, dtype=coord_3d.dtype, device=coord_3d.device
+                    )
+                    .reshape(1, _dim_cs)
+                    .expand(nframes, -1)
+                )
+
+        # Carry-all graph (dynamic E, no edge_capacity) — identical to the eager
+        # uncompiled ``_call_common_graph`` builder so the two paths match.
+        ng = build_neighbor_graph(coord_3d, atype, box_flat, rcut)
+        atype_flat = atype.reshape(nframes * nloc)
+
+        # Lazy compile of the GRAPH lower (cached per structure key).
+        if self.compiled_forward_lower is None:
+            if self._structure_key in self._compiled_by_structure:
+                compiled_lower, buf_order = self._compiled_by_structure[
+                    self._structure_key
+                ]
+                log.info("Reusing compiled graph lower (shared structure, lazy).")
+            else:
+                log.info(
+                    "Lazy compile (graph lower): tracing on first forward call "
+                    "(structure_key=%s).",
+                    self._structure_key,
+                )
+                compiled_lower, buf_order = _trace_and_compile_graph(
+                    _model,
+                    fparam,
+                    aparam,
+                    charge_spin,
+                    task_buffers=self._task_buffers,
+                    compile_opts=self._compile_opts,
+                )
+                self._compiled_by_structure[self._structure_key] = (
+                    compiled_lower,
+                    buf_order,
+                )
+            self.compiled_forward_lower = compiled_lower
+            self._task_buf_order = buf_order
+            self._task_buffers = None
+
+        # Feed a detached, grad-enabled edge_vec leaf: the traced graph's internal
+        # ``edge_vec.detach()`` is stripped by ``_strip_saved_tensor_detach`` (as
+        # for the dense ext_coord leaf), so the force backward roots at this input.
+        edge_vec = ng.edge_vec.detach().requires_grad_(True)
+
+        if self._task_buf_order:
+            try:
+                _fitting = _model.get_fitting_net()
+                _am = getattr(_model, "atomic_model", None)
+                _vals: list[torch.Tensor] = []
+                for _name in self._task_buf_order:
+                    if _name.startswith(_AM_PREFIX):
+                        _actual = _name[len(_AM_PREFIX) :]
+                        _vals.append(_am._buffers[_actual])
+                    else:
+                        _vals.append(getattr(_fitting, _name))
+                task_buf_vals: tuple = tuple(_vals)
+            except AttributeError as exc:
+                raise RuntimeError(
+                    f"Compiled graph expects task buffers {self._task_buf_order!r} "
+                    "but they could not be retrieved from the model. "
+                    "This is a bug in the compile path."
+                ) from exc
+        else:
+            task_buf_vals = ()
+
+        result = self.compiled_forward_lower(
+            atype_flat,
+            ng.n_node,
+            ng.edge_index,
+            edge_vec,
+            ng.edge_mask,
+            fparam,
+            aparam,
+            charge_spin,
+            *task_buf_vals,
+        )
+
+        # The compiled graph lower emits PUBLIC keys on the FLAT node axis
+        # (``atom_energy`` / ``force`` are (N, *); ``energy`` / ``virial`` are
+        # (nf, *)).  Unravel the node-level keys to rectangular (nf, nloc, *) so
+        # callers receive the same shapes as the dense path.
+        N = nframes * nloc
+        # Node-level (per-atom, lead dim N) public keys emitted by the graph
+        # lower; the remaining keys are frame-level (lead dim nf) and must NOT
+        # be unravelled. Keying on the NAME rather than the ``N != nframes``
+        # shape heuristic keeps the single-atom case (nloc == 1, where
+        # N == nframes) correct -- node-level outputs still reshape to
+        # (nf, 1, *) instead of staying (nf, *).
+        node_level_keys = {"atom_energy", "force", "atom_virial", "mask"}
+        out: dict[str, torch.Tensor] = {}
+        for key, val in result.items():
+            if (
+                key in node_level_keys
+                and val is not None
+                and val.shape[:1] == torch.Size([N])
+            ):
+                out[key] = val.reshape(nframes, nloc, *val.shape[1:])
+            else:
+                out[key] = val
+        return out
+
 
 # ---------------------------------------------------------------------------
 # Trainer
diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index 9c03783574..e0c08a6f84 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -40,27 +40,32 @@
 
 
 def _strip_shape_assertions(graph_module: torch.nn.Module) -> None:
-    """Neutralise shape-guard assertion nodes in a spin model's exported graph.
-
-    ``torch.export`` inserts ``aten._assert_scalar`` nodes for symbolic shape
-    relationships discovered during tracing.  For the spin model, the atom-
-    doubling logic creates slice patterns that depend on ``(nall - nloc)``,
-    producing guards like ``Ne(nall, nloc)``.  These guards are spurious: the
-    model computes correct results even when ``nall == nloc`` (NoPBC, no ghost
-    atoms).
-
-    This function is **only called for spin models** (guarded by ``if is_spin``
-    in ``_trace_and_export``).  The assertion messages use opaque symbolic
-    variable names (e.g. ``Ne(s22, s96)``) rather than human-readable names,
-    so filtering by message content is not reliable.  Since
-    ``prefer_deferred_runtime_asserts_over_guards=True`` converts all shape
-    guards into these deferred assertions, and the only shape relationships in
-    the spin model involve nall/nloc, neutralising all of them is safe in this
-    context.
-
-    We replace each assertion's condition with ``True`` rather than erasing the
-    node; erasing nodes can disturb the FX graph structure and produce NaN
-    gradients on some Python/torch versions.
+    """Neutralise deferred shape-guard assertion nodes in an exported graph.
+
+    ``torch.export`` (with ``prefer_deferred_runtime_asserts_over_guards=True``)
+    inserts ``aten._assert_scalar`` nodes for symbolic-shape relationships
+    discovered during tracing.  The assertion messages use opaque symbolic names
+    (e.g. ``Ne(s22, s96)``), so filtering by message content is not reliable; we
+    replace each assertion's condition with ``True`` rather than erasing the node
+    (erasing can disturb the FX graph and yield NaN gradients on some torch
+    versions).
+
+    Called from TWO export paths in ``_trace_and_export``:
+
+    * **spin (dense) models** — atom-doubling slice patterns depend on
+      ``(nall - nloc)``, producing spurious guards like ``Ne(nall, nloc)``; the
+      model is correct even when ``nall == nloc`` (NoPBC, no ghosts).
+    * **graph models** — the DYNAMIC edge axis (``Dim("nedge")``) produces guards
+      of the ``nloc_min``/SIGFPE family on the edge count ``E``.  These are the
+      shape-specialization guards the static-``edge_capacity`` path was designed
+      to avoid; neutralising them is what makes one artifact eval any edge count.
+
+    **Safety:** in both contexts every input is constructed well-formed by the
+    builder (spin: valid atom doubling; graph: ``build_neighbor_graph`` /
+    ``buildGraphTensors`` always emit ``E >= min_edges == 2`` with in-range,
+    masked edges), so the neutralised guards would never legitimately fire.  The
+    only cost is that a MALFORMED runtime tensor no longer throws cleanly — the
+    documented AOTI trade-off (CLAUDE.md), accepted identically on both paths.
     """
     graph = graph_module.graph
     for node in list(graph.nodes):
@@ -310,6 +315,158 @@ def _make_sample_inputs(
     return ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam, charge_spin
 
 
+def build_synthetic_graph_inputs(
+    model: torch.nn.Module,
+    e_max: int,
+    nframes: int = 2,
+    nloc: int = 7,
+    *,
+    dtype: torch.dtype,
+    device: torch.device | None = None,
+    want_fparam: bool = True,
+    want_aparam: bool = True,
+    want_charge_spin: bool = True,
+) -> tuple[torch.Tensor | None, ...]:
+    """Build a synthetic carry-all ``NeighborGraph`` for graph-lower tracing.
+
+    Single source of the trace-time graph inputs, shared by ``.pt2`` export
+    (:func:`_trace_and_export`) and compiled training
+    (:func:`deepmd.pt_expt.train.training._trace_and_compile_graph`), so the two
+    traces can never desync on the graph input schema.  Builds a small random
+    system, runs the carry-all
+    :func:`~deepmd.dpmodel.utils.neighbor_graph.build_neighbor_graph` with a
+    STATIC ``GraphLayout(edge_capacity=e_max)`` (decision #16: the masked static
+    edge axis), and returns tensors in the positional order expected by
+    ``forward_(common_)lower_graph``:
+    ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam,
+    charge_spin)``.
+
+    The system (``rng(42)``, ``box = rcut*3``, centered coords, ``atype[:, i] =
+    i % ntypes``) is identical for both callers; the only two former differences
+    are now parameters.
+
+    Parameters
+    ----------
+    model : torch.nn.Module
+        The pt_expt energy model (must expose ``get_rcut``/``get_type_map``/...).
+    e_max : int
+        Static edge capacity ``E`` to pad the (masked) edge axis to.
+    nframes : int
+        Number of frames in the sample system.
+    nloc : int
+        Number of local atoms per frame (``N == nframes * nloc``).
+    dtype : torch.dtype
+        Float precision of ``coord``/``edge_vec``/``fparam``/... .  The exported
+        ``.pt2`` is float64-only (C++ ABI); training passes
+        ``GLOBAL_PT_FLOAT_PRECISION``.
+    device : torch.device, optional
+        Target device.  Defaults to ``deepmd.pt_expt.utils.env.DEVICE``; the
+        export path passes ``cpu`` explicitly (make_fx traces on CPU).
+    want_fparam, want_aparam, want_charge_spin : bool
+        Whether to emit the optional conditioning tensor when its ``dim > 0``.
+        Export passes the defaults (``True`` = include if present); training
+        passes ``x is not None`` so the traced branch matches the run-time call.
+    """
+    import deepmd.pt_expt.utils.env as _env
+    from deepmd.dpmodel.utils.neighbor_graph import (
+        GraphLayout,
+        build_neighbor_graph,
+    )
+
+    if device is None:
+        device = _env.DEVICE
+
+    rcut = model.get_rcut()
+    ntypes = len(model.get_type_map())
+    dim_fparam = model.get_dim_fparam()
+    dim_aparam = model.get_dim_aparam()
+    dim_chg_spin = model.get_dim_chg_spin() if hasattr(model, "get_dim_chg_spin") else 0
+
+    # Box large enough to avoid PBC degeneracy; centered coords.
+    box_size = rcut * 3.0
+    box_np = (np.eye(3, dtype=np.float64) * box_size).reshape(1, 9)
+    rng = np.random.default_rng(42)
+    coord_np = rng.random((nframes, nloc, 3)) * box_size * 0.5 + box_size * 0.25
+    atype_np = np.zeros((nframes, nloc), dtype=np.int64)
+    for i in range(nloc):
+        atype_np[:, i] = i % ntypes
+
+    coord_t = torch.tensor(coord_np, dtype=dtype, device=device)
+    atype_t = torch.tensor(atype_np, dtype=torch.int64, device=device)
+    box_t = torch.tensor(np.tile(box_np, (nframes, 1)), dtype=dtype, device=device)
+    graph = build_neighbor_graph(
+        coord_t, atype_t, box_t, rcut, layout=GraphLayout(edge_capacity=e_max)
+    )
+
+    fparam = (
+        torch.zeros(nframes, dim_fparam, dtype=dtype, device=device)
+        if (want_fparam and dim_fparam > 0)
+        else None
+    )
+    aparam = (
+        torch.zeros(nframes, nloc, dim_aparam, dtype=dtype, device=device)
+        if (want_aparam and dim_aparam > 0)
+        else None
+    )
+    charge_spin = (
+        torch.zeros(nframes, dim_chg_spin, dtype=dtype, device=device)
+        if (want_charge_spin and dim_chg_spin > 0)
+        else None
+    )
+
+    return (
+        atype_t.reshape(-1),
+        graph.n_node,
+        graph.edge_index,
+        graph.edge_vec,
+        graph.edge_mask,
+        fparam,
+        aparam,
+        charge_spin,
+    )
+
+
+def _build_graph_dynamic_shapes(
+    *sample_inputs: torch.Tensor | None,
+) -> tuple:
+    """Build dynamic-shape specifications for the graph-form forward_lower export.
+
+    ``nframes`` (the ``n_node`` axis), ``N`` (the flat node axis) AND the edge
+    axis ``E`` are all dynamic dims (B2.0: the dynamic edge axis replaces the
+    static ``edge_capacity`` of B1).  ``E`` is marked ``Dim("nedge", min=2)`` so
+    the AOTI artifact accepts any system size with no capacity ceiling — the
+    ``min=2`` lower bound mirrors the dense path's ``Dim("nnei", min=...)`` (a
+    dynamic, SIGFPE-tamed axis) and matches the carry-all builder's
+    ``min_edges=2`` guard (every dynamic graph carries >=2 edges).
+
+    Parameters
+    ----------
+    *sample_inputs : torch.Tensor | None
+        ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam,
+        charge_spin)`` — 8 entries matching ``forward_lower_graph_exportable``.
+    """
+    fparam = sample_inputs[5]
+    aparam = sample_inputs[6]
+    charge_spin = sample_inputs[7]
+    nframes_dim = torch.export.Dim("nframes", min=1)
+    n_node_total_dim = torch.export.Dim("n_node_total", min=1)
+    nedge_dim = torch.export.Dim("nedge", min=2)
+    nloc_dim = torch.export.Dim("nloc", min=1)
+    return (
+        {0: n_node_total_dim},  # atype: (N,)
+        {0: nframes_dim},  # n_node: (nf,)
+        {1: nedge_dim},  # edge_index: (2, E) — E dynamic
+        {0: nedge_dim},  # edge_vec: (E, 3) — E dynamic
+        {0: nedge_dim},  # edge_mask: (E,) — E dynamic
+        {0: nframes_dim} if fparam is not None else None,  # fparam: (nf, ndf)
+        # aparam: (nf, nloc, nda) — both the frame AND atom axes are dynamic,
+        # matching the dense ``_build_dynamic_shapes`` (otherwise a dim_aparam>0
+        # graph export specializes nloc to the sample size and breaks at runtime).
+        {0: nframes_dim, 1: nloc_dim} if aparam is not None else None,  # aparam
+        {0: nframes_dim} if charge_spin is not None else None,  # charge_spin
+    )
+
+
 def _build_dynamic_shapes(
     *sample_inputs: torch.Tensor | None,
     has_spin: bool = False,
@@ -416,7 +573,9 @@ def _build_dynamic_shapes(
     return (*base, None, None, None, None, None, None, None, None)
 
 
-def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict:
+def _collect_metadata(
+    model: torch.nn.Module, is_spin: bool = False, lower_kind: str = "nlist"
+) -> dict:
     """Collect metadata from the model for C++ inference.
 
     This metadata is stored as ``metadata.json`` in both .pt2 and .pte archives.
@@ -528,6 +687,12 @@ def _probe_has_message_passing(obj: object) -> bool | None:
         if result is not None:
             break
     meta["has_message_passing"] = result if result is not None else False
+
+    # Which input schema the compiled AOTI forward consumes:
+    #   "nlist" → dense quartet (extended_coord, extended_atype, nlist, mapping)
+    #   "graph" → NeighborGraph (atype, n_node, edge_index, edge_vec, edge_mask)
+    # The C++ loader branches on this to build the matching inputs.
+    meta["lower_input_kind"] = "graph" if lower_kind == "graph" else "nlist"
     return meta
 
 
@@ -599,6 +764,7 @@ def deserialize_to_file(
     data: dict,
     model_json_override: dict | None = None,
     do_atomic_virial: bool = False,
+    lower_kind: str = "nlist",
 ) -> None:
     """Deserialize a dictionary to a .pte or .pt2 model file.
 
@@ -622,14 +788,23 @@ def deserialize_to_file(
     do_atomic_virial : bool
         If True, export with per-atom virial correction (3 extra backward
         passes, ~2.5x slower).  Default False for best performance.
+    lower_kind : str
+        Which lower-forward schema the compiled AOTI graph consumes:
+        ``"nlist"`` (default) traces the dense quartet
+        (``extended_coord``/``extended_atype``/``nlist``/``mapping``);
+        ``"graph"`` traces the NeighborGraph schema
+        (``atype``/``n_node``/``edge_index``/``edge_vec``/``edge_mask``) with a
+        DYNAMIC edge axis ``E`` (``Dim("nedge", min=2)``), so the artifact
+        accepts any system size.  The selected schema is recorded as
+        ``lower_input_kind`` in ``metadata.json``.
     """
     if model_file.endswith(".pt2"):
         _deserialize_to_file_pt2(
-            model_file, data, model_json_override, do_atomic_virial
+            model_file, data, model_json_override, do_atomic_virial, lower_kind
         )
     else:
         _deserialize_to_file_pte(
-            model_file, data, model_json_override, do_atomic_virial
+            model_file, data, model_json_override, do_atomic_virial, lower_kind
         )
 
 
@@ -638,6 +813,7 @@ def _trace_and_export(
     model_json_override: dict | None = None,
     with_comm_dict: bool = False,
     do_atomic_virial: bool = False,
+    lower_kind: str = "nlist",
 ) -> tuple:
     """Common logic: build model, trace, export.
 
@@ -663,6 +839,10 @@ def _trace_and_export(
         If True, the traced graph computes per-atom virial (extra
         autograd.grad backward passes); off by default to keep .pt2
         inference fast. Mirrors PR #5407 in upstream master.
+    lower_kind
+        ``"nlist"`` (default) traces the dense quartet forward; ``"graph"``
+        traces ``forward_lower_graph_exportable`` over the NeighborGraph schema
+        with a dynamic edge axis. Recorded as ``lower_input_kind`` in metadata.
 
     Returns
     -------
@@ -700,7 +880,125 @@ def _trace_and_export(
     model.eval()
 
     # 2. Collect metadata
-    metadata = _collect_metadata(model, is_spin=is_spin)
+    metadata = _collect_metadata(model, is_spin=is_spin, lower_kind=lower_kind)
+
+    # 2b. Graph-form export branch (NeighborGraph schema). The graph path is
+    # LOCAL-only (no ghosts), single-rank, energy-model only in PR-A/PR-B; it
+    # traces ``forward_lower_graph_exportable`` with a DYNAMIC edge axis (B2.0).
+    # The dense (nlist) path below is left byte-unchanged.
+    if lower_kind == "graph":
+        import math
+
+        if is_spin:
+            raise NotImplementedError(
+                "graph-form .pt2 export is not supported for spin models"
+            )
+        if with_comm_dict:
+            raise NotImplementedError(
+                "graph-form .pt2 export does not support the with-comm artifact "
+                "(multi-rank graph message passing is a later PR)"
+            )
+        if not hasattr(model, "forward_lower_graph_exportable"):
+            raise NotImplementedError(
+                f"model {type(model).__name__} has no "
+                "forward_lower_graph_exportable; graph-form .pt2 export "
+                "requires an energy model"
+            )
+
+        # The edge axis is DYNAMIC (B2.0): the AOTI artifact accepts any edge
+        # count, so there is no capacity to bake. The trace sample is built at a
+        # concrete, padded edge size only to keep the trace tensors distinct
+        # from the other dynamic dims (nframes=2, N=14) under torch.export's
+        # duck-sizing; the value itself does NOT constrain runtime.
+        nloc_sample = 7
+        nnei = sum(model.get_sel())
+        e_sample = math.ceil(1.25 * nloc_sample * nnei)
+
+        # make_fx traces on CPU; the .pt2 C++ ABI is float64-only.  Pass device
+        # and dtype explicitly instead of mutating the module-level env.DEVICE.
+        sample_inputs = build_synthetic_graph_inputs(
+            model,
+            e_max=e_sample,
+            nframes=2,
+            nloc=nloc_sample,
+            dtype=torch.float64,
+            device=torch.device("cpu"),
+        )
+
+        (
+            atype_g,
+            n_node_g,
+            edge_index_g,
+            edge_vec_g,
+            edge_mask_g,
+            fparam_g,
+            aparam_g,
+            charge_spin_g,
+        ) = sample_inputs
+
+        # Trace via make_fx on CPU (decomposes autograd.grad into aten ops).
+        traced = model.forward_lower_graph_exportable(
+            atype_g,
+            n_node_g,
+            edge_index_g,
+            edge_vec_g,
+            edge_mask_g,
+            fparam=fparam_g,
+            aparam=aparam_g,
+            do_atomic_virial=do_atomic_virial,
+            charge_spin=charge_spin_g,
+            tracing_mode="symbolic",
+            _allow_non_fake_inputs=True,
+        )
+        sample_out = traced(
+            atype_g,
+            n_node_g,
+            edge_index_g,
+            edge_vec_g,
+            edge_mask_g,
+            fparam_g,
+            aparam_g,
+            charge_spin_g,
+        )
+        output_keys = list(sample_out.keys())
+
+        dynamic_shapes = _build_graph_dynamic_shapes(*sample_inputs)
+        exported = torch.export.export(
+            traced,
+            sample_inputs,
+            dynamic_shapes=dynamic_shapes,
+            strict=False,
+            prefer_deferred_runtime_asserts_over_guards=True,
+        )
+
+        # Neutralise shape-guard assertion nodes on the dynamic edge axis.
+        # ``prefer_deferred_runtime_asserts_over_guards=True`` converts the
+        # symbolic-shape guards discovered while tracing into deferred
+        # ``aten._assert_scalar`` nodes; on the dynamic ``E`` axis these are the
+        # SIGFPE-prone ``nloc_min``-family checks (CLAUDE.md AOTI pitfalls) that
+        # the dense spin path already strips. Replacing each condition with
+        # ``True`` (not erasing the node) keeps the graph well-formed while
+        # letting the AOTI artifact generalise across edge counts.
+        _strip_shape_assertions(exported.graph_module)
+
+        if target_device.type != "cpu":
+            from torch.export.passes import (
+                move_to_device_pass,
+            )
+
+            exported = move_to_device_pass(exported, target_device)
+
+        metadata["do_atomic_virial"] = do_atomic_virial
+        # The edge axis is DYNAMIC (B2.0): the AOTI forward accepts any edge
+        # count, so there is no ``edge_capacity`` to persist. The C++ / Python
+        # conversion hub builds the carry-all graph at its exact (tight) edge
+        # count and feeds it straight through.
+
+        json_source = model_json_override if model_json_override is not None else data
+        data_for_json = deepcopy(json_source)
+        data_for_json = _numpy_to_json_serializable(data_for_json)
+
+        return exported, metadata, data_for_json, output_keys
 
     # 3. Create sample inputs on CPU for tracing
     # torch.export's duck-sizing unifies dimensions with the same sample value,
@@ -917,10 +1215,14 @@ def _deserialize_to_file_pte(
     data: dict,
     model_json_override: dict | None = None,
     do_atomic_virial: bool = False,
+    lower_kind: str = "nlist",
 ) -> None:
     """Deserialize a dictionary to a .pte model file."""
     exported, metadata, data_for_json, output_keys = _trace_and_export(
-        data, model_json_override, do_atomic_virial=do_atomic_virial
+        data,
+        model_json_override,
+        do_atomic_virial=do_atomic_virial,
+        lower_kind=lower_kind,
     )
 
     model_def_script = data.get("model_def_script") or {}
@@ -939,6 +1241,7 @@ def _deserialize_to_file_pt2(
     data: dict,
     model_json_override: dict | None = None,
     do_atomic_virial: bool = False,
+    lower_kind: str = "nlist",
 ) -> None:
     """Deserialize a dictionary to a .pt2 model file (AOTInductor).
 
@@ -976,7 +1279,10 @@ def _deserialize_to_file_pt2(
 
     # First artifact: regular (no comm). Always produced.
     exported, metadata, data_for_json, output_keys = _trace_and_export(
-        data, model_json_override, do_atomic_virial=do_atomic_virial
+        data,
+        model_json_override,
+        do_atomic_virial=do_atomic_virial,
+        lower_kind=lower_kind,
     )
     metadata["output_keys"] = output_keys
 
diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h
index 68a553e29c..ddaea35646 100644
--- a/source/api_cc/include/DeepPotPTExpt.h
+++ b/source/api_cc/include/DeepPotPTExpt.h
@@ -308,10 +308,12 @@ class DeepPotPTExpt : public DeepPotBackend {
   bool do_atomic_virial;  // whether model was exported with atomic virial corr
   int nnei;               // expected nlist nnei dimension (= sum(sel))
   bool lower_input_is_edge_ = false;
+  bool lower_input_is_graph_ = false;
   NeighborListData nlist_data;
-  at::Tensor mapping_tensor;         // cached mapping tensor (LAMMPS path)
-  at::Tensor firstneigh_tensor;      // cached nlist tensor (LAMMPS path)
-  at::Tensor edge_index_tensor;      // cached local edge graph (LAMMPS path)
+  at::Tensor mapping_tensor;           // cached mapping tensor (LAMMPS path)
+  std::vector<std::int64_t> mapping_;  // cached mapping vector (LAMMPS path)
+  at::Tensor firstneigh_tensor;        // cached nlist tensor (LAMMPS path)
+  at::Tensor edge_index_tensor;        // cached local edge graph (LAMMPS path)
   at::Tensor edge_index_ext_tensor;  // cached extended edge graph (LAMMPS path)
   std::unique_ptr<torch::inductor::AOTIModelPackageLoader> loader;
   // Optional second AOTInductor artifact for the multi-rank GNN code
@@ -398,6 +400,30 @@ class DeepPotPTExpt : public DeepPotBackend {
       const torch::Tensor& aparam,
       const torch::Tensor& charge_spin);
 
+  /**
+   * @brief Run a NeighborGraph-schema ``.pt2`` (lower_input_kind="graph").
+   *
+   * Positional AOTI input order matches the Python export ABI:
+   * ``(atype, n_node, edge_index, edge_vec, edge_mask, [fparam], [aparam],
+   * [charge_spin])``.  Unlike the edge schema there is no ``coord`` and no
+   * ``edge_scatter_index`` input; node count is carried by ``n_node`` and the
+   * geometry is fully described by ``edge_vec``.
+   *
+   * @param[in] atype Per-node local types, shape ``(N,)`` int64.
+   * @param[in] n_node Per-frame node count, shape ``(nf,)`` int64.
+   * @param[in] edge_index Folded edge graph ``(2, E)`` int64 [src, dst].
+   * @param[in] edge_vec Edge vectors ``(E, 3)`` (neighbour - center).
+   * @param[in] edge_mask Physical-edge mask ``(E,)`` bool.
+   */
+  std::vector<torch::Tensor> run_model_graph(const torch::Tensor& atype,
+                                             const torch::Tensor& n_node,
+                                             const torch::Tensor& edge_index,
+                                             const torch::Tensor& edge_vec,
+                                             const torch::Tensor& edge_mask,
+                                             const torch::Tensor& fparam,
+                                             const torch::Tensor& aparam,
+                                             const torch::Tensor& charge_spin);
+
   /**
    * @brief Run the with-comm .pt2 artifact with comm tensors appended.
    *
diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h
index 643e53974a..02c25aa047 100644
--- a/source/api_cc/include/commonPT.h
+++ b/source/api_cc/include/commonPT.h
@@ -6,6 +6,8 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <map>
+#include <string>
 #include <type_traits>
 #include <vector>
 
@@ -364,6 +366,233 @@ inline EdgeTensorPack compactEdgeTensors(const torch::Tensor& edge_index,
   return pack;
 }
 
+struct GraphTensorPack {
+  torch::Tensor atype;
+  torch::Tensor n_node;
+  torch::Tensor edge_index;
+  torch::Tensor edge_vec;
+  torch::Tensor edge_mask;
+};
+
+/**
+ * @brief Build NeighborGraph input tensors from a host neighbor list
+ *        (single-rank, dynamic edge axis).
+ *
+ * Mirrors the edge schema but drops ``coord``/``edge_scatter_index`` and adds
+ * ``n_node``.  Edge construction is delegated to the existing
+ * ``createEdgeTensors``/``compactEdgeTensors`` helpers (same rcut filter,
+ * variable edge count and two masked dummy edges that keep the dynamic edge
+ * dimension non-empty); the wrapper then (a) drops the extended scatter index,
+ * (b) emits ``n_node = [nloc]`` for the single frame, and (c) sets the node
+ * types from the local slice of ``atype_ext``.
+ *
+ * @param nlist Neighbor-list rows (local idx into the extended set).
+ * @param coord Extended coordinates shaped as nall x 3.
+ * @param atype_ext Extended atom types, length nall.  Node types are taken from
+ *   the extended types (NOT ``atype[mapping]``); for single-rank ghost-free
+ *   this is just ``atype_ext[0:nloc]``, while multi-rank (B3) passes the halo
+ *   types directly.
+ * @param mapping Extended-to-local atom map, length nall.
+ * @param nloc Number of local atoms.
+ * @param nall Number of extended atoms.
+ * @param rcut Model cutoff (edges with ``rr > rcut**2`` are dropped).
+ * @param device Target device for the returned tensors.
+ * @param row_centers Optional center atom index for each neighbor-list row
+ *   (LAMMPS compacts away empty rows); ``nullptr`` means row i is center i.
+ * @param fold_to_local Whether ghost neighbours are folded onto their local
+ *   owners (single-rank, ``N == nloc``, ``n_node = [nloc]``, node types from
+ *   ``atype_ext[0:nloc]``) or kept as distinct extended nodes (multi-rank,
+ *   ``N == nall``, ``n_node = [nall]``, node types from the full ``atype_ext``
+ *   including the real halo types — the #5583 invariant).  In the multi-rank
+ *   case ``edge_index`` indexes the extended atoms directly, so ghost reaction
+ *   forces land on the ghost rows and are folded to their owners by LAMMPS
+ *   reverse-comm (no with-comm artifact / no border_op — dpa1 is non-MP).
+ */
+template <typename VALUETYPE>
+inline GraphTensorPack buildGraphTensors(
+    const std::vector<std::vector<int>>& nlist,
+    const std::vector<VALUETYPE>& coord,
+    const std::vector<int>& atype_ext,
+    const std::vector<std::int64_t>& mapping,
+    const int nloc,
+    const int nall,
+    const double rcut,
+    const torch::Device& device,
+    const std::vector<int>* row_centers = nullptr,
+    const bool fold_to_local = true) {
+  auto int_options = torch::TensorOptions().dtype(torch::kInt64);
+
+  // 1. Cached-style topology only (no geometry): when fold_to_local=true,
+  //    edge_index folds ghost neighbours onto their local owners (single-rank);
+  //    when false, edge_index indexes the extended atoms directly (multi-rank).
+  //    edge_index_ext always keeps extended indices for the on-device geometry
+  //    recompute.
+  const EdgeTensorPack topo =
+      createEdgeTensors(nlist, coord, mapping, nloc, nall, device,
+                        /*with_geometry=*/false, row_centers, fold_to_local);
+
+  // 2. Recompute geometry from the current coords on-device, filter by rcut and
+  //    append the two masked dummy edges.  The model is compiled for float64
+  //    inputs, so build the coord tensor as float64 to match the edge path.
+  std::vector<double> coord_d(coord.begin(), coord.end());
+  at::Tensor coord_tensor =
+      torch::from_blob(coord_d.data(), {static_cast<std::int64_t>(nall), 3},
+                       torch::TensorOptions().dtype(torch::kFloat64))
+          .clone()
+          .to(device);
+  const EdgeTensorPack edges = compactEdgeTensors(
+      topo.edge_index, topo.edge_index_ext, coord_tensor, rcut);
+
+  GraphTensorPack pack;
+  pack.edge_index = edges.edge_index;  // (2, E): local-folded or extended
+  pack.edge_vec = edges.edge_vec;      // (E, 3) neighbour - center
+  pack.edge_mask = edges.edge_mask;    // (E,) bool
+  // Single-rank: N == nloc (ghosts folded onto owners).  Multi-rank: N == nall
+  // (ghosts are distinct nodes whose features come from their real halo types).
+  const std::int64_t n_node_count = fold_to_local ? nloc : nall;
+  pack.n_node = torch::full({1}, n_node_count, int_options).to(device);
+  // Node types from the extended types (NOT atype[mapping]): the local slice
+  // for single-rank, the full extended set (incl. real halo types) for
+  // multi-rank.
+  std::vector<std::int64_t> atype_nodes(atype_ext.begin(),
+                                        atype_ext.begin() + n_node_count);
+  pack.atype = torch::from_blob(atype_nodes.data(), {n_node_count}, int_options)
+                   .clone()
+                   .to(device);
+  return pack;
+}
+
+/**
+ * @brief Remap NeighborGraph (graph-schema) public outputs onto the dense
+ *        internal-key layout the rest of ``compute`` consumes.
+ *
+ * The graph forward (``forward_lower_graph_exportable``) is LOCAL-only and
+ * emits flat-N PUBLIC keys:
+ *   - ``atom_energy`` (N, 1)      per-atom energy        (N == nloc)
+ *   - ``energy``      (nf, 1)     reduced total energy
+ *   - ``force``       (N, 3)      per-atom force (ghosts already folded onto
+ *                                 their local owners via ``edge_index``)
+ *   - ``virial``      (nf, 9)     reduced total virial
+ *   - ``atom_virial`` (N, 9)      per-atom (full-to-src) virial
+ *
+ * The downstream extraction in ``DeepPotPTExpt::compute`` was written for the
+ * dense forward's internal keys with their extra dims:
+ *   ``energy_redu`` (nf,1), ``energy_derv_c_redu`` (nf,1,9),
+ *   ``energy_derv_r`` (nf,nall,1,3), ``energy`` (nf,nloc,1),
+ *   ``energy_derv_c`` (nf,nall,1,9).
+ *
+ * This helper rewrites the public keys into those internal keys (single frame,
+ * nf == 1).  The per-atom force / atom-virial are LOCAL (nloc rows); they are
+ * zero-padded up to the extended length ``nall`` so the existing fold-back
+ * (``fold_back`` / ``select_map``) is a no-op on the ghost rows — the local
+ * rows already carry the folded ghost contributions, so zero ghosts avoid
+ * double counting (and keep LAMMPS reverse-comm correct).
+ *
+ * **Single-rank only.**  Multi-rank inference (B3.2) must NOT call this
+ * function: ghost/halo forces are real cross-rank contributions that must be
+ * returned as-is and folded back via reverse-comm rather than being zeroed.
+ * Calling this function on a multi-rank result would silently zero those forces
+ * and produce wrong energetics. Pass ``single_rank = false`` to get an
+ * explicit exception instead of silent corruption.
+ *
+ * @param[in,out] output_map Output tensor map (public keys in, internal keys
+ *   added).
+ * @param[in] nloc Number of local atoms (== N, the graph node count).
+ * @param[in] nall Extended atom count to pad the per-atom outputs up to.
+ * @param[in] atomic Whether atomic energy / virial were requested.
+ * @param[in] single_rank Must be true; throws deepmd_exception if false.
+ */
+inline void remap_graph_outputs_to_dense_keys(
+    std::map<std::string, torch::Tensor>& output_map,
+    const std::int64_t nloc,
+    const std::int64_t nall,
+    const bool atomic,
+    const bool single_rank = true) {
+  if (!single_rank) {
+    throw deepmd::deepmd_exception(
+        "remap_graph_outputs_to_dense_keys is single-rank-only; multi-rank "
+        "uses the extended-region reverse-comm fold (PR-B3.2)");
+  }
+  using torch::indexing::Slice;
+  const std::int64_t nf = 1;
+  const auto& energy_pub = output_map.at("energy");  // (nf, 1)
+  const auto& force_pub = output_map.at("force");    // (N, 3), N == nloc
+  const auto& virial_pub = output_map.at("virial");  // (nf, 9)
+
+  output_map["energy_redu"] = energy_pub.reshape({nf, 1});
+  output_map["energy_derv_c_redu"] = virial_pub.reshape({nf, 1, 9});
+
+  // Local force -> (nf, nall, 1, 3) with zero ghost rows.
+  auto force_full = torch::zeros({nf, nall, 1, 3}, force_pub.options());
+  force_full.index_put_({0, Slice(0, nloc), 0}, force_pub);
+  output_map["energy_derv_r"] = force_full;
+
+  if (atomic) {
+    const auto& atom_energy_pub = output_map.at("atom_energy");  // (N, 1)
+    const auto& atom_virial_pub = output_map.at("atom_virial");  // (N, 9)
+    output_map["energy"] = atom_energy_pub.reshape({nf, nloc, 1});
+    auto atom_virial_full =
+        torch::zeros({nf, nall, 1, 9}, atom_virial_pub.options());
+    atom_virial_full.index_put_({0, Slice(0, nloc), 0}, atom_virial_pub);
+    output_map["energy_derv_c"] = atom_virial_full;
+  }
+}
+
+/**
+ * @brief Remap NeighborGraph public outputs onto the dense internal-key layout
+ *        for the MULTI-RANK (extended-region) non-message-passing path.
+ *
+ * Built with ``fold_to_local=false``, the graph has ``N == nall`` nodes: ghost
+ * (halo) atoms are distinct nodes, so the per-node ``force`` is already the
+ * EXTENDED force (one row per extended atom).  Ghost reaction forces stay on
+ * their ghost rows and are folded back to their owning rank by LAMMPS
+ * reverse-comm — exactly as the dense path returns its extended force.  No
+ * zero-padding (unlike the single-rank helper) and no with-comm artifact (dpa1
+ * is non-MP).
+ *
+ * Key differences from the single-rank helper:
+ *   - ``energy_redu`` = sum of the LOCAL atom energies
+ * (``atom_energy[0:nloc]``) ONLY.  The public ``energy`` key reduces over all
+ * ``N == nall`` nodes, which would double-count the bias energy of ghost nodes
+ * that belong to other ranks (ghost nodes have no center edges, so they carry a
+ * bias-only energy and zero force/virial gradient — harmless for force/virial
+ * but wrong for the owned energy).
+ *   - ``energy_derv_r`` / ``energy_derv_c`` keep all ``nall`` rows (no
+ * padding).
+ *
+ * @param[in,out] output_map Output tensor map (public keys in, internal keys
+ *   added).
+ * @param[in] nloc Number of local atoms (owned by this rank).
+ * @param[in] nall Extended atom count (== N, the graph node count).
+ * @param[in] atomic Whether atomic energy / virial were requested.
+ */
+inline void remap_graph_outputs_to_dense_keys_extended(
+    std::map<std::string, torch::Tensor>& output_map,
+    const std::int64_t nloc,
+    const std::int64_t nall,
+    const bool atomic) {
+  using torch::indexing::Slice;
+  const std::int64_t nf = 1;
+  const auto& atom_energy_pub = output_map.at("atom_energy");  // (N==nall, 1)
+  const auto& force_pub = output_map.at("force");    // (N==nall, 3) extended
+  const auto& virial_pub = output_map.at("virial");  // (nf, 9)
+
+  // Owned energy = sum over LOCAL atoms only; ghost nodes carry bias-only
+  // energy belonging to other ranks.
+  output_map["energy_redu"] =
+      atom_energy_pub.index({Slice(0, nloc)}).sum().reshape({nf, 1});
+  output_map["energy_derv_c_redu"] = virial_pub.reshape({nf, 1, 9});
+  // Extended force: ghost rows stay distinct for LAMMPS reverse-comm fold-back.
+  output_map["energy_derv_r"] = force_pub.reshape({nf, nall, 1, 3});
+
+  if (atomic) {
+    const auto& atom_virial_pub = output_map.at("atom_virial");  // (N==nall, 9)
+    output_map["energy"] =
+        atom_energy_pub.index({Slice(0, nloc)}).reshape({nf, nloc, 1});
+    output_map["energy_derv_c"] = atom_virial_pub.reshape({nf, nall, 1, 9});
+  }
+}
+
 }  // namespace deepmd
 
 #endif  // BUILD_PYTORCH
diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc
index 96033fcab4..315f4cc39b 100644
--- a/source/api_cc/src/DeepPotPTExpt.cc
+++ b/source/api_cc/src/DeepPotPTExpt.cc
@@ -155,8 +155,10 @@ void DeepPotPTExpt::init(const std::string& model,
     const std::string lower_input_kind =
         metadata["lower_input_kind"].as_string();
     lower_input_is_edge_ = lower_input_kind == "edge_vec";
+    lower_input_is_graph_ = lower_input_kind == "graph";
   } else {
     lower_input_is_edge_ = false;
+    lower_input_is_graph_ = false;
   }
 
   type_map.clear();
@@ -289,6 +291,31 @@ std::vector<torch::Tensor> DeepPotPTExpt::run_model_edges(
   return loader->run(inputs);
 }
 
+std::vector<torch::Tensor> DeepPotPTExpt::run_model_graph(
+    const torch::Tensor& atype,
+    const torch::Tensor& n_node,
+    const torch::Tensor& edge_index,
+    const torch::Tensor& edge_vec,
+    const torch::Tensor& edge_mask,
+    const torch::Tensor& fparam,
+    const torch::Tensor& aparam,
+    const torch::Tensor& charge_spin) {
+  // NeighborGraph ABI: (atype, n_node, edge_index, edge_vec, edge_mask,
+  // [fparam], [aparam], [charge_spin]).  No coord, no edge_scatter_index.
+  std::vector<torch::Tensor> inputs = {atype, n_node, edge_index, edge_vec,
+                                       edge_mask};
+  if (dfparam > 0) {
+    inputs.push_back(fparam);
+  }
+  if (daparam > 0) {
+    inputs.push_back(aparam);
+  }
+  if (dchgspin > 0) {
+    inputs.push_back(charge_spin);
+  }
+  return loader->run(inputs);
+}
+
 std::vector<torch::Tensor> DeepPotPTExpt::run_model_with_comm(
     const torch::Tensor& coord,
     const torch::Tensor& atype,
@@ -475,6 +502,24 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   bool multi_rank = (lmp_list.nprocs > 1);
   bool atom_map_present = (lmp_list.mapping != nullptr);
   bool use_with_comm = has_comm_artifact_ && multi_rank;
+  // NeighborGraph multi-rank dispatch:
+  //   - NON-message-passing (dpa1, se_e2_a, ...): the SAME single-rank graph
+  //     .pt2 runs on the EXTENDED region (fold_to_local=false; ghosts are
+  //     distinct nodes whose features come from their real halo types).  No
+  //     with-comm artifact / no border_op is needed; ghost reaction forces are
+  //     folded to their owners by LAMMPS reverse-comm.  Handled below.
+  //   - message-passing graph (DPA2/DPA3, PR-G): would need a with-comm graph
+  //     artifact for cross-rank ghost-feature exchange — not yet supported.
+  //     Fail fast before building any tensors so callers get a clear message
+  //     instead of a wrong answer.
+  if (lower_input_is_graph_ && multi_rank && has_message_passing_) {
+    throw deepmd::deepmd_exception(
+        "Multi-rank message-passing graph (NeighborGraph) .pt2 inference is "
+        "not yet supported (PR-G). Non-message-passing graph models (e.g. "
+        "dpa1) run multi-rank on the extended-region single-rank artifact; "
+        "for message-passing models run single-rank, or use a dense/edge "
+        ".pt2 for multi-rank LAMMPS.");
+  }
   // Decision matrix (see PR #5450 description):
   //   non-GNN model (has_message_passing_ == false): regular path is
   //                                                  always safe.
@@ -505,19 +550,25 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   // LAMMPS sets ago=0 on every nlist rebuild (neighbor rebuild, re-partition,
   // atom exchange between subdomains), so `ago > 0` implies the cached
   // mapping and nlist tensors are still valid.  Rebuild only on ago==0.
-  std::vector<std::int64_t> mapping;
   if (ago == 0) {
     nlist_data.copy_from_nlist(lmp_list, nall - nghost);
     nlist_data.shuffle_exclude_empty(fwd_map);
 
-    // Rebuild mapping tensor
+    // Rebuild mapping vector and tensor (cached as members). ``mapping_tensor``
+    // is consumed every step by the dense ``run_model`` (ghost-feature gather);
+    // the ``mapping_`` vector is read only here at ago==0 -- to build that
+    // tensor and, for the edge/graph paths, to fold ghost neighbours onto their
+    // local owners inside ``createEdgeTensors``.  (The graph path used to read
+    // ``mapping_`` every step via a per-step ``buildGraphTensors``; it now
+    // caches the topology at ago==0 like the edge/dense paths, so no per-step
+    // read.)
     if (lmp_list.mapping) {
-      mapping.resize(nall_real);
+      mapping_.resize(nall_real);
       for (int ii = 0; ii < nall_real; ii++) {
-        mapping[ii] = fwd_map[lmp_list.mapping[bkw_map[ii]]];
+        mapping_[ii] = fwd_map[lmp_list.mapping[bkw_map[ii]]];
       }
       mapping_tensor =
-          torch::from_blob(mapping.data(), {1, nall_real}, int_option)
+          torch::from_blob(mapping_.data(), {1, nall_real}, int_option)
               .clone()
               .to(device);
     } else {
@@ -530,12 +581,12 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
       //     features via border_op and ignores this tensor for ghost
       //     gather — see deepmd/pt_expt/descriptor/
       //     repflows.py::_exchange_ghosts).
-      mapping.resize(nall_real);
+      mapping_.resize(nall_real);
       for (int ii = 0; ii < nall_real; ii++) {
-        mapping[ii] = ii;
+        mapping_[ii] = ii;
       }
       mapping_tensor =
-          torch::from_blob(mapping.data(), {1, nall_real}, int_option)
+          torch::from_blob(mapping_.data(), {1, nall_real}, int_option)
               .clone()
               .to(device);
     }
@@ -551,11 +602,25 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
       // their features can be exchanged across ranks via border_op, instead of
       // being folded onto a local owner that this rank does not own.
       const auto edge_tensors = createEdgeTensors(
-          nlist_data.jlist, dcoord, mapping, nloc, nall_real, device,
+          nlist_data.jlist, dcoord, mapping_, nloc, nall_real, device,
           /*with_geometry=*/false, /*row_centers=*/&nlist_data.ilist,
           /*fold_to_local=*/!use_with_comm);
       edge_index_tensor = edge_tensors.edge_index;
       edge_index_ext_tensor = edge_tensors.edge_index_ext;
+    } else if (lower_input_is_graph_) {
+      // Cache only the real skin topology, exactly like the edge path: the
+      // geometry (edge_vec) + rcut filter are recomputed on-device every step
+      // by compactEdgeTensors, so the O(E) host loop + H2D copy in
+      // createEdgeTensors runs ONLY on a LAMMPS nlist rebuild (ago==0), not
+      // every step.  Single-rank folds ghosts onto local owners
+      // (fold_to_local=true); non-MP multi-rank keeps the extended region
+      // (fold_to_local=false) so ghost forces reverse-comm to their owners.
+      const auto edge_tensors = createEdgeTensors(
+          nlist_data.jlist, dcoord, mapping_, nloc, nall_real, device,
+          /*with_geometry=*/false, /*row_centers=*/&nlist_data.ilist,
+          /*fold_to_local=*/!multi_rank);
+      edge_index_tensor = edge_tensors.edge_index;
+      edge_index_ext_tensor = edge_tensors.edge_index_ext;
     } else {
       nlist_data.padding();
       firstneigh_tensor = createNlistTensor(nlist_data.jlist, nnei)
@@ -771,6 +836,49 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
                           edge_tensors.edge_index, edge_tensors.edge_vec,
                           edge_tensors.edge_index_ext, edge_tensors.edge_mask,
                           fparam_tensor, aparam_tensor, charge_spin_tensor);
+    } else if (lower_input_is_graph_) {
+      if (nall_real == 0) {
+        // Truly-empty rank (no local atoms AND no ghosts): the graph would emit
+        // N == 0 nodes, and edge_force_virial's ``edge_index % node_capacity``
+        // would divide by zero (SIGFPE) -- it also violates the exported
+        // ``Dim("n_node_total", min=1)``.  Such a rank contributes nothing, so
+        // fill zero outputs and return instead of running the model.  (The
+        // tested ``nloc == 0`` empty-subdomain case has ``nall_real > 0`` --
+        // ghosts within rcut -- so it still runs the model normally.)
+        ener.assign(nframes, static_cast<ENERGYTYPE>(0));
+        force.assign(static_cast<size_t>(nframes) * fwd_map.size() * 3,
+                     static_cast<VALUETYPE>(0));
+        virial.assign(static_cast<size_t>(nframes) * 9,
+                      static_cast<VALUETYPE>(0));
+        if (atomic) {
+          atom_energy.assign(static_cast<size_t>(nframes) * fwd_map.size(),
+                             static_cast<VALUETYPE>(0));
+          atom_virial.assign(static_cast<size_t>(nframes) * fwd_map.size() * 9,
+                             static_cast<VALUETYPE>(0));
+        }
+        return;
+      }
+      // NeighborGraph schema: recompute geometry + rcut filter on-device from
+      // the cached skin topology (edge_index[_ext]_tensor built at ago==0),
+      // then assemble the cheap node tensors.  Mirrors the edge path -- no
+      // per-step host rebuild / H2D copy.  Single-rank folds ghosts onto local
+      // owners (N == nloc); multi-rank (non-MP only — the fail-fast above
+      // blocks MP graph multi-rank) keeps the extended region (N == nall_real,
+      // node types from the real halo types) so LAMMPS reverse-comm folds ghost
+      // forces back.  The node types come from the on-device extended
+      // atype_Tensor slice (== atype_ext[0:N]); n_node is a 1-element tensor.
+      const auto edge_tensors =
+          compactEdgeTensors(edge_index_tensor, edge_index_ext_tensor,
+                             coord_Tensor, static_cast<double>(rcut));
+      const std::int64_t n_node_count = multi_rank ? nall_real : nloc;
+      at::Tensor n_node_tensor =
+          torch::full({1}, n_node_count, int_option).to(device);
+      at::Tensor node_atype =
+          atype_Tensor.slice(1, 0, n_node_count).reshape({n_node_count});
+      flat_outputs =
+          run_model_graph(node_atype, n_node_tensor, edge_tensors.edge_index,
+                          edge_tensors.edge_vec, edge_tensors.edge_mask,
+                          fparam_tensor, aparam_tensor, charge_spin_tensor);
     } else {
       flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor,
                                mapping_tensor, fparam_tensor, aparam_tensor,
@@ -782,6 +890,24 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   std::map<std::string, torch::Tensor> output_map;
   extract_outputs(output_map, flat_outputs);
 
+  if (lower_input_is_graph_) {
+    // The graph forward emits flat-N PUBLIC keys (atom_energy/energy/force/
+    // virial/atom_virial); rewrite them into the dense internal-key layout the
+    // downstream extraction/fold-back expects.
+    if (multi_rank) {
+      // Extended region (N == nall_real): force is already per-extended-atom,
+      // owned energy = sum over local atom energies, no zero-padding.  Ghost
+      // forces fold back via LAMMPS reverse-comm (no with-comm artifact).
+      deepmd::remap_graph_outputs_to_dense_keys_extended(output_map, nloc,
+                                                         nall_real, atomic);
+    } else {
+      // Single-rank (N == nloc): ghosts folded onto owners; pad the per-atom
+      // force/virial up to nall_real with zero ghost rows.
+      deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall_real,
+                                                atomic, /*single_rank=*/true);
+    }
+  }
+
   if (phantom_n > 0) {
     // Strip the phantom local prefix and zero the empty rank's energy.  The
     // phantom atoms carry no edges, so their force / per-atom virial are
@@ -1015,9 +1141,16 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
           .to(device);
   at::Tensor nlist_tensor;
   EdgeTensorPack edge_tensors;
+  GraphTensorPack graph_tensors;
   if (lower_input_is_edge_) {
     edge_tensors = createEdgeTensors(nlist_raw, coord_cpy_d, mapping_64, nloc,
                                      nall, device);
+  } else if (lower_input_is_graph_) {
+    // Standalone (no nlist) graph schema: build_nlist already cut at rcut and
+    // keys row i to center i, so no row_centers remapping is needed.
+    graph_tensors =
+        buildGraphTensors(nlist_raw, coord_cpy_d, atype_cpy, mapping_64, nloc,
+                          nall, static_cast<double>(rcut), device);
   } else {
     nlist_tensor =
         createNlistTensor(nlist_raw, nnei).to(torch::kInt64).to(device);
@@ -1104,6 +1237,11 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
                         edge_tensors.edge_index, edge_tensors.edge_vec,
                         edge_tensors.edge_index_ext, edge_tensors.edge_mask,
                         fparam_tensor, aparam_tensor, charge_spin_tensor);
+  } else if (lower_input_is_graph_) {
+    flat_outputs = run_model_graph(
+        graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index,
+        graph_tensors.edge_vec, graph_tensors.edge_mask, fparam_tensor,
+        aparam_tensor, charge_spin_tensor);
   } else {
     flat_outputs =
         run_model(coord_Tensor, atype_Tensor, nlist_tensor, mapping_tensor,
@@ -1114,6 +1252,17 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener,
   std::map<std::string, torch::Tensor> output_map;
   extract_outputs(output_map, flat_outputs);
 
+  if (lower_input_is_graph_) {
+    // The graph forward emits LOCAL public keys; rewrite them into the dense
+    // internal-key layout used below.  nloc == N (graph node count); pad the
+    // per-atom force/virial up to the extended nall with zero ghost rows so the
+    // fold-back is a no-op on ghosts.
+    // single_rank=true: the standalone (build_nlist) path is always
+    // single-rank; there is no comm_dict / cross-rank ghost exchange here.
+    deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall, atomic,
+                                              /*single_rank=*/true);
+  }
+
   // 7. Extract energy
   torch::Tensor flat_energy_ =
       output_map["energy_redu"].view({-1}).to(torch::kCPU);
diff --git a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
new file mode 100644
index 0000000000..c57abaf0a0
--- /dev/null
+++ b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+// Test C++ inference for the NeighborGraph (graph-schema) .pt2 path of the
+// pt_expt backend.  The graph model is a dpa1(attn_layer=0) descriptor exported
+// with lower_kind="graph" (gen_dpa1.py section B); this is the FIRST runtime
+// exercise of the C++ graph ingestion added in PR-B Phase B2
+// (lower_input_is_graph_ / run_model_graph / buildGraphTensors / the
+// compute_inner graph branch).
+//
+// Reference values (deeppot_dpa1_graph.expected) come from an INDEPENDENT
+// nlist (dense-quartet) evaluation of the same weights, so a match validates
+// the graph AOTI ABI/geometry, not just self-consistency.  A second, persisted
+// nlist .pt2 of the same weights (deeppot_dpa1_graph_nlist_ref.pt2) is loaded
+// alongside the graph model so arbitrary system sizes (dynamic edge axis) can
+// be cross-checked graph≈dense live without baking more reference blocks.
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "DeepPot.h"
+#include "DeepPotPTExpt.h"
+#include "expected_ref.h"
+#include "neighbor_list.h"
+#include "test_utils.h"
+
+namespace {
+constexpr const char* kGraphModel = "../../tests/infer/deeppot_dpa1_graph.pt2";
+constexpr const char* kNlistRefModel =
+    "../../tests/infer/deeppot_dpa1_graph_nlist_ref.pt2";
+constexpr const char* kRefPath =
+    "../../tests/infer/deeppot_dpa1_graph.expected";
+}  // namespace
+
+template <class VALUETYPE>
+class TestInferDpa1GraphPtExpt : public ::testing::Test {
+ protected:
+  std::vector<VALUETYPE> coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
+                                  00.25, 3.32, 1.68, 3.36,  3.00, 1.81,
+                                  3.51,  2.51, 2.60, 4.27,  3.22, 1.56};
+  std::vector<int> atype = {0, 1, 1, 0, 1, 1};
+  std::vector<VALUETYPE> box = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+  // Per-atom reference (energy/force/virial) loaded from the .expected sidecar.
+  std::vector<VALUETYPE> expected_e;
+  std::vector<VALUETYPE> expected_f;
+  std::vector<VALUETYPE> expected_v;
+  int natoms;
+  double expected_tot_e;
+  std::vector<VALUETYPE> expected_tot_v;
+
+  // Graph-schema model under test.
+  static deepmd::DeepPot dp;
+  // Independent nlist (dense) model with identical weights — used as a live
+  // graph≈dense oracle for arbitrary system sizes.
+  static deepmd::DeepPot dp_ref;
+
+  static void SetUpTestSuite() {
+#if defined(BUILD_PYTORCH) && BUILD_PT_EXPT
+    dp.init(kGraphModel);
+    dp_ref.init(kNlistRefModel);
+#endif
+  }
+
+  void SetUp() override {
+#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT
+    GTEST_SKIP() << "Skip because PyTorch support is not enabled.";
+#endif
+    deepmd_test::ExpectedRef ref;
+    ref.load(kRefPath);
+    expected_e = ref.get<VALUETYPE>("pbc", "expected_e");
+    expected_f = ref.get<VALUETYPE>("pbc", "expected_f");
+    expected_v = ref.get<VALUETYPE>("pbc", "expected_v");
+
+    natoms = expected_e.size();
+    EXPECT_EQ(natoms * 3, static_cast<int>(expected_f.size()));
+    EXPECT_EQ(natoms * 9, static_cast<int>(expected_v.size()));
+    expected_tot_e = 0.;
+    expected_tot_v.assign(9, 0.);
+    for (int ii = 0; ii < natoms; ++ii) {
+      expected_tot_e += expected_e[ii];
+    }
+    for (int ii = 0; ii < natoms; ++ii) {
+      for (int dd = 0; dd < 9; ++dd) {
+        expected_tot_v[dd] += expected_v[ii * 9 + dd];
+      }
+    }
+  };
+
+  void TearDown() override {};
+
+  static void TearDownTestSuite() {
+    dp = deepmd::DeepPot();
+    dp_ref = deepmd::DeepPot();
+  }
+};
+
+template <class VALUETYPE>
+deepmd::DeepPot TestInferDpa1GraphPtExpt<VALUETYPE>::dp;
+template <class VALUETYPE>
+deepmd::DeepPot TestInferDpa1GraphPtExpt<VALUETYPE>::dp_ref;
+
+TYPED_TEST_SUITE(TestInferDpa1GraphPtExpt, ValueTypes);
+
+// Case 1: DeepPot builds its own neighbor list and runs the standalone graph
+// branch (lower_input_is_graph_, build_nlist -> buildGraphTensors).  Validates
+// the graph AOTI ABI/geometry against the independent nlist reference.
+TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  double ener;
+  std::vector<VALUETYPE> force, virial;
+  dp.compute(ener, force, virial, coord, atype, box);
+
+  EXPECT_EQ(force.size(), static_cast<size_t>(natoms * 3));
+  EXPECT_EQ(virial.size(), 9u);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}
+
+// Case 2: a SECOND, larger system (12 atoms, different edge count) through the
+// SAME loaded graph model — proves the dynamic edge axis works in C++.  The
+// graph result is cross-checked against the dense nlist .pt2 (same weights);
+// at non-binding sel they must agree bit-for-bit (fp64 ~1e-10).
+TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_sys2_dynamic_edges) {
+  using VALUETYPE = TypeParam;
+  deepmd::DeepPot& dp = this->dp;
+  deepmd::DeepPot& dp_ref = this->dp_ref;
+
+  // 12 atoms: original 6 stacked with a +13 z-shifted copy, box doubled in z.
+  // Same local density as the 6-atom fixture, so per-atom neighbor counts stay
+  // far below sel=30 and graph(carry-all) == dense(sel-truncated).
+  std::vector<VALUETYPE> coord2 = {
+      12.83, 2.56, 2.18,  12.09, 2.87, 2.74,  00.25, 3.32, 1.68,
+      3.36,  3.00, 1.81,  3.51,  2.51, 2.60,  4.27,  3.22, 1.56,
+      12.83, 2.56, 15.18, 12.09, 2.87, 15.74, 00.25, 3.32, 14.68,
+      3.36,  3.00, 14.81, 3.51,  2.51, 15.60, 4.27,  3.22, 14.56};
+  std::vector<int> atype2 = {0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1};
+  std::vector<VALUETYPE> box2 = {13., 0., 0., 0., 13., 0., 0., 0., 26.};
+  int natoms2 = atype2.size();
+
+  double ener_g, ener_r;
+  std::vector<VALUETYPE> force_g, virial_g, force_r, virial_r;
+  dp.compute(ener_g, force_g, virial_g, coord2, atype2, box2);
+  dp_ref.compute(ener_r, force_r, virial_r, coord2, atype2, box2);
+
+  EXPECT_EQ(force_g.size(), static_cast<size_t>(natoms2 * 3));
+  EXPECT_EQ(virial_g.size(), 9u);
+
+  EXPECT_LT(fabs(ener_g - ener_r), EPSILON);
+  for (int ii = 0; ii < natoms2 * 3; ++ii) {
+    EXPECT_LT(fabs(force_g[ii] - force_r[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_LT(fabs(virial_g[ii] - virial_r[ii]), EPSILON);
+  }
+}
+
+// Case 3 (CRITICAL): exercise the LAMMPS compute_inner graph branch with an
+// explicit InputNlist and the `ago` cache.  Calling compute twice WITHOUT
+// rebuilding the nlist — first ago=0 (rebuild), then ago=1 (reuse) — must give
+// identical results.  This is the only case that hits compute_inner + the
+// member-cached mapping_ vector; the build-nlist cases above never touch it.
+// Regression guard for the OOB-on-ago>0 bug fixed by caching mapping_ as a
+// member (commit 7c70db47b).
+TYPED_TEST(TestInferDpa1GraphPtExpt, lammps_nlist_ago) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+
+  float rc = dp.cutoff();
+  int nloc = coord.size() / 3;
+  std::vector<VALUETYPE> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int> > nlist_data;
+  _build_nlist<VALUETYPE>(nlist_data, coord_cpy, atype_cpy, mapping, coord,
+                          atype, box, rc);
+  int nall = coord_cpy.size() / 3;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_data);
+  // The graph branch folds ghost neighbours onto their local owners via the
+  // LAMMPS atom-map; without it periodic (ghost) edges would be dropped.
+  inlist.mapping = mapping.data();
+
+  // ago=0: rebuild the cached nlist/mapping, then run the graph branch.
+  double ener;
+  std::vector<VALUETYPE> force_, virial;
+  dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc,
+             inlist, 0);
+  std::vector<VALUETYPE> force;
+  _fold_back<VALUETYPE>(force, force_, mapping, nloc, nall, 3);
+
+  EXPECT_EQ(force.size(), static_cast<size_t>(natoms * 3));
+  EXPECT_EQ(virial.size(), 9u);
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+
+  // ago=1: reuse the cached nlist/mapping (NO rebuild).  Must match again.
+  // This is the path that previously read the local mapping vector OOB.
+  ener = 0.;
+  std::fill(force_.begin(), force_.end(), 0.0);
+  std::fill(virial.begin(), virial.end(), 0.0);
+  dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc,
+             inlist, 1);
+  _fold_back<VALUETYPE>(force, force_, mapping, nloc, nall, 3);
+
+  EXPECT_EQ(force.size(), static_cast<size_t>(natoms * 3));
+  EXPECT_EQ(virial.size(), 9u);
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}
+
+// Case 5: exercise the DeepPot::compute ATOMIC overload on the graph .pt2.
+// This is the first test to reach the ``if (atomic)`` branch inside
+// remap_graph_outputs_to_dense_keys (the atom_energy/atom_virial remapping).
+// The per-atom reference values are already loaded from
+// deeppot_dpa1_graph.expected into this->expected_e and this->expected_v by
+// SetUp().
+TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_atomic) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_e = this->expected_e;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  std::vector<VALUETYPE>& expected_v = this->expected_v;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+
+  double ener;
+  std::vector<VALUETYPE> force, virial, atom_energy, atom_virial;
+  // Standalone atomic overload: DeepPot builds its own nlist (graph branch),
+  // then returns per-atom energy + atom-virial alongside total
+  // energy/force/virial.
+  dp.compute(ener, force, virial, atom_energy, atom_virial, coord, atype, box);
+
+  EXPECT_EQ(force.size(), static_cast<size_t>(natoms * 3));
+  EXPECT_EQ(virial.size(), 9u);
+  EXPECT_EQ(atom_energy.size(), static_cast<size_t>(natoms));
+  EXPECT_EQ(atom_virial.size(), static_cast<size_t>(natoms * 9));
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < natoms; ++ii) {
+    EXPECT_LT(fabs(atom_energy[ii] - expected_e[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < natoms * 9; ++ii) {
+    EXPECT_LT(fabs(atom_virial[ii] - expected_v[ii]), EPSILON);
+  }
+}
+
+// Case 4: a tiny system with no in-cutoff neighbors — only the two masked
+// dummy edges survive (nedge_min=2 guard / SIGFPE-edge family).  The graph
+// must run cleanly, produce finite, interaction-free output (zero force/virial)
+// and agree with the dense reference.
+TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_tiny_no_edges) {
+  using VALUETYPE = TypeParam;
+  deepmd::DeepPot& dp = this->dp;
+  deepmd::DeepPot& dp_ref = this->dp_ref;
+
+  // Two atoms ~33 apart in a 40-box: no neighbor within rcut=6 and no periodic
+  // image either, so the graph sees zero real edges (only the 2 dummy edges).
+  std::vector<VALUETYPE> coord_t = {1.0, 1.0, 1.0, 20.0, 20.0, 20.0};
+  std::vector<int> atype_t = {0, 1};
+  std::vector<VALUETYPE> box_t = {40., 0., 0., 0., 40., 0., 0., 0., 40.};
+  int natoms_t = atype_t.size();
+
+  double ener_g, ener_r;
+  std::vector<VALUETYPE> force_g, virial_g, force_r, virial_r;
+  ASSERT_NO_THROW(
+      dp.compute(ener_g, force_g, virial_g, coord_t, atype_t, box_t));
+  dp_ref.compute(ener_r, force_r, virial_r, coord_t, atype_t, box_t);
+
+  EXPECT_EQ(force_g.size(), static_cast<size_t>(natoms_t * 3));
+  EXPECT_EQ(virial_g.size(), 9u);
+
+  EXPECT_TRUE(std::isfinite(ener_g));
+  // No interactions: force and virial must vanish.
+  for (int ii = 0; ii < natoms_t * 3; ++ii) {
+    EXPECT_TRUE(std::isfinite(force_g[ii]));
+    EXPECT_LT(fabs(force_g[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 9; ++ii) {
+    EXPECT_TRUE(std::isfinite(virial_g[ii]));
+    EXPECT_LT(fabs(virial_g[ii]), EPSILON);
+  }
+  // graph == dense for the isolated-atom limit.
+  EXPECT_LT(fabs(ener_g - ener_r), EPSILON);
+  for (int ii = 0; ii < natoms_t * 3; ++ii) {
+    EXPECT_LT(fabs(force_g[ii] - force_r[ii]), EPSILON);
+  }
+}
diff --git a/source/api_cc/tests/test_deeppot_universal.cc b/source/api_cc/tests/test_deeppot_universal.cc
index e0ee6fc8f4..1c599d7d33 100644
--- a/source/api_cc/tests/test_deeppot_universal.cc
+++ b/source/api_cc/tests/test_deeppot_universal.cc
@@ -143,6 +143,28 @@ std::vector<VariantDeepPotCase> variant_deeppot_cases() {
            /*supports_no_pbc_atomic=*/false,
            /*supports_no_pbc_lmp_nlist=*/true,
            /*supports_no_pbc_lmp_nlist_atomic=*/false},
+          {"dpa1_graph_ptexpt",
+           Backend::PTExpt,
+           "../../tests/infer/deeppot_dpa1_graph.pt2",
+           /*convert_pbtxt=*/false,
+           nullptr,
+           nullptr,
+           "../../tests/infer/deeppot_dpa1_graph.expected",
+           "pbc",
+           "nopbc",
+           1e-10,
+           1e-4,
+           /*supports_float=*/true,
+           /*supports_finite_difference=*/true,
+           /*supports_lmp_nlist=*/true,
+           /*supports_lmp_nlist_atomic=*/true,
+           /*supports_lmp_nlist_cutoff_twice=*/true,
+           /*supports_lmp_nlist_type_sel=*/true,
+           /*supports_print_summary=*/true,
+           /*supports_no_pbc_simple=*/true,
+           /*supports_no_pbc_atomic=*/false,
+           /*supports_no_pbc_lmp_nlist=*/true,
+           /*supports_no_pbc_lmp_nlist_atomic=*/false},
           {"dpa2_pytorch_pth",
            Backend::PyTorch,
            "../../tests/infer/deeppot_dpa2.pth",
diff --git a/source/lmp/tests/test_lammps_dpa1_graph_pt2.py b/source/lmp/tests/test_lammps_dpa1_graph_pt2.py
new file mode 100644
index 0000000000..ab898e15f6
--- /dev/null
+++ b/source/lmp/tests/test_lammps_dpa1_graph_pt2.py
@@ -0,0 +1,319 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Test LAMMPS with the NeighborGraph (graph-schema) .pt2 DPA1 model.
+
+The model ``deeppot_dpa1_graph.pt2`` is a dpa1(attn_layer=0) descriptor
+exported with ``lower_kind="graph"`` (gen_dpa1.py section B).  dpa1 is
+NON-message-passing, so the SAME single-rank graph .pt2 also drives the
+multi-rank path: the C++ ``DeepPotPTExpt`` builds an EXTENDED-region graph
+(``fold_to_local=False``; ghosts are distinct nodes whose features come from
+their real halo types) and returns per-extended-atom forces, which LAMMPS
+reverse-comm folds back to their owners.  There is NO with-comm artifact and
+NO ``border_op`` (that is the message-passing PR-G path) — hence no
+``use_loc_mapping=False`` variant.
+
+Reference values come from ``source/tests/infer/gen_dpa1.py`` (the same
+``deeppot_dpa1_graph.expected`` the C++ gtest uses); the multi-rank run must
+match the single-rank reference for energy, per-atom force, and per-atom
+virial.  This is the core multi-rank correctness gate for the non-MP graph
+path implemented in B3.1.
+"""
+
+import importlib.util
+import os
+import shutil
+import subprocess as sp
+import sys
+import tempfile
+from pathlib import (
+    Path,
+)
+
+import constants
+import numpy as np
+import pytest
+from expected_ref import (
+    read_expected_ref,
+)
+from lammps import (
+    PyLammps,
+)
+from write_lmp_data import (
+    write_lmp_data,
+)
+
+pb_file = (
+    Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa1_graph.pt2"
+)
+ref_file = (
+    Path(__file__).parent.parent.parent
+    / "tests"
+    / "infer"
+    / "deeppot_dpa1_graph.expected"
+)
+# The MPI runner is backend-agnostic (DATAFILE PB_FILE OUTPUT + flags); reuse
+# the DPA3 driver verbatim rather than duplicate it.
+mpi_runner = Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py"
+
+data_file = Path(__file__).parent / "data_dpa1_graph_pt2.lmp"
+# Elongated-box variant for the empty-subdomain MPI corner: x extended to
+# 30 A while atoms stay in x in [0.25, 12.83]; with ``processors 2 1 1`` the
+# split at x = 15 leaves rank 1 with zero local atoms.
+data_file_empty_subdomain = (
+    Path(__file__).parent / "data_dpa1_graph_pt2_empty_subdomain.lmp"
+)
+
+# Reference values written by source/tests/infer/gen_dpa1.py (PBC case).
+# Guarded with try/except because gen_dpa1.py only runs when PyTorch is built.
+try:
+    _ref = read_expected_ref(ref_file)["pbc"]
+    expected_e = float(np.sum(_ref["expected_e"]))
+    expected_f = _ref["expected_f"].reshape(6, 3)
+    # LAMMPS uses the opposite sign convention for virial vs DeepPot.
+    expected_v = -_ref["expected_v"].reshape(6, 9)
+except FileNotFoundError:
+    expected_e = expected_f = expected_v = None
+
+# Gate the reference-comparison tests on the generated ``.expected`` fixture so
+# they skip cleanly (rather than failing with a ``TypeError`` on ``None``) when
+# gen_dpa1.py has not run (e.g. PyTorch not built). The MPI multi-rank tests
+# compare against a single-rank run of the same archive and do not need it.
+_HAS_REF = expected_e is not None
+
+box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0])
+coord = np.array(
+    [
+        [12.83, 2.56, 2.18],
+        [12.09, 2.87, 2.74],
+        [0.25, 3.32, 1.68],
+        [3.36, 3.00, 1.81],
+        [3.51, 2.51, 2.60],
+        [4.27, 3.22, 1.56],
+    ]
+)
+# Model type_map is ["O", "H"]; gtest atype = [0, 1, 1, 0, 1, 1] -> LAMMPS
+# types [1, 2, 2, 1, 2, 2] under identity ``pair_coeff * *``.
+type_OH = np.array([1, 2, 2, 1, 2, 2])
+
+
+def setup_module() -> None:
+    if os.environ.get("ENABLE_PYTORCH", "1") != "1":
+        pytest.skip(
+            "Skip test because PyTorch support is not enabled.",
+        )
+    write_lmp_data(box, coord, type_OH, data_file)
+    box_empty_subdomain = np.array([0, 30, 0, 13, 0, 13, 0, 0, 0])
+    write_lmp_data(box_empty_subdomain, coord, type_OH, data_file_empty_subdomain)
+
+
+def teardown_module() -> None:
+    for f in [data_file, data_file_empty_subdomain]:
+        if f.exists():
+            os.remove(f)
+
+
+def _lammps(data_file, units="metal", atom_map: str = "yes") -> PyLammps:
+    lammps = PyLammps()
+    lammps.units(units)
+    lammps.boundary("p p p")
+    lammps.atom_style("atomic")
+    if atom_map != "no":
+        lammps.atom_modify(f"map {atom_map}")
+    lammps.neighbor("2.0 bin")
+    lammps.neigh_modify("every 10 delay 0 check no")
+    lammps.read_data(data_file.resolve())
+    lammps.mass("1 16")
+    lammps.mass("2 2")
+    lammps.timestep(0.0005)
+    lammps.fix("1 all nve")
+    return lammps
+
+
+@pytest.fixture
+def lammps():
+    lmp = _lammps(data_file=data_file)
+    yield lmp
+    lmp.close()
+
+
+@pytest.mark.skipif(not _HAS_REF, reason="gen_dpa1.py .expected fixture not generated")
+def test_pair_deepmd(lammps) -> None:
+    """Single-rank serial run (``atom_modify map yes``): the graph .pt2
+    folds ghosts onto local owners (``fold_to_local=True``) and must match
+    the gen_dpa1.py reference for energy and per-atom force.
+    """
+    lammps.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps.pair_coeff("* *")
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    lammps.run(1)
+
+
+@pytest.mark.skipif(not _HAS_REF, reason="gen_dpa1.py .expected fixture not generated")
+def test_pair_deepmd_virial(lammps) -> None:
+    """Single-rank per-atom virial via ``centroid/stress/atom``."""
+    lammps.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps.pair_coeff("* *")
+    lammps.compute("virial all centroid/stress/atom NULL pair")
+    for ii in range(9):
+        jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii]
+        lammps.variable(f"virial{jj} atom c_virial[{ii + 1}]")
+    lammps.dump(
+        "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
+    )
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
+    for ii in range(9):
+        assert np.array(
+            lammps.variables[f"virial{ii}"].value
+        ) / constants.nktv2p == pytest.approx(expected_v[idx_map, ii])
+
+
+# ---------------------------------------------------------------------------
+# Multi-rank test (non-MP extended-region graph path; B3.1).
+#
+# dpa1 is non-message-passing, so multi-rank uses the SAME single-rank graph
+# .pt2 on the extended region.  The expected energy/force/virial are the
+# single-rank reference: each rank evaluates its local atoms over the extended
+# graph; ghost reaction forces fold back via LAMMPS reverse-comm.
+# ---------------------------------------------------------------------------
+
+
+def _run_mpi_subprocess(
+    extra_args: list[str] | None = None,
+    nprocs: int = 2,
+    data_path: Path | None = None,
+    processors: str | None = None,
+    runner_args: list[str] | None = None,
+) -> dict:
+    """Invoke the (backend-agnostic) DPA3 MPI runner under
+    ``mpirun -n <nprocs>`` against the dpa1 graph .pt2 and return
+    ``{"pe": float, "forces": (n, 3), "virials": (n, 9)}``.
+
+    ``nprocs == 1`` forces ``--processors 1 1 1`` so the C++ side sees
+    ``nprocs == 1`` and routes to the single-rank graph path — a
+    same-archive reference for the multi-rank comparison.
+    """
+    if data_path is None:
+        data_path = data_file
+    with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f:
+        out_path = f.name
+    try:
+        argv = [
+            "mpirun",
+            "-n",
+            str(nprocs),
+            sys.executable,
+            str(mpi_runner),
+            str(data_path.resolve()),
+            str(pb_file.resolve()),
+            out_path,
+        ]
+        if processors is not None:
+            argv.extend(["--processors", processors])
+        elif nprocs == 1:
+            argv.extend(["--processors", "1 1 1"])
+        if extra_args:
+            argv.extend(extra_args)
+        if runner_args:
+            argv.extend(runner_args)
+        sp.check_call(argv)
+        with open(out_path) as fh:
+            lines = fh.read().strip().splitlines()
+        pe = float(lines[0])
+        rows = np.array(
+            [list(map(float, line.split())) for line in lines[1:]],
+            dtype=np.float64,
+        )
+        forces = rows[:, :3]
+        virials = rows[:, 3:]
+        return {"pe": pe, "forces": forces, "virials": virials}
+    finally:
+        if os.path.exists(out_path):
+            os.remove(out_path)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa1_graph() -> None:
+    """Multi-rank LAMMPS run for the dpa1 graph .pt2 must match the
+    single-rank reference within numerical tolerance for energy, forces,
+    and per-atom virial.
+
+    This is the core correctness gate for the non-MP extended-region
+    multi-rank C++ path (B3.1): the extended graph + reverse-comm
+    fold-back must reproduce the folded single-rank result.
+    """
+    out = _run_mpi_subprocess()
+    assert out["pe"] == pytest.approx(expected_e, rel=0, abs=1e-8)
+    for ii in range(6):
+        np.testing.assert_allclose(out["forces"][ii], expected_f[ii], atol=1e-8, rtol=0)
+    # ``centroid/stress/atom`` column order [xx, yy, zz, xy, xz, yz, yx, zx,
+    # zy]; the inverse permutation maps it back to the expected_v columns.
+    expected_v_to_lammps = [0, 6, 7, 3, 1, 8, 4, 5, 2]
+    np.testing.assert_allclose(
+        out["virials"][:, expected_v_to_lammps] / constants.nktv2p,
+        expected_v,
+        atol=1e-8,
+        rtol=0,
+    )
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa1_graph_matches_single_rank() -> None:
+    """Multi-rank (``-n 2``) ≡ single-rank (``-n 1``) on the SAME archive
+    and trajectory — isolates the extended-region multi-rank C++ path from
+    the .pt2 reference values (a wrong-but-finite divergence would show up
+    here even if the hardcoded reference drifted).
+    """
+    out_mpi = _run_mpi_subprocess(nprocs=2)
+    out_ref = _run_mpi_subprocess(nprocs=1)
+    np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
+    )
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10)
+
+
+@pytest.mark.skipif(
+    shutil.which("mpirun") is None, reason="MPI is not installed on this system"
+)
+@pytest.mark.skipif(
+    importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed"
+)
+def test_pair_deepmd_mpi_dpa1_graph_empty_subdomain() -> None:
+    """Multi-rank with one rank owning zero local atoms (elongated box,
+    ``processors 2 1 1``, split at x = 15).  The extended-region graph path
+    must still produce correct forces/virial on the populated rank and a
+    zero contribution from the empty rank — compared against a same-archive
+    single-rank reference of the same fixture.
+    """
+    # Force ``processors 2 1 1`` so the split is along x at 15 and rank 1 is
+    # genuinely empty -- otherwise LAMMPS may auto-pick a grid where neither
+    # rank is empty and the branch under test is not exercised.
+    out_mpi = _run_mpi_subprocess(
+        nprocs=2, data_path=data_file_empty_subdomain, processors="2 1 1"
+    )
+    out_ref = _run_mpi_subprocess(nprocs=1, data_path=data_file_empty_subdomain)
+    np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0)
+    np.testing.assert_allclose(
+        out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0
+    )
+    assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10)
diff --git a/source/tests/common/dpmodel/test_edge_force_virial.py b/source/tests/common/dpmodel/test_edge_force_virial.py
index fa84ef7ba4..722960f57a 100644
--- a/source/tests/common/dpmodel/test_edge_force_virial.py
+++ b/source/tests/common/dpmodel/test_edge_force_virial.py
@@ -97,6 +97,38 @@ def test_all_edges_masked_gives_zero(self) -> None:
             np.testing.assert_allclose(av, np.zeros((n, 3, 3)))
             np.testing.assert_allclose(vir, np.zeros((nf, 3, 3)))
 
+    def test_modulo_clamp_leaves_real_edges_unchanged(self) -> None:
+        # INVARIANT (iProzd review): the in-bounds index clamp (``% n_out``) that
+        # keeps the CUDA-exported scatter address legal must NEVER alter a real
+        # (edge_mask == True) edge -- only masked/out-of-range guard edges may be
+        # remapped, and those carry zero weight so remapping is harmless. Here a
+        # REAL edge sits on the boundary node ``n_out - 1`` (the largest valid
+        # index, where a wrong wrap would be visible) and a MASKED guard edge
+        # carries deliberately OUT-OF-RANGE indices (>= n_out) with nonzero g/vec.
+        # Correctness requires the result to equal the real-edges-only reference:
+        # the boundary real edge must land on node n_out-1 (not wrapped), and the
+        # out-of-range guard must contribute nothing. If real edges were ever
+        # remapped by the modulo (the shape-binding bug iProzd warned about), the
+        # boundary node's force/virial would be wrong and this test would fail.
+        n_node = np.array([5], dtype=np.int64)  # 1 frame, nodes 0..4 (n_out = 5)
+        # e0: real, src on the boundary node 4 -> node 0 ; e1: real, node 0 -> 4
+        # e2: MASKED guard with out-of-range indices src=99, dst=77 (>= n_out)
+        edge_index = np.array([[4, 0, 99], [0, 4, 77]], dtype=np.int64)
+        edge_vec = np.array([[1.0, 0.0, 0.0], [-1.0, 0.0, 0.0], [9.0, 9.0, 9.0]])
+        edge_mask = np.array([True, True, False])
+        g = np.array([[0.5, 0.2, 0.0], [0.3, 0.0, 0.1], [7.0, 7.0, 7.0]])
+        force, av, vir = edge_force_virial(g, edge_vec, edge_index, edge_mask, n_node)
+
+        # reference: the SAME two real edges only (no guard edge at all)
+        ref_force, ref_av, ref_vir = edge_force_virial(
+            g[:2], edge_vec[:2], edge_index[:, :2], edge_mask[:2], n_node
+        )
+        np.testing.assert_allclose(force, ref_force)
+        np.testing.assert_allclose(av, ref_av)
+        np.testing.assert_allclose(vir, ref_vir)
+        # explicit: the boundary real edge scattered its force to node 4 (unwrapped)
+        self.assertTrue(np.any(force[4] != 0.0))
+
     def test_ragged_multiframe_with_edge_and_node_padding(self) -> None:
         # MOST GENERAL case: 2 frames with DIFFERENT node counts (3 and 5) AND
         # different edge counts (2 and 3), masked guard EDGES, and a padded NODE
diff --git a/source/tests/common/dpmodel/test_graph_static_capacity.py b/source/tests/common/dpmodel/test_graph_static_capacity.py
new file mode 100644
index 0000000000..c9ff982d01
--- /dev/null
+++ b/source/tests/common/dpmodel/test_graph_static_capacity.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Tests for static edge_capacity masked padding in build_neighbor_graph.
+
+Codifies the contract: build_neighbor_graph(..., layout=GraphLayout(edge_capacity=E_max))
+returns a NeighborGraph whose edge_index/edge_vec/edge_mask have a STATIC leading
+edge dim E_max (real edges in the compact prefix, edge_mask=False tail), so export
+sees a fixed E.  Edge overflow must raise ValueError.
+"""
+
+import numpy as np
+import pytest
+
+from deepmd.dpmodel.utils.neighbor_graph import (
+    GraphLayout,
+    build_neighbor_graph,
+)
+
+
+class TestStaticEdgeCapacity:
+    """Tests for static edge_capacity masked padding via build_neighbor_graph."""
+
+    @pytest.fixture()
+    def small_system(self):
+        """6-atom periodic system with a 20 Å box (atoms well within rcut=4 range)."""
+        rng = np.random.default_rng(0)
+        coord = rng.normal(size=(1, 6, 3)) * 1.5
+        atype = np.array([[0, 1, 0, 1, 0, 1]], dtype=np.int64)
+        box = np.eye(3).reshape(1, 9) * 20.0
+        return coord, atype, box
+
+    def test_static_edge_capacity_shape(self, small_system):
+        """Static edge_capacity=64 yields edge_index.shape == (2, 64)."""
+        coord, atype, box = small_system
+        cap = build_neighbor_graph(
+            coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=64)
+        )
+        assert cap.edge_index.shape == (2, 64)
+        assert cap.edge_vec.shape == (64, 3)
+        assert cap.edge_mask.shape == (64,)
+
+    def test_static_edge_capacity_matches_dynamic(self, small_system):
+        """Static graph has same real-edge count as dynamic graph."""
+        coord, atype, box = small_system
+        dyn = build_neighbor_graph(coord, atype, box, 4.0)
+        cap = build_neighbor_graph(
+            coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=64)
+        )
+        assert cap.edge_index.shape == (2, 64)
+        assert int(cap.edge_mask.sum()) == int(dyn.edge_mask.sum())
+
+    def test_static_edge_capacity_real_prefix_matches_dynamic(self, small_system):
+        """The real-edge prefix of the static graph matches the dynamic graph."""
+        coord, atype, box = small_system
+        dyn = build_neighbor_graph(coord, atype, box, 4.0)
+        cap = build_neighbor_graph(
+            coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=64)
+        )
+        n_real = int(dyn.edge_mask.sum())
+        # real prefix must match exactly
+        np.testing.assert_array_equal(
+            cap.edge_index[:, :n_real], dyn.edge_index[:, :n_real]
+        )
+        np.testing.assert_allclose(cap.edge_vec[:n_real], dyn.edge_vec[:n_real])
+        # padding suffix must have edge_mask=False
+        assert not np.any(cap.edge_mask[n_real:])
+
+    def test_overflow_raises(self, small_system):
+        """edge_capacity smaller than real edge count must raise ValueError."""
+        coord, atype, box = small_system
+        # capacity=1 is guaranteed to be smaller than the real edge count
+        with pytest.raises(ValueError, match="edge overflow"):
+            build_neighbor_graph(
+                coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=1)
+            )
diff --git a/source/tests/infer/gen_dpa1.py b/source/tests/infer/gen_dpa1.py
index 7eaaae4ae2..9c743c9f88 100644
--- a/source/tests/infer/gen_dpa1.py
+++ b/source/tests/infer/gen_dpa1.py
@@ -1,10 +1,21 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Generate deeppot_dpa1.pth and deeppot_dpa1.pt2 test models.
+"""Generate deeppot_dpa1.pth, deeppot_dpa1.pt2, and deeppot_dpa1_graph.pt2 test models.
 
-Creates a DPA1 model from dpmodel config, serializes, and exports to both
-.pth (torch.jit) and .pt2 (torch.export) from the same weights.
-Also prints reference values for C++ tests (PBC and NoPbc).
+Creates two DPA1 models from dpmodel configs:
+  - deeppot_dpa1.pt2 / deeppot_dpa1.pth  (attn_layer=2, dense nlist-form export)
+  - deeppot_dpa1_graph.pt2                (attn_layer=0, graph-form export via
+                                            lower_kind="graph"; the graph forward
+                                            is eligible only when attn_layer==0)
+
+Both are serialized and exported to their respective formats from the same weights.
+Reference sidecar files (.expected) consumed by C++ gtests are also written:
+  - deeppot_dpa1.expected   — from the nlist .pt2 eval (existing)
+  - deeppot_dpa1_graph.expected — from an independent NLIST .pt2 eval (NOT the
+      graph .pt2; dpmodel se_atten has no analytical force, so the dense nlist
+      path is the independent ground truth). At non-binding sel the graph and
+      nlist paths see the same neighbor set, so the graph .pt2 is sanity-checked
+      against this reference at ≤1e-5.
 """
 
 import copy
@@ -171,6 +182,157 @@ def main():
     print(f"// .pth NoPbc total energy: {e_pth_np[0, 0]:.18e}")  # noqa: T201
     print(f"// .pth vs .pt2 NoPbc energy diff: {abs(e_np[0, 0] - e_pth_np[0, 0]):.2e}")  # noqa: T201
 
+    # ============================================================
+    # Section B: graph-eligible DPA1 (attn_layer=0) model
+    # ============================================================
+    # attn_layer=0 disables the attention layers, making the descriptor
+    # a plain two-body embedding (se_e2_a-like) that is eligible for the
+    # NeighborGraph forward path (forward_lower_graph_exportable).
+    # Config mirrors DPA1_CONFIG in
+    # source/tests/pt_expt/utils/test_graph_pt2_metadata.py
+    graph_config = {
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "se_atten",
+            "sel": 30,
+            "rcut_smth": 2.0,
+            "rcut": 6.0,
+            "neuron": [2, 4, 8],
+            "axis_neuron": 4,
+            "attn": 5,
+            "attn_layer": 0,
+            "attn_dotr": True,
+            "attn_mask": False,
+            "activation_function": "tanh",
+            "scaling_factor": 1.0,
+            "normalize": True,
+            "temperature": 1.0,
+            "type_one_side": True,
+            "seed": 1,
+        },
+        "fitting_net": {
+            "neuron": [5, 5, 5],
+            "resnet_dt": True,
+            "seed": 1,
+        },
+    }
+
+    print("\n---- Building graph-eligible DPA1 (attn_layer=0) ----")  # noqa: T201
+
+    # ---- B.1  Build dpmodel, serialize ----
+    model_g = get_model(copy.deepcopy(graph_config))
+    model_dict_g = model_g.serialize()
+
+    data_g = {
+        "model": copy.deepcopy(model_dict_g),
+        "model_def_script": graph_config,
+        "backend": "dpmodel",
+        "software": "deepmd-kit",
+        "version": "3.0.0",
+    }
+
+    # ---- B.2  Compute reference via nlist .pt2 (independent of graph path) ----
+    # The reference for deeppot_dpa1_graph.expected comes from the NLIST .pt2
+    # (dense-quartet forward), NOT the graph .pt2.  This ensures the C++ gtest
+    # (B2.5) independently validates the graph AOTI path against a known-good
+    # nlist evaluation.
+    #
+    # The nlist .pt2 is also PERSISTED (deeppot_dpa1_graph_nlist_ref.pt2): the
+    # C++ gtest loads it alongside the graph .pt2 to cross-check graph≈dense at
+    # 1e-9 on arbitrary system sizes (dynamic-edge-axis cases) without baking a
+    # second reference block into the .expected sidecar.  Same weights as the
+    # graph model, so at non-binding sel the two paths must agree.
+    nlist_ref_pt2 = os.path.join(base_dir, "deeppot_dpa1_graph_nlist_ref.pt2")
+    print(f"Exporting reference nlist .pt2 to {nlist_ref_pt2} ...")  # noqa: T201
+    pt_expt_deserialize_to_file(
+        nlist_ref_pt2,
+        copy.deepcopy(data_g),
+        do_atomic_virial=True,
+        lower_kind="nlist",  # independent: dense nlist, NOT graph
+    )
+    dp_nlist_ref = DeepPot(nlist_ref_pt2)
+
+    # PBC reference from nlist path
+    e_r1, f_r1, v_r1, ae_r1, av_r1 = dp_nlist_ref.eval(coord, box, atype, atomic=True)
+    # NoPBC reference from nlist path
+    e_rnp, f_rnp, v_rnp, ae_rnp, av_rnp = dp_nlist_ref.eval(
+        coord, None, atype, atomic=True
+    )
+
+    print(f"Nlist ref PBC energy: {e_r1[0, 0]:.18e}")  # noqa: T201
+    print(f"Nlist ref NoPBC energy: {e_rnp[0, 0]:.18e}")  # noqa: T201
+    max_ref_force_pbc = float(np.max(np.abs(f_r1)))
+    max_ref_force_nopbc = float(np.max(np.abs(f_rnp)))
+    print(f"Nlist ref PBC max |force|: {max_ref_force_pbc:.6e}")  # noqa: T201
+    print(f"Nlist ref NoPBC max |force|: {max_ref_force_nopbc:.6e}")  # noqa: T201
+    if max_ref_force_pbc < 1e-10:
+        raise RuntimeError(
+            f"Graph model nlist-ref forces are degenerate "
+            f"(max={max_ref_force_pbc:.2e}); weights may need perturbation."
+        )
+
+    # ---- B.3  Write sidecar reference file ----
+    graph_ref_path = os.path.join(base_dir, "deeppot_dpa1_graph.expected")
+    write_expected_ref(
+        graph_ref_path,
+        sections={
+            "pbc": {
+                "expected_e": ae_r1[0, :, 0],
+                "expected_f": f_r1[0],
+                "expected_v": av_r1[0],
+            },
+            "nopbc": {
+                "expected_e": ae_rnp[0, :, 0],
+                "expected_f": f_rnp[0],
+                "expected_v": av_rnp[0],
+            },
+        },
+        source_script="source/tests/infer/gen_dpa1.py",
+    )
+    print(f"Wrote {graph_ref_path}")  # noqa: T201
+
+    # ---- B.4  Export graph-form .pt2 ----
+    graph_pt2_path = os.path.join(base_dir, "deeppot_dpa1_graph.pt2")
+    print(f"Exporting to {graph_pt2_path} (lower_kind='graph') ...")  # noqa: T201
+    pt_expt_deserialize_to_file(
+        graph_pt2_path,
+        copy.deepcopy(data_g),
+        do_atomic_virial=True,
+        lower_kind="graph",
+    )
+    print("Graph .pt2 export done.")  # noqa: T201
+
+    # ---- B.5  Sanity-check: graph .pt2 vs nlist reference ----
+    # Both use the SAME weights; at non-binding sel the math is equivalent.
+    # Verifies that forward_lower_graph_exportable + edge_energy_deriv match
+    # the nlist forward for this concrete system.
+    dp_graph = DeepPot(graph_pt2_path)
+
+    # PBC sanity check
+    e_g1, f_g1, v_g1, ae_g1, av_g1 = dp_graph.eval(coord, box, atype, atomic=True)
+    force_diff_pbc = float(np.max(np.abs(f_g1[0] - f_r1[0])))
+    print(  # noqa: T201
+        f"Graph .pt2 vs nlist ref PBC force max diff: {force_diff_pbc:.2e}"
+    )
+    if force_diff_pbc > 1e-5:
+        raise RuntimeError(
+            f"BLOCKED: graph .pt2 PBC force differs from nlist reference by "
+            f"{force_diff_pbc:.2e} (threshold 1e-5)."
+        )
+
+    # NoPBC sanity check
+    e_gnp, f_gnp, v_gnp, ae_gnp, av_gnp = dp_graph.eval(coord, None, atype, atomic=True)
+    force_diff_nopbc = float(np.max(np.abs(f_gnp[0] - f_rnp[0])))
+    print(  # noqa: T201
+        f"Graph .pt2 vs nlist ref NoPBC force max diff: {force_diff_nopbc:.2e}"
+    )
+    if force_diff_nopbc > 1e-5:
+        raise RuntimeError(
+            f"BLOCKED: graph .pt2 NoPBC force differs from nlist reference by "
+            f"{force_diff_nopbc:.2e} (threshold 1e-5)."
+        )
+
+    print("\nAll graph sanity checks passed.")  # noqa: T201
     print("\nDone!")  # noqa: T201
 
 
diff --git a/source/tests/pt_expt/infer/test_graph_deepeval.py b/source/tests/pt_expt/infer/test_graph_deepeval.py
new file mode 100644
index 0000000000..7fc83b677f
--- /dev/null
+++ b/source/tests/pt_expt/infer/test_graph_deepeval.py
@@ -0,0 +1,239 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Graph-form ``.pt2`` DeepEval parity vs the eager dense reference.
+
+A graph-form ``.pt2`` (exported with ``lower_kind="graph"``) carries the
+NeighborGraph schema ``(atype, n_node, edge_index, edge_vec, edge_mask, ...)``
+in its AOTI forward.  This test verifies that evaluating such an archive
+through the pt_expt :class:`DeepPot` reproduces the eager dpa1 energy / force /
+virial to ``rtol=atol=1e-10`` (fp64), for both PBC and non-PBC.
+
+The graph path is CARRY-ALL (every neighbor within ``rcut``); the eager dense
+reference is sel-capped (``sel=30``, forced via
+``neighbor_graph_method="legacy"``).  They coincide only at NON-BINDING ``sel``
+(max neighbor count ``< sel``), so the test fixture is a small, sparse cluster
+and the non-binding condition is asserted explicitly -- otherwise the parity
+would vacuously compare two different neighbor sets.
+"""
+
+import copy
+import os
+import tempfile
+
+import numpy as np
+import pytest
+import torch
+
+from deepmd.infer import (
+    DeepPot,
+)
+from deepmd.pt_expt.utils.env import (
+    DEVICE,
+)
+from deepmd.pt_expt.utils.serialization import (
+    deserialize_to_file,
+)
+
+# dpa1 with attn_layer == 0 -- the energy model exercised by the graph path.
+DPA1_CONFIG = {
+    "type_map": ["O", "H"],
+    "descriptor": {
+        "type": "se_atten",
+        "sel": 30,
+        "rcut_smth": 2.0,
+        "rcut": 6.0,
+        "neuron": [2, 4, 8],
+        "axis_neuron": 4,
+        "attn": 5,
+        "attn_layer": 0,
+        "attn_dotr": True,
+        "attn_mask": False,
+        "activation_function": "tanh",
+        "scaling_factor": 1.0,
+        "normalize": True,
+        "temperature": 1.0,
+        "type_one_side": True,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [5, 5, 5],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+}
+
+RCUT = 6.0
+SEL = 30
+
+
+def _build_system(
+    natoms: int = 8, seed: int = 20240626
+) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """A small, sparse cluster: ``natoms`` inside a 5 A blob, centered in an 18 A box.
+
+    The blob keeps every atom within ``rcut`` of at most ``natoms - 1`` others
+    (<< ``sel``), so the carry-all graph neighbor set equals the sel-capped
+    dense one.  Varying ``natoms`` yields a different edge count, exercising the
+    DYNAMIC edge axis of the exported ``.pt2`` (B2.0).
+    """
+    rng = np.random.default_rng(seed)
+    box_size = 18.0
+    blob = rng.random((natoms, 3)) * 5.0 + box_size * 0.5 - 2.5
+    coords = blob.reshape(1, natoms, 3)
+    cells = (np.eye(3) * box_size).reshape(1, 9)
+    # Alternate O/H types; both species present regardless of natoms.
+    atype = np.array([i % 2 for i in range(natoms)], dtype=np.int32)
+    return coords, cells, atype
+
+
+# Two DIFFERENT-size systems evaluated through the SAME exported ``.pt2``.
+# Both are sparse, non-binding clusters but with different edge counts, so the
+# second size FAILS against a static-``E`` artifact (B1) and PASSES only once
+# the edge axis is dynamic (B2.0).
+_SYSTEMS = {
+    "small_8": {"natoms": 8, "seed": 20240626},
+    "large_20": {"natoms": 20, "seed": 20240701},
+}
+
+
+def _max_neighbors(
+    coords: np.ndarray, cells: np.ndarray | None, atype: np.ndarray
+) -> int:
+    """Max carry-all neighbor count per center within ``rcut`` (for the non-binding check)."""
+    from deepmd.dpmodel.utils.neighbor_graph import (
+        build_neighbor_graph,
+    )
+
+    natoms = atype.shape[0]
+    graph = build_neighbor_graph(
+        coords.reshape(1, natoms, 3),
+        atype.reshape(1, natoms),
+        cells.reshape(1, 9) if cells is not None else None,
+        RCUT,
+    )
+    real = np.asarray(graph.edge_mask)
+    dst = np.asarray(graph.edge_index)[1][real]
+    counts = np.bincount(dst, minlength=natoms)
+    return int(counts.max())
+
+
+def _eager_dense_reference(
+    model: torch.nn.Module,
+    coords: np.ndarray,
+    cells: np.ndarray | None,
+    atype: np.ndarray,
+) -> dict[str, np.ndarray]:
+    """Reference energy/force/virial from the eager dense (sel-capped) path."""
+    natoms = atype.shape[0]
+    coord_t = torch.tensor(
+        coords.reshape(1, natoms, 3), dtype=torch.float64, device=DEVICE
+    ).requires_grad_(True)
+    atype_t = torch.tensor(atype.reshape(1, natoms), dtype=torch.int64, device=DEVICE)
+    box_t = (
+        torch.tensor(cells.reshape(1, 9), dtype=torch.float64, device=DEVICE)
+        if cells is not None
+        else None
+    )
+    ret = model.call_common(
+        coord_t,
+        atype_t,
+        box_t,
+        do_atomic_virial=True,
+        neighbor_graph_method="legacy",
+    )
+    out = {
+        "atom_energy": ret["energy"],
+        "energy": ret["energy_redu"],
+        "force": ret["energy_derv_r"].squeeze(-2),
+        "virial": ret["energy_derv_c_redu"].squeeze(-2),
+        "atom_virial": ret["energy_derv_c"].squeeze(-2),
+    }
+    return {k: v.detach().cpu().numpy() for k, v in out.items()}
+
+
+@pytest.fixture(scope="module")
+def graph_pt2():
+    """Build a dpa1(attn_layer=0) model and export it to a graph-form ``.pt2``.
+
+    The AOTI compile is slow (~90 s), so it is done once per module.  The eager
+    pt_expt model is returned alongside the archive path to serve as the dense
+    parity reference.
+    """
+    from deepmd.pt_expt.model import (
+        get_model,
+    )
+
+    model = get_model(copy.deepcopy(DPA1_CONFIG)).to(torch.float64)
+    model.eval()
+    data = {"model": model.serialize()}
+
+    tmpdir = tempfile.mkdtemp()
+    pt2_path = os.path.join(tmpdir, "deeppot_dpa1_graph.pt2")
+    deserialize_to_file(
+        pt2_path,
+        copy.deepcopy(data),
+        do_atomic_virial=True,
+        lower_kind="graph",
+    )
+    yield pt2_path, model
+    os.unlink(pt2_path)
+    os.rmdir(tmpdir)
+
+
+@pytest.mark.parametrize("system", list(_SYSTEMS))  # two different edge counts
+@pytest.mark.parametrize("pbc", [True, False])  # periodic vs non-periodic
+def test_graph_pt2_deepeval_parity(graph_pt2, pbc, system) -> None:
+    """Graph ``.pt2`` DeepEval == eager dense dpa1 (energy/force/virial), 1e-10.
+
+    Both ``_SYSTEMS`` are fed through the SAME module-scoped ``.pt2``; the
+    differing edge counts prove the exported artifact's edge axis is dynamic
+    (a static-``E`` B1 artifact would reject / mis-shape the larger system).
+    """
+    pt2_path, model = graph_pt2
+    coords, cells, atype = _build_system(**_SYSTEMS[system])
+    box = cells if pbc else None
+
+    # Anti-vacuity: the carry-all graph and the sel-capped dense reference only
+    # agree when no center is sel-bound.  Assert the system is non-binding.
+    max_nn = _max_neighbors(coords, box, atype)
+    assert max_nn < SEL, (
+        f"test system is sel-binding (max neighbors {max_nn} >= sel {SEL}); "
+        "carry-all graph and sel-capped dense reference would diverge"
+    )
+
+    dp = DeepPot(pt2_path)
+    assert dp.deep_eval.metadata["lower_input_kind"] == "graph"
+
+    e, f, v, ae, av = dp.eval(coords, box, atype, atomic=True)
+    ref = _eager_dense_reference(model, coords, box, atype)
+
+    np.testing.assert_allclose(
+        e.reshape(-1),
+        ref["energy"].reshape(-1),
+        rtol=1e-10,
+        atol=1e-10,
+        err_msg="energy",
+    )
+    np.testing.assert_allclose(
+        f.reshape(-1), ref["force"].reshape(-1), rtol=1e-10, atol=1e-10, err_msg="force"
+    )
+    np.testing.assert_allclose(
+        v.reshape(-1),
+        ref["virial"].reshape(-1),
+        rtol=1e-10,
+        atol=1e-10,
+        err_msg="virial",
+    )
+    np.testing.assert_allclose(
+        ae.reshape(-1),
+        ref["atom_energy"].reshape(-1),
+        rtol=1e-10,
+        atol=1e-10,
+        err_msg="atom_energy",
+    )
+    np.testing.assert_allclose(
+        av.reshape(-1),
+        ref["atom_virial"].reshape(-1),
+        rtol=1e-10,
+        atol=1e-10,
+        err_msg="atom_virial",
+    )
diff --git a/source/tests/pt_expt/model/test_graph_export.py b/source/tests/pt_expt/model/test_graph_export.py
new file mode 100644
index 0000000000..6b735aa3d5
--- /dev/null
+++ b/source/tests/pt_expt/model/test_graph_export.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Graph-lower export: forward_common_lower_graph_exportable traces + torch.export."""
+
+import pytest
+import torch
+
+from deepmd.dpmodel.utils.neighbor_graph import (
+    build_neighbor_graph,
+)
+from deepmd.pt.utils import (
+    env,
+)
+from deepmd.pt_expt.descriptor.dpa1 import (
+    DescrptDPA1,
+)
+from deepmd.pt_expt.fitting import (
+    InvarFitting,
+)
+from deepmd.pt_expt.model import (
+    EnergyModel,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+
+_RCUT, _NT = 4.0, 2
+
+
+def _model():
+    ds = DescrptDPA1(
+        _RCUT,
+        0.5,
+        20,
+        _NT,
+        neuron=[3, 6],
+        axis_neuron=2,
+        attn_layer=0,
+        precision="float64",
+        seed=GLOBAL_SEED,
+    ).to(env.DEVICE)
+    ft = InvarFitting(
+        "energy",
+        _NT,
+        ds.get_dim_out(),
+        1,
+        mixed_types=ds.mixed_types(),
+        precision="float64",
+        seed=GLOBAL_SEED,
+    ).to(env.DEVICE)
+    return EnergyModel(ds, ft, type_map=["A", "B"]).to(env.DEVICE)
+
+
+def _graph_inputs(model):
+    rng = torch.Generator(device=env.DEVICE).manual_seed(GLOBAL_SEED)
+    nloc = 6
+    coord = (
+        torch.rand(1, nloc, 3, dtype=torch.float64, device=env.DEVICE, generator=rng)
+        * 3.0
+    )
+    atype = torch.tensor([[0, 1, 0, 1, 0, 1]], dtype=torch.int64, device=env.DEVICE)
+    box = torch.eye(3, dtype=torch.float64, device=env.DEVICE).reshape(1, 9) * 20.0
+    g = build_neighbor_graph(coord, atype, box, model.get_rcut())
+    return (atype.reshape(-1), g.n_node, g.edge_index, g.edge_vec, g.edge_mask)
+
+
+def test_graph_exportable_traces():
+    model = _model().eval()
+    atype, n_node, ei, ev, em = _graph_inputs(model)
+    gm = model.forward_common_lower_graph_exportable(
+        atype,
+        n_node,
+        ei,
+        ev,
+        em,
+        do_atomic_virial=False,
+        tracing_mode="symbolic",
+        _allow_non_fake_inputs=True,
+    )
+    assert isinstance(gm, torch.nn.Module)
+    # the traced module reproduces eager outputs
+    eager = model.forward_common_lower_graph(
+        atype, n_node, ei, ev, em, do_atomic_virial=False
+    )
+    # traced module has placeholders for all 8 fn args (fparam/aparam/charge_spin=None)
+    traced = gm(atype, n_node, ei, ev, em, None, None, None)
+    # traced returns a tuple/dict; compare energy_redu
+    te = traced["energy_redu"] if isinstance(traced, dict) else traced[1]
+    torch.testing.assert_close(te, eager["energy_redu"], rtol=1e-10, atol=1e-10)
+
+
+@pytest.mark.parametrize("do_atomic_virial", [False, True])  # both branches of the bool
+def test_forward_lower_graph_exportable_public_keys(do_atomic_virial):
+    """EnergyModel.forward_lower_graph_exportable: traces the public-key path and
+    reproduces eager energy/force; atom_virial present iff do_atomic_virial.
+    """
+    model = _model().eval()
+    atype, n_node, ei, ev, em = _graph_inputs(model)
+    gm = model.forward_lower_graph_exportable(
+        atype,
+        n_node,
+        ei,
+        ev,
+        em,
+        do_atomic_virial=do_atomic_virial,
+        tracing_mode="symbolic",
+        _allow_non_fake_inputs=True,
+    )
+    assert isinstance(gm, torch.nn.Module)
+    out = gm(atype, n_node, ei, ev, em, None, None, None)
+
+    # public key set (graph path is local-only: force/atom_virial, NOT extended_*)
+    assert "atom_energy" in out and "energy" in out and "force" in out
+    assert "virial" in out
+    assert "extended_force" not in out and "extended_virial" not in out
+    # atom_virial appears ONLY when do_atomic_virial=True
+    assert ("atom_virial" in out) == do_atomic_virial
+
+    # values match the eager graph lower
+    eager = model.forward_common_lower_graph(
+        atype, n_node, ei, ev, em, do_atomic_virial=do_atomic_virial
+    )
+    torch.testing.assert_close(
+        out["energy"], eager["energy_redu"], rtol=1e-10, atol=1e-10
+    )
+    torch.testing.assert_close(
+        out["force"], eager["energy_derv_r"].squeeze(-2), rtol=1e-10, atol=1e-10
+    )
+    if do_atomic_virial:
+        torch.testing.assert_close(
+            out["atom_virial"],
+            eager["energy_derv_c"].squeeze(-2),
+            rtol=1e-10,
+            atol=1e-10,
+        )
diff --git a/source/tests/pt_expt/test_dp_freeze.py b/source/tests/pt_expt/test_dp_freeze.py
index 7c33f0de81..cdaa4a02c6 100644
--- a/source/tests/pt_expt/test_dp_freeze.py
+++ b/source/tests/pt_expt/test_dp_freeze.py
@@ -41,6 +41,36 @@
     "data_stat_nbatch": 20,
 }
 
+# dpa1 with attn_layer == 0 — the only graph-eligible model family today
+# (mixed_types and uses_graph_lower()==True), used to exercise the
+# ``freeze --lower-kind graph`` public-CLI path.
+model_dpa1_graph = {
+    "type_map": ["O", "H"],
+    "descriptor": {
+        "type": "se_atten",
+        "sel": 30,
+        "rcut_smth": 2.0,
+        "rcut": 6.0,
+        "neuron": [2, 4, 8],
+        "axis_neuron": 4,
+        "attn": 5,
+        "attn_layer": 0,
+        "attn_dotr": True,
+        "attn_mask": False,
+        "activation_function": "tanh",
+        "scaling_factor": 1.0,
+        "normalize": True,
+        "temperature": 1.0,
+        "type_one_side": True,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [5, 5, 5],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+}
+
 
 class TestDPFreezePtExpt(unittest.TestCase):
     """Test dp freeze for the pt_expt backend."""
@@ -103,6 +133,79 @@ def test_freeze_default_suffix(self) -> None:
         expected = os.path.join(self.tmpdir, "frozen_default_suffix.pte")
         self.assertTrue(os.path.exists(expected))
 
+    def test_freeze_output_suffix_by_lower_kind(self) -> None:
+        """main() defaults a suffix-less output to .pt2 for --lower-kind graph
+        and .pte for nlist, while preserving an explicit .pte/.pt2 (iProzd
+        review). freeze() is mocked so the suffix logic is checked without the
+        AOTInductor compile cost.
+        """
+        from unittest import (
+            mock,
+        )
+
+        cases = [
+            ("graph", "out_g", None, ".pt2"),  # graph, no suffix -> .pt2
+            ("nlist", "out_n", None, ".pte"),  # nlist, no suffix -> .pte
+            ("graph", "out_g_explicit", ".pte", ".pte"),  # explicit .pte kept
+            ("nlist", "out_n_explicit", ".pt2", ".pt2"),  # explicit .pt2 kept
+        ]
+        for lower_kind, stem, explicit, expected_suffix in cases:
+            with self.subTest(lower_kind=lower_kind, explicit=explicit):
+                name = stem + (explicit or "")
+                captured: dict = {}
+
+                def _fake_freeze(model, output, head=None, lower_kind="nlist", **kw):
+                    captured["output"] = output
+                    captured["lower_kind"] = lower_kind
+
+                flags = argparse.Namespace(
+                    command="freeze",
+                    checkpoint_folder=self.ckpt_file,
+                    output=os.path.join(self.tmpdir, name),
+                    head=None,
+                    lower_kind=lower_kind,
+                    log_level=2,
+                    log_path=None,
+                )
+                with mock.patch("deepmd.pt_expt.entrypoints.main.freeze", _fake_freeze):
+                    main(flags)
+                self.assertTrue(captured["output"].endswith(expected_suffix))
+                self.assertEqual(captured["lower_kind"], lower_kind)
+
+    def test_freeze_graph_rejects_ineligible(self) -> None:
+        """``--lower-kind graph`` on a non-graph-eligible model (se_e2_a,
+        mixed_types=False) fails fast rather than emitting a broken .pt2.
+        """
+        output = os.path.join(self.tmpdir, "frozen_graph_reject.pt2")
+        with self.assertRaises(ValueError):
+            freeze(model=self.ckpt_file, output=output, lower_kind="graph")
+
+    def test_freeze_graph_dpa1(self) -> None:
+        """``freeze --lower-kind graph`` on a graph-eligible dpa1(attn_layer=0)
+        model produces a .pt2 whose metadata records the graph lower (the
+        user-facing entry point to the C++ graph inference path).
+        """
+        import json
+        import zipfile
+
+        model_params = deepcopy(model_dpa1_graph)
+        model = get_model(model_params)
+        wrapper = ModelWrapper(model, model_params=model_params)
+        ckpt = os.path.join(self.tmpdir, "dpa1_graph.pt")
+        torch.save({"model": wrapper.state_dict()}, ckpt)
+
+        output = os.path.join(self.tmpdir, "frozen_dpa1_graph.pt2")
+        freeze(model=ckpt, output=output, lower_kind="graph")
+        self.assertTrue(os.path.exists(output))
+
+        # the .pt2 is a zip; metadata.json must record the graph lower
+        with zipfile.ZipFile(output) as zf:
+            meta_name = next(
+                n for n in zf.namelist() if n.endswith("extra/metadata.json")
+            )
+            metadata = json.loads(zf.read(meta_name))
+        self.assertEqual(metadata["lower_input_kind"], "graph")
+
     def test_freeze_pt2(self) -> None:
         """Freeze to .pt2 (AOTInductor) and verify the file is loadable."""
         self.assertTrue(os.path.exists(self.shared_pt2))
diff --git a/source/tests/pt_expt/test_training.py b/source/tests/pt_expt/test_training.py
index 45061c084a..a9764947c0 100644
--- a/source/tests/pt_expt/test_training.py
+++ b/source/tests/pt_expt/test_training.py
@@ -1352,9 +1352,7 @@ def _make_varying_config(
         config = normalize(config)
         return config
 
-    def _check_varying_natoms(
-        self, descriptor: dict | None = None, force_legacy_descriptor: bool = False
-    ) -> None:
+    def _check_varying_natoms(self, descriptor: dict | None = None) -> None:
         """Per-step compiled-vs-uncompiled comparison for the given descriptor.
 
         The loss config has ``start_pref_f=1000`` and ``start_pref_v=1.0``,
@@ -1370,17 +1368,10 @@ def _check_varying_natoms(
         cannot meet that on float64 the descriptor has a real numerical
         problem (see the DPA1 limitation note where this happened).
 
-        ``force_legacy_descriptor`` makes a graph-eligible descriptor (dpa1
-        ``attn_layer==0``) take the legacy *dense* (env-mat) path on BOTH the
-        compiled and uncompiled sides, so this stays a true compile-correctness
-        check (same computation, compiled vs eager).  The pt_expt eager default
-        for such a descriptor is the carry-all GRAPH forward while the compiled
-        ``forward_lower`` is the sel-capped DENSE forward; those are two
-        *different* force computations whose parameter gradients agree only to
-        fp64 accumulation (~1e-12), which the optimizer then amplifies into a
-        diverging training trajectory.  Making the compiled GRAPH lower (so
-        eager==compiled) is tracked for PR-B; until then this test exercises the
-        dense path it actually compiles.
+        Graph-eligible descriptors (dpa1 ``attn_layer==0``) compile the GRAPH
+        lower (``forward_common_lower_graph``) so the compiled path matches the
+        eager carry-all graph default-flip; non-eligible descriptors
+        (se_e2_a / dpa2 / dpa3) compile the dense ``forward_lower``.
         """
         from deepmd.pt_expt.train.training import (
             _CompiledModel,
@@ -1400,16 +1391,6 @@ def _check_varying_natoms(
                 compiled_model = trainer_c.wrapper.model["Default"]
                 self.assertIsInstance(compiled_model, _CompiledModel)
 
-                if force_legacy_descriptor:
-                    # Pin BOTH sides to the legacy dense (env-mat) path so the
-                    # uncompiled reference matches the dense ``forward_lower``
-                    # that gets compiled (must happen before the first forward,
-                    # i.e. before the lazy compile trace).  See the docstring /
-                    # PR-B note: the graph forward vs dense forward differ in the
-                    # backward at fp64 precision, which the optimizer amplifies.
-                    for _m in (trainer_uc.model, compiled_model.original_model):
-                        _m.get_descriptor().uses_graph_lower = lambda: False
-
                 # Sync weights so predictions can be compared exactly
                 compiled_model.original_model.load_state_dict(
                     trainer_uc.model.state_dict()
@@ -1482,25 +1463,20 @@ def test_compiled_matches_uncompiled_varying_natoms_dpa3(self) -> None:
         self._check_varying_natoms(_DESCRIPTOR_DPA3)
 
     def test_compiled_matches_uncompiled_varying_natoms_dpa1_no_attn(self) -> None:
-        """DPA1 (attn_layer=0): compiled vs uncompiled match (dense path).
+        """DPA1 (attn_layer=0): compiled vs uncompiled match (GRAPH lower).
 
-        ``force_legacy_descriptor=True`` pins both sides to the legacy dense
-        (env-mat) forward -- the path the compiled ``forward_lower`` actually
-        uses.  The pt_expt eager default for dpa1(attn_layer=0) is the carry-all
-        GRAPH forward, a *different* force computation from the compiled dense
-        forward; their backward gradients agree only to fp64 accumulation, which
-        the optimizer amplifies, so comparing graph-vs-dense through training is
-        ill-posed.  Making the compiled path the GRAPH lower (eager==compiled)
-        is tracked for PR-B (graph .pt2/export).
+        The pt_expt eager default for dpa1(attn_layer=0) is the carry-all GRAPH
+        forward, and the compiled path now compiles the matching GRAPH lower
+        (``forward_common_lower_graph``) -- so eager==compiled and the
+        multi-step varying-natoms trajectory (predictions + per-parameter grads
+        + loss) agrees to the strict ``atol=rtol=1e-10`` tolerance.
 
         DPA1 with attention layers is intentionally not covered: the
         compiled se_atten path is hardware-sensitive on multi-threaded
         CPUs (parallel reduction order diverges from eager above the
         1e-10 tolerance).  ``_compile_model`` warns the user instead.
         """
-        self._check_varying_natoms(
-            _DESCRIPTOR_DPA1_NO_ATTN, force_legacy_descriptor=True
-        )
+        self._check_varying_natoms(_DESCRIPTOR_DPA1_NO_ATTN)
 
     def test_compile_warns_dpa1_with_attention(self) -> None:
         """DPA1 (attn_layer>0) under compile must emit a warning.
diff --git a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
new file mode 100644
index 0000000000..17aef4d671
--- /dev/null
+++ b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py
@@ -0,0 +1,109 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Graph-form ``.pt2`` export: ``lower_input_kind`` metadata branch.
+
+Covers both branches of the ``lower_kind`` selector on
+``deserialize_to_file``: ``"graph"`` traces ``forward_lower_graph_exportable``
+over the NeighborGraph schema and records ``lower_input_kind == "graph"`` in
+``metadata.json``; the default (``"nlist"``) traces the dense quartet and
+records ``lower_input_kind == "nlist"``.
+"""
+
+import copy
+import json
+import os
+import tempfile
+import zipfile
+
+import pytest
+
+from deepmd.pt_expt.utils.serialization import (
+    deserialize_to_file,
+)
+
+# dpa1 with attn_layer == 0 — the energy model exercised by the graph path.
+DPA1_CONFIG = {
+    "type_map": ["O", "H"],
+    "descriptor": {
+        "type": "se_atten",
+        "sel": 30,
+        "rcut_smth": 2.0,
+        "rcut": 6.0,
+        "neuron": [2, 4, 8],
+        "axis_neuron": 4,
+        "attn": 5,
+        "attn_layer": 0,
+        "attn_dotr": True,
+        "attn_mask": False,
+        "activation_function": "tanh",
+        "scaling_factor": 1.0,
+        "normalize": True,
+        "temperature": 1.0,
+        "type_one_side": True,
+        "seed": 1,
+    },
+    "fitting_net": {
+        "neuron": [5, 5, 5],
+        "resnet_dt": True,
+        "seed": 1,
+    },
+}
+
+
+def _build_dpa1_data() -> dict:
+    """Build a serialized dpmodel data dict for a dpa1(attn_layer=0) energy model."""
+    from deepmd.dpmodel.model.model import (
+        get_model,
+    )
+
+    model = get_model(copy.deepcopy(DPA1_CONFIG))
+    return {
+        "model": model.serialize(),
+        "model_def_script": copy.deepcopy(DPA1_CONFIG),
+        "backend": "dpmodel",
+        "software": "deepmd-kit",
+        "version": "3.0.0",
+    }
+
+
+def _read_metadata(pt2_path: str) -> dict:
+    """Read ``model/extra/metadata.json`` from a ``.pt2`` ZIP archive."""
+    with zipfile.ZipFile(pt2_path, "r") as zf:
+        raw = zf.read("model/extra/metadata.json").decode("utf-8")
+    return json.loads(raw)
+
+
+@pytest.fixture(scope="module")
+def dpa1_dpmodel_data() -> dict:
+    return _build_dpa1_data()
+
+
+def test_graph_pt2_has_lower_input_kind_graph(dpa1_dpmodel_data) -> None:
+    """``lower_kind="graph"`` -> metadata ``lower_input_kind == "graph"``."""
+    with tempfile.TemporaryDirectory() as d:
+        p = os.path.join(d, "m_graph.pt2")
+        deserialize_to_file(
+            p,
+            copy.deepcopy(dpa1_dpmodel_data),
+            do_atomic_virial=True,
+            lower_kind="graph",
+        )
+        meta = _read_metadata(p)
+    assert meta["lower_input_kind"] == "graph"
+    # B2.0: the edge axis is DYNAMIC (Dim("nedge", min=2)); there is no static
+    # capacity baked into the AOTI artifact, so no ``edge_capacity`` is persisted.
+    assert "edge_capacity" not in meta
+
+
+def test_dense_pt2_has_lower_input_kind_nlist(dpa1_dpmodel_data) -> None:
+    """Default (``lower_kind="nlist"``) -> metadata ``lower_input_kind == "nlist"``."""
+    with tempfile.TemporaryDirectory() as d:
+        p = os.path.join(d, "m_dense.pt2")
+        deserialize_to_file(
+            p,
+            copy.deepcopy(dpa1_dpmodel_data),
+            do_atomic_virial=True,
+        )
+        meta = _read_metadata(p)
+    assert meta["lower_input_kind"] == "nlist"
+    # edge_capacity is a graph-only artifact constant; the dense path omits it.
+    assert "edge_capacity" not in meta