From b87ac5010bba6945a02d7684e83a54c7da552c9c Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 29 Jun 2026 18:42:24 +0800 Subject: [PATCH 01/33] feat(pt_expt): forward_common_lower_graph_exportable trace target for graph .pt2 export MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add make_model.forward_common_lower_graph_exportable: make_fx trace of forward_common_lower_graph with edge_vec as the autograd leaf; symbolic tracing enabled via two data-dependent shape fixes. - Add EnergyModel.forward_lower_graph_exportable: wraps the above with a second make_fx pass that translates internal keys to public names (atom_energy, energy, force, virial, atom_virial). - Fix edge_transform_output.py: replace int(n_node.sum()) with next(iter(fit_ret.values())).shape[0] (static shape attr, trace-safe); thread node_capacity through edge_energy_deriv -> edge_force_virial so that segment_sum sizes are static under make_fx symbolic mode. - Add edge_energy_deriv node_capacity param (None = eager fallback). - Test: source/tests/pt_expt/model/test_graph_export.py — TDD RED->GREEN, verifies traced module reproduces eager energy_redu (rtol=1e-10). --- deepmd/pt_expt/model/edge_transform_output.py | 17 ++- deepmd/pt_expt/model/ener_model.py | 103 ++++++++++++++++++ deepmd/pt_expt/model/make_model.py | 78 +++++++++++++ .../tests/pt_expt/model/test_graph_export.py | 74 +++++++++++++ 4 files changed, 269 insertions(+), 3 deletions(-) create mode 100644 source/tests/pt_expt/model/test_graph_export.py diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py index 565e155157..1eb3e4363b 100644 --- a/deepmd/pt_expt/model/edge_transform_output.py +++ b/deepmd/pt_expt/model/edge_transform_output.py @@ -31,6 +31,7 @@ def edge_energy_deriv( n_node: torch.Tensor, do_atomic_virial: bool = False, create_graph: bool = False, + node_capacity: int | None = None, ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor]: """Return (force, atom_virial_or_None, virial) from a graph energy. @@ -53,6 +54,10 @@ def edge_energy_deriv( whether to materialize the per-atom virial (else ``None`` is returned). create_graph whether the backward retains a graph (training, for second-order grad). + node_capacity + Static node-axis size ``N``. ``None`` (eager default) falls back to + ``int(n_node.sum())``. Pass a static value (e.g. ``atype.shape[0]``) + to keep this function trace-safe under ``make_fx``/``torch.export``. Returns ------- @@ -70,7 +75,7 @@ def edge_energy_deriv( retain_graph=True, ) force, atom_virial, virial = edge_force_virial( - g_e, edge_vec, edge_index, edge_mask, n_node + g_e, edge_vec, edge_index, edge_mask, n_node, node_capacity=node_capacity ) return force, (atom_virial if do_atomic_virial else None), virial @@ -132,8 +137,13 @@ def fit_output_to_model_output_graph( n_node = graph.n_node redu_prec = env.GLOBAL_PT_ENER_FLOAT_PRECISION nf = int(n_node.shape[0]) - N = int(n_node.sum()) - frame_id = frame_id_from_n_node(n_node) # (N,) int64 frame index per atom + # Derive N from the fitting output's leading shape rather than int(n_node.sum()). + # shape attributes are always static Python ints (or SymInts in symbolic-mode + # tracing) and are trace-safe; reading a tensor VALUE via int() is not. + N = next(iter(fit_ret.values())).shape[0] + frame_id = frame_id_from_n_node( + n_node, n_total=N + ) # (N,) int64 frame index per atom model_ret: dict[str, torch.Tensor] = dict(fit_ret.items()) for kk, vv in fit_ret.items(): vdef = fit_output_def[kk] @@ -174,6 +184,7 @@ def fit_output_to_model_output_graph( n_node, do_atomic_virial=(vdef.c_differentiable and do_atomic_virial), create_graph=create_graph, + node_capacity=N, ) # force (N, 3) -> (N, 1, 3) [flat; caller unravels at I/O boundary] ff_list.append(force.reshape(N, 1, 3)) diff --git a/deepmd/pt_expt/model/ener_model.py b/deepmd/pt_expt/model/ener_model.py index 6347382135..af9806f083 100644 --- a/deepmd/pt_expt/model/ener_model.py +++ b/deepmd/pt_expt/model/ener_model.py @@ -247,3 +247,106 @@ def fn( return make_fx(fn, **make_fx_kwargs)( extended_coord, extended_atype, nlist, mapping, fparam, aparam, charge_spin ) + + def forward_lower_graph_exportable( + self, + atype: torch.Tensor, + n_node: torch.Tensor, + edge_index: torch.Tensor, + edge_vec: torch.Tensor, + edge_mask: torch.Tensor, + fparam: torch.Tensor | None = None, + aparam: torch.Tensor | None = None, + do_atomic_virial: bool = False, + charge_spin: torch.Tensor | None = None, + **make_fx_kwargs: Any, + ) -> torch.nn.Module: + """Trace ``forward_common_lower_graph`` into an exportable module with + public output keys. + + Delegates to ``forward_common_lower_graph_exportable`` for tracing, + then translates the internal keys to the ``forward_lower`` convention. + + Parameters + ---------- + atype + (N,) flat local atom types, ``N == sum(n_node)``. + n_node + (nf,) per-frame local atom counts. + edge_index + (2, E) ``[src, dst]`` edge endpoints (flat local indices). + edge_vec + (E, 3) neighbor-minus-center edge vectors (sample for tracing). + edge_mask + (E,) valid-edge mask (sample for tracing). + fparam, aparam, do_atomic_virial, charge_spin + As in ``forward_lower``. + **make_fx_kwargs + Extra keyword arguments forwarded to ``make_fx`` + (e.g. ``tracing_mode="symbolic"``). + + Returns + ------- + torch.nn.Module + A traced module whose ``forward`` accepts + ``(atype, n_node, edge_index, edge_vec, edge_mask, + fparam, aparam, charge_spin)`` and returns a dict with the + public keys: ``atom_energy``, ``energy``, ``force``, + ``virial``, ``atom_virial``. + """ + traced = self.forward_common_lower_graph_exportable( + atype, + n_node, + edge_index, + edge_vec, + edge_mask, + fparam=fparam, + aparam=aparam, + charge_spin=charge_spin, + do_atomic_virial=do_atomic_virial, + **make_fx_kwargs, + ) + + # Translate internal keys to public convention. + # Capture model config at trace time via closures. + do_grad_r = self.do_grad_r("energy") + do_grad_c = self.do_grad_c("energy") + + def fn( + atype: torch.Tensor, + n_node: torch.Tensor, + edge_index: torch.Tensor, + edge_vec: torch.Tensor, + edge_mask: torch.Tensor, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + charge_spin: torch.Tensor | None, + ) -> dict[str, torch.Tensor]: + model_ret = traced( + atype, + n_node, + edge_index, + edge_vec, + edge_mask, + fparam, + aparam, + charge_spin, + ) + model_predict: dict[str, torch.Tensor] = {} + model_predict["atom_energy"] = model_ret["energy"] + model_predict["energy"] = model_ret["energy_redu"] + if do_grad_r: + model_predict["force"] = model_ret["energy_derv_r"].squeeze(-2) + if do_grad_c: + model_predict["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2) + if do_atomic_virial: + model_predict["atom_virial"] = model_ret["energy_derv_c"].squeeze( + -2 + ) + if "mask" in model_ret: + model_predict["mask"] = model_ret["mask"] + return model_predict + + return make_fx(fn, **make_fx_kwargs)( + atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, charge_spin + ) diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py index 50ede240e4..cb2689c449 100644 --- a/deepmd/pt_expt/model/make_model.py +++ b/deepmd/pt_expt/model/make_model.py @@ -630,6 +630,84 @@ def fn( model.need_sorted_nlist_for_lower = _orig_need_sort return traced + def forward_common_lower_graph_exportable( + self, + atype: torch.Tensor, + n_node: torch.Tensor, + edge_index: torch.Tensor, + edge_vec: torch.Tensor, + edge_mask: torch.Tensor, + fparam: torch.Tensor | None = None, + aparam: torch.Tensor | None = None, + do_atomic_virial: bool = False, + charge_spin: torch.Tensor | None = None, + **make_fx_kwargs: Any, + ) -> torch.nn.Module: + """make_fx trace of ``forward_common_lower_graph`` with ``edge_vec`` + as the autograd leaf — the export target for graph-form .pt2 archives. + + Parameters + ---------- + atype + (N,) flat local atom types, ``N == sum(n_node)``. + n_node + (nf,) per-frame local atom counts. + edge_index + (2, E) ``[src, dst]`` edge endpoints (flat local indices). + edge_vec + (E, 3) neighbor-minus-center edge vectors (sample for tracing). + edge_mask + (E,) valid-edge mask (sample for tracing). + fparam, aparam, do_atomic_virial, charge_spin + As in ``forward_common_lower_graph``. + **make_fx_kwargs + Extra keyword arguments forwarded to ``make_fx`` + (e.g. ``tracing_mode="symbolic"``). + + Returns + ------- + torch.nn.Module + A traced module whose ``forward`` accepts + ``(atype, n_node, edge_index, edge_vec, edge_mask, + fparam, aparam, charge_spin)`` and returns a dict with the + same internal keys as ``forward_common_lower_graph``. + """ + model = self + + def fn( + atype: torch.Tensor, + n_node: torch.Tensor, + edge_index: torch.Tensor, + edge_vec: torch.Tensor, + edge_mask: torch.Tensor, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + charge_spin: torch.Tensor | None, + ) -> dict[str, torch.Tensor]: + ev = edge_vec.detach().requires_grad_(True) + return model.forward_common_lower_graph( + atype, + n_node, + edge_index, + ev, + edge_mask, + do_atomic_virial=do_atomic_virial, + fparam=fparam, + aparam=aparam, + charge_spin=charge_spin, + ) + + return make_fx(fn, **make_fx_kwargs)( + atype, + n_node, + edge_index, + edge_vec, + edge_mask, + fparam, + aparam, + charge_spin, + ) + def forward_common_lower_exportable_with_comm( self, extended_coord: torch.Tensor, diff --git a/source/tests/pt_expt/model/test_graph_export.py b/source/tests/pt_expt/model/test_graph_export.py new file mode 100644 index 0000000000..7738595dae --- /dev/null +++ b/source/tests/pt_expt/model/test_graph_export.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Graph-lower export: forward_common_lower_graph_exportable traces + torch.export.""" + +import torch +from deepmd.pt.utils import env +from deepmd.pt_expt.descriptor.dpa1 import DescrptDPA1 +from deepmd.pt_expt.fitting import InvarFitting +from deepmd.pt_expt.model import EnergyModel +from deepmd.dpmodel.utils.neighbor_graph import build_neighbor_graph +from ...seed import GLOBAL_SEED + +_RCUT, _NT = 4.0, 2 + + +def _model(): + ds = DescrptDPA1( + _RCUT, + 0.5, + 20, + _NT, + neuron=[3, 6], + axis_neuron=2, + attn_layer=0, + precision="float64", + seed=GLOBAL_SEED, + ).to(env.DEVICE) + ft = InvarFitting( + "energy", + _NT, + ds.get_dim_out(), + 1, + mixed_types=ds.mixed_types(), + precision="float64", + seed=GLOBAL_SEED, + ).to(env.DEVICE) + return EnergyModel(ds, ft, type_map=["A", "B"]).to(env.DEVICE) + + +def _graph_inputs(model): + rng = torch.Generator(device=env.DEVICE).manual_seed(GLOBAL_SEED) + nloc = 6 + coord = ( + torch.rand(1, nloc, 3, dtype=torch.float64, device=env.DEVICE, generator=rng) + * 3.0 + ) + atype = torch.tensor([[0, 1, 0, 1, 0, 1]], dtype=torch.int64, device=env.DEVICE) + box = torch.eye(3, dtype=torch.float64, device=env.DEVICE).reshape(1, 9) * 20.0 + g = build_neighbor_graph(coord, atype, box, model.get_rcut()) + return (atype.reshape(-1), g.n_node, g.edge_index, g.edge_vec, g.edge_mask) + + +def test_graph_exportable_traces(): + model = _model().eval() + atype, n_node, ei, ev, em = _graph_inputs(model) + gm = model.forward_common_lower_graph_exportable( + atype, + n_node, + ei, + ev, + em, + do_atomic_virial=False, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + ) + assert isinstance(gm, torch.nn.Module) + # the traced module reproduces eager outputs + eager = model.forward_common_lower_graph( + atype, n_node, ei, ev, em, do_atomic_virial=False + ) + # traced module has placeholders for all 8 fn args (fparam/aparam/charge_spin=None) + traced = gm(atype, n_node, ei, ev, em, None, None, None) + # traced returns a tuple/dict; compare energy_redu + te = traced["energy_redu"] if isinstance(traced, dict) else traced[1] + torch.testing.assert_close(te, eager["energy_redu"], rtol=1e-10, atol=1e-10) From ee8db1b0fb9facc39edaa1c1785ca3c74b1a3bb4 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 29 Jun 2026 19:39:09 +0800 Subject: [PATCH 02/33] =?UTF-8?q?fix(pt=5Fexpt):=20B1.1=20review=20?= =?UTF-8?q?=E2=80=94=20test=20forward=5Flower=5Fgraph=5Fexportable=20(both?= =?UTF-8?q?=20do=5Fatomic=5Fvirial=20branches),=20dedup=20key-translation,?= =?UTF-8?q?=20drop=20redundant=20detach?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepmd/pt_expt/model/ener_model.py | 84 ++++++++++++------- deepmd/pt_expt/model/make_model.py | 6 +- .../tests/pt_expt/model/test_graph_export.py | 47 +++++++++++ 3 files changed, 106 insertions(+), 31 deletions(-) diff --git a/deepmd/pt_expt/model/ener_model.py b/deepmd/pt_expt/model/ener_model.py index af9806f083..140141dd5e 100644 --- a/deepmd/pt_expt/model/ener_model.py +++ b/deepmd/pt_expt/model/ener_model.py @@ -32,6 +32,41 @@ DPEnergyModel_ = make_model(DPEnergyAtomicModel, T_Bases=(BaseModel,)) +def _translate_energy_keys( + model_ret: dict[str, torch.Tensor], + *, + do_grad_r: bool, + do_grad_c: bool, + do_atomic_virial: bool, + local: bool, +) -> dict[str, torch.Tensor]: + """Map internal fitting keys -> public energy-model keys (shared by the + dense and graph ``forward_lower`` export traces). + + Operates on plain dicts (make_fx-safe). ``local=True`` is the GRAPH path + (per-node ``N == sum(n_node)`` local atoms, no ghost/extended region) and + emits ``force``/``atom_virial``; ``local=False`` is the DENSE extended-region + path and emits ``extended_force``/``extended_virial`` (folded to local by + ``communicate_extended_output`` at inference). + """ + out: dict[str, torch.Tensor] = {} + out["atom_energy"] = model_ret["energy"] + out["energy"] = model_ret["energy_redu"] + if do_grad_r: + out["force" if local else "extended_force"] = model_ret[ + "energy_derv_r" + ].squeeze(-2) + if do_grad_c: + out["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2) + if do_atomic_virial: + out["atom_virial" if local else "extended_virial"] = model_ret[ + "energy_derv_c" + ].squeeze(-2) + if "mask" in model_ret: + out["mask"] = model_ret["mask"] + return out + + @BaseModel.register("ener") @BaseModel.register("sezm_ener") @BaseModel.register("dpa4_ener") @@ -229,20 +264,13 @@ def fn( aparam, charge_spin, ) - model_predict: dict[str, torch.Tensor] = {} - model_predict["atom_energy"] = model_ret["energy"] - model_predict["energy"] = model_ret["energy_redu"] - if do_grad_r: - model_predict["extended_force"] = model_ret["energy_derv_r"].squeeze(-2) - if do_grad_c: - model_predict["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2) - if do_atomic_virial: - model_predict["extended_virial"] = model_ret[ - "energy_derv_c" - ].squeeze(-2) - if "mask" in model_ret: - model_predict["mask"] = model_ret["mask"] - return model_predict + return _translate_energy_keys( + model_ret, + do_grad_r=do_grad_r, + do_grad_c=do_grad_c, + do_atomic_virial=do_atomic_virial, + local=False, + ) return make_fx(fn, **make_fx_kwargs)( extended_coord, extended_atype, nlist, mapping, fparam, aparam, charge_spin @@ -292,7 +320,12 @@ def forward_lower_graph_exportable( ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, charge_spin)`` and returns a dict with the public keys: ``atom_energy``, ``energy``, ``force``, - ``virial``, ``atom_virial``. + ``virial``, ``atom_virial`` (the last only when + ``do_atomic_virial``). Unlike the dense + :meth:`forward_lower_exportable` (which emits ``extended_force`` / + ``extended_virial`` over the ghost-padded extended region), the + graph path is LOCAL-only (``N == sum(n_node)`` nodes, no ghosts), + so it emits ``force`` / ``atom_virial`` directly. """ traced = self.forward_common_lower_graph_exportable( atype, @@ -332,20 +365,13 @@ def fn( aparam, charge_spin, ) - model_predict: dict[str, torch.Tensor] = {} - model_predict["atom_energy"] = model_ret["energy"] - model_predict["energy"] = model_ret["energy_redu"] - if do_grad_r: - model_predict["force"] = model_ret["energy_derv_r"].squeeze(-2) - if do_grad_c: - model_predict["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2) - if do_atomic_virial: - model_predict["atom_virial"] = model_ret["energy_derv_c"].squeeze( - -2 - ) - if "mask" in model_ret: - model_predict["mask"] = model_ret["mask"] - return model_predict + return _translate_energy_keys( + model_ret, + do_grad_r=do_grad_r, + do_grad_c=do_grad_c, + do_atomic_virial=do_atomic_virial, + local=True, + ) return make_fx(fn, **make_fx_kwargs)( atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, charge_spin diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py index cb2689c449..3e14ed2d56 100644 --- a/deepmd/pt_expt/model/make_model.py +++ b/deepmd/pt_expt/model/make_model.py @@ -684,12 +684,14 @@ def fn( aparam: torch.Tensor | None, charge_spin: torch.Tensor | None, ) -> dict[str, torch.Tensor]: - ev = edge_vec.detach().requires_grad_(True) + # forward_common_lower_graph creates the autograd leaf from + # edge_vec internally, so no outer detach/requires_grad_ here + # (it would only add spurious ops to the traced graph). return model.forward_common_lower_graph( atype, n_node, edge_index, - ev, + edge_vec, edge_mask, do_atomic_virial=do_atomic_virial, fparam=fparam, diff --git a/source/tests/pt_expt/model/test_graph_export.py b/source/tests/pt_expt/model/test_graph_export.py index 7738595dae..56e2d6eb7b 100644 --- a/source/tests/pt_expt/model/test_graph_export.py +++ b/source/tests/pt_expt/model/test_graph_export.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: LGPL-3.0-or-later """Graph-lower export: forward_common_lower_graph_exportable traces + torch.export.""" +import pytest import torch from deepmd.pt.utils import env from deepmd.pt_expt.descriptor.dpa1 import DescrptDPA1 @@ -72,3 +73,49 @@ def test_graph_exportable_traces(): # traced returns a tuple/dict; compare energy_redu te = traced["energy_redu"] if isinstance(traced, dict) else traced[1] torch.testing.assert_close(te, eager["energy_redu"], rtol=1e-10, atol=1e-10) + + +@pytest.mark.parametrize("do_atomic_virial", [False, True]) # both branches of the bool +def test_forward_lower_graph_exportable_public_keys(do_atomic_virial): + """EnergyModel.forward_lower_graph_exportable: traces the public-key path and + reproduces eager energy/force; atom_virial present iff do_atomic_virial. + """ + model = _model().eval() + atype, n_node, ei, ev, em = _graph_inputs(model) + gm = model.forward_lower_graph_exportable( + atype, + n_node, + ei, + ev, + em, + do_atomic_virial=do_atomic_virial, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + ) + assert isinstance(gm, torch.nn.Module) + out = gm(atype, n_node, ei, ev, em, None, None, None) + + # public key set (graph path is local-only: force/atom_virial, NOT extended_*) + assert "atom_energy" in out and "energy" in out and "force" in out + assert "virial" in out + assert "extended_force" not in out and "extended_virial" not in out + # atom_virial appears ONLY when do_atomic_virial=True + assert ("atom_virial" in out) == do_atomic_virial + + # values match the eager graph lower + eager = model.forward_common_lower_graph( + atype, n_node, ei, ev, em, do_atomic_virial=do_atomic_virial + ) + torch.testing.assert_close( + out["energy"], eager["energy_redu"], rtol=1e-10, atol=1e-10 + ) + torch.testing.assert_close( + out["force"], eager["energy_derv_r"].squeeze(-2), rtol=1e-10, atol=1e-10 + ) + if do_atomic_virial: + torch.testing.assert_close( + out["atom_virial"], + eager["energy_derv_c"].squeeze(-2), + rtol=1e-10, + atol=1e-10, + ) From 7437d549057d8e5fb5114c51d70e48f8cdc3b17d Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 29 Jun 2026 19:44:53 +0800 Subject: [PATCH 03/33] test(dpmodel): codify static edge_capacity contract for build_neighbor_graph Adds test_graph_static_capacity.py (4 tests): shape (2,64), real-edge count matches dynamic, real prefix identity + masked tail, overflow raises. No source change needed -- builder.py already threads layout.edge_capacity into pad_and_guard_edges. --- .../dpmodel/test_graph_static_capacity.py | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 source/tests/common/dpmodel/test_graph_static_capacity.py diff --git a/source/tests/common/dpmodel/test_graph_static_capacity.py b/source/tests/common/dpmodel/test_graph_static_capacity.py new file mode 100644 index 0000000000..c9ff982d01 --- /dev/null +++ b/source/tests/common/dpmodel/test_graph_static_capacity.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Tests for static edge_capacity masked padding in build_neighbor_graph. + +Codifies the contract: build_neighbor_graph(..., layout=GraphLayout(edge_capacity=E_max)) +returns a NeighborGraph whose edge_index/edge_vec/edge_mask have a STATIC leading +edge dim E_max (real edges in the compact prefix, edge_mask=False tail), so export +sees a fixed E. Edge overflow must raise ValueError. +""" + +import numpy as np +import pytest + +from deepmd.dpmodel.utils.neighbor_graph import ( + GraphLayout, + build_neighbor_graph, +) + + +class TestStaticEdgeCapacity: + """Tests for static edge_capacity masked padding via build_neighbor_graph.""" + + @pytest.fixture() + def small_system(self): + """6-atom periodic system with a 20 Å box (atoms well within rcut=4 range).""" + rng = np.random.default_rng(0) + coord = rng.normal(size=(1, 6, 3)) * 1.5 + atype = np.array([[0, 1, 0, 1, 0, 1]], dtype=np.int64) + box = np.eye(3).reshape(1, 9) * 20.0 + return coord, atype, box + + def test_static_edge_capacity_shape(self, small_system): + """Static edge_capacity=64 yields edge_index.shape == (2, 64).""" + coord, atype, box = small_system + cap = build_neighbor_graph( + coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=64) + ) + assert cap.edge_index.shape == (2, 64) + assert cap.edge_vec.shape == (64, 3) + assert cap.edge_mask.shape == (64,) + + def test_static_edge_capacity_matches_dynamic(self, small_system): + """Static graph has same real-edge count as dynamic graph.""" + coord, atype, box = small_system + dyn = build_neighbor_graph(coord, atype, box, 4.0) + cap = build_neighbor_graph( + coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=64) + ) + assert cap.edge_index.shape == (2, 64) + assert int(cap.edge_mask.sum()) == int(dyn.edge_mask.sum()) + + def test_static_edge_capacity_real_prefix_matches_dynamic(self, small_system): + """The real-edge prefix of the static graph matches the dynamic graph.""" + coord, atype, box = small_system + dyn = build_neighbor_graph(coord, atype, box, 4.0) + cap = build_neighbor_graph( + coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=64) + ) + n_real = int(dyn.edge_mask.sum()) + # real prefix must match exactly + np.testing.assert_array_equal( + cap.edge_index[:, :n_real], dyn.edge_index[:, :n_real] + ) + np.testing.assert_allclose(cap.edge_vec[:n_real], dyn.edge_vec[:n_real]) + # padding suffix must have edge_mask=False + assert not np.any(cap.edge_mask[n_real:]) + + def test_overflow_raises(self, small_system): + """edge_capacity smaller than real edge count must raise ValueError.""" + coord, atype, box = small_system + # capacity=1 is guaranteed to be smaller than the real edge count + with pytest.raises(ValueError, match="edge overflow"): + build_neighbor_graph( + coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=1) + ) From 1051a0d9d517b1f2557cfb5b9daf64dcb684db45 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 29 Jun 2026 19:55:36 +0800 Subject: [PATCH 04/33] feat(pt_expt): graph .pt2 export branch + lower_input_kind metadata --- deepmd/pt_expt/utils/serialization.py | 282 +++++++++++++++++- .../pt_expt/utils/test_graph_pt2_metadata.py | 104 +++++++ 2 files changed, 380 insertions(+), 6 deletions(-) create mode 100644 source/tests/pt_expt/utils/test_graph_pt2_metadata.py diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index 9c03783574..8c1f9e0d1e 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -310,6 +310,147 @@ def _make_sample_inputs( return ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam, charge_spin +def _make_graph_sample_inputs( + model: torch.nn.Module, + e_max: int, + nframes: int = 2, + nloc: int = 7, +) -> tuple[torch.Tensor | None, ...]: + """Create sample inputs for tracing ``forward_lower_graph``. + + Builds a small random system, runs the carry-all + :func:`~deepmd.dpmodel.utils.neighbor_graph.build_neighbor_graph` with a + STATIC ``GraphLayout(edge_capacity=e_max)`` (decision #16: the masked + static edge axis), and returns tensors in the positional order expected by + :meth:`forward_lower_graph_exportable`: + ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, + charge_spin)``. + + Parameters + ---------- + model : torch.nn.Module + The pt_expt energy model (must expose ``get_rcut``/``get_type_map``/...). + e_max : int + Static edge capacity ``E`` to pad the edge axis to. + nframes : int + Number of frames in the sample system. + nloc : int + Number of local atoms per frame (``N == nframes * nloc``). + """ + from deepmd.dpmodel.utils.neighbor_graph import ( + GraphLayout, + build_neighbor_graph, + ) + + import deepmd.pt_expt.utils.env as _env + + rcut = model.get_rcut() + ntypes = len(model.get_type_map()) + dim_fparam = model.get_dim_fparam() + dim_aparam = model.get_dim_aparam() + + # Box large enough to avoid PBC degeneracy; mirrors _make_sample_inputs. + box_size = rcut * 3.0 + box = np.eye(3, dtype=np.float64) * box_size + box_np = box.reshape(1, 9) + + rng = np.random.default_rng(42) + coord_np = rng.random((nframes, nloc, 3), dtype=np.float64) * box_size * 0.5 + coord_np += box_size * 0.25 # center in box + + atype_np = np.zeros((nframes, nloc), dtype=np.int64) + for i in range(nloc): + atype_np[:, i] = i % ntypes + + graph = build_neighbor_graph( + coord_np, + atype_np, + np.tile(box_np, (nframes, 1)), + rcut, + layout=GraphLayout(edge_capacity=e_max), + ) + + atype_t = torch.tensor(atype_np.reshape(-1), dtype=torch.int64, device=_env.DEVICE) + n_node_t = torch.tensor( + np.asarray(graph.n_node), dtype=torch.int64, device=_env.DEVICE + ) + edge_index_t = torch.tensor( + np.asarray(graph.edge_index), dtype=torch.int64, device=_env.DEVICE + ) + edge_vec_t = torch.tensor( + np.asarray(graph.edge_vec), dtype=torch.float64, device=_env.DEVICE + ) + edge_mask_t = torch.tensor( + np.asarray(graph.edge_mask), dtype=torch.bool, device=_env.DEVICE + ) + + if dim_fparam > 0: + fparam = torch.zeros( + nframes, dim_fparam, dtype=torch.float64, device=_env.DEVICE + ) + else: + fparam = None + + if dim_aparam > 0: + aparam = torch.zeros( + nframes, nloc, dim_aparam, dtype=torch.float64, device=_env.DEVICE + ) + else: + aparam = None + + dim_chg_spin = model.get_dim_chg_spin() if hasattr(model, "get_dim_chg_spin") else 0 + if dim_chg_spin > 0: + charge_spin = torch.zeros( + nframes, dim_chg_spin, dtype=torch.float64, device=_env.DEVICE + ) + else: + charge_spin = None + + return ( + atype_t, + n_node_t, + edge_index_t, + edge_vec_t, + edge_mask_t, + fparam, + aparam, + charge_spin, + ) + + +def _build_graph_dynamic_shapes( + *sample_inputs: torch.Tensor | None, +) -> tuple: + """Build dynamic-shape specifications for the graph-form forward_lower export. + + ``nframes`` (the ``n_node`` axis) and ``N`` (the flat node axis) are + dynamic dims; the edge axis ``E`` is STATIC (decision #16: the masked + ``edge_capacity`` path), expressed by leaving the edge dims unmarked + (``None``) so torch.export specialises them to the sample value. + + Parameters + ---------- + *sample_inputs : torch.Tensor | None + ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, + charge_spin)`` — 8 entries matching ``forward_lower_graph_exportable``. + """ + fparam = sample_inputs[5] + aparam = sample_inputs[6] + charge_spin = sample_inputs[7] + nframes_dim = torch.export.Dim("nframes", min=1) + n_node_total_dim = torch.export.Dim("n_node_total", min=1) + return ( + {0: n_node_total_dim}, # atype: (N,) + {0: nframes_dim}, # n_node: (nf,) + None, # edge_index: (2, E) — E static + None, # edge_vec: (E, 3) — E static + None, # edge_mask: (E,) — E static + {0: nframes_dim} if fparam is not None else None, # fparam: (nf, ndf) + {0: nframes_dim} if aparam is not None else None, # aparam: (nf, nloc, nda) + {0: nframes_dim} if charge_spin is not None else None, # charge_spin + ) + + def _build_dynamic_shapes( *sample_inputs: torch.Tensor | None, has_spin: bool = False, @@ -416,7 +557,9 @@ def _build_dynamic_shapes( return (*base, None, None, None, None, None, None, None, None) -def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict: +def _collect_metadata( + model: torch.nn.Module, is_spin: bool = False, lower_kind: str = "nlist" +) -> dict: """Collect metadata from the model for C++ inference. This metadata is stored as ``metadata.json`` in both .pt2 and .pte archives. @@ -528,6 +671,12 @@ def _probe_has_message_passing(obj: object) -> bool | None: if result is not None: break meta["has_message_passing"] = result if result is not None else False + + # Which input schema the compiled AOTI forward consumes: + # "nlist" → dense quartet (extended_coord, extended_atype, nlist, mapping) + # "graph" → NeighborGraph (atype, n_node, edge_index, edge_vec, edge_mask) + # The C++ loader branches on this to build the matching inputs. + meta["lower_input_kind"] = "graph" if lower_kind == "graph" else "nlist" return meta @@ -599,6 +748,7 @@ def deserialize_to_file( data: dict, model_json_override: dict | None = None, do_atomic_virial: bool = False, + lower_kind: str = "nlist", ) -> None: """Deserialize a dictionary to a .pte or .pt2 model file. @@ -622,14 +772,22 @@ def deserialize_to_file( do_atomic_virial : bool If True, export with per-atom virial correction (3 extra backward passes, ~2.5x slower). Default False for best performance. + lower_kind : str + Which lower-forward schema the compiled AOTI graph consumes: + ``"nlist"`` (default) traces the dense quartet + (``extended_coord``/``extended_atype``/``nlist``/``mapping``); + ``"graph"`` traces the NeighborGraph schema + (``atype``/``n_node``/``edge_index``/``edge_vec``/``edge_mask``) with a + static edge axis ``E = ceil(1.25 * nloc * nnei)``. The selected schema + is recorded as ``lower_input_kind`` in ``metadata.json``. """ if model_file.endswith(".pt2"): _deserialize_to_file_pt2( - model_file, data, model_json_override, do_atomic_virial + model_file, data, model_json_override, do_atomic_virial, lower_kind ) else: _deserialize_to_file_pte( - model_file, data, model_json_override, do_atomic_virial + model_file, data, model_json_override, do_atomic_virial, lower_kind ) @@ -638,6 +796,7 @@ def _trace_and_export( model_json_override: dict | None = None, with_comm_dict: bool = False, do_atomic_virial: bool = False, + lower_kind: str = "nlist", ) -> tuple: """Common logic: build model, trace, export. @@ -663,6 +822,10 @@ def _trace_and_export( If True, the traced graph computes per-atom virial (extra autograd.grad backward passes); off by default to keep .pt2 inference fast. Mirrors PR #5407 in upstream master. + lower_kind + ``"nlist"`` (default) traces the dense quartet forward; ``"graph"`` + traces ``forward_lower_graph_exportable`` over the NeighborGraph schema + with a static edge axis. Recorded as ``lower_input_kind`` in metadata. Returns ------- @@ -700,7 +863,106 @@ def _trace_and_export( model.eval() # 2. Collect metadata - metadata = _collect_metadata(model, is_spin=is_spin) + metadata = _collect_metadata(model, is_spin=is_spin, lower_kind=lower_kind) + + # 2b. Graph-form export branch (NeighborGraph schema). The graph path is + # LOCAL-only (no ghosts), single-rank, energy-model only in PR-A/PR-B; it + # traces ``forward_lower_graph_exportable`` with a STATIC edge axis. The + # dense (nlist) path below is left byte-unchanged. + if lower_kind == "graph": + import math + + if is_spin: + raise NotImplementedError( + "graph-form .pt2 export is not supported for spin models" + ) + if with_comm_dict: + raise NotImplementedError( + "graph-form .pt2 export does not support the with-comm artifact " + "(multi-rank graph message passing is a later PR)" + ) + if not hasattr(model, "forward_lower_graph_exportable"): + raise NotImplementedError( + f"model {type(model).__name__} has no " + "forward_lower_graph_exportable; graph-form .pt2 export " + "requires an energy model" + ) + + # Static export edge capacity E_max = ceil(1.25 * nloc * nnei) + # (decision #12 headroom). nloc is the sample-system local-atom count. + nloc_sample = 7 + nnei = sum(model.get_sel()) + e_max = math.ceil(1.25 * nloc_sample * nnei) + + _orig_device = _env.DEVICE + _env.DEVICE = torch.device("cpu") + try: + sample_inputs = _make_graph_sample_inputs( + model, e_max=e_max, nframes=2, nloc=nloc_sample + ) + finally: + _env.DEVICE = _orig_device + + ( + atype_g, + n_node_g, + edge_index_g, + edge_vec_g, + edge_mask_g, + fparam_g, + aparam_g, + charge_spin_g, + ) = sample_inputs + + # Trace via make_fx on CPU (decomposes autograd.grad into aten ops). + traced = model.forward_lower_graph_exportable( + atype_g, + n_node_g, + edge_index_g, + edge_vec_g, + edge_mask_g, + fparam=fparam_g, + aparam=aparam_g, + do_atomic_virial=do_atomic_virial, + charge_spin=charge_spin_g, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + ) + sample_out = traced( + atype_g, + n_node_g, + edge_index_g, + edge_vec_g, + edge_mask_g, + fparam_g, + aparam_g, + charge_spin_g, + ) + output_keys = list(sample_out.keys()) + + dynamic_shapes = _build_graph_dynamic_shapes(*sample_inputs) + exported = torch.export.export( + traced, + sample_inputs, + dynamic_shapes=dynamic_shapes, + strict=False, + prefer_deferred_runtime_asserts_over_guards=True, + ) + + if target_device.type != "cpu": + from torch.export.passes import ( + move_to_device_pass, + ) + + exported = move_to_device_pass(exported, target_device) + + metadata["do_atomic_virial"] = do_atomic_virial + + json_source = model_json_override if model_json_override is not None else data + data_for_json = deepcopy(json_source) + data_for_json = _numpy_to_json_serializable(data_for_json) + + return exported, metadata, data_for_json, output_keys # 3. Create sample inputs on CPU for tracing # torch.export's duck-sizing unifies dimensions with the same sample value, @@ -917,10 +1179,14 @@ def _deserialize_to_file_pte( data: dict, model_json_override: dict | None = None, do_atomic_virial: bool = False, + lower_kind: str = "nlist", ) -> None: """Deserialize a dictionary to a .pte model file.""" exported, metadata, data_for_json, output_keys = _trace_and_export( - data, model_json_override, do_atomic_virial=do_atomic_virial + data, + model_json_override, + do_atomic_virial=do_atomic_virial, + lower_kind=lower_kind, ) model_def_script = data.get("model_def_script") or {} @@ -939,6 +1205,7 @@ def _deserialize_to_file_pt2( data: dict, model_json_override: dict | None = None, do_atomic_virial: bool = False, + lower_kind: str = "nlist", ) -> None: """Deserialize a dictionary to a .pt2 model file (AOTInductor). @@ -976,7 +1243,10 @@ def _deserialize_to_file_pt2( # First artifact: regular (no comm). Always produced. exported, metadata, data_for_json, output_keys = _trace_and_export( - data, model_json_override, do_atomic_virial=do_atomic_virial + data, + model_json_override, + do_atomic_virial=do_atomic_virial, + lower_kind=lower_kind, ) metadata["output_keys"] = output_keys diff --git a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py new file mode 100644 index 0000000000..ffbf2ae9e7 --- /dev/null +++ b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Graph-form ``.pt2`` export: ``lower_input_kind`` metadata branch. + +Covers both branches of the ``lower_kind`` selector on +``deserialize_to_file``: ``"graph"`` traces ``forward_lower_graph_exportable`` +over the NeighborGraph schema and records ``lower_input_kind == "graph"`` in +``metadata.json``; the default (``"nlist"``) traces the dense quartet and +records ``lower_input_kind == "nlist"``. +""" + +import copy +import json +import os +import tempfile +import zipfile + +import pytest + +from deepmd.pt_expt.utils.serialization import ( + deserialize_to_file, +) + +# dpa1 with attn_layer == 0 — the energy model exercised by the graph path. +DPA1_CONFIG = { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_atten", + "sel": 30, + "rcut_smth": 2.0, + "rcut": 6.0, + "neuron": [2, 4, 8], + "axis_neuron": 4, + "attn": 5, + "attn_layer": 0, + "attn_dotr": True, + "attn_mask": False, + "activation_function": "tanh", + "scaling_factor": 1.0, + "normalize": True, + "temperature": 1.0, + "type_one_side": True, + "seed": 1, + }, + "fitting_net": { + "neuron": [5, 5, 5], + "resnet_dt": True, + "seed": 1, + }, +} + + +def _build_dpa1_data() -> dict: + """Build a serialized dpmodel data dict for a dpa1(attn_layer=0) energy model.""" + from deepmd.dpmodel.model.model import ( + get_model, + ) + + model = get_model(copy.deepcopy(DPA1_CONFIG)) + return { + "model": model.serialize(), + "model_def_script": copy.deepcopy(DPA1_CONFIG), + "backend": "dpmodel", + "software": "deepmd-kit", + "version": "3.0.0", + } + + +def _read_metadata(pt2_path: str) -> dict: + """Read ``model/extra/metadata.json`` from a ``.pt2`` ZIP archive.""" + with zipfile.ZipFile(pt2_path, "r") as zf: + raw = zf.read("model/extra/metadata.json").decode("utf-8") + return json.loads(raw) + + +@pytest.fixture(scope="module") +def dpa1_dpmodel_data() -> dict: + return _build_dpa1_data() + + +def test_graph_pt2_has_lower_input_kind_graph(dpa1_dpmodel_data) -> None: + """``lower_kind="graph"`` -> metadata ``lower_input_kind == "graph"``.""" + with tempfile.TemporaryDirectory() as d: + p = os.path.join(d, "m_graph.pt2") + deserialize_to_file( + p, + copy.deepcopy(dpa1_dpmodel_data), + do_atomic_virial=True, + lower_kind="graph", + ) + meta = _read_metadata(p) + assert meta["lower_input_kind"] == "graph" + + +def test_dense_pt2_has_lower_input_kind_nlist(dpa1_dpmodel_data) -> None: + """Default (``lower_kind="nlist"``) -> metadata ``lower_input_kind == "nlist"``.""" + with tempfile.TemporaryDirectory() as d: + p = os.path.join(d, "m_dense.pt2") + deserialize_to_file( + p, + copy.deepcopy(dpa1_dpmodel_data), + do_atomic_virial=True, + ) + meta = _read_metadata(p) + assert meta["lower_input_kind"] == "nlist" From 148fa0e7bdff0aa73a0b3d6a4a3f7c26330e2ce4 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 29 Jun 2026 20:03:18 +0800 Subject: [PATCH 05/33] =?UTF-8?q?fix(pt=5Fexpt):=20B1.3=20review=20?= =?UTF-8?q?=E2=80=94=20persist=20static=20edge=5Fcapacity=20(E=5Fmax)=20in?= =?UTF-8?q?=20graph=20.pt2=20metadata=20for=20the=20C++=20hub=20(B2)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepmd/pt_expt/utils/serialization.py | 5 +++++ source/tests/pt_expt/utils/test_graph_pt2_metadata.py | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index 8c1f9e0d1e..dcb8aa58c1 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -957,6 +957,11 @@ def _trace_and_export( exported = move_to_device_pass(exported, target_device) metadata["do_atomic_virial"] = do_atomic_virial + # The edge axis is specialized STATIC: torch.export bakes E to exactly + # e_max, so the AOTI forward only accepts edge tensors of this length. + # Persist it so the C++ conversion hub (PR-B Phase B2) pads/masks runtime + # edges to precisely this value instead of re-deriving the constant. + metadata["edge_capacity"] = e_max json_source = model_json_override if model_json_override is not None else data data_for_json = deepcopy(json_source) diff --git a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py index ffbf2ae9e7..5c38b5046d 100644 --- a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py +++ b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py @@ -89,6 +89,10 @@ def test_graph_pt2_has_lower_input_kind_graph(dpa1_dpmodel_data) -> None: ) meta = _read_metadata(p) assert meta["lower_input_kind"] == "graph" + # the static edge axis is baked into the AOTI artifact; E_max must be + # persisted so the C++ conversion hub (PR-B B2) pads runtime edges to it. + # E_max = ceil(1.25 * nloc_sample(7) * nnei(sum(sel)=30)) = 263. + assert meta["edge_capacity"] == 263 def test_dense_pt2_has_lower_input_kind_nlist(dpa1_dpmodel_data) -> None: @@ -102,3 +106,5 @@ def test_dense_pt2_has_lower_input_kind_nlist(dpa1_dpmodel_data) -> None: ) meta = _read_metadata(p) assert meta["lower_input_kind"] == "nlist" + # edge_capacity is a graph-only artifact constant; the dense path omits it. + assert "edge_capacity" not in meta From ce2fd12a5728e78f77ae0bdcf2a070e613db8207 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 29 Jun 2026 20:28:35 +0800 Subject: [PATCH 06/33] test(pt_expt): graph .pt2 DeepEval parity vs eager dense dpa1 (pbc+nopbc, 1e-10) Add a graph-form .pt2 DeepEval dispatch and parity test (NeighborGraph PR-B Phase B1.4). DeepEval: route lower_input_kind=="graph" archives through a new _eval_model_graph that builds a carry-all NeighborGraph (padded to the static edge_capacity baked in metadata), feeds the positional schema (atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, charge_spin) to the AOTI forward, and reshapes the LOCAL public outputs by category (no communicate_extended_output, since the graph path is ghost-free). The dense (nlist) path is untouched. Fix dynamic-shape generalization of the graph .pt2 export: under symbolic make_fx/torch.export, int() on a SymInt SPECIALIZES the axis, baking the trace-time sample size (N=14, nf=2) into the autograd-derived force/energy/ virial outputs. Drop int() on node_capacity (edge_force_virial) and on n_node.shape[0] (fit_output_to_model_output_graph), and derive nf-1 in frame_id_from_n_node as a runtime 0-d tensor instead of xp.asarray(shape-1). The exported graph now generalizes across arbitrary N and nf. Test: dpa1(attn_layer=0) energy model exported with lower_kind="graph", evaluated through the pt_expt DeepPot and compared to the eager dense (sel-capped, neighbor_graph_method="legacy") reference at rtol=atol=1e-10 for pbc and nopbc. The fixture is a sparse 8-atom cluster and asserts non-binding sel (max neighbors < 30) so the carry-all/sel-capped neighbor sets coincide (anti-vacuity). --- .../utils/neighbor_graph/derivatives.py | 10 +- deepmd/dpmodel/utils/neighbor_graph/graph.py | 9 +- deepmd/pt_expt/infer/deep_eval.py | 113 +++++++++ deepmd/pt_expt/model/edge_transform_output.py | 6 +- .../pt_expt/infer/test_graph_deepeval.py | 219 ++++++++++++++++++ 5 files changed, 351 insertions(+), 6 deletions(-) create mode 100644 source/tests/pt_expt/infer/test_graph_deepeval.py diff --git a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py index 494e97a0c9..2c8c50eaca 100644 --- a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py +++ b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py @@ -80,9 +80,13 @@ def edge_force_virial( frame via the frame of their ``dst`` node. """ xp = array_api_compat.array_namespace(g_e) - # node-axis size; when a static ``node_capacity`` is supplied (the jax/export - # path) short-circuit so we never call int() on the traced ``sum(n_node)``. - n_out = int(node_capacity) if node_capacity is not None else int(xp.sum(n_node)) + # node-axis size; when a ``node_capacity`` is supplied (the jax/export path) + # use it AS-IS so we never call int() on the traced ``sum(n_node)`` -- and, + # crucially, never on ``node_capacity`` itself: under symbolic make_fx / + # torch.export it is a SymInt (``atype.shape[0]``); ``int(SymInt)`` would + # SPECIALIZE the node axis to the trace-time sample size, baking a constant + # ``N`` into the scatter and breaking dynamic-``N`` inference. + n_out = node_capacity if node_capacity is not None else int(xp.sum(n_node)) nf = n_node.shape[0] # zero padding/guard contributions; cast mask to g's dtype (array-API pure, # CLAUDE.md mask-multiply guideline — avoids bool*float under array_api_strict) diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py index e527a84bf0..0ce10efdf6 100644 --- a/deepmd/dpmodel/utils/neighbor_graph/graph.py +++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py @@ -153,13 +153,18 @@ def frame_id_from_n_node(n_node: Array, n_total: int | None = None) -> Array: dev = array_api_compat.device(n_node) if n_total is None: n_total = int(xp.sum(n_node)) - nf = n_node.shape[0] idx = xp.arange(n_total, dtype=n_node.dtype, device=dev) boundaries = xp.cumulative_sum(n_node) # (nf,) upper bounds, exclusive frame_id = xp.astype(xp.searchsorted(boundaries, idx, side="right"), xp.int64) # padding nodes (idx >= sum(n_node)) land at frame ``nf`` (OOB); clamp them to # the last real frame so the per-frame scatter never indexes out of range. - return xp.minimum(frame_id, xp.asarray(nf - 1, dtype=xp.int64, device=dev)) + # Derive ``nf - 1`` as a RUNTIME 0-d tensor (sum of ones over the frame axis) + # rather than ``xp.asarray(n_node.shape[0] - 1)``: under symbolic make_fx / + # torch.export, ``shape[0]`` is a SymInt and materializing it into a constant + # tensor SPECIALIZES the frame axis -- baking the trace-time frame count into + # every downstream per-frame reduction and breaking dynamic-``nf`` inference. + last_frame = xp.sum(xp.ones_like(n_node)) - 1 # 0-d int == nf - 1 + return xp.minimum(frame_id, xp.astype(last_frame, xp.int64)) def node_validity_mask(n_node: Array, n_total: int) -> Array: diff --git a/deepmd/pt_expt/infer/deep_eval.py b/deepmd/pt_expt/infer/deep_eval.py index 97bba3d4a5..47bcbff731 100644 --- a/deepmd/pt_expt/infer/deep_eval.py +++ b/deepmd/pt_expt/infer/deep_eval.py @@ -66,6 +66,20 @@ import ase.neighborlist +# Public output keys emitted by the graph-form AOTI forward +# (``forward_lower_graph_exportable``) keyed by the output-variable category that +# ``request_defs`` carries. The graph path is LOCAL-only (``N == sum(n_node)`` +# nodes, no ghosts), so its outputs are already at local-atom resolution -- no +# ``communicate_extended_output`` fold-back is needed. +_GRAPH_CATEGORY_TO_KEY = { + OutputVariableCategory.OUT: "atom_energy", + OutputVariableCategory.REDU: "energy", + OutputVariableCategory.DERV_R: "force", + OutputVariableCategory.DERV_C_REDU: "virial", + OutputVariableCategory.DERV_C: "atom_virial", +} + + def _reshape_charge_spin( charge_spin: np.ndarray, nframes: int, dim_chg_spin: int ) -> np.ndarray: @@ -1423,6 +1437,10 @@ def _eval_model( request_defs: list[OutputVariableDef], charge_spin: np.ndarray | None = None, ) -> tuple[np.ndarray, ...]: + if self.metadata.get("lower_input_kind") == "graph": + return self._eval_model_graph( + coords, cells, atom_types, fparam, aparam, request_defs, charge_spin + ) model_inputs, mapping_t, nframes, natoms = self._prepare_inputs( coords, cells, atom_types, fparam, aparam, charge_spin ) @@ -1621,6 +1639,101 @@ def _eval_model_spin( ) return tuple(results) + def _eval_model_graph( + self, + coords: np.ndarray, + cells: np.ndarray | None, + atom_types: np.ndarray, + fparam: np.ndarray | None, + aparam: np.ndarray | None, + request_defs: list[OutputVariableDef], + charge_spin: np.ndarray | None = None, + ) -> tuple[np.ndarray, ...]: + """Evaluate a graph-form ``.pt2`` (``lower_input_kind == "graph"``). + + Builds a carry-all :class:`~deepmd.dpmodel.utils.neighbor_graph.NeighborGraph` + from the eval system, padded to the static ``edge_capacity`` baked into + the AOTI artifact, and feeds the positional schema + ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, + charge_spin)`` to the exported forward. The forward returns the LOCAL + public keys directly, so results are reshaped without + ``communicate_extended_output``. + """ + from deepmd.dpmodel.utils.neighbor_graph import ( + GraphLayout, + build_neighbor_graph, + ) + from deepmd.pt_expt.utils.env import ( + DEVICE, + ) + + nframes = coords.shape[0] + if len(atom_types.shape) == 1: + natoms = len(atom_types) + atom_types = np.tile(atom_types, nframes).reshape(nframes, -1) + else: + natoms = len(atom_types[0]) + + coord_input = coords.reshape(nframes, natoms, 3) + box_input = cells.reshape(nframes, 9) if cells is not None else None + edge_capacity = int(self.metadata["edge_capacity"]) + graph = build_neighbor_graph( + coord_input, + atom_types, + box_input, + self._rcut, + layout=GraphLayout(edge_capacity=edge_capacity), + ) + + atype_t = torch.tensor( + np.asarray(atom_types).reshape(-1), dtype=torch.int64, device=DEVICE + ) + n_node_t = torch.tensor( + np.asarray(graph.n_node), dtype=torch.int64, device=DEVICE + ) + edge_index_t = torch.tensor( + np.asarray(graph.edge_index), dtype=torch.int64, device=DEVICE + ) + edge_vec_t = torch.tensor( + np.asarray(graph.edge_vec), dtype=torch.float64, device=DEVICE + ) + edge_mask_t = torch.tensor( + np.asarray(graph.edge_mask), dtype=torch.bool, device=DEVICE + ) + + fparam_t, aparam_t = self._prepare_optional_lower_inputs( + fparam, aparam, nframes, natoms, DEVICE + ) + charge_spin_t = self._make_charge_spin_input(nframes, charge_spin) + + model_inputs = ( + atype_t, + n_node_t, + edge_index_t, + edge_vec_t, + edge_mask_t, + fparam_t, + aparam_t, + charge_spin_t, + ) + if self._is_pt2: + model_ret = self._pt2_runner(*model_inputs) + else: + model_ret = self.exported_module(*model_inputs) + + results = [] + for odef in request_defs: + shape = self._get_output_shape(odef, nframes, natoms) + gkey = _GRAPH_CATEGORY_TO_KEY.get(odef.category) + val = model_ret.get(gkey) if gkey is not None else None + if val is not None: + results.append(val.detach().cpu().numpy().reshape(shape)) + else: + results.append( + np.full(np.abs(shape), np.nan, dtype=GLOBAL_NP_FLOAT_PRECISION) + ) + return tuple(results) + def _get_output_shape( self, odef: OutputVariableDef, nframes: int, natoms: int ) -> list[int]: diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py index 1eb3e4363b..9202a64d8f 100644 --- a/deepmd/pt_expt/model/edge_transform_output.py +++ b/deepmd/pt_expt/model/edge_transform_output.py @@ -136,7 +136,11 @@ def fit_output_to_model_output_graph( edge_mask = graph.edge_mask n_node = graph.n_node redu_prec = env.GLOBAL_PT_ENER_FLOAT_PRECISION - nf = int(n_node.shape[0]) + # Keep ``nf`` as a (possibly symbolic) shape value: under symbolic make_fx / + # torch.export ``n_node`` dim-0 is the dynamic frame axis, and ``int()`` on a + # SymInt SPECIALIZES it -- baking the trace-time frame count into every + # per-frame reduction (energy_redu / virial) and breaking multi-frame infer. + nf = n_node.shape[0] # Derive N from the fitting output's leading shape rather than int(n_node.sum()). # shape attributes are always static Python ints (or SymInts in symbolic-mode # tracing) and are trace-safe; reading a tensor VALUE via int() is not. diff --git a/source/tests/pt_expt/infer/test_graph_deepeval.py b/source/tests/pt_expt/infer/test_graph_deepeval.py new file mode 100644 index 0000000000..e2bdabf04f --- /dev/null +++ b/source/tests/pt_expt/infer/test_graph_deepeval.py @@ -0,0 +1,219 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Graph-form ``.pt2`` DeepEval parity vs the eager dense reference. + +A graph-form ``.pt2`` (exported with ``lower_kind="graph"``) carries the +NeighborGraph schema ``(atype, n_node, edge_index, edge_vec, edge_mask, ...)`` +in its AOTI forward. This test verifies that evaluating such an archive +through the pt_expt :class:`DeepPot` reproduces the eager dpa1 energy / force / +virial to ``rtol=atol=1e-10`` (fp64), for both PBC and non-PBC. + +The graph path is CARRY-ALL (every neighbor within ``rcut``); the eager dense +reference is sel-capped (``sel=30``, forced via +``neighbor_graph_method="legacy"``). They coincide only at NON-BINDING ``sel`` +(max neighbor count ``< sel``), so the test fixture is a small, sparse cluster +and the non-binding condition is asserted explicitly -- otherwise the parity +would vacuously compare two different neighbor sets. +""" + +import copy +import os +import tempfile + +import numpy as np +import pytest +import torch + +from deepmd.infer import ( + DeepPot, +) +from deepmd.pt_expt.utils.env import ( + DEVICE, +) +from deepmd.pt_expt.utils.serialization import ( + deserialize_to_file, +) + +# dpa1 with attn_layer == 0 -- the energy model exercised by the graph path. +DPA1_CONFIG = { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_atten", + "sel": 30, + "rcut_smth": 2.0, + "rcut": 6.0, + "neuron": [2, 4, 8], + "axis_neuron": 4, + "attn": 5, + "attn_layer": 0, + "attn_dotr": True, + "attn_mask": False, + "activation_function": "tanh", + "scaling_factor": 1.0, + "normalize": True, + "temperature": 1.0, + "type_one_side": True, + "seed": 1, + }, + "fitting_net": { + "neuron": [5, 5, 5], + "resnet_dt": True, + "seed": 1, + }, +} + +RCUT = 6.0 +SEL = 30 + + +def _build_system() -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """A small, sparse cluster: 8 atoms inside a 5 A blob, centered in an 18 A box. + + The blob keeps every atom within ``rcut`` of at most 7 others (<< ``sel``), + so the carry-all graph neighbor set equals the sel-capped dense one. + """ + rng = np.random.default_rng(20240626) + natoms = 8 + box_size = 18.0 + blob = rng.random((natoms, 3)) * 5.0 + box_size * 0.5 - 2.5 + coords = blob.reshape(1, natoms, 3) + cells = (np.eye(3) * box_size).reshape(1, 9) + atype = np.array([0, 1, 1, 0, 1, 1, 0, 1], dtype=np.int32) + return coords, cells, atype + + +def _max_neighbors( + coords: np.ndarray, cells: np.ndarray | None, atype: np.ndarray +) -> int: + """Max carry-all neighbor count per center within ``rcut`` (for the non-binding check).""" + from deepmd.dpmodel.utils.neighbor_graph import ( + build_neighbor_graph, + ) + + natoms = atype.shape[0] + graph = build_neighbor_graph( + coords.reshape(1, natoms, 3), + atype.reshape(1, natoms), + cells.reshape(1, 9) if cells is not None else None, + RCUT, + ) + real = np.asarray(graph.edge_mask) + dst = np.asarray(graph.edge_index)[1][real] + counts = np.bincount(dst, minlength=natoms) + return int(counts.max()) + + +def _eager_dense_reference( + model: torch.nn.Module, + coords: np.ndarray, + cells: np.ndarray | None, + atype: np.ndarray, +) -> dict[str, np.ndarray]: + """Reference energy/force/virial from the eager dense (sel-capped) path.""" + natoms = atype.shape[0] + coord_t = torch.tensor( + coords.reshape(1, natoms, 3), dtype=torch.float64, device=DEVICE + ).requires_grad_(True) + atype_t = torch.tensor(atype.reshape(1, natoms), dtype=torch.int64, device=DEVICE) + box_t = ( + torch.tensor(cells.reshape(1, 9), dtype=torch.float64, device=DEVICE) + if cells is not None + else None + ) + ret = model.call_common( + coord_t, + atype_t, + box_t, + do_atomic_virial=True, + neighbor_graph_method="legacy", + ) + out = { + "atom_energy": ret["energy"], + "energy": ret["energy_redu"], + "force": ret["energy_derv_r"].squeeze(-2), + "virial": ret["energy_derv_c_redu"].squeeze(-2), + "atom_virial": ret["energy_derv_c"].squeeze(-2), + } + return {k: v.detach().cpu().numpy() for k, v in out.items()} + + +@pytest.fixture(scope="module") +def graph_pt2(): + """Build a dpa1(attn_layer=0) model and export it to a graph-form ``.pt2``. + + The AOTI compile is slow (~90 s), so it is done once per module. The eager + pt_expt model is returned alongside the archive path to serve as the dense + parity reference. + """ + from deepmd.pt_expt.model import ( + get_model, + ) + + model = get_model(copy.deepcopy(DPA1_CONFIG)).to(torch.float64) + model.eval() + data = {"model": model.serialize()} + + tmpdir = tempfile.mkdtemp() + pt2_path = os.path.join(tmpdir, "deeppot_dpa1_graph.pt2") + deserialize_to_file( + pt2_path, + copy.deepcopy(data), + do_atomic_virial=True, + lower_kind="graph", + ) + yield pt2_path, model + os.unlink(pt2_path) + os.rmdir(tmpdir) + + +@pytest.mark.parametrize("pbc", [True, False]) # periodic vs non-periodic +def test_graph_pt2_deepeval_parity(graph_pt2, pbc) -> None: + """Graph ``.pt2`` DeepEval == eager dense dpa1 (energy/force/virial), 1e-10.""" + pt2_path, model = graph_pt2 + coords, cells, atype = _build_system() + box = cells if pbc else None + + # Anti-vacuity: the carry-all graph and the sel-capped dense reference only + # agree when no center is sel-bound. Assert the system is non-binding. + max_nn = _max_neighbors(coords, box, atype) + assert max_nn < SEL, ( + f"test system is sel-binding (max neighbors {max_nn} >= sel {SEL}); " + "carry-all graph and sel-capped dense reference would diverge" + ) + + dp = DeepPot(pt2_path) + assert dp.deep_eval.metadata["lower_input_kind"] == "graph" + + e, f, v, ae, av = dp.eval(coords, box, atype, atomic=True) + ref = _eager_dense_reference(model, coords, box, atype) + + np.testing.assert_allclose( + e.reshape(-1), + ref["energy"].reshape(-1), + rtol=1e-10, + atol=1e-10, + err_msg="energy", + ) + np.testing.assert_allclose( + f.reshape(-1), ref["force"].reshape(-1), rtol=1e-10, atol=1e-10, err_msg="force" + ) + np.testing.assert_allclose( + v.reshape(-1), + ref["virial"].reshape(-1), + rtol=1e-10, + atol=1e-10, + err_msg="virial", + ) + np.testing.assert_allclose( + ae.reshape(-1), + ref["atom_energy"].reshape(-1), + rtol=1e-10, + atol=1e-10, + err_msg="atom_energy", + ) + np.testing.assert_allclose( + av.reshape(-1), + ref["atom_virial"].reshape(-1), + rtol=1e-10, + atol=1e-10, + err_msg="atom_virial", + ) From e35fc389770f39f5c9df6d2aa48f50b8b63359b2 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 29 Jun 2026 22:04:49 +0800 Subject: [PATCH 07/33] fix(pt_expt): compiled training runs the graph lower (eager==compiled); drop force_legacy_descriptor Retarget _CompiledModel to compile forward_common_lower_graph for graph-eligible descriptors (dpa1 attn_layer==0), gated by the same mixed_types()+uses_graph_lower() predicate the eager default-flip uses; se_e2_a/dpa2/dpa3 keep compiling the dense forward_lower. _trace_and_compile_graph builds a synthetic NeighborGraph with prime-distinct nf/N/E axes (no make_fx duck-shape merge) and edge_vec as the autograd leaf; _forward_graph builds the carry-all graph eagerly and unravels flat (N,*) node outputs to (nf,nloc,*). cpp.simdlen=0 for the graph compile avoids an inductor CPU scatter-vectorizer crash on the per-frame virial atomic_add. Also fixes an eager autograd bug in dpa1 call_graph: xp.asarray(type_embedding, device=dev) DETACHES under torch, so the type-embedding weights never trained in the graph path (grad None despite a real finite-diff dependency). make_fx traced through it, so compiled != eager and the optimizer diverged after step 0. Use type_embedding directly (mirrors the dense path); the tebd net now trains and eager==compiled to 1e-10 across the varying-natoms trajectory. Drops the force_legacy_descriptor workaround + uses_graph_lower monkeypatch. --- deepmd/dpmodel/descriptor/dpa1.py | 13 +- deepmd/pt_expt/train/training.py | 533 +++++++++++++++++++++++++- source/tests/pt_expt/test_training.py | 48 +-- 3 files changed, 554 insertions(+), 40 deletions(-) diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py index 27e2d68bfc..3f49b6c04f 100644 --- a/deepmd/dpmodel/descriptor/dpa1.py +++ b/deepmd/dpmodel/descriptor/dpa1.py @@ -757,9 +757,13 @@ def call_graph( ) # FLAT node axis (N, ...): no (nf, nloc) reshape -- ragged-native, spec. if self.concat_output_tebd: - tebd = xp.asarray(type_embedding, device=dev) + # Use type_embedding directly (mirrors the dense path's + # ``xp.take(type_embedding, ...)``): ``xp.asarray(..., device=dev)`` + # DETACHES under torch, silently severing the type-embedding weight + # gradient so the tebd net never trains; type_embedding already lives + # on the model device, so the device cast was redundant anyway. atype_local = xp.asarray(atype, device=dev) - atype_embd = xp.take(tebd, atype_local, axis=0) # (N, tebd_dim) + atype_embd = xp.take(type_embedding, atype_local, axis=0) # (N, tebd_dim) grrg = xp.concat([grrg, atype_embd], axis=-1) return grrg, rot_mat @@ -1523,7 +1527,10 @@ def call_graph( ss = rr[:, 0:1] # (E, 1) # neighbor / center type embeddings (concat mode); ghost type == owner type # so gathering by the LOCAL owner (src) reproduces the dense neighbor tebd. - tebd = xp.asarray(type_embedding, device=dev) + # NB: do NOT wrap in ``xp.asarray(..., device=dev)`` -- that DETACHES under + # torch and severs the type-embedding weight gradient (the tebd net would + # never train); type_embedding already lives on the model device. + tebd = type_embedding atype_embd_nlist = xp.take(tebd, nei_type, axis=0) # (E, tebd_dim) if not self.type_one_side: atype_embd_nnei = xp.take(tebd, center_type, axis=0) # (E, tebd_dim) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index bd6fdb02a3..ff685cb0c7 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -477,6 +477,25 @@ def fn( *task_buf_vals_trace, ) + return ( + _finalize_compiled_lower(traced_lower, model, was_training, compile_opts), + task_buf_order, + ) + + +def _finalize_compiled_lower( + traced_lower: "torch.fx.GraphModule", + model: torch.nn.Module, + was_training: bool, + compile_opts: dict[str, Any] | None, + extra_options: dict[str, Any] | None = None, +) -> torch.nn.Module: + """Shared post-``make_fx`` tail: strip detach, rebuild, inductor-compile. + + Used by both the dense :func:`_trace_and_compile` and the graph + :func:`_trace_and_compile_graph` so the second-order-gradient handling + (detach removal) and inductor options stay identical on both paths. + """ # make_fx inserts aten.detach.default for saved tensors used in the # decomposed autograd.grad backward ops. These detach nodes break # second-order gradient flow (d(force)/d(params) for force training). @@ -503,6 +522,8 @@ def fn( # pytorch/pytorch#174379, #178080, #179494 under # data-dependent symbolic shapes. } + if extra_options: + inductor_options.update(extra_options) if compile_opts: inductor_options.update(compile_opts) @@ -511,7 +532,356 @@ def fn( backend="inductor", dynamic=True, options=inductor_options, - ), task_buf_order + ) + + +def _model_uses_graph_lower(model: torch.nn.Module) -> bool: + """Whether ``model``'s eager default-flip routes through the GRAPH lower. + + Mirrors the predicate in + :meth:`~deepmd.pt_expt.model.make_model.make_model..CM._resolve_graph_method` + for ``neighbor_graph_method is None`` (the training default): a model is + graph-eligible iff it is ``mixed_types`` AND its single descriptor reports + ``uses_graph_lower() == True`` (currently only dpa1 ``attn_layer == 0``). + + When True the compiled lower must be the GRAPH ``forward_common_lower_graph`` + so the compiled path matches eager training (which already default-flips to + the carry-all graph forward); when False the dense ``forward_lower`` is + compiled (se_e2_a / dpa2 / dpa3 / linear / zbl). + """ + if not hasattr(model, "mixed_types"): + return False + try: + if not model.mixed_types(): + return False + except (AttributeError, NotImplementedError): + return False + # Linear / ZBL atomic models have no single ``descriptor`` -> dense. + descriptor = getattr(getattr(model, "atomic_model", None), "descriptor", None) + uses_graph = getattr(descriptor, "uses_graph_lower", None) + if uses_graph is None: + return False + try: + return bool(uses_graph()) + except (AttributeError, NotImplementedError): + return False + + +def _trace_and_compile_graph( + model: torch.nn.Module, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + charge_spin: torch.Tensor | None, + compile_opts: dict[str, Any] | None = None, + task_buffers: dict[str, torch.Tensor] | None = None, +) -> tuple[torch.nn.Module, tuple[str, ...]]: + """Symbolic-trace ``forward_common_lower_graph`` and inductor-compile it. + + The GRAPH analogue of :func:`_trace_and_compile`. Builds a small synthetic + NeighborGraph with prime-controlled ``nf`` / ``N`` / ``E`` axes (so make_fx's + duck-shape unification keeps the three dynamic dims as distinct symbols), + traces ``model.forward_common_lower_graph`` with ``edge_vec`` as the autograd + leaf, and translates the internal fitting keys to the public energy-model + keys (``atom_energy`` / ``energy`` / ``force`` / ``virial``). The compiled + callable accepts the positional graph tensors plus the promoted task buffers + and returns those public keys on the FLAT node axis (``N == sum(n_node)``); + the caller (:meth:`_CompiledModel.forward`) unravels them to ``(nf, nloc, *)``. + + Parameters + ---------- + model + The (uncompiled) graph-eligible energy model. + fparam, aparam, charge_spin + Representative optional inputs (or ``None``) so the traced branch + matches what :meth:`_CompiledModel.forward` passes at run time. + compile_opts + User-supplied inductor options (merged over the built-in defaults). + task_buffers + Per-task buffers promoted to FX placeholders (see + :func:`_detect_task_buffers`). + """ + import math + + from torch._decomp import ( + get_decompositions, + ) + from torch.fx.experimental.proxy_tensor import ( + make_fx, + ) + + from deepmd.pt_expt.model.ener_model import ( + _translate_energy_keys, + ) + + was_training = model.training + # Trace in train mode so create_graph=True is captured inside the graph + # force backward (forward_common_lower_graph passes create_graph=self.training). + model.train() + + task_buf_order: tuple[str, ...] = tuple(task_buffers.keys()) if task_buffers else () + task_buf_vals_trace: tuple[torch.Tensor, ...] = ( + tuple(task_buffers[k] for k in task_buf_order) if task_buffers else () + ) + + _fitting: torch.nn.Module | None = None + _atomic_model: torch.nn.Module | None = None + if task_buf_order: + try: + _fitting = model.get_fitting_net() + except AttributeError: + pass + try: + _atomic_model = model.atomic_model + except AttributeError: + pass + + do_grad_r = model.do_grad_r("energy") + do_grad_c = model.do_grad_c("energy") + + # ------------------------------------------------------------------ + # Build the trace-time NeighborGraph with prime-distinct nf / N / E. + # + # make_fx (tracing_mode="symbolic") unifies dimension symbols that share a + # concrete value (duck-shape merging). The three dynamic axes of the graph + # lower must stay distinct symbols, otherwise the per-frame segment_sum + # (N -> nf) and the per-edge scatter (E -> N) bake in a false equality: + # * nf = n_node.shape[0] (per-frame reductions) + # * N = atype.shape[0] (flat node axis = sum(n_node)) + # * E = edge_vec.shape[0] (edge axis) + # They are chosen as collision-free primes vs every parameter/buffer dim. + # ------------------------------------------------------------------ + _forbidden: set[int] = { + int(_d) + for _src in (model.parameters(), model.buffers()) + for _p in _src + for _d in _p.shape + if _d > 1 + } + try: + _dim_fp = model.get_dim_fparam() + if _dim_fp > 1: + _forbidden.add(_dim_fp) + except Exception: + pass + try: + _dim_ap = model.get_dim_aparam() + if _dim_ap > 1: + _forbidden.add(_dim_ap) + except Exception: + pass + if charge_spin is not None and charge_spin.shape[-1] > 1: + _forbidden.add(int(charge_spin.shape[-1])) + for _tbv in task_buf_vals_trace: + for _d in _tbv.shape: + if _d > 1: + _forbidden.add(int(_d)) + + trace_nf = _next_safe_prime(5, _forbidden) + # nloc such that N = trace_nf * nloc is collision-free (and != trace_nf). + nloc_trace = 7 + while (trace_nf * nloc_trace) in (_forbidden | {trace_nf}): + nloc_trace += 1 + trace_N = trace_nf * nloc_trace + # Static edge capacity, prime-padded to stay distinct from nf and N. + nnei = sum(model.get_sel()) + e_max_base = max(math.ceil(1.25 * nloc_trace * nnei), 7) + e_max = _next_safe_prime(e_max_base, _forbidden | {trace_nf, trace_N}) + + sample = _make_graph_trace_inputs( + model, + e_max=e_max, + nframes=trace_nf, + nloc=nloc_trace, + want_fparam=fparam is not None, + want_aparam=aparam is not None, + want_charge_spin=charge_spin is not None, + ) + ( + s_atype, + s_n_node, + s_edge_index, + s_edge_vec, + s_edge_mask, + s_fparam, + s_aparam, + s_charge_spin, + ) = sample + + def fn( + atype: torch.Tensor, + n_node: torch.Tensor, + edge_index: torch.Tensor, + edge_vec: torch.Tensor, + edge_mask: torch.Tensor, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + charge_spin: torch.Tensor | None, + *task_buf_vals: torch.Tensor, + ) -> dict[str, torch.Tensor]: + # Patch task-specific buffers with the proxy tensors so make_fx records + # them as FX placeholders (mirrors the dense ``_trace_and_compile``). + originals: dict[str, torch.Tensor | None] = {} + if task_buf_order: + for name, val in zip(task_buf_order, task_buf_vals, strict=True): + if name.startswith(_AM_PREFIX): + actual = name[len(_AM_PREFIX) :] + if _atomic_model is not None: + originals[name] = _atomic_model._buffers.get(actual) + _atomic_model._buffers[actual] = val + else: + if _fitting is not None: + originals[name] = _fitting._buffers.get(name) + _fitting._buffers[name] = val + try: + # forward_common_lower_graph makes edge_vec the autograd leaf + # internally, so no outer detach/requires_grad_ here. + model_ret = model.forward_common_lower_graph( + atype, + n_node, + edge_index, + edge_vec, + edge_mask, + do_atomic_virial=False, + fparam=fparam, + aparam=aparam, + charge_spin=charge_spin, + ) + return _translate_energy_keys( + model_ret, + do_grad_r=do_grad_r, + do_grad_c=do_grad_c, + do_atomic_virial=False, + local=True, + ) + finally: + for name, orig in originals.items(): + if name.startswith(_AM_PREFIX): + actual = name[len(_AM_PREFIX) :] + if _atomic_model is not None: + _atomic_model._buffers[actual] = orig + else: + if _fitting is not None: + _fitting._buffers[name] = orig + + decomp_table = get_decompositions([torch.ops.aten.silu_backward.default]) + + traced_lower = make_fx( + fn, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + decomposition_table=decomp_table, + )( + s_atype, + s_n_node, + s_edge_index, + s_edge_vec, + s_edge_mask, + s_fparam, + s_aparam, + s_charge_spin, + *task_buf_vals_trace, + ) + + # The per-frame virial reduction scatters E edges into the (nf, 3, 3) virial + # via an atomic_add; inductor's CPU vectorizer asserts on that scatter's + # scalar index (``index.is_vec``). Disable CPU SIMD for the graph lower so + # the scatter is emitted scalar — numerically this only removes a + # reduction-order source, keeping eager==compiled within fp64 tolerance. + return ( + _finalize_compiled_lower( + traced_lower, + model, + was_training, + compile_opts, + extra_options={"cpp.simdlen": 0}, + ), + task_buf_order, + ) + + +def _make_graph_trace_inputs( + model: torch.nn.Module, + e_max: int, + nframes: int, + nloc: int, + *, + want_fparam: bool, + want_aparam: bool, + want_charge_spin: bool, +) -> tuple[torch.Tensor | None, ...]: + """Build a synthetic carry-all NeighborGraph for the graph-compile trace. + + Returns positional tensors in the order + ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, + charge_spin)`` matching ``forward_common_lower_graph``. The edge axis is + padded to the STATIC ``e_max`` (masked) so its concrete value is a chosen + prime; ``fparam`` / ``aparam`` / ``charge_spin`` are emitted only when the + model+data path actually carries them (``want_*``), so the traced branch + matches the run-time call. + """ + from deepmd.dpmodel.utils.neighbor_graph import ( + GraphLayout, + build_neighbor_graph, + ) + + rcut = model.get_rcut() + ntypes = len(model.get_type_map()) + dim_fparam = model.get_dim_fparam() + dim_aparam = model.get_dim_aparam() + + box_size = rcut * 3.0 + box_np = (np.eye(3, dtype=np.float64) * box_size).reshape(1, 9) + rng = np.random.default_rng(42) + coord_np = rng.random((nframes, nloc, 3)) * box_size * 0.5 + box_size * 0.25 + atype_np = np.zeros((nframes, nloc), dtype=np.int64) + for i in range(nloc): + atype_np[:, i] = i % ntypes + + coord_t = torch.tensor(coord_np, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE) + atype_t = torch.tensor(atype_np, dtype=torch.int64, device=DEVICE) + box_t = torch.tensor( + np.tile(box_np, (nframes, 1)), dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE + ) + + graph = build_neighbor_graph( + coord_t, atype_t, box_t, rcut, layout=GraphLayout(edge_capacity=e_max) + ) + + s_atype = atype_t.reshape(-1) + s_n_node = graph.n_node + s_edge_index = graph.edge_index + s_edge_vec = graph.edge_vec + s_edge_mask = graph.edge_mask + + s_fparam = ( + torch.zeros(nframes, dim_fparam, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE) + if (want_fparam and dim_fparam > 0) + else None + ) + s_aparam = ( + torch.zeros( + nframes, nloc, dim_aparam, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE + ) + if (want_aparam and dim_aparam > 0) + else None + ) + dim_cs = model.get_dim_chg_spin() if hasattr(model, "get_dim_chg_spin") else 0 + s_charge_spin = ( + torch.zeros(nframes, dim_cs, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE) + if (want_charge_spin and dim_cs > 0) + else None + ) + + return ( + s_atype, + s_n_node, + s_edge_index, + s_edge_vec, + s_edge_mask, + s_fparam, + s_aparam, + s_charge_spin, + ) class _CompiledModel(torch.nn.Module): @@ -546,6 +916,9 @@ def __init__( self._compiled_by_structure: dict = ( compiled_by_structure if compiled_by_structure is not None else {} ) + # Resolved on the first forward: whether to compile the GRAPH lower + # (graph-eligible mixed_types descriptors) or the dense forward_lower. + self._graph_eligible: bool | None = None def __getattr__(self, name: str) -> Any: # Delegate unknown lookups to original_model so that callers such as @@ -579,6 +952,18 @@ def forward( nframes, nloc = atype.shape[:2] rcut = self.original_model.get_rcut() + + # Graph-eligible models (dpa1 attn_layer==0) default-flip to the carry-all + # GRAPH forward in eager training; the compiled lower must be the GRAPH + # lower too, otherwise the eager (graph) and compiled (dense) backward + # gradients diverge at fp64 accumulation and the optimizer amplifies it. + if self._graph_eligible is None: + self._graph_eligible = _model_uses_graph_lower(self.original_model) + if self._graph_eligible: + return self._forward_graph( + coord, atype, box, fparam, aparam, charge_spin, nframes, nloc, rcut + ) + sel = self.original_model.get_sel() # coord extension + nlist (data-dependent, run in eager) @@ -751,6 +1136,152 @@ def forward( out["mask"] = result["mask"] return out + def _forward_graph( + self, + coord: torch.Tensor, + atype: torch.Tensor, + box: torch.Tensor | None, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + charge_spin: torch.Tensor | None, + nframes: int, + nloc: int, + rcut: float, + ) -> dict[str, torch.Tensor]: + """Carry-all GRAPH forward -> compiled ``forward_common_lower_graph``. + + Builds the carry-all NeighborGraph eagerly (the SAME builder the eager + uncompiled default-flip uses, so the graph tensors are bit-identical), + then calls the compiled graph lower. The graph force is per-LOCAL-node + ``(N, 3)`` with ``N == nframes * nloc`` for a single-rank carry-all graph, + so no extended->local scatter is needed; only the flat ``(N, *)`` node + keys are unravelled to ``(nf, nloc, *)`` at the I/O boundary. + """ + from deepmd.dpmodel.utils.neighbor_graph import ( + build_neighbor_graph, + ) + + _model = self.original_model + + coord_3d = coord.detach().reshape(nframes, nloc, 3) + box_flat = box.detach().reshape(nframes, 9) if box is not None else None + + # Mirror the optional-input defaulting of the dense path / eager + # call_common: a model configured with fparam / charge_spin substitutes + # its default when the data omits it, so the compiled (frozen) branch + # always sees a tensor. + _dim_fparam = ( + _model.get_dim_fparam() if hasattr(_model, "get_dim_fparam") else 0 + ) + if fparam is None and _dim_fparam > 0: + _default_fparam = _model.get_default_fparam() + if _default_fparam is not None: + fparam = ( + torch.as_tensor( + _default_fparam, dtype=coord_3d.dtype, device=coord_3d.device + ) + .reshape(1, _dim_fparam) + .expand(nframes, -1) + ) + _dim_cs = ( + _model.get_dim_chg_spin() if hasattr(_model, "get_dim_chg_spin") else 0 + ) + if charge_spin is None and _dim_cs > 0: + _default_cs = _model.get_default_chg_spin() + if _default_cs is not None: + charge_spin = ( + torch.as_tensor( + _default_cs, dtype=coord_3d.dtype, device=coord_3d.device + ) + .reshape(1, _dim_cs) + .expand(nframes, -1) + ) + + # Carry-all graph (dynamic E, no edge_capacity) — identical to the eager + # uncompiled ``_call_common_graph`` builder so the two paths match. + ng = build_neighbor_graph(coord_3d, atype, box_flat, rcut) + atype_flat = atype.reshape(nframes * nloc) + + # Lazy compile of the GRAPH lower (cached per structure key). + if self.compiled_forward_lower is None: + if self._structure_key in self._compiled_by_structure: + compiled_lower, buf_order = self._compiled_by_structure[ + self._structure_key + ] + log.info("Reusing compiled graph lower (shared structure, lazy).") + else: + log.info( + "Lazy compile (graph lower): tracing on first forward call " + "(structure_key=%s).", + self._structure_key, + ) + compiled_lower, buf_order = _trace_and_compile_graph( + _model, + fparam, + aparam, + charge_spin, + task_buffers=self._task_buffers, + compile_opts=self._compile_opts, + ) + self._compiled_by_structure[self._structure_key] = ( + compiled_lower, + buf_order, + ) + self.compiled_forward_lower = compiled_lower + self._task_buf_order = buf_order + self._task_buffers = None + + # Feed a detached, grad-enabled edge_vec leaf: the traced graph's internal + # ``edge_vec.detach()`` is stripped by ``_strip_saved_tensor_detach`` (as + # for the dense ext_coord leaf), so the force backward roots at this input. + edge_vec = ng.edge_vec.detach().requires_grad_(True) + + if self._task_buf_order: + try: + _fitting = _model.get_fitting_net() + _am = getattr(_model, "atomic_model", None) + _vals: list[torch.Tensor] = [] + for _name in self._task_buf_order: + if _name.startswith(_AM_PREFIX): + _actual = _name[len(_AM_PREFIX) :] + _vals.append(_am._buffers[_actual]) + else: + _vals.append(getattr(_fitting, _name)) + task_buf_vals: tuple = tuple(_vals) + except AttributeError as exc: + raise RuntimeError( + f"Compiled graph expects task buffers {self._task_buf_order!r} " + "but they could not be retrieved from the model. " + "This is a bug in the compile path." + ) from exc + else: + task_buf_vals = () + + result = self.compiled_forward_lower( + atype_flat, + ng.n_node, + ng.edge_index, + edge_vec, + ng.edge_mask, + fparam, + aparam, + charge_spin, + *task_buf_vals, + ) + + # The compiled graph lower emits PUBLIC keys on the FLAT node axis + # (``atom_energy`` / ``force`` are (N, *); ``energy`` / ``virial`` are + # (nf, *)). Unravel the node-level keys to rectangular (nf, nloc, *) so + # callers receive the same shapes as the dense path. + N = nframes * nloc + out: dict[str, torch.Tensor] = {} + for key, val in result.items(): + if val is not None and val.shape[:1] == torch.Size([N]) and N != nframes: + out[key] = val.reshape(nframes, nloc, *val.shape[1:]) + else: + out[key] = val + return out + # --------------------------------------------------------------------------- # Trainer diff --git a/source/tests/pt_expt/test_training.py b/source/tests/pt_expt/test_training.py index 45061c084a..a9764947c0 100644 --- a/source/tests/pt_expt/test_training.py +++ b/source/tests/pt_expt/test_training.py @@ -1352,9 +1352,7 @@ def _make_varying_config( config = normalize(config) return config - def _check_varying_natoms( - self, descriptor: dict | None = None, force_legacy_descriptor: bool = False - ) -> None: + def _check_varying_natoms(self, descriptor: dict | None = None) -> None: """Per-step compiled-vs-uncompiled comparison for the given descriptor. The loss config has ``start_pref_f=1000`` and ``start_pref_v=1.0``, @@ -1370,17 +1368,10 @@ def _check_varying_natoms( cannot meet that on float64 the descriptor has a real numerical problem (see the DPA1 limitation note where this happened). - ``force_legacy_descriptor`` makes a graph-eligible descriptor (dpa1 - ``attn_layer==0``) take the legacy *dense* (env-mat) path on BOTH the - compiled and uncompiled sides, so this stays a true compile-correctness - check (same computation, compiled vs eager). The pt_expt eager default - for such a descriptor is the carry-all GRAPH forward while the compiled - ``forward_lower`` is the sel-capped DENSE forward; those are two - *different* force computations whose parameter gradients agree only to - fp64 accumulation (~1e-12), which the optimizer then amplifies into a - diverging training trajectory. Making the compiled GRAPH lower (so - eager==compiled) is tracked for PR-B; until then this test exercises the - dense path it actually compiles. + Graph-eligible descriptors (dpa1 ``attn_layer==0``) compile the GRAPH + lower (``forward_common_lower_graph``) so the compiled path matches the + eager carry-all graph default-flip; non-eligible descriptors + (se_e2_a / dpa2 / dpa3) compile the dense ``forward_lower``. """ from deepmd.pt_expt.train.training import ( _CompiledModel, @@ -1400,16 +1391,6 @@ def _check_varying_natoms( compiled_model = trainer_c.wrapper.model["Default"] self.assertIsInstance(compiled_model, _CompiledModel) - if force_legacy_descriptor: - # Pin BOTH sides to the legacy dense (env-mat) path so the - # uncompiled reference matches the dense ``forward_lower`` - # that gets compiled (must happen before the first forward, - # i.e. before the lazy compile trace). See the docstring / - # PR-B note: the graph forward vs dense forward differ in the - # backward at fp64 precision, which the optimizer amplifies. - for _m in (trainer_uc.model, compiled_model.original_model): - _m.get_descriptor().uses_graph_lower = lambda: False - # Sync weights so predictions can be compared exactly compiled_model.original_model.load_state_dict( trainer_uc.model.state_dict() @@ -1482,25 +1463,20 @@ def test_compiled_matches_uncompiled_varying_natoms_dpa3(self) -> None: self._check_varying_natoms(_DESCRIPTOR_DPA3) def test_compiled_matches_uncompiled_varying_natoms_dpa1_no_attn(self) -> None: - """DPA1 (attn_layer=0): compiled vs uncompiled match (dense path). + """DPA1 (attn_layer=0): compiled vs uncompiled match (GRAPH lower). - ``force_legacy_descriptor=True`` pins both sides to the legacy dense - (env-mat) forward -- the path the compiled ``forward_lower`` actually - uses. The pt_expt eager default for dpa1(attn_layer=0) is the carry-all - GRAPH forward, a *different* force computation from the compiled dense - forward; their backward gradients agree only to fp64 accumulation, which - the optimizer amplifies, so comparing graph-vs-dense through training is - ill-posed. Making the compiled path the GRAPH lower (eager==compiled) - is tracked for PR-B (graph .pt2/export). + The pt_expt eager default for dpa1(attn_layer=0) is the carry-all GRAPH + forward, and the compiled path now compiles the matching GRAPH lower + (``forward_common_lower_graph``) -- so eager==compiled and the + multi-step varying-natoms trajectory (predictions + per-parameter grads + + loss) agrees to the strict ``atol=rtol=1e-10`` tolerance. DPA1 with attention layers is intentionally not covered: the compiled se_atten path is hardware-sensitive on multi-threaded CPUs (parallel reduction order diverges from eager above the 1e-10 tolerance). ``_compile_model`` warns the user instead. """ - self._check_varying_natoms( - _DESCRIPTOR_DPA1_NO_ATTN, force_legacy_descriptor=True - ) + self._check_varying_natoms(_DESCRIPTOR_DPA1_NO_ATTN) def test_compile_warns_dpa1_with_attention(self) -> None: """DPA1 (attn_layer>0) under compile must emit a warning. From 47fb700efb0d0225762beacde4d315144d13e28f Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 29 Jun 2026 22:46:07 +0800 Subject: [PATCH 08/33] =?UTF-8?q?docs(pt=5Fexpt):=20B1=20final-review=20mi?= =?UTF-8?q?nors=20=E2=80=94=20document=20nloc=3D=3D1=20unravel-skip=20+=20?= =?UTF-8?q?legacy-gate=20assumption=20in=20the=20graph=20compile=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepmd/pt_expt/train/training.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index ff685cb0c7..6393d78e39 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -548,6 +548,11 @@ def _model_uses_graph_lower(model: torch.nn.Module) -> bool: so the compiled path matches eager training (which already default-flips to the carry-all graph forward); when False the dense ``forward_lower`` is compiled (se_e2_a / dpa2 / dpa3 / linear / zbl). + + ASSUMPTION: training uses the default ``neighbor_graph_method`` (None). If a + user-facing ``"legacy"`` opt-out is ever plumbed into the trainer, this gate + must also honor it (else eager would run dense while the compiled path runs + the graph lower, re-introducing the eager!=compiled divergence this fixes). """ if not hasattr(model, "mixed_types"): return False @@ -1276,6 +1281,12 @@ def _forward_graph( N = nframes * nloc out: dict[str, torch.Tensor] = {} for key, val in result.items(): + # ``N != nframes`` distinguishes node-level keys (lead dim N) from + # frame-level keys (lead dim nf) by shape. DEGENERATE: when nloc==1, + # N == nframes, so node-level keys are NOT unravelled and stay + # (nf, *) instead of (nf, 1, *). Harmless for the varying-natoms + # trainer (nloc >> 1); a single-atom-per-frame system would need an + # explicit per-key category check instead of the shape heuristic. if val is not None and val.shape[:1] == torch.Size([N]) and N != nframes: out[key] = val.reshape(nframes, nloc, *val.shape[1:]) else: From b046874f03c237c0863d28a44bd83487fefe1537 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Jun 2026 14:47:28 +0000 Subject: [PATCH 09/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt_expt/utils/serialization.py | 3 +-- .../tests/pt_expt/model/test_graph_export.py | 26 ++++++++++++++----- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index dcb8aa58c1..532fbbb4b2 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -337,13 +337,12 @@ def _make_graph_sample_inputs( nloc : int Number of local atoms per frame (``N == nframes * nloc``). """ + import deepmd.pt_expt.utils.env as _env from deepmd.dpmodel.utils.neighbor_graph import ( GraphLayout, build_neighbor_graph, ) - import deepmd.pt_expt.utils.env as _env - rcut = model.get_rcut() ntypes = len(model.get_type_map()) dim_fparam = model.get_dim_fparam() diff --git a/source/tests/pt_expt/model/test_graph_export.py b/source/tests/pt_expt/model/test_graph_export.py index 56e2d6eb7b..6b735aa3d5 100644 --- a/source/tests/pt_expt/model/test_graph_export.py +++ b/source/tests/pt_expt/model/test_graph_export.py @@ -3,12 +3,26 @@ import pytest import torch -from deepmd.pt.utils import env -from deepmd.pt_expt.descriptor.dpa1 import DescrptDPA1 -from deepmd.pt_expt.fitting import InvarFitting -from deepmd.pt_expt.model import EnergyModel -from deepmd.dpmodel.utils.neighbor_graph import build_neighbor_graph -from ...seed import GLOBAL_SEED + +from deepmd.dpmodel.utils.neighbor_graph import ( + build_neighbor_graph, +) +from deepmd.pt.utils import ( + env, +) +from deepmd.pt_expt.descriptor.dpa1 import ( + DescrptDPA1, +) +from deepmd.pt_expt.fitting import ( + InvarFitting, +) +from deepmd.pt_expt.model import ( + EnergyModel, +) + +from ...seed import ( + GLOBAL_SEED, +) _RCUT, _NT = 4.0, 2 From 0d3860e88e91d9e2806adf312e47bb2ef12bf4af Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 29 Jun 2026 23:45:21 +0800 Subject: [PATCH 10/33] feat(pt_expt): graph .pt2 uses a dynamic edge axis (Dim(nedge)); drop static edge_capacity The graph-form .pt2 export now marks the edge axis dynamic (Dim("nedge", min=2)) instead of baking a static E_max=ceil(1.25*nloc*nnei) capacity. The AOTI artifact accepts any system size with no capacity ceiling. - _build_graph_dynamic_shapes: edge_index dim1 / edge_vec dim0 / edge_mask dim0 are now dynamic; mirrors the dense Dim("nnei", min=...) precedent. - _trace_and_export graph branch: drop the metadata["edge_capacity"] write; apply _strip_shape_assertions to neutralise the SIGFPE-prone deferred shape guards on the dynamic E axis (same handling the spin dense path uses). - deep_eval._eval_model_graph: build the carry-all graph at its tight edge count (no edge_capacity padding). - test_graph_deepeval: eval TWO different-size systems (8- and 20-atom, 56 vs 380 real edges) through the SAME exported artifact; both match eager dense dpa1 at 1e-10 pbc+nopbc, both non-binding. The 20-atom system (380 > 263) would have overflowed the B1 static artifact -> the RED. - test_graph_pt2_metadata: graph metadata no longer carries edge_capacity. --- deepmd/pt_expt/infer/deep_eval.py | 16 ++--- deepmd/pt_expt/utils/serialization.py | 59 ++++++++++++------- .../pt_expt/infer/test_graph_deepeval.py | 42 +++++++++---- .../pt_expt/utils/test_graph_pt2_metadata.py | 7 +-- 4 files changed, 80 insertions(+), 44 deletions(-) diff --git a/deepmd/pt_expt/infer/deep_eval.py b/deepmd/pt_expt/infer/deep_eval.py index 47bcbff731..e03f893040 100644 --- a/deepmd/pt_expt/infer/deep_eval.py +++ b/deepmd/pt_expt/infer/deep_eval.py @@ -1652,15 +1652,15 @@ def _eval_model_graph( """Evaluate a graph-form ``.pt2`` (``lower_input_kind == "graph"``). Builds a carry-all :class:`~deepmd.dpmodel.utils.neighbor_graph.NeighborGraph` - from the eval system, padded to the static ``edge_capacity`` baked into - the AOTI artifact, and feeds the positional schema + from the eval system at its exact (tight) edge count and feeds the + positional schema ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, - charge_spin)`` to the exported forward. The forward returns the LOCAL - public keys directly, so results are reshaped without - ``communicate_extended_output``. + charge_spin)`` to the exported forward. The AOTI artifact's edge axis + is DYNAMIC (B2.0), so no ``edge_capacity`` padding is needed. The + forward returns the LOCAL public keys directly, so results are reshaped + without ``communicate_extended_output``. """ from deepmd.dpmodel.utils.neighbor_graph import ( - GraphLayout, build_neighbor_graph, ) from deepmd.pt_expt.utils.env import ( @@ -1676,13 +1676,13 @@ def _eval_model_graph( coord_input = coords.reshape(nframes, natoms, 3) box_input = cells.reshape(nframes, 9) if cells is not None else None - edge_capacity = int(self.metadata["edge_capacity"]) + # Dynamic edge axis (B2.0): build the carry-all graph at its exact edge + # count (no static padding); the AOTI artifact accepts any E. graph = build_neighbor_graph( coord_input, atom_types, box_input, self._rcut, - layout=GraphLayout(edge_capacity=edge_capacity), ) atype_t = torch.tensor( diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index 532fbbb4b2..03c678c0fa 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -422,10 +422,13 @@ def _build_graph_dynamic_shapes( ) -> tuple: """Build dynamic-shape specifications for the graph-form forward_lower export. - ``nframes`` (the ``n_node`` axis) and ``N`` (the flat node axis) are - dynamic dims; the edge axis ``E`` is STATIC (decision #16: the masked - ``edge_capacity`` path), expressed by leaving the edge dims unmarked - (``None``) so torch.export specialises them to the sample value. + ``nframes`` (the ``n_node`` axis), ``N`` (the flat node axis) AND the edge + axis ``E`` are all dynamic dims (B2.0: the dynamic edge axis replaces the + static ``edge_capacity`` of B1). ``E`` is marked ``Dim("nedge", min=2)`` so + the AOTI artifact accepts any system size with no capacity ceiling — the + ``min=2`` lower bound mirrors the dense path's ``Dim("nnei", min=...)`` (a + dynamic, SIGFPE-tamed axis) and matches the carry-all builder's + ``min_edges=2`` guard (every dynamic graph carries >=2 edges). Parameters ---------- @@ -438,12 +441,13 @@ def _build_graph_dynamic_shapes( charge_spin = sample_inputs[7] nframes_dim = torch.export.Dim("nframes", min=1) n_node_total_dim = torch.export.Dim("n_node_total", min=1) + nedge_dim = torch.export.Dim("nedge", min=2) return ( {0: n_node_total_dim}, # atype: (N,) {0: nframes_dim}, # n_node: (nf,) - None, # edge_index: (2, E) — E static - None, # edge_vec: (E, 3) — E static - None, # edge_mask: (E,) — E static + {1: nedge_dim}, # edge_index: (2, E) — E dynamic + {0: nedge_dim}, # edge_vec: (E, 3) — E dynamic + {0: nedge_dim}, # edge_mask: (E,) — E dynamic {0: nframes_dim} if fparam is not None else None, # fparam: (nf, ndf) {0: nframes_dim} if aparam is not None else None, # aparam: (nf, nloc, nda) {0: nframes_dim} if charge_spin is not None else None, # charge_spin @@ -777,8 +781,9 @@ def deserialize_to_file( (``extended_coord``/``extended_atype``/``nlist``/``mapping``); ``"graph"`` traces the NeighborGraph schema (``atype``/``n_node``/``edge_index``/``edge_vec``/``edge_mask``) with a - static edge axis ``E = ceil(1.25 * nloc * nnei)``. The selected schema - is recorded as ``lower_input_kind`` in ``metadata.json``. + DYNAMIC edge axis ``E`` (``Dim("nedge", min=2)``), so the artifact + accepts any system size. The selected schema is recorded as + ``lower_input_kind`` in ``metadata.json``. """ if model_file.endswith(".pt2"): _deserialize_to_file_pt2( @@ -824,7 +829,7 @@ def _trace_and_export( lower_kind ``"nlist"`` (default) traces the dense quartet forward; ``"graph"`` traces ``forward_lower_graph_exportable`` over the NeighborGraph schema - with a static edge axis. Recorded as ``lower_input_kind`` in metadata. + with a dynamic edge axis. Recorded as ``lower_input_kind`` in metadata. Returns ------- @@ -866,8 +871,8 @@ def _trace_and_export( # 2b. Graph-form export branch (NeighborGraph schema). The graph path is # LOCAL-only (no ghosts), single-rank, energy-model only in PR-A/PR-B; it - # traces ``forward_lower_graph_exportable`` with a STATIC edge axis. The - # dense (nlist) path below is left byte-unchanged. + # traces ``forward_lower_graph_exportable`` with a DYNAMIC edge axis (B2.0). + # The dense (nlist) path below is left byte-unchanged. if lower_kind == "graph": import math @@ -887,17 +892,20 @@ def _trace_and_export( "requires an energy model" ) - # Static export edge capacity E_max = ceil(1.25 * nloc * nnei) - # (decision #12 headroom). nloc is the sample-system local-atom count. + # The edge axis is DYNAMIC (B2.0): the AOTI artifact accepts any edge + # count, so there is no capacity to bake. The trace sample is built at a + # concrete, padded edge size only to keep the trace tensors distinct + # from the other dynamic dims (nframes=2, N=14) under torch.export's + # duck-sizing; the value itself does NOT constrain runtime. nloc_sample = 7 nnei = sum(model.get_sel()) - e_max = math.ceil(1.25 * nloc_sample * nnei) + e_sample = math.ceil(1.25 * nloc_sample * nnei) _orig_device = _env.DEVICE _env.DEVICE = torch.device("cpu") try: sample_inputs = _make_graph_sample_inputs( - model, e_max=e_max, nframes=2, nloc=nloc_sample + model, e_max=e_sample, nframes=2, nloc=nloc_sample ) finally: _env.DEVICE = _orig_device @@ -948,6 +956,16 @@ def _trace_and_export( prefer_deferred_runtime_asserts_over_guards=True, ) + # Neutralise shape-guard assertion nodes on the dynamic edge axis. + # ``prefer_deferred_runtime_asserts_over_guards=True`` converts the + # symbolic-shape guards discovered while tracing into deferred + # ``aten._assert_scalar`` nodes; on the dynamic ``E`` axis these are the + # SIGFPE-prone ``nloc_min``-family checks (CLAUDE.md AOTI pitfalls) that + # the dense spin path already strips. Replacing each condition with + # ``True`` (not erasing the node) keeps the graph well-formed while + # letting the AOTI artifact generalise across edge counts. + _strip_shape_assertions(exported.graph_module) + if target_device.type != "cpu": from torch.export.passes import ( move_to_device_pass, @@ -956,11 +974,10 @@ def _trace_and_export( exported = move_to_device_pass(exported, target_device) metadata["do_atomic_virial"] = do_atomic_virial - # The edge axis is specialized STATIC: torch.export bakes E to exactly - # e_max, so the AOTI forward only accepts edge tensors of this length. - # Persist it so the C++ conversion hub (PR-B Phase B2) pads/masks runtime - # edges to precisely this value instead of re-deriving the constant. - metadata["edge_capacity"] = e_max + # The edge axis is DYNAMIC (B2.0): the AOTI forward accepts any edge + # count, so there is no ``edge_capacity`` to persist. The C++ / Python + # conversion hub builds the carry-all graph at its exact (tight) edge + # count and feeds it straight through. json_source = model_json_override if model_json_override is not None else data data_for_json = deepcopy(json_source) diff --git a/source/tests/pt_expt/infer/test_graph_deepeval.py b/source/tests/pt_expt/infer/test_graph_deepeval.py index e2bdabf04f..7fc83b677f 100644 --- a/source/tests/pt_expt/infer/test_graph_deepeval.py +++ b/source/tests/pt_expt/infer/test_graph_deepeval.py @@ -65,22 +65,36 @@ SEL = 30 -def _build_system() -> tuple[np.ndarray, np.ndarray, np.ndarray]: - """A small, sparse cluster: 8 atoms inside a 5 A blob, centered in an 18 A box. - - The blob keeps every atom within ``rcut`` of at most 7 others (<< ``sel``), - so the carry-all graph neighbor set equals the sel-capped dense one. +def _build_system( + natoms: int = 8, seed: int = 20240626 +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """A small, sparse cluster: ``natoms`` inside a 5 A blob, centered in an 18 A box. + + The blob keeps every atom within ``rcut`` of at most ``natoms - 1`` others + (<< ``sel``), so the carry-all graph neighbor set equals the sel-capped + dense one. Varying ``natoms`` yields a different edge count, exercising the + DYNAMIC edge axis of the exported ``.pt2`` (B2.0). """ - rng = np.random.default_rng(20240626) - natoms = 8 + rng = np.random.default_rng(seed) box_size = 18.0 blob = rng.random((natoms, 3)) * 5.0 + box_size * 0.5 - 2.5 coords = blob.reshape(1, natoms, 3) cells = (np.eye(3) * box_size).reshape(1, 9) - atype = np.array([0, 1, 1, 0, 1, 1, 0, 1], dtype=np.int32) + # Alternate O/H types; both species present regardless of natoms. + atype = np.array([i % 2 for i in range(natoms)], dtype=np.int32) return coords, cells, atype +# Two DIFFERENT-size systems evaluated through the SAME exported ``.pt2``. +# Both are sparse, non-binding clusters but with different edge counts, so the +# second size FAILS against a static-``E`` artifact (B1) and PASSES only once +# the edge axis is dynamic (B2.0). +_SYSTEMS = { + "small_8": {"natoms": 8, "seed": 20240626}, + "large_20": {"natoms": 20, "seed": 20240701}, +} + + def _max_neighbors( coords: np.ndarray, cells: np.ndarray | None, atype: np.ndarray ) -> int: @@ -165,11 +179,17 @@ def graph_pt2(): os.rmdir(tmpdir) +@pytest.mark.parametrize("system", list(_SYSTEMS)) # two different edge counts @pytest.mark.parametrize("pbc", [True, False]) # periodic vs non-periodic -def test_graph_pt2_deepeval_parity(graph_pt2, pbc) -> None: - """Graph ``.pt2`` DeepEval == eager dense dpa1 (energy/force/virial), 1e-10.""" +def test_graph_pt2_deepeval_parity(graph_pt2, pbc, system) -> None: + """Graph ``.pt2`` DeepEval == eager dense dpa1 (energy/force/virial), 1e-10. + + Both ``_SYSTEMS`` are fed through the SAME module-scoped ``.pt2``; the + differing edge counts prove the exported artifact's edge axis is dynamic + (a static-``E`` B1 artifact would reject / mis-shape the larger system). + """ pt2_path, model = graph_pt2 - coords, cells, atype = _build_system() + coords, cells, atype = _build_system(**_SYSTEMS[system]) box = cells if pbc else None # Anti-vacuity: the carry-all graph and the sel-capped dense reference only diff --git a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py index 5c38b5046d..17aef4d671 100644 --- a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py +++ b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py @@ -89,10 +89,9 @@ def test_graph_pt2_has_lower_input_kind_graph(dpa1_dpmodel_data) -> None: ) meta = _read_metadata(p) assert meta["lower_input_kind"] == "graph" - # the static edge axis is baked into the AOTI artifact; E_max must be - # persisted so the C++ conversion hub (PR-B B2) pads runtime edges to it. - # E_max = ceil(1.25 * nloc_sample(7) * nnei(sum(sel)=30)) = 263. - assert meta["edge_capacity"] == 263 + # B2.0: the edge axis is DYNAMIC (Dim("nedge", min=2)); there is no static + # capacity baked into the AOTI artifact, so no ``edge_capacity`` is persisted. + assert "edge_capacity" not in meta def test_dense_pt2_has_lower_input_kind_nlist(dpa1_dpmodel_data) -> None: From 40487c413d9331f84bb3c0d74f88c9bb16066f26 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Mon, 29 Jun 2026 23:49:51 +0800 Subject: [PATCH 11/33] =?UTF-8?q?docs(pt=5Fexpt):=20B2.0=20review=20?= =?UTF-8?q?=E2=80=94=20=5Fstrip=5Fshape=5Fassertions=20now=20documents=20t?= =?UTF-8?q?he=20graph=20(dynamic-E)=20caller=20+=20edge-axis=20safety?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deepmd/pt_expt/utils/serialization.py | 47 +++++++++++++++------------ 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index 03c678c0fa..1bb49a8b5a 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -40,27 +40,32 @@ def _strip_shape_assertions(graph_module: torch.nn.Module) -> None: - """Neutralise shape-guard assertion nodes in a spin model's exported graph. - - ``torch.export`` inserts ``aten._assert_scalar`` nodes for symbolic shape - relationships discovered during tracing. For the spin model, the atom- - doubling logic creates slice patterns that depend on ``(nall - nloc)``, - producing guards like ``Ne(nall, nloc)``. These guards are spurious: the - model computes correct results even when ``nall == nloc`` (NoPBC, no ghost - atoms). - - This function is **only called for spin models** (guarded by ``if is_spin`` - in ``_trace_and_export``). The assertion messages use opaque symbolic - variable names (e.g. ``Ne(s22, s96)``) rather than human-readable names, - so filtering by message content is not reliable. Since - ``prefer_deferred_runtime_asserts_over_guards=True`` converts all shape - guards into these deferred assertions, and the only shape relationships in - the spin model involve nall/nloc, neutralising all of them is safe in this - context. - - We replace each assertion's condition with ``True`` rather than erasing the - node; erasing nodes can disturb the FX graph structure and produce NaN - gradients on some Python/torch versions. + """Neutralise deferred shape-guard assertion nodes in an exported graph. + + ``torch.export`` (with ``prefer_deferred_runtime_asserts_over_guards=True``) + inserts ``aten._assert_scalar`` nodes for symbolic-shape relationships + discovered during tracing. The assertion messages use opaque symbolic names + (e.g. ``Ne(s22, s96)``), so filtering by message content is not reliable; we + replace each assertion's condition with ``True`` rather than erasing the node + (erasing can disturb the FX graph and yield NaN gradients on some torch + versions). + + Called from TWO export paths in ``_trace_and_export``: + + * **spin (dense) models** — atom-doubling slice patterns depend on + ``(nall - nloc)``, producing spurious guards like ``Ne(nall, nloc)``; the + model is correct even when ``nall == nloc`` (NoPBC, no ghosts). + * **graph models** — the DYNAMIC edge axis (``Dim("nedge")``) produces guards + of the ``nloc_min``/SIGFPE family on the edge count ``E``. These are the + shape-specialization guards the static-``edge_capacity`` path was designed + to avoid; neutralising them is what makes one artifact eval any edge count. + + **Safety:** in both contexts every input is constructed well-formed by the + builder (spin: valid atom doubling; graph: ``build_neighbor_graph`` / + ``buildGraphTensors`` always emit ``E >= min_edges == 2`` with in-range, + masked edges), so the neutralised guards would never legitimately fire. The + only cost is that a MALFORMED runtime tensor no longer throws cleanly — the + documented AOTI trade-off (CLAUDE.md), accepted identically on both paths. """ graph = graph_module.graph for node in list(graph.nodes): From 9a8727b872e14ca7c246fb06c02c2db740a5fd71 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 00:18:15 +0800 Subject: [PATCH 12/33] test(infer): extend gen_dpa1.py with graph-eligible dpa1(attn_layer=0) generator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Section B of gen_dpa1.py produces deeppot_dpa1_graph.pt2 (lower_kind="graph", do_atomic_virial=True) and the accompanying deeppot_dpa1_graph.expected sidecar. Reference values come from an independent nlist .pt2 eval (NOT the graph .pt2) so the C++ gtest (B2.5) validates the graph AOTI path against a known-good reference. Sanity check: graph .pt2 vs nlist ref force diff 1.3e-18 (machine precision; sel=30 >> actual neighbors, so both paths see identical neighbor sets). Forces non-degenerate: max |F| ~5.3e-4 (PBC). Config: type_map=[O,H], sel=30, rcut=6.0, attn_layer=0, neuron=[2,4,8], axis_neuron=4, fitting neuron=[5,5,5], resnet_dt=True, seed=1 — mirrors DPA1_CONFIG in source/tests/pt_expt/utils/test_graph_pt2_metadata.py. --- source/tests/infer/gen_dpa1.py | 166 ++++++++++++++++++++++++++++++++- 1 file changed, 162 insertions(+), 4 deletions(-) diff --git a/source/tests/infer/gen_dpa1.py b/source/tests/infer/gen_dpa1.py index 7eaaae4ae2..5ec523f5f5 100644 --- a/source/tests/infer/gen_dpa1.py +++ b/source/tests/infer/gen_dpa1.py @@ -1,15 +1,25 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: LGPL-3.0-or-later -"""Generate deeppot_dpa1.pth and deeppot_dpa1.pt2 test models. +"""Generate deeppot_dpa1.pth, deeppot_dpa1.pt2, and deeppot_dpa1_graph.pt2 test models. -Creates a DPA1 model from dpmodel config, serializes, and exports to both -.pth (torch.jit) and .pt2 (torch.export) from the same weights. -Also prints reference values for C++ tests (PBC and NoPbc). +Creates two DPA1 models from dpmodel configs: + - deeppot_dpa1.pt2 / deeppot_dpa1.pth (attn_layer=2, dense nlist-form export) + - deeppot_dpa1_graph.pt2 (attn_layer=0, graph-form export via + lower_kind="graph"; the graph forward + is eligible only when attn_layer==0) + +Both are serialized and exported to their respective formats from the same weights. +Reference sidecar files (.expected) consumed by C++ gtests are also written: + - deeppot_dpa1.expected — from the nlist .pt2 eval (existing) + - deeppot_dpa1_graph.expected — from a direct dpmodel eval (independent ground + truth, NOT from the graph .pt2); the graph .pt2 is sanity-checked against + this reference at ≤1e-6. """ import copy import os import sys +import tempfile import numpy as np @@ -171,6 +181,154 @@ def main(): print(f"// .pth NoPbc total energy: {e_pth_np[0, 0]:.18e}") # noqa: T201 print(f"// .pth vs .pt2 NoPbc energy diff: {abs(e_np[0, 0] - e_pth_np[0, 0]):.2e}") # noqa: T201 + # ============================================================ + # Section B: graph-eligible DPA1 (attn_layer=0) model + # ============================================================ + # attn_layer=0 disables the attention layers, making the descriptor + # a plain two-body embedding (se_e2_a-like) that is eligible for the + # NeighborGraph forward path (forward_lower_graph_exportable). + # Config mirrors DPA1_CONFIG in + # source/tests/pt_expt/utils/test_graph_pt2_metadata.py + graph_config = { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_atten", + "sel": 30, + "rcut_smth": 2.0, + "rcut": 6.0, + "neuron": [2, 4, 8], + "axis_neuron": 4, + "attn": 5, + "attn_layer": 0, + "attn_dotr": True, + "attn_mask": False, + "activation_function": "tanh", + "scaling_factor": 1.0, + "normalize": True, + "temperature": 1.0, + "type_one_side": True, + "seed": 1, + }, + "fitting_net": { + "neuron": [5, 5, 5], + "resnet_dt": True, + "seed": 1, + }, + } + + print("\n---- Building graph-eligible DPA1 (attn_layer=0) ----") # noqa: T201 + + # ---- B.1 Build dpmodel, serialize ---- + model_g = get_model(copy.deepcopy(graph_config)) + model_dict_g = model_g.serialize() + + data_g = { + "model": copy.deepcopy(model_dict_g), + "model_def_script": graph_config, + "backend": "dpmodel", + "software": "deepmd-kit", + "version": "3.0.0", + } + + # ---- B.2 Compute reference via nlist .pt2 (independent of graph path) ---- + # The reference for deeppot_dpa1_graph.expected comes from the NLIST .pt2 + # (dense-quartet forward), NOT the graph .pt2. This ensures the C++ gtest + # (B2.5) independently validates the graph AOTI path against a known-good + # nlist evaluation. + print("Exporting reference nlist .pt2 (independent ground truth) ...") # noqa: T201 + with tempfile.TemporaryDirectory() as _tmp: + nlist_ref_pt2 = os.path.join(_tmp, "dpa1_graph_nlist_ref.pt2") + pt_expt_deserialize_to_file( + nlist_ref_pt2, + copy.deepcopy(data_g), + do_atomic_virial=True, + lower_kind="nlist", # independent: dense nlist, NOT graph + ) + dp_nlist_ref = DeepPot(nlist_ref_pt2) + + # PBC reference from nlist path + e_r1, f_r1, v_r1, ae_r1, av_r1 = dp_nlist_ref.eval( + coord, box, atype, atomic=True + ) + # NoPBC reference from nlist path + e_rnp, f_rnp, v_rnp, ae_rnp, av_rnp = dp_nlist_ref.eval( + coord, None, atype, atomic=True + ) + + print(f"Nlist ref PBC energy: {e_r1[0, 0]:.18e}") # noqa: T201 + print(f"Nlist ref NoPBC energy: {e_rnp[0, 0]:.18e}") # noqa: T201 + max_ref_force_pbc = float(np.max(np.abs(f_r1))) + max_ref_force_nopbc = float(np.max(np.abs(f_rnp))) + print(f"Nlist ref PBC max |force|: {max_ref_force_pbc:.6e}") # noqa: T201 + print(f"Nlist ref NoPBC max |force|: {max_ref_force_nopbc:.6e}") # noqa: T201 + if max_ref_force_pbc < 1e-10: + raise RuntimeError( + f"Graph model nlist-ref forces are degenerate " + f"(max={max_ref_force_pbc:.2e}); weights may need perturbation." + ) + + # ---- B.3 Write sidecar reference file ---- + graph_ref_path = os.path.join(base_dir, "deeppot_dpa1_graph.expected") + write_expected_ref( + graph_ref_path, + sections={ + "pbc": { + "expected_e": ae_r1[0, :, 0], + "expected_f": f_r1[0], + "expected_v": av_r1[0], + }, + "nopbc": { + "expected_e": ae_rnp[0, :, 0], + "expected_f": f_rnp[0], + "expected_v": av_rnp[0], + }, + }, + source_script="source/tests/infer/gen_dpa1.py", + ) + print(f"Wrote {graph_ref_path}") # noqa: T201 + + # ---- B.4 Export graph-form .pt2 ---- + graph_pt2_path = os.path.join(base_dir, "deeppot_dpa1_graph.pt2") + print(f"Exporting to {graph_pt2_path} (lower_kind='graph') ...") # noqa: T201 + pt_expt_deserialize_to_file( + graph_pt2_path, + copy.deepcopy(data_g), + do_atomic_virial=True, + lower_kind="graph", + ) + print("Graph .pt2 export done.") # noqa: T201 + + # ---- B.5 Sanity-check: graph .pt2 vs nlist reference ---- + # Both use the SAME weights; at non-binding sel the math is equivalent. + # Verifies that forward_lower_graph_exportable + edge_energy_deriv match + # the nlist forward for this concrete system. + dp_graph = DeepPot(graph_pt2_path) + + # PBC sanity check + e_g1, f_g1, v_g1, ae_g1, av_g1 = dp_graph.eval(coord, box, atype, atomic=True) + force_diff_pbc = float(np.max(np.abs(f_g1[0] - f_r1[0]))) + print( # noqa: T201 + f"Graph .pt2 vs nlist ref PBC force max diff: {force_diff_pbc:.2e}" + ) + if force_diff_pbc > 1e-5: + raise RuntimeError( + f"BLOCKED: graph .pt2 PBC force differs from nlist reference by " + f"{force_diff_pbc:.2e} (threshold 1e-5)." + ) + + # NoPBC sanity check + e_gnp, f_gnp, v_gnp, ae_gnp, av_gnp = dp_graph.eval(coord, None, atype, atomic=True) + force_diff_nopbc = float(np.max(np.abs(f_gnp[0] - f_rnp[0]))) + print( # noqa: T201 + f"Graph .pt2 vs nlist ref NoPBC force max diff: {force_diff_nopbc:.2e}" + ) + if force_diff_nopbc > 1e-5: + raise RuntimeError( + f"BLOCKED: graph .pt2 NoPBC force differs from nlist reference by " + f"{force_diff_nopbc:.2e} (threshold 1e-5)." + ) + + print("\nAll graph sanity checks passed.") # noqa: T201 print("\nDone!") # noqa: T201 From f97129c88c147e7992ac7495c4ccd615826fe605 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 00:28:55 +0800 Subject: [PATCH 13/33] feat(api_cc): graph-schema .pt2 ingestion in DeepPotPTExpt (single-rank) B2.2: read lower_input_kind="graph" -> lower_input_is_graph_. B2.3: run_model_graph with NeighborGraph AOTI input order (atype, n_node, edge_index, edge_vec, edge_mask, [fparam], [aparam], [charge_spin]); no coord / edge_scatter_index. B2.4: GraphTensorPack + buildGraphTensors (delegates to createEdgeTensors+compactEdgeTensors for the rcut filter, dynamic edge count and 2 masked dummy edges; drops edge_index_ext, adds n_node=[nloc], node types from atype_ext[0:nloc]) + compute_inner & standalone dispatch branches. Multi-rank graph fails fast (PR-B3). --- source/api_cc/include/DeepPotPTExpt.h | 25 ++++++++ source/api_cc/include/commonPT.h | 84 +++++++++++++++++++++++++++ source/api_cc/src/DeepPotPTExpt.cc | 64 ++++++++++++++++++++ 3 files changed, 173 insertions(+) diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h index 68a553e29c..b569c7eb16 100644 --- a/source/api_cc/include/DeepPotPTExpt.h +++ b/source/api_cc/include/DeepPotPTExpt.h @@ -308,6 +308,7 @@ class DeepPotPTExpt : public DeepPotBackend { bool do_atomic_virial; // whether model was exported with atomic virial corr int nnei; // expected nlist nnei dimension (= sum(sel)) bool lower_input_is_edge_ = false; + bool lower_input_is_graph_ = false; NeighborListData nlist_data; at::Tensor mapping_tensor; // cached mapping tensor (LAMMPS path) at::Tensor firstneigh_tensor; // cached nlist tensor (LAMMPS path) @@ -398,6 +399,30 @@ class DeepPotPTExpt : public DeepPotBackend { const torch::Tensor& aparam, const torch::Tensor& charge_spin); + /** + * @brief Run a NeighborGraph-schema ``.pt2`` (lower_input_kind="graph"). + * + * Positional AOTI input order matches the Python export ABI: + * ``(atype, n_node, edge_index, edge_vec, edge_mask, [fparam], [aparam], + * [charge_spin])``. Unlike the edge schema there is no ``coord`` and no + * ``edge_scatter_index`` input; node count is carried by ``n_node`` and the + * geometry is fully described by ``edge_vec``. + * + * @param[in] atype Per-node local types, shape ``(N,)`` int64. + * @param[in] n_node Per-frame node count, shape ``(nf,)`` int64. + * @param[in] edge_index Folded edge graph ``(2, E)`` int64 [src, dst]. + * @param[in] edge_vec Edge vectors ``(E, 3)`` (neighbour - center). + * @param[in] edge_mask Physical-edge mask ``(E,)`` bool. + */ + std::vector run_model_graph(const torch::Tensor& atype, + const torch::Tensor& n_node, + const torch::Tensor& edge_index, + const torch::Tensor& edge_vec, + const torch::Tensor& edge_mask, + const torch::Tensor& fparam, + const torch::Tensor& aparam, + const torch::Tensor& charge_spin); + /** * @brief Run the with-comm .pt2 artifact with comm tensors appended. * diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h index 643e53974a..30cd8e5e9f 100644 --- a/source/api_cc/include/commonPT.h +++ b/source/api_cc/include/commonPT.h @@ -364,6 +364,90 @@ inline EdgeTensorPack compactEdgeTensors(const torch::Tensor& edge_index, return pack; } +struct GraphTensorPack { + torch::Tensor atype; + torch::Tensor n_node; + torch::Tensor edge_index; + torch::Tensor edge_vec; + torch::Tensor edge_mask; +}; + +/** + * @brief Build NeighborGraph input tensors from a host neighbor list + * (single-rank, dynamic edge axis). + * + * Mirrors the edge schema but drops ``coord``/``edge_scatter_index`` and adds + * ``n_node``. Edge construction is delegated to the existing + * ``createEdgeTensors``/``compactEdgeTensors`` helpers (same rcut filter, + * variable edge count and two masked dummy edges that keep the dynamic edge + * dimension non-empty); the wrapper then (a) drops the extended scatter index, + * (b) emits ``n_node = [nloc]`` for the single frame, and (c) sets the node + * types from the local slice of ``atype_ext``. + * + * @param nlist Neighbor-list rows (local idx into the extended set). + * @param coord Extended coordinates shaped as nall x 3. + * @param atype_ext Extended atom types, length nall. Node types are taken from + * the extended types (NOT ``atype[mapping]``); for single-rank ghost-free + * this is just ``atype_ext[0:nloc]``, while multi-rank (B3) passes the halo + * types directly. + * @param mapping Extended-to-local atom map, length nall. + * @param nloc Number of local atoms. + * @param nall Number of extended atoms. + * @param rcut Model cutoff (edges with ``rr > rcut**2`` are dropped). + * @param device Target device for the returned tensors. + * @param row_centers Optional center atom index for each neighbor-list row + * (LAMMPS compacts away empty rows); ``nullptr`` means row i is center i. + */ +template +inline GraphTensorPack buildGraphTensors( + const std::vector>& nlist, + const std::vector& coord, + const std::vector& atype_ext, + const std::vector& mapping, + const int nloc, + const int nall, + const double rcut, + const torch::Device& device, + const std::vector* row_centers = nullptr) { + auto int_options = torch::TensorOptions().dtype(torch::kInt64); + + // 1. Cached-style topology only (no geometry): edge_index folds ghost + // neighbours onto their local owners (fold_to_local=true), edge_index_ext + // keeps extended indices for the on-device geometry recompute. + const EdgeTensorPack topo = + createEdgeTensors(nlist, coord, mapping, nloc, nall, device, + /*with_geometry=*/false, row_centers, + /*fold_to_local=*/true); + + // 2. Recompute geometry from the current coords on-device, filter by rcut and + // append the two masked dummy edges. The model is compiled for float64 + // inputs, so build the coord tensor as float64 to match the edge path. + std::vector coord_d(coord.begin(), coord.end()); + at::Tensor coord_tensor = + torch::from_blob(coord_d.data(), + {static_cast(nall), 3}, + torch::TensorOptions().dtype(torch::kFloat64)) + .clone() + .to(device); + const EdgeTensorPack edges = compactEdgeTensors( + topo.edge_index, topo.edge_index_ext, coord_tensor, rcut); + + GraphTensorPack pack; + pack.edge_index = edges.edge_index; // local-folded (2, E) + pack.edge_vec = edges.edge_vec; // (E, 3) neighbour - center + pack.edge_mask = edges.edge_mask; // (E,) bool + pack.n_node = + torch::full({1}, static_cast(nloc), int_options).to(device); + // Node types from the local slice of the extended types. + std::vector atype_loc(atype_ext.begin(), + atype_ext.begin() + nloc); + pack.atype = torch::from_blob(atype_loc.data(), + {static_cast(nloc)}, int_options) + .clone() + .to(device); + return pack; +} + } // namespace deepmd #endif // BUILD_PYTORCH diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index 96033fcab4..25d82f623a 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -155,8 +155,10 @@ void DeepPotPTExpt::init(const std::string& model, const std::string lower_input_kind = metadata["lower_input_kind"].as_string(); lower_input_is_edge_ = lower_input_kind == "edge_vec"; + lower_input_is_graph_ = lower_input_kind == "graph"; } else { lower_input_is_edge_ = false; + lower_input_is_graph_ = false; } type_map.clear(); @@ -289,6 +291,31 @@ std::vector DeepPotPTExpt::run_model_edges( return loader->run(inputs); } +std::vector DeepPotPTExpt::run_model_graph( + const torch::Tensor& atype, + const torch::Tensor& n_node, + const torch::Tensor& edge_index, + const torch::Tensor& edge_vec, + const torch::Tensor& edge_mask, + const torch::Tensor& fparam, + const torch::Tensor& aparam, + const torch::Tensor& charge_spin) { + // NeighborGraph ABI: (atype, n_node, edge_index, edge_vec, edge_mask, + // [fparam], [aparam], [charge_spin]). No coord, no edge_scatter_index. + std::vector inputs = {atype, n_node, edge_index, edge_vec, + edge_mask}; + if (dfparam > 0) { + inputs.push_back(fparam); + } + if (daparam > 0) { + inputs.push_back(aparam); + } + if (dchgspin > 0) { + inputs.push_back(charge_spin); + } + return loader->run(inputs); +} + std::vector DeepPotPTExpt::run_model_with_comm( const torch::Tensor& coord, const torch::Tensor& atype, @@ -475,6 +502,15 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, bool multi_rank = (lmp_list.nprocs > 1); bool atom_map_present = (lmp_list.mapping != nullptr); bool use_with_comm = has_comm_artifact_ && multi_rank; + // The NeighborGraph schema only has a single-rank artifact so far; the + // multi-rank (with-comm) graph path is PR-B3. Fail fast before building + // any tensors so callers get a clear message instead of a wrong answer. + if (lower_input_is_graph_ && multi_rank) { + throw deepmd::deepmd_exception( + "Multi-rank graph (NeighborGraph) .pt2 inference is not yet " + "supported (PR-B3). Run single-rank, or use a dense/edge .pt2 for " + "multi-rank LAMMPS."); + } // Decision matrix (see PR #5450 description): // non-GNN model (has_message_passing_ == false): regular path is // always safe. @@ -556,6 +592,11 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, /*fold_to_local=*/!use_with_comm); edge_index_tensor = edge_tensors.edge_index; edge_index_ext_tensor = edge_tensors.edge_index_ext; + } else if (lower_input_is_graph_) { + // Graph schema rebuilds the edge topology on-device every step inside + // buildGraphTensors (from the raw, unpadded nlist_data.jlist + + // nlist_data.ilist centers), so nothing is cached here and the nlist is + // left unpadded (createEdgeTensors handles ragged rows and skips -1). } else { nlist_data.padding(); firstneigh_tensor = createNlistTensor(nlist_data.jlist, nnei) @@ -771,6 +812,17 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, edge_tensors.edge_index, edge_tensors.edge_vec, edge_tensors.edge_index_ext, edge_tensors.edge_mask, fparam_tensor, aparam_tensor, charge_spin_tensor); + } else if (lower_input_is_graph_) { + // Single-rank NeighborGraph schema: build (atype, n_node, edge_index, + // edge_vec, edge_mask) from the host nlist (node types from the extended + // types, folded local edge graph) and run the graph artifact. + const auto graph_tensors = buildGraphTensors( + nlist_data.jlist, dcoord, datype, mapping, nloc, nall_real, + static_cast(rcut), device, &nlist_data.ilist); + flat_outputs = run_model_graph( + graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index, + graph_tensors.edge_vec, graph_tensors.edge_mask, fparam_tensor, + aparam_tensor, charge_spin_tensor); } else { flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor, mapping_tensor, fparam_tensor, aparam_tensor, @@ -1015,9 +1067,16 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, .to(device); at::Tensor nlist_tensor; EdgeTensorPack edge_tensors; + GraphTensorPack graph_tensors; if (lower_input_is_edge_) { edge_tensors = createEdgeTensors(nlist_raw, coord_cpy_d, mapping_64, nloc, nall, device); + } else if (lower_input_is_graph_) { + // Standalone (no nlist) graph schema: build_nlist already cut at rcut and + // keys row i to center i, so no row_centers remapping is needed. + graph_tensors = + buildGraphTensors(nlist_raw, coord_cpy_d, atype_cpy, mapping_64, nloc, + nall, static_cast(rcut), device); } else { nlist_tensor = createNlistTensor(nlist_raw, nnei).to(torch::kInt64).to(device); @@ -1104,6 +1163,11 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, edge_tensors.edge_index, edge_tensors.edge_vec, edge_tensors.edge_index_ext, edge_tensors.edge_mask, fparam_tensor, aparam_tensor, charge_spin_tensor); + } else if (lower_input_is_graph_) { + flat_outputs = run_model_graph( + graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index, + graph_tensors.edge_vec, graph_tensors.edge_mask, fparam_tensor, + aparam_tensor, charge_spin_tensor); } else { flat_outputs = run_model(coord_Tensor, atype_Tensor, nlist_tensor, mapping_tensor, From 074b3ff63bfadd2da298d12255df1ecb723579e8 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 00:37:43 +0800 Subject: [PATCH 14/33] fix(api_cc): cache mapping vector as member to fix OOB on ago>0 graph path Local std::vector mapping was declared in compute_inner and populated only inside if(ago==0). The graph branch called buildGraphTensors with this local vector on every step, causing an OOB heap read on ago>0 (mapping.size()==0). Fix: promote mapping to a member mapping_ (parallel to mapping_tensor) so it persists across steps. Edge-path (createEdgeTensors) and dense-path (mapping_tensor) are unaffected in behavior; only the vector source changes from local to member. --- source/api_cc/include/DeepPotPTExpt.h | 5 +++-- source/api_cc/src/DeepPotPTExpt.cc | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h index b569c7eb16..d8bd22cbad 100644 --- a/source/api_cc/include/DeepPotPTExpt.h +++ b/source/api_cc/include/DeepPotPTExpt.h @@ -310,8 +310,9 @@ class DeepPotPTExpt : public DeepPotBackend { bool lower_input_is_edge_ = false; bool lower_input_is_graph_ = false; NeighborListData nlist_data; - at::Tensor mapping_tensor; // cached mapping tensor (LAMMPS path) - at::Tensor firstneigh_tensor; // cached nlist tensor (LAMMPS path) + at::Tensor mapping_tensor; // cached mapping tensor (LAMMPS path) + std::vector mapping_; // cached mapping vector (LAMMPS path) + at::Tensor firstneigh_tensor; // cached nlist tensor (LAMMPS path) at::Tensor edge_index_tensor; // cached local edge graph (LAMMPS path) at::Tensor edge_index_ext_tensor; // cached extended edge graph (LAMMPS path) std::unique_ptr loader; diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index 25d82f623a..3b0e976da3 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -541,19 +541,19 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, // LAMMPS sets ago=0 on every nlist rebuild (neighbor rebuild, re-partition, // atom exchange between subdomains), so `ago > 0` implies the cached // mapping and nlist tensors are still valid. Rebuild only on ago==0. - std::vector mapping; if (ago == 0) { nlist_data.copy_from_nlist(lmp_list, nall - nghost); nlist_data.shuffle_exclude_empty(fwd_map); - // Rebuild mapping tensor + // Rebuild mapping vector and tensor (cached as members; graph branch reads + // mapping_ on every step, not just ago==0, so the vector must persist). if (lmp_list.mapping) { - mapping.resize(nall_real); + mapping_.resize(nall_real); for (int ii = 0; ii < nall_real; ii++) { - mapping[ii] = fwd_map[lmp_list.mapping[bkw_map[ii]]]; + mapping_[ii] = fwd_map[lmp_list.mapping[bkw_map[ii]]]; } mapping_tensor = - torch::from_blob(mapping.data(), {1, nall_real}, int_option) + torch::from_blob(mapping_.data(), {1, nall_real}, int_option) .clone() .to(device); } else { @@ -566,12 +566,12 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, // features via border_op and ignores this tensor for ghost // gather — see deepmd/pt_expt/descriptor/ // repflows.py::_exchange_ghosts). - mapping.resize(nall_real); + mapping_.resize(nall_real); for (int ii = 0; ii < nall_real; ii++) { - mapping[ii] = ii; + mapping_[ii] = ii; } mapping_tensor = - torch::from_blob(mapping.data(), {1, nall_real}, int_option) + torch::from_blob(mapping_.data(), {1, nall_real}, int_option) .clone() .to(device); } @@ -587,7 +587,7 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, // their features can be exchanged across ranks via border_op, instead of // being folded onto a local owner that this rank does not own. const auto edge_tensors = createEdgeTensors( - nlist_data.jlist, dcoord, mapping, nloc, nall_real, device, + nlist_data.jlist, dcoord, mapping_, nloc, nall_real, device, /*with_geometry=*/false, /*row_centers=*/&nlist_data.ilist, /*fold_to_local=*/!use_with_comm); edge_index_tensor = edge_tensors.edge_index; @@ -817,7 +817,7 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, // edge_vec, edge_mask) from the host nlist (node types from the extended // types, folded local edge graph) and run the graph artifact. const auto graph_tensors = buildGraphTensors( - nlist_data.jlist, dcoord, datype, mapping, nloc, nall_real, + nlist_data.jlist, dcoord, datype, mapping_, nloc, nall_real, static_cast(rcut), device, &nlist_data.ilist); flat_outputs = run_model_graph( graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index, From 26b2c9d7cbe200bfb42e726242ad2f3bb94db968 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 01:05:37 +0800 Subject: [PATCH 15/33] test(api_cc): dpa1 graph .pt2 single-rank parity + fix graph output extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add test_deeppot_dpa1_graph_ptexpt.cc — the first runtime exercise of the NeighborGraph .pt2 C++ ingestion (B2.2-4). Four cases x {double,float}: build-nlist parity vs the committed .expected, a second 12-atom system through the same model (dynamic edge axis), the LAMMPS InputNlist+ago=0/ago=1 path (compute_inner + cached mapping_), and a tiny no-edge system (nedge_min=2 guard). The first run surfaced a B2.2-4 bug: the graph forward emits LOCAL flat-N PUBLIC keys (atom_energy/energy/force/virial/atom_virial) but compute()'s output extraction read the dense INTERNAL keys (energy_redu/energy_derv_r/...), so output_map["energy_redu"].view() threw on an undefined tensor. The graph branch had only ever been compiled, never run. Fix: remap_graph_outputs_to_dense_keys() in commonPT.h, called after extract_outputs in both compute overloads (gated on lower_input_is_graph_). Rewrites the public keys into the dense internal-key layout; per-atom force/atom_virial are local (nloc) and zero-padded up to nall so the existing fold-back is a no-op on ghost rows (local rows already carry the folded ghost contributions). Dense/edge paths untouched. gen_dpa1.py now persists the dense nlist-ref .pt2 (deeppot_dpa1_graph_nlist_ref.pt2) as a live graph-vs-dense oracle for the dynamic-edge cases. --- source/api_cc/include/commonPT.h | 64 ++++ source/api_cc/src/DeepPotPTExpt.cc | 17 ++ .../tests/test_deeppot_dpa1_graph_ptexpt.cc | 282 ++++++++++++++++++ source/tests/infer/gen_dpa1.py | 40 +-- 4 files changed, 384 insertions(+), 19 deletions(-) create mode 100644 source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h index 30cd8e5e9f..f6f82d89a0 100644 --- a/source/api_cc/include/commonPT.h +++ b/source/api_cc/include/commonPT.h @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include @@ -448,6 +450,68 @@ inline GraphTensorPack buildGraphTensors( return pack; } +/** + * @brief Remap NeighborGraph (graph-schema) public outputs onto the dense + * internal-key layout the rest of ``compute`` consumes. + * + * The graph forward (``forward_lower_graph_exportable``) is LOCAL-only and emits + * flat-N PUBLIC keys: + * - ``atom_energy`` (N, 1) per-atom energy (N == nloc) + * - ``energy`` (nf, 1) reduced total energy + * - ``force`` (N, 3) per-atom force (ghosts already folded onto + * their local owners via ``edge_index``) + * - ``virial`` (nf, 9) reduced total virial + * - ``atom_virial`` (N, 9) per-atom (full-to-src) virial + * + * The downstream extraction in ``DeepPotPTExpt::compute`` was written for the + * dense forward's internal keys with their extra dims: + * ``energy_redu`` (nf,1), ``energy_derv_c_redu`` (nf,1,9), + * ``energy_derv_r`` (nf,nall,1,3), ``energy`` (nf,nloc,1), + * ``energy_derv_c`` (nf,nall,1,9). + * + * This helper rewrites the public keys into those internal keys (single frame, + * nf == 1). The per-atom force / atom-virial are LOCAL (nloc rows); they are + * zero-padded up to the extended length ``nall`` so the existing fold-back + * (``fold_back`` / ``select_map``) is a no-op on the ghost rows — the local + * rows already carry the folded ghost contributions, so zero ghosts avoid + * double counting (and keep LAMMPS reverse-comm correct). + * + * @param[in,out] output_map Output tensor map (public keys in, internal keys + * added). + * @param[in] nloc Number of local atoms (== N, the graph node count). + * @param[in] nall Extended atom count to pad the per-atom outputs up to. + * @param[in] atomic Whether atomic energy / virial were requested. + */ +inline void remap_graph_outputs_to_dense_keys( + std::map& output_map, + const std::int64_t nloc, + const std::int64_t nall, + const bool atomic) { + using torch::indexing::Slice; + const std::int64_t nf = 1; + const auto& energy_pub = output_map.at("energy"); // (nf, 1) + const auto& force_pub = output_map.at("force"); // (N, 3), N == nloc + const auto& virial_pub = output_map.at("virial"); // (nf, 9) + + output_map["energy_redu"] = energy_pub.reshape({nf, 1}); + output_map["energy_derv_c_redu"] = virial_pub.reshape({nf, 1, 9}); + + // Local force -> (nf, nall, 1, 3) with zero ghost rows. + auto force_full = torch::zeros({nf, nall, 1, 3}, force_pub.options()); + force_full.index_put_({0, Slice(0, nloc), 0}, force_pub); + output_map["energy_derv_r"] = force_full; + + if (atomic) { + const auto& atom_energy_pub = output_map.at("atom_energy"); // (N, 1) + const auto& atom_virial_pub = output_map.at("atom_virial"); // (N, 9) + output_map["energy"] = atom_energy_pub.reshape({nf, nloc, 1}); + auto atom_virial_full = + torch::zeros({nf, nall, 1, 9}, atom_virial_pub.options()); + atom_virial_full.index_put_({0, Slice(0, nloc), 0}, atom_virial_pub); + output_map["energy_derv_c"] = atom_virial_full; + } +} + } // namespace deepmd #endif // BUILD_PYTORCH diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index 3b0e976da3..c591ae23e6 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -834,6 +834,15 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, std::map output_map; extract_outputs(output_map, flat_outputs); + if (lower_input_is_graph_) { + // The graph forward emits LOCAL public keys (atom_energy/energy/force/ + // virial/atom_virial); rewrite them into the dense internal-key layout the + // downstream extraction/fold-back expects. nloc == N (graph node count); + // pad the per-atom force/virial up to nall_real with zero ghost rows. + deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall_real, + atomic); + } + if (phantom_n > 0) { // Strip the phantom local prefix and zero the empty rank's energy. The // phantom atoms carry no edges, so their force / per-atom virial are @@ -1178,6 +1187,14 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, std::map output_map; extract_outputs(output_map, flat_outputs); + if (lower_input_is_graph_) { + // The graph forward emits LOCAL public keys; rewrite them into the dense + // internal-key layout used below. nloc == N (graph node count); pad the + // per-atom force/virial up to the extended nall with zero ghost rows so the + // fold-back is a no-op on ghosts. + deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall, atomic); + } + // 7. Extract energy torch::Tensor flat_energy_ = output_map["energy_redu"].view({-1}).to(torch::kCPU); diff --git a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc new file mode 100644 index 0000000000..bbe751026c --- /dev/null +++ b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later +// Test C++ inference for the NeighborGraph (graph-schema) .pt2 path of the +// pt_expt backend. The graph model is a dpa1(attn_layer=0) descriptor exported +// with lower_kind="graph" (gen_dpa1.py section B); this is the FIRST runtime +// exercise of the C++ graph ingestion added in PR-B Phase B2 +// (lower_input_is_graph_ / run_model_graph / buildGraphTensors / the +// compute_inner graph branch). +// +// Reference values (deeppot_dpa1_graph.expected) come from an INDEPENDENT +// nlist (dense-quartet) evaluation of the same weights, so a match validates +// the graph AOTI ABI/geometry, not just self-consistency. A second, persisted +// nlist .pt2 of the same weights (deeppot_dpa1_graph_nlist_ref.pt2) is loaded +// alongside the graph model so arbitrary system sizes (dynamic edge axis) can +// be cross-checked graph≈dense live without baking more reference blocks. +#include + +#include +#include +#include + +#include "DeepPot.h" +#include "DeepPotPTExpt.h" +#include "expected_ref.h" +#include "neighbor_list.h" +#include "test_utils.h" + +namespace { +constexpr const char* kGraphModel = "../../tests/infer/deeppot_dpa1_graph.pt2"; +constexpr const char* kNlistRefModel = + "../../tests/infer/deeppot_dpa1_graph_nlist_ref.pt2"; +constexpr const char* kRefPath = "../../tests/infer/deeppot_dpa1_graph.expected"; +} // namespace + +template +class TestInferDpa1GraphPtExpt : public ::testing::Test { + protected: + std::vector coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74, + 00.25, 3.32, 1.68, 3.36, 3.00, 1.81, + 3.51, 2.51, 2.60, 4.27, 3.22, 1.56}; + std::vector atype = {0, 1, 1, 0, 1, 1}; + std::vector box = {13., 0., 0., 0., 13., 0., 0., 0., 13.}; + // Per-atom reference (energy/force/virial) loaded from the .expected sidecar. + std::vector expected_e; + std::vector expected_f; + std::vector expected_v; + int natoms; + double expected_tot_e; + std::vector expected_tot_v; + + // Graph-schema model under test. + static deepmd::DeepPot dp; + // Independent nlist (dense) model with identical weights — used as a live + // graph≈dense oracle for arbitrary system sizes. + static deepmd::DeepPot dp_ref; + + static void SetUpTestSuite() { +#if defined(BUILD_PYTORCH) && BUILD_PT_EXPT + dp.init(kGraphModel); + dp_ref.init(kNlistRefModel); +#endif + } + + void SetUp() override { +#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT + GTEST_SKIP() << "Skip because PyTorch support is not enabled."; +#endif + deepmd_test::ExpectedRef ref; + ref.load(kRefPath); + expected_e = ref.get("pbc", "expected_e"); + expected_f = ref.get("pbc", "expected_f"); + expected_v = ref.get("pbc", "expected_v"); + + natoms = expected_e.size(); + EXPECT_EQ(natoms * 3, static_cast(expected_f.size())); + EXPECT_EQ(natoms * 9, static_cast(expected_v.size())); + expected_tot_e = 0.; + expected_tot_v.assign(9, 0.); + for (int ii = 0; ii < natoms; ++ii) { + expected_tot_e += expected_e[ii]; + } + for (int ii = 0; ii < natoms; ++ii) { + for (int dd = 0; dd < 9; ++dd) { + expected_tot_v[dd] += expected_v[ii * 9 + dd]; + } + } + }; + + void TearDown() override {}; + + static void TearDownTestSuite() { + dp = deepmd::DeepPot(); + dp_ref = deepmd::DeepPot(); + } +}; + +template +deepmd::DeepPot TestInferDpa1GraphPtExpt::dp; +template +deepmd::DeepPot TestInferDpa1GraphPtExpt::dp_ref; + +TYPED_TEST_SUITE(TestInferDpa1GraphPtExpt, ValueTypes); + +// Case 1: DeepPot builds its own neighbor list and runs the standalone graph +// branch (lower_input_is_graph_, build_nlist -> buildGraphTensors). Validates +// the graph AOTI ABI/geometry against the independent nlist reference. +TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist) { + using VALUETYPE = TypeParam; + std::vector& coord = this->coord; + std::vector& atype = this->atype; + std::vector& box = this->box; + std::vector& expected_f = this->expected_f; + int& natoms = this->natoms; + double& expected_tot_e = this->expected_tot_e; + std::vector& expected_tot_v = this->expected_tot_v; + deepmd::DeepPot& dp = this->dp; + double ener; + std::vector force, virial; + dp.compute(ener, force, virial, coord, atype, box); + + EXPECT_EQ(force.size(), static_cast(natoms * 3)); + EXPECT_EQ(virial.size(), 9u); + + EXPECT_LT(fabs(ener - expected_tot_e), EPSILON); + for (int ii = 0; ii < natoms * 3; ++ii) { + EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON); + } + for (int ii = 0; ii < 9; ++ii) { + EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON); + } +} + +// Case 2: a SECOND, larger system (12 atoms, different edge count) through the +// SAME loaded graph model — proves the dynamic edge axis works in C++. The +// graph result is cross-checked against the dense nlist .pt2 (same weights); +// at non-binding sel they must agree bit-for-bit (fp64 ~1e-10). +TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_sys2_dynamic_edges) { + using VALUETYPE = TypeParam; + deepmd::DeepPot& dp = this->dp; + deepmd::DeepPot& dp_ref = this->dp_ref; + + // 12 atoms: original 6 stacked with a +13 z-shifted copy, box doubled in z. + // Same local density as the 6-atom fixture, so per-atom neighbor counts stay + // far below sel=30 and graph(carry-all) == dense(sel-truncated). + std::vector coord2 = { + 12.83, 2.56, 2.18, 12.09, 2.87, 2.74, 00.25, 3.32, 1.68, + 3.36, 3.00, 1.81, 3.51, 2.51, 2.60, 4.27, 3.22, 1.56, + 12.83, 2.56, 15.18, 12.09, 2.87, 15.74, 00.25, 3.32, 14.68, + 3.36, 3.00, 14.81, 3.51, 2.51, 15.60, 4.27, 3.22, 14.56}; + std::vector atype2 = {0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1}; + std::vector box2 = {13., 0., 0., 0., 13., 0., 0., 0., 26.}; + int natoms2 = atype2.size(); + + double ener_g, ener_r; + std::vector force_g, virial_g, force_r, virial_r; + dp.compute(ener_g, force_g, virial_g, coord2, atype2, box2); + dp_ref.compute(ener_r, force_r, virial_r, coord2, atype2, box2); + + EXPECT_EQ(force_g.size(), static_cast(natoms2 * 3)); + EXPECT_EQ(virial_g.size(), 9u); + + EXPECT_LT(fabs(ener_g - ener_r), EPSILON); + for (int ii = 0; ii < natoms2 * 3; ++ii) { + EXPECT_LT(fabs(force_g[ii] - force_r[ii]), EPSILON); + } + for (int ii = 0; ii < 9; ++ii) { + EXPECT_LT(fabs(virial_g[ii] - virial_r[ii]), EPSILON); + } +} + +// Case 3 (CRITICAL): exercise the LAMMPS compute_inner graph branch with an +// explicit InputNlist and the `ago` cache. Calling compute twice WITHOUT +// rebuilding the nlist — first ago=0 (rebuild), then ago=1 (reuse) — must give +// identical results. This is the only case that hits compute_inner + the +// member-cached mapping_ vector; the build-nlist cases above never touch it. +// Regression guard for the OOB-on-ago>0 bug fixed by caching mapping_ as a +// member (commit 7c70db47b). +TYPED_TEST(TestInferDpa1GraphPtExpt, lammps_nlist_ago) { + using VALUETYPE = TypeParam; + std::vector& coord = this->coord; + std::vector& atype = this->atype; + std::vector& box = this->box; + std::vector& expected_f = this->expected_f; + int& natoms = this->natoms; + double& expected_tot_e = this->expected_tot_e; + std::vector& expected_tot_v = this->expected_tot_v; + deepmd::DeepPot& dp = this->dp; + + float rc = dp.cutoff(); + int nloc = coord.size() / 3; + std::vector coord_cpy; + std::vector atype_cpy, mapping; + std::vector > nlist_data; + _build_nlist(nlist_data, coord_cpy, atype_cpy, mapping, coord, + atype, box, rc); + int nall = coord_cpy.size() / 3; + std::vector ilist(nloc), numneigh(nloc); + std::vector firstneigh(nloc); + deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]); + convert_nlist(inlist, nlist_data); + // The graph branch folds ghost neighbours onto their local owners via the + // LAMMPS atom-map; without it periodic (ghost) edges would be dropped. + inlist.mapping = mapping.data(); + + // ago=0: rebuild the cached nlist/mapping, then run the graph branch. + double ener; + std::vector force_, virial; + dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc, + inlist, 0); + std::vector force; + _fold_back(force, force_, mapping, nloc, nall, 3); + + EXPECT_EQ(force.size(), static_cast(natoms * 3)); + EXPECT_EQ(virial.size(), 9u); + EXPECT_LT(fabs(ener - expected_tot_e), EPSILON); + for (int ii = 0; ii < natoms * 3; ++ii) { + EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON); + } + for (int ii = 0; ii < 9; ++ii) { + EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON); + } + + // ago=1: reuse the cached nlist/mapping (NO rebuild). Must match again. + // This is the path that previously read the local mapping vector OOB. + ener = 0.; + std::fill(force_.begin(), force_.end(), 0.0); + std::fill(virial.begin(), virial.end(), 0.0); + dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc, + inlist, 1); + _fold_back(force, force_, mapping, nloc, nall, 3); + + EXPECT_EQ(force.size(), static_cast(natoms * 3)); + EXPECT_EQ(virial.size(), 9u); + EXPECT_LT(fabs(ener - expected_tot_e), EPSILON); + for (int ii = 0; ii < natoms * 3; ++ii) { + EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON); + } + for (int ii = 0; ii < 9; ++ii) { + EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON); + } +} + +// Case 4: a tiny system with no in-cutoff neighbors — only the two masked +// dummy edges survive (nedge_min=2 guard / SIGFPE-edge family). The graph +// must run cleanly, produce finite, interaction-free output (zero force/virial) +// and agree with the dense reference. +TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_tiny_no_edges) { + using VALUETYPE = TypeParam; + deepmd::DeepPot& dp = this->dp; + deepmd::DeepPot& dp_ref = this->dp_ref; + + // Two atoms ~33 apart in a 40-box: no neighbor within rcut=6 and no periodic + // image either, so the graph sees zero real edges (only the 2 dummy edges). + std::vector coord_t = {1.0, 1.0, 1.0, 20.0, 20.0, 20.0}; + std::vector atype_t = {0, 1}; + std::vector box_t = {40., 0., 0., 0., 40., 0., 0., 0., 40.}; + int natoms_t = atype_t.size(); + + double ener_g, ener_r; + std::vector force_g, virial_g, force_r, virial_r; + ASSERT_NO_THROW( + dp.compute(ener_g, force_g, virial_g, coord_t, atype_t, box_t)); + dp_ref.compute(ener_r, force_r, virial_r, coord_t, atype_t, box_t); + + EXPECT_EQ(force_g.size(), static_cast(natoms_t * 3)); + EXPECT_EQ(virial_g.size(), 9u); + + EXPECT_TRUE(std::isfinite(ener_g)); + // No interactions: force and virial must vanish. + for (int ii = 0; ii < natoms_t * 3; ++ii) { + EXPECT_TRUE(std::isfinite(force_g[ii])); + EXPECT_LT(fabs(force_g[ii]), EPSILON); + } + for (int ii = 0; ii < 9; ++ii) { + EXPECT_TRUE(std::isfinite(virial_g[ii])); + EXPECT_LT(fabs(virial_g[ii]), EPSILON); + } + // graph == dense for the isolated-atom limit. + EXPECT_LT(fabs(ener_g - ener_r), EPSILON); + for (int ii = 0; ii < natoms_t * 3; ++ii) { + EXPECT_LT(fabs(force_g[ii] - force_r[ii]), EPSILON); + } +} diff --git a/source/tests/infer/gen_dpa1.py b/source/tests/infer/gen_dpa1.py index 5ec523f5f5..b45b476520 100644 --- a/source/tests/infer/gen_dpa1.py +++ b/source/tests/infer/gen_dpa1.py @@ -19,7 +19,6 @@ import copy import os import sys -import tempfile import numpy as np @@ -235,25 +234,28 @@ def main(): # (dense-quartet forward), NOT the graph .pt2. This ensures the C++ gtest # (B2.5) independently validates the graph AOTI path against a known-good # nlist evaluation. - print("Exporting reference nlist .pt2 (independent ground truth) ...") # noqa: T201 - with tempfile.TemporaryDirectory() as _tmp: - nlist_ref_pt2 = os.path.join(_tmp, "dpa1_graph_nlist_ref.pt2") - pt_expt_deserialize_to_file( - nlist_ref_pt2, - copy.deepcopy(data_g), - do_atomic_virial=True, - lower_kind="nlist", # independent: dense nlist, NOT graph - ) - dp_nlist_ref = DeepPot(nlist_ref_pt2) + # + # The nlist .pt2 is also PERSISTED (deeppot_dpa1_graph_nlist_ref.pt2): the + # C++ gtest loads it alongside the graph .pt2 to cross-check graph≈dense at + # 1e-9 on arbitrary system sizes (dynamic-edge-axis cases) without baking a + # second reference block into the .expected sidecar. Same weights as the + # graph model, so at non-binding sel the two paths must agree. + nlist_ref_pt2 = os.path.join(base_dir, "deeppot_dpa1_graph_nlist_ref.pt2") + print(f"Exporting reference nlist .pt2 to {nlist_ref_pt2} ...") # noqa: T201 + pt_expt_deserialize_to_file( + nlist_ref_pt2, + copy.deepcopy(data_g), + do_atomic_virial=True, + lower_kind="nlist", # independent: dense nlist, NOT graph + ) + dp_nlist_ref = DeepPot(nlist_ref_pt2) - # PBC reference from nlist path - e_r1, f_r1, v_r1, ae_r1, av_r1 = dp_nlist_ref.eval( - coord, box, atype, atomic=True - ) - # NoPBC reference from nlist path - e_rnp, f_rnp, v_rnp, ae_rnp, av_rnp = dp_nlist_ref.eval( - coord, None, atype, atomic=True - ) + # PBC reference from nlist path + e_r1, f_r1, v_r1, ae_r1, av_r1 = dp_nlist_ref.eval(coord, box, atype, atomic=True) + # NoPBC reference from nlist path + e_rnp, f_rnp, v_rnp, ae_rnp, av_rnp = dp_nlist_ref.eval( + coord, None, atype, atomic=True + ) print(f"Nlist ref PBC energy: {e_r1[0, 0]:.18e}") # noqa: T201 print(f"Nlist ref NoPBC energy: {e_rnp[0, 0]:.18e}") # noqa: T201 From af92be11b4bbf42da7c668f0c3600968d4c0887c Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 01:20:49 +0800 Subject: [PATCH 16/33] =?UTF-8?q?docs(infer):=20B2=20final-review=20?= =?UTF-8?q?=E2=80=94=20correct=20gen=5Fdpa1=20graph-reference=20docstring?= =?UTF-8?q?=20(nlist=20.pt2,=201e-5)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- source/tests/infer/gen_dpa1.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/source/tests/infer/gen_dpa1.py b/source/tests/infer/gen_dpa1.py index b45b476520..9c743c9f88 100644 --- a/source/tests/infer/gen_dpa1.py +++ b/source/tests/infer/gen_dpa1.py @@ -11,9 +11,11 @@ Both are serialized and exported to their respective formats from the same weights. Reference sidecar files (.expected) consumed by C++ gtests are also written: - deeppot_dpa1.expected — from the nlist .pt2 eval (existing) - - deeppot_dpa1_graph.expected — from a direct dpmodel eval (independent ground - truth, NOT from the graph .pt2); the graph .pt2 is sanity-checked against - this reference at ≤1e-6. + - deeppot_dpa1_graph.expected — from an independent NLIST .pt2 eval (NOT the + graph .pt2; dpmodel se_atten has no analytical force, so the dense nlist + path is the independent ground truth). At non-binding sel the graph and + nlist paths see the same neighbor set, so the graph .pt2 is sanity-checked + against this reference at ≤1e-5. """ import copy From b25fdfc2f258e5e1d88c8f07af88d3ddfe5c9ce6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Jun 2026 17:22:43 +0000 Subject: [PATCH 17/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- source/api_cc/include/DeepPotPTExpt.h | 8 ++++---- source/api_cc/include/commonPT.h | 7 +++---- source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc | 3 ++- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h index d8bd22cbad..ddaea35646 100644 --- a/source/api_cc/include/DeepPotPTExpt.h +++ b/source/api_cc/include/DeepPotPTExpt.h @@ -310,10 +310,10 @@ class DeepPotPTExpt : public DeepPotBackend { bool lower_input_is_edge_ = false; bool lower_input_is_graph_ = false; NeighborListData nlist_data; - at::Tensor mapping_tensor; // cached mapping tensor (LAMMPS path) - std::vector mapping_; // cached mapping vector (LAMMPS path) - at::Tensor firstneigh_tensor; // cached nlist tensor (LAMMPS path) - at::Tensor edge_index_tensor; // cached local edge graph (LAMMPS path) + at::Tensor mapping_tensor; // cached mapping tensor (LAMMPS path) + std::vector mapping_; // cached mapping vector (LAMMPS path) + at::Tensor firstneigh_tensor; // cached nlist tensor (LAMMPS path) + at::Tensor edge_index_tensor; // cached local edge graph (LAMMPS path) at::Tensor edge_index_ext_tensor; // cached extended edge graph (LAMMPS path) std::unique_ptr loader; // Optional second AOTInductor artifact for the multi-rank GNN code diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h index f6f82d89a0..c56f366bbd 100644 --- a/source/api_cc/include/commonPT.h +++ b/source/api_cc/include/commonPT.h @@ -426,8 +426,7 @@ inline GraphTensorPack buildGraphTensors( // inputs, so build the coord tensor as float64 to match the edge path. std::vector coord_d(coord.begin(), coord.end()); at::Tensor coord_tensor = - torch::from_blob(coord_d.data(), - {static_cast(nall), 3}, + torch::from_blob(coord_d.data(), {static_cast(nall), 3}, torch::TensorOptions().dtype(torch::kFloat64)) .clone() .to(device); @@ -454,8 +453,8 @@ inline GraphTensorPack buildGraphTensors( * @brief Remap NeighborGraph (graph-schema) public outputs onto the dense * internal-key layout the rest of ``compute`` consumes. * - * The graph forward (``forward_lower_graph_exportable``) is LOCAL-only and emits - * flat-N PUBLIC keys: + * The graph forward (``forward_lower_graph_exportable``) is LOCAL-only and + * emits flat-N PUBLIC keys: * - ``atom_energy`` (N, 1) per-atom energy (N == nloc) * - ``energy`` (nf, 1) reduced total energy * - ``force`` (N, 3) per-atom force (ghosts already folded onto diff --git a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc index bbe751026c..95dcf72e74 100644 --- a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc +++ b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc @@ -28,7 +28,8 @@ namespace { constexpr const char* kGraphModel = "../../tests/infer/deeppot_dpa1_graph.pt2"; constexpr const char* kNlistRefModel = "../../tests/infer/deeppot_dpa1_graph_nlist_ref.pt2"; -constexpr const char* kRefPath = "../../tests/infer/deeppot_dpa1_graph.expected"; +constexpr const char* kRefPath = + "../../tests/infer/deeppot_dpa1_graph.expected"; } // namespace template From 10d82a68baece6168dac87f0e25861762b389144 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 08:11:07 +0800 Subject: [PATCH 18/33] fix(api_cc): guard graph remap single-rank-only + atomic-overload gtest (B2.5 follow-ups) --- source/api_cc/include/commonPT.h | 16 ++++++- source/api_cc/src/DeepPotPTExpt.cc | 9 +++- .../tests/test_deeppot_dpa1_graph_ptexpt.cc | 44 +++++++++++++++++++ 3 files changed, 66 insertions(+), 3 deletions(-) diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h index c56f366bbd..865f9d3837 100644 --- a/source/api_cc/include/commonPT.h +++ b/source/api_cc/include/commonPT.h @@ -475,17 +475,31 @@ inline GraphTensorPack buildGraphTensors( * rows already carry the folded ghost contributions, so zero ghosts avoid * double counting (and keep LAMMPS reverse-comm correct). * + * **Single-rank only.** Multi-rank inference (B3.2) must NOT call this + * function: ghost/halo forces are real cross-rank contributions that must be + * returned as-is and folded back via reverse-comm rather than being zeroed. + * Calling this function on a multi-rank result would silently zero those forces + * and produce wrong energetics. Pass ``single_rank = false`` to get an + * explicit exception instead of silent corruption. + * * @param[in,out] output_map Output tensor map (public keys in, internal keys * added). * @param[in] nloc Number of local atoms (== N, the graph node count). * @param[in] nall Extended atom count to pad the per-atom outputs up to. * @param[in] atomic Whether atomic energy / virial were requested. + * @param[in] single_rank Must be true; throws deepmd_exception if false. */ inline void remap_graph_outputs_to_dense_keys( std::map& output_map, const std::int64_t nloc, const std::int64_t nall, - const bool atomic) { + const bool atomic, + const bool single_rank = true) { + if (!single_rank) { + throw deepmd::deepmd_exception( + "remap_graph_outputs_to_dense_keys is single-rank-only; multi-rank " + "uses the extended-region reverse-comm fold (PR-B3.2)"); + } using torch::indexing::Slice; const std::int64_t nf = 1; const auto& energy_pub = output_map.at("energy"); // (nf, 1) diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index c591ae23e6..30cffe8c99 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -839,8 +839,10 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, // virial/atom_virial); rewrite them into the dense internal-key layout the // downstream extraction/fold-back expects. nloc == N (graph node count); // pad the per-atom force/virial up to nall_real with zero ghost rows. + // single_rank=true: the multi-rank fail-fast at line ~508 guarantees we + // never reach here on a multi-rank graph call. deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall_real, - atomic); + atomic, /*single_rank=*/true); } if (phantom_n > 0) { @@ -1192,7 +1194,10 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, // internal-key layout used below. nloc == N (graph node count); pad the // per-atom force/virial up to the extended nall with zero ghost rows so the // fold-back is a no-op on ghosts. - deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall, atomic); + // single_rank=true: the standalone (build_nlist) path is always + // single-rank; there is no comm_dict / cross-rank ghost exchange here. + deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall, atomic, + /*single_rank=*/true); } // 7. Extract energy diff --git a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc index 95dcf72e74..8b347c5e57 100644 --- a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc +++ b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc @@ -240,6 +240,50 @@ TYPED_TEST(TestInferDpa1GraphPtExpt, lammps_nlist_ago) { } } +// Case 5: exercise the DeepPot::compute ATOMIC overload on the graph .pt2. +// This is the first test to reach the ``if (atomic)`` branch inside +// remap_graph_outputs_to_dense_keys (the atom_energy/atom_virial remapping). +// The per-atom reference values are already loaded from deeppot_dpa1_graph.expected +// into this->expected_e and this->expected_v by SetUp(). +TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_atomic) { + using VALUETYPE = TypeParam; + std::vector& coord = this->coord; + std::vector& atype = this->atype; + std::vector& box = this->box; + std::vector& expected_e = this->expected_e; + std::vector& expected_f = this->expected_f; + std::vector& expected_v = this->expected_v; + int& natoms = this->natoms; + double& expected_tot_e = this->expected_tot_e; + std::vector& expected_tot_v = this->expected_tot_v; + deepmd::DeepPot& dp = this->dp; + + double ener; + std::vector force, virial, atom_energy, atom_virial; + // Standalone atomic overload: DeepPot builds its own nlist (graph branch), + // then returns per-atom energy + atom-virial alongside total energy/force/virial. + dp.compute(ener, force, virial, atom_energy, atom_virial, coord, atype, box); + + EXPECT_EQ(force.size(), static_cast(natoms * 3)); + EXPECT_EQ(virial.size(), 9u); + EXPECT_EQ(atom_energy.size(), static_cast(natoms)); + EXPECT_EQ(atom_virial.size(), static_cast(natoms * 9)); + + EXPECT_LT(fabs(ener - expected_tot_e), EPSILON); + for (int ii = 0; ii < natoms * 3; ++ii) { + EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON); + } + for (int ii = 0; ii < 9; ++ii) { + EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON); + } + for (int ii = 0; ii < natoms; ++ii) { + EXPECT_LT(fabs(atom_energy[ii] - expected_e[ii]), EPSILON); + } + for (int ii = 0; ii < natoms * 9; ++ii) { + EXPECT_LT(fabs(atom_virial[ii] - expected_v[ii]), EPSILON); + } +} + // Case 4: a tiny system with no in-cutoff neighbors — only the two masked // dummy edges survive (nedge_min=2 guard / SIGFPE-edge family). The graph // must run cleanly, produce finite, interaction-free output (zero force/virial) From 7d37319c7095064dfb69557cef89b34d2f519c49 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 08:24:24 +0800 Subject: [PATCH 19/33] feat(api_cc): non-MP multi-rank graph path (extended region + reverse-comm; no with-comm) --- source/api_cc/include/commonPT.h | 95 +++++++++++++++++++++++++----- source/api_cc/src/DeepPotPTExpt.cc | 56 ++++++++++++------ 2 files changed, 119 insertions(+), 32 deletions(-) diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h index 865f9d3837..919df504f0 100644 --- a/source/api_cc/include/commonPT.h +++ b/source/api_cc/include/commonPT.h @@ -399,6 +399,14 @@ struct GraphTensorPack { * @param device Target device for the returned tensors. * @param row_centers Optional center atom index for each neighbor-list row * (LAMMPS compacts away empty rows); ``nullptr`` means row i is center i. + * @param fold_to_local Whether ghost neighbours are folded onto their local + * owners (single-rank, ``N == nloc``, ``n_node = [nloc]``, node types from + * ``atype_ext[0:nloc]``) or kept as distinct extended nodes (multi-rank, + * ``N == nall``, ``n_node = [nall]``, node types from the full ``atype_ext`` + * including the real halo types — the #5583 invariant). In the multi-rank + * case ``edge_index`` indexes the extended atoms directly, so ghost reaction + * forces land on the ghost rows and are folded to their owners by LAMMPS + * reverse-comm (no with-comm artifact / no border_op — dpa1 is non-MP). */ template inline GraphTensorPack buildGraphTensors( @@ -410,16 +418,18 @@ inline GraphTensorPack buildGraphTensors( const int nall, const double rcut, const torch::Device& device, - const std::vector* row_centers = nullptr) { + const std::vector* row_centers = nullptr, + const bool fold_to_local = true) { auto int_options = torch::TensorOptions().dtype(torch::kInt64); - // 1. Cached-style topology only (no geometry): edge_index folds ghost - // neighbours onto their local owners (fold_to_local=true), edge_index_ext - // keeps extended indices for the on-device geometry recompute. + // 1. Cached-style topology only (no geometry): when fold_to_local=true, + // edge_index folds ghost neighbours onto their local owners (single-rank); + // when false, edge_index indexes the extended atoms directly (multi-rank). + // edge_index_ext always keeps extended indices for the on-device geometry + // recompute. const EdgeTensorPack topo = createEdgeTensors(nlist, coord, mapping, nloc, nall, device, - /*with_geometry=*/false, row_centers, - /*fold_to_local=*/true); + /*with_geometry=*/false, row_centers, fold_to_local); // 2. Recompute geometry from the current coords on-device, filter by rcut and // append the two masked dummy edges. The model is compiled for float64 @@ -434,16 +444,19 @@ inline GraphTensorPack buildGraphTensors( topo.edge_index, topo.edge_index_ext, coord_tensor, rcut); GraphTensorPack pack; - pack.edge_index = edges.edge_index; // local-folded (2, E) + pack.edge_index = edges.edge_index; // (2, E): local-folded or extended pack.edge_vec = edges.edge_vec; // (E, 3) neighbour - center pack.edge_mask = edges.edge_mask; // (E,) bool - pack.n_node = - torch::full({1}, static_cast(nloc), int_options).to(device); - // Node types from the local slice of the extended types. - std::vector atype_loc(atype_ext.begin(), - atype_ext.begin() + nloc); - pack.atype = torch::from_blob(atype_loc.data(), - {static_cast(nloc)}, int_options) + // Single-rank: N == nloc (ghosts folded onto owners). Multi-rank: N == nall + // (ghosts are distinct nodes whose features come from their real halo types). + const std::int64_t n_node_count = fold_to_local ? nloc : nall; + pack.n_node = torch::full({1}, n_node_count, int_options).to(device); + // Node types from the extended types (NOT atype[mapping]): the local slice + // for single-rank, the full extended set (incl. real halo types) for + // multi-rank. + std::vector atype_nodes(atype_ext.begin(), + atype_ext.begin() + n_node_count); + pack.atype = torch::from_blob(atype_nodes.data(), {n_node_count}, int_options) .clone() .to(device); return pack; @@ -525,6 +538,60 @@ inline void remap_graph_outputs_to_dense_keys( } } +/** + * @brief Remap NeighborGraph public outputs onto the dense internal-key layout + * for the MULTI-RANK (extended-region) non-message-passing path. + * + * Built with ``fold_to_local=false``, the graph has ``N == nall`` nodes: ghost + * (halo) atoms are distinct nodes, so the per-node ``force`` is already the + * EXTENDED force (one row per extended atom). Ghost reaction forces stay on + * their ghost rows and are folded back to their owning rank by LAMMPS + * reverse-comm — exactly as the dense path returns its extended force. No + * zero-padding (unlike the single-rank helper) and no with-comm artifact (dpa1 + * is non-MP). + * + * Key differences from the single-rank helper: + * - ``energy_redu`` = sum of the LOCAL atom energies (``atom_energy[0:nloc]``) + * ONLY. The public ``energy`` key reduces over all ``N == nall`` nodes, + * which would double-count the bias energy of ghost nodes that belong to + * other ranks (ghost nodes have no center edges, so they carry a bias-only + * energy and zero force/virial gradient — harmless for force/virial but + * wrong for the owned energy). + * - ``energy_derv_r`` / ``energy_derv_c`` keep all ``nall`` rows (no padding). + * + * @param[in,out] output_map Output tensor map (public keys in, internal keys + * added). + * @param[in] nloc Number of local atoms (owned by this rank). + * @param[in] nall Extended atom count (== N, the graph node count). + * @param[in] atomic Whether atomic energy / virial were requested. + */ +inline void remap_graph_outputs_to_dense_keys_extended( + std::map& output_map, + const std::int64_t nloc, + const std::int64_t nall, + const bool atomic) { + using torch::indexing::Slice; + const std::int64_t nf = 1; + const auto& atom_energy_pub = output_map.at("atom_energy"); // (N==nall, 1) + const auto& force_pub = output_map.at("force"); // (N==nall, 3) extended + const auto& virial_pub = output_map.at("virial"); // (nf, 9) + + // Owned energy = sum over LOCAL atoms only; ghost nodes carry bias-only + // energy belonging to other ranks. + output_map["energy_redu"] = + atom_energy_pub.index({Slice(0, nloc)}).sum().reshape({nf, 1}); + output_map["energy_derv_c_redu"] = virial_pub.reshape({nf, 1, 9}); + // Extended force: ghost rows stay distinct for LAMMPS reverse-comm fold-back. + output_map["energy_derv_r"] = force_pub.reshape({nf, nall, 1, 3}); + + if (atomic) { + const auto& atom_virial_pub = output_map.at("atom_virial"); // (N==nall, 9) + output_map["energy"] = + atom_energy_pub.index({Slice(0, nloc)}).reshape({nf, nloc, 1}); + output_map["energy_derv_c"] = atom_virial_pub.reshape({nf, nall, 1, 9}); + } +} + } // namespace deepmd #endif // BUILD_PYTORCH diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index 30cffe8c99..49e7ed0662 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -502,14 +502,23 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, bool multi_rank = (lmp_list.nprocs > 1); bool atom_map_present = (lmp_list.mapping != nullptr); bool use_with_comm = has_comm_artifact_ && multi_rank; - // The NeighborGraph schema only has a single-rank artifact so far; the - // multi-rank (with-comm) graph path is PR-B3. Fail fast before building - // any tensors so callers get a clear message instead of a wrong answer. - if (lower_input_is_graph_ && multi_rank) { + // NeighborGraph multi-rank dispatch: + // - NON-message-passing (dpa1, se_e2_a, ...): the SAME single-rank graph + // .pt2 runs on the EXTENDED region (fold_to_local=false; ghosts are + // distinct nodes whose features come from their real halo types). No + // with-comm artifact / no border_op is needed; ghost reaction forces are + // folded to their owners by LAMMPS reverse-comm. Handled below. + // - message-passing graph (DPA2/DPA3, PR-G): would need a with-comm graph + // artifact for cross-rank ghost-feature exchange — not yet supported. + // Fail fast before building any tensors so callers get a clear message + // instead of a wrong answer. + if (lower_input_is_graph_ && multi_rank && has_message_passing_) { throw deepmd::deepmd_exception( - "Multi-rank graph (NeighborGraph) .pt2 inference is not yet " - "supported (PR-B3). Run single-rank, or use a dense/edge .pt2 for " - "multi-rank LAMMPS."); + "Multi-rank message-passing graph (NeighborGraph) .pt2 inference is " + "not yet supported (PR-G). Non-message-passing graph models (e.g. " + "dpa1) run multi-rank on the extended-region single-rank artifact; " + "for message-passing models run single-rank, or use a dense/edge " + ".pt2 for multi-rank LAMMPS."); } // Decision matrix (see PR #5450 description): // non-GNN model (has_message_passing_ == false): regular path is @@ -813,12 +822,16 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, edge_tensors.edge_index_ext, edge_tensors.edge_mask, fparam_tensor, aparam_tensor, charge_spin_tensor); } else if (lower_input_is_graph_) { - // Single-rank NeighborGraph schema: build (atype, n_node, edge_index, - // edge_vec, edge_mask) from the host nlist (node types from the extended - // types, folded local edge graph) and run the graph artifact. + // NeighborGraph schema: build (atype, n_node, edge_index, edge_vec, + // edge_mask) from the host nlist and run the (single-rank) graph + // artifact. Single-rank folds ghosts onto local owners (N == nloc); + // multi-rank (non-MP only — the fail-fast above blocks MP graph + // multi-rank) keeps the extended region (N == nall_real, node types from + // the real halo types) so LAMMPS reverse-comm folds ghost forces back. const auto graph_tensors = buildGraphTensors( nlist_data.jlist, dcoord, datype, mapping_, nloc, nall_real, - static_cast(rcut), device, &nlist_data.ilist); + static_cast(rcut), device, &nlist_data.ilist, + /*fold_to_local=*/!multi_rank); flat_outputs = run_model_graph( graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index, graph_tensors.edge_vec, graph_tensors.edge_mask, fparam_tensor, @@ -835,14 +848,21 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, extract_outputs(output_map, flat_outputs); if (lower_input_is_graph_) { - // The graph forward emits LOCAL public keys (atom_energy/energy/force/ + // The graph forward emits flat-N PUBLIC keys (atom_energy/energy/force/ // virial/atom_virial); rewrite them into the dense internal-key layout the - // downstream extraction/fold-back expects. nloc == N (graph node count); - // pad the per-atom force/virial up to nall_real with zero ghost rows. - // single_rank=true: the multi-rank fail-fast at line ~508 guarantees we - // never reach here on a multi-rank graph call. - deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall_real, - atomic, /*single_rank=*/true); + // downstream extraction/fold-back expects. + if (multi_rank) { + // Extended region (N == nall_real): force is already per-extended-atom, + // owned energy = sum over local atom energies, no zero-padding. Ghost + // forces fold back via LAMMPS reverse-comm (no with-comm artifact). + deepmd::remap_graph_outputs_to_dense_keys_extended(output_map, nloc, + nall_real, atomic); + } else { + // Single-rank (N == nloc): ghosts folded onto owners; pad the per-atom + // force/virial up to nall_real with zero ghost rows. + deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall_real, + atomic, /*single_rank=*/true); + } } if (phantom_n > 0) { From 92c35a6b14f46a10fae2deaec84b89bd8355cfd2 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 08:29:49 +0800 Subject: [PATCH 20/33] test(lammps): dpa1 graph .pt2 single + multi-rank (mpirun -n 2, local) --- .../lmp/tests/test_lammps_dpa1_graph_pt2.py | 306 ++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 source/lmp/tests/test_lammps_dpa1_graph_pt2.py diff --git a/source/lmp/tests/test_lammps_dpa1_graph_pt2.py b/source/lmp/tests/test_lammps_dpa1_graph_pt2.py new file mode 100644 index 0000000000..3e6a11428d --- /dev/null +++ b/source/lmp/tests/test_lammps_dpa1_graph_pt2.py @@ -0,0 +1,306 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Test LAMMPS with the NeighborGraph (graph-schema) .pt2 DPA1 model. + +The model ``deeppot_dpa1_graph.pt2`` is a dpa1(attn_layer=0) descriptor +exported with ``lower_kind="graph"`` (gen_dpa1.py section B). dpa1 is +NON-message-passing, so the SAME single-rank graph .pt2 also drives the +multi-rank path: the C++ ``DeepPotPTExpt`` builds an EXTENDED-region graph +(``fold_to_local=False``; ghosts are distinct nodes whose features come from +their real halo types) and returns per-extended-atom forces, which LAMMPS +reverse-comm folds back to their owners. There is NO with-comm artifact and +NO ``border_op`` (that is the message-passing PR-G path) — hence no +``use_loc_mapping=False`` variant. + +Reference values come from ``source/tests/infer/gen_dpa1.py`` (the same +``deeppot_dpa1_graph.expected`` the C++ gtest uses); the multi-rank run must +match the single-rank reference for energy, per-atom force, and per-atom +virial. This is the core multi-rank correctness gate for the non-MP graph +path implemented in B3.1. +""" + +import importlib.util +import os +import shutil +import subprocess as sp +import sys +import tempfile +from pathlib import ( + Path, +) + +import constants +import numpy as np +import pytest +from expected_ref import ( + read_expected_ref, +) +from lammps import ( + PyLammps, +) +from write_lmp_data import ( + write_lmp_data, +) + +pb_file = ( + Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa1_graph.pt2" +) +ref_file = ( + Path(__file__).parent.parent.parent + / "tests" + / "infer" + / "deeppot_dpa1_graph.expected" +) +# The MPI runner is backend-agnostic (DATAFILE PB_FILE OUTPUT + flags); reuse +# the DPA3 driver verbatim rather than duplicate it. +mpi_runner = Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py" + +data_file = Path(__file__).parent / "data_dpa1_graph_pt2.lmp" +# Elongated-box variant for the empty-subdomain MPI corner: x extended to +# 30 A while atoms stay in x in [0.25, 12.83]; with ``processors 2 1 1`` the +# split at x = 15 leaves rank 1 with zero local atoms. +data_file_empty_subdomain = ( + Path(__file__).parent / "data_dpa1_graph_pt2_empty_subdomain.lmp" +) + +# Reference values written by source/tests/infer/gen_dpa1.py (PBC case). +# Guarded with try/except because gen_dpa1.py only runs when PyTorch is built. +try: + _ref = read_expected_ref(ref_file)["pbc"] + expected_e = float(np.sum(_ref["expected_e"])) + expected_f = _ref["expected_f"].reshape(6, 3) + # LAMMPS uses the opposite sign convention for virial vs DeepPot. + expected_v = -_ref["expected_v"].reshape(6, 9) +except FileNotFoundError: + expected_e = expected_f = expected_v = None + +box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0]) +coord = np.array( + [ + [12.83, 2.56, 2.18], + [12.09, 2.87, 2.74], + [0.25, 3.32, 1.68], + [3.36, 3.00, 1.81], + [3.51, 2.51, 2.60], + [4.27, 3.22, 1.56], + ] +) +# Model type_map is ["O", "H"]; gtest atype = [0, 1, 1, 0, 1, 1] -> LAMMPS +# types [1, 2, 2, 1, 2, 2] under identity ``pair_coeff * *``. +type_OH = np.array([1, 2, 2, 1, 2, 2]) + + +def setup_module() -> None: + if os.environ.get("ENABLE_PYTORCH", "1") != "1": + pytest.skip( + "Skip test because PyTorch support is not enabled.", + ) + write_lmp_data(box, coord, type_OH, data_file) + box_empty_subdomain = np.array([0, 30, 0, 13, 0, 13, 0, 0, 0]) + write_lmp_data(box_empty_subdomain, coord, type_OH, data_file_empty_subdomain) + + +def teardown_module() -> None: + for f in [data_file, data_file_empty_subdomain]: + if f.exists(): + os.remove(f) + + +def _lammps(data_file, units="metal", atom_map: str = "yes") -> PyLammps: + lammps = PyLammps() + lammps.units(units) + lammps.boundary("p p p") + lammps.atom_style("atomic") + if atom_map != "no": + lammps.atom_modify(f"map {atom_map}") + lammps.neighbor("2.0 bin") + lammps.neigh_modify("every 10 delay 0 check no") + lammps.read_data(data_file.resolve()) + lammps.mass("1 16") + lammps.mass("2 2") + lammps.timestep(0.0005) + lammps.fix("1 all nve") + return lammps + + +@pytest.fixture +def lammps(): + lmp = _lammps(data_file=data_file) + yield lmp + lmp.close() + + +def test_pair_deepmd(lammps) -> None: + """Single-rank serial run (``atom_modify map yes``): the graph .pt2 + folds ghosts onto local owners (``fold_to_local=True``) and must match + the gen_dpa1.py reference for energy and per-atom force. + """ + lammps.pair_style(f"deepmd {pb_file.resolve()}") + lammps.pair_coeff("* *") + lammps.run(0) + assert lammps.eval("pe") == pytest.approx(expected_e) + for ii in range(6): + assert lammps.atoms[ii].force == pytest.approx( + expected_f[lammps.atoms[ii].id - 1] + ) + lammps.run(1) + + +def test_pair_deepmd_virial(lammps) -> None: + """Single-rank per-atom virial via ``centroid/stress/atom``.""" + lammps.pair_style(f"deepmd {pb_file.resolve()}") + lammps.pair_coeff("* *") + lammps.compute("virial all centroid/stress/atom NULL pair") + for ii in range(9): + jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii] + lammps.variable(f"virial{jj} atom c_virial[{ii + 1}]") + lammps.dump( + "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)]) + ) + lammps.run(0) + assert lammps.eval("pe") == pytest.approx(expected_e) + for ii in range(6): + assert lammps.atoms[ii].force == pytest.approx( + expected_f[lammps.atoms[ii].id - 1] + ) + idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1 + for ii in range(9): + assert np.array( + lammps.variables[f"virial{ii}"].value + ) / constants.nktv2p == pytest.approx(expected_v[idx_map, ii]) + + +# --------------------------------------------------------------------------- +# Multi-rank test (non-MP extended-region graph path; B3.1). +# +# dpa1 is non-message-passing, so multi-rank uses the SAME single-rank graph +# .pt2 on the extended region. The expected energy/force/virial are the +# single-rank reference: each rank evaluates its local atoms over the extended +# graph; ghost reaction forces fold back via LAMMPS reverse-comm. +# --------------------------------------------------------------------------- + + +def _run_mpi_subprocess( + extra_args: list[str] | None = None, + nprocs: int = 2, + data_path: Path | None = None, + processors: str | None = None, + runner_args: list[str] | None = None, +) -> dict: + """Invoke the (backend-agnostic) DPA3 MPI runner under + ``mpirun -n `` against the dpa1 graph .pt2 and return + ``{"pe": float, "forces": (n, 3), "virials": (n, 9)}``. + + ``nprocs == 1`` forces ``--processors 1 1 1`` so the C++ side sees + ``nprocs == 1`` and routes to the single-rank graph path — a + same-archive reference for the multi-rank comparison. + """ + if data_path is None: + data_path = data_file + with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: + out_path = f.name + try: + argv = [ + "mpirun", + "-n", + str(nprocs), + sys.executable, + str(mpi_runner), + str(data_path.resolve()), + str(pb_file.resolve()), + out_path, + ] + if processors is not None: + argv.extend(["--processors", processors]) + elif nprocs == 1: + argv.extend(["--processors", "1 1 1"]) + if extra_args: + argv.extend(extra_args) + if runner_args: + argv.extend(runner_args) + sp.check_call(argv) + with open(out_path) as fh: + lines = fh.read().strip().splitlines() + pe = float(lines[0]) + rows = np.array( + [list(map(float, line.split())) for line in lines[1:]], + dtype=np.float64, + ) + forces = rows[:, :3] + virials = rows[:, 3:] + return {"pe": pe, "forces": forces, "virials": virials} + finally: + if os.path.exists(out_path): + os.remove(out_path) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa1_graph() -> None: + """Multi-rank LAMMPS run for the dpa1 graph .pt2 must match the + single-rank reference within numerical tolerance for energy, forces, + and per-atom virial. + + This is the core correctness gate for the non-MP extended-region + multi-rank C++ path (B3.1): the extended graph + reverse-comm + fold-back must reproduce the folded single-rank result. + """ + out = _run_mpi_subprocess() + assert out["pe"] == pytest.approx(expected_e, rel=0, abs=1e-8) + for ii in range(6): + np.testing.assert_allclose(out["forces"][ii], expected_f[ii], atol=1e-8, rtol=0) + # ``centroid/stress/atom`` column order [xx, yy, zz, xy, xz, yz, yx, zx, + # zy]; the inverse permutation maps it back to the expected_v columns. + expected_v_to_lammps = [0, 6, 7, 3, 1, 8, 4, 5, 2] + np.testing.assert_allclose( + out["virials"][:, expected_v_to_lammps] / constants.nktv2p, + expected_v, + atol=1e-8, + rtol=0, + ) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa1_graph_matches_single_rank() -> None: + """Multi-rank (``-n 2``) ≡ single-rank (``-n 1``) on the SAME archive + and trajectory — isolates the extended-region multi-rank C++ path from + the .pt2 reference values (a wrong-but-finite divergence would show up + here even if the hardcoded reference drifted). + """ + out_mpi = _run_mpi_subprocess(nprocs=2) + out_ref = _run_mpi_subprocess(nprocs=1) + np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 + ) + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa1_graph_empty_subdomain() -> None: + """Multi-rank with one rank owning zero local atoms (elongated box, + ``processors 2 1 1``, split at x = 15). The extended-region graph path + must still produce correct forces/virial on the populated rank and a + zero contribution from the empty rank — compared against a same-archive + single-rank reference of the same fixture. + """ + out_mpi = _run_mpi_subprocess(nprocs=2, data_path=data_file_empty_subdomain) + out_ref = _run_mpi_subprocess(nprocs=1, data_path=data_file_empty_subdomain) + np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 + ) + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10) From e2e07f16046f6c44f592e342cba71ed83a53aa4c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 30 Jun 2026 00:38:02 +0000 Subject: [PATCH 21/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- source/api_cc/include/commonPT.h | 15 ++++++++------- .../tests/test_deeppot_dpa1_graph_ptexpt.cc | 8 +++++--- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h index 919df504f0..02c25aa047 100644 --- a/source/api_cc/include/commonPT.h +++ b/source/api_cc/include/commonPT.h @@ -551,13 +551,14 @@ inline void remap_graph_outputs_to_dense_keys( * is non-MP). * * Key differences from the single-rank helper: - * - ``energy_redu`` = sum of the LOCAL atom energies (``atom_energy[0:nloc]``) - * ONLY. The public ``energy`` key reduces over all ``N == nall`` nodes, - * which would double-count the bias energy of ghost nodes that belong to - * other ranks (ghost nodes have no center edges, so they carry a bias-only - * energy and zero force/virial gradient — harmless for force/virial but - * wrong for the owned energy). - * - ``energy_derv_r`` / ``energy_derv_c`` keep all ``nall`` rows (no padding). + * - ``energy_redu`` = sum of the LOCAL atom energies + * (``atom_energy[0:nloc]``) ONLY. The public ``energy`` key reduces over all + * ``N == nall`` nodes, which would double-count the bias energy of ghost nodes + * that belong to other ranks (ghost nodes have no center edges, so they carry a + * bias-only energy and zero force/virial gradient — harmless for force/virial + * but wrong for the owned energy). + * - ``energy_derv_r`` / ``energy_derv_c`` keep all ``nall`` rows (no + * padding). * * @param[in,out] output_map Output tensor map (public keys in, internal keys * added). diff --git a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc index 8b347c5e57..c57abaf0a0 100644 --- a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc +++ b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc @@ -243,8 +243,9 @@ TYPED_TEST(TestInferDpa1GraphPtExpt, lammps_nlist_ago) { // Case 5: exercise the DeepPot::compute ATOMIC overload on the graph .pt2. // This is the first test to reach the ``if (atomic)`` branch inside // remap_graph_outputs_to_dense_keys (the atom_energy/atom_virial remapping). -// The per-atom reference values are already loaded from deeppot_dpa1_graph.expected -// into this->expected_e and this->expected_v by SetUp(). +// The per-atom reference values are already loaded from +// deeppot_dpa1_graph.expected into this->expected_e and this->expected_v by +// SetUp(). TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_atomic) { using VALUETYPE = TypeParam; std::vector& coord = this->coord; @@ -261,7 +262,8 @@ TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_atomic) { double ener; std::vector force, virial, atom_energy, atom_virial; // Standalone atomic overload: DeepPot builds its own nlist (graph branch), - // then returns per-atom energy + atom-virial alongside total energy/force/virial. + // then returns per-atom energy + atom-virial alongside total + // energy/force/virial. dp.compute(ener, force, virial, atom_energy, atom_virial, coord, atype, box); EXPECT_EQ(force.size(), static_cast(natoms * 3)); From 942de1f4f4dcd58574709cc4d25955069f4b6cce Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 11:50:14 +0800 Subject: [PATCH 22/33] refactor(pt_expt): group node_capacity with graph-shape args, make edge_energy_deriv flags keyword-only Move node_capacity up next to n_node (it is a shape parameter, not a behavior flag) and put a keyword-only barrier before do_atomic_virial/ create_graph. Reads target -> graph-shape -> behavior, mirroring the edge_force_virial it wraps. The sole caller already passed the flags by keyword, so this is a no-behavior-change reorder. --- deepmd/pt_expt/model/edge_transform_output.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py index 9202a64d8f..98a620b524 100644 --- a/deepmd/pt_expt/model/edge_transform_output.py +++ b/deepmd/pt_expt/model/edge_transform_output.py @@ -29,9 +29,10 @@ def edge_energy_deriv( edge_index: torch.Tensor, edge_mask: torch.Tensor, n_node: torch.Tensor, + node_capacity: int | None = None, + *, do_atomic_virial: bool = False, create_graph: bool = False, - node_capacity: int | None = None, ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor]: """Return (force, atom_virial_or_None, virial) from a graph energy. @@ -50,14 +51,14 @@ def edge_energy_deriv( (E,) valid-edge mask. n_node (nf,) per-frame node counts. - do_atomic_virial - whether to materialize the per-atom virial (else ``None`` is returned). - create_graph - whether the backward retains a graph (training, for second-order grad). node_capacity Static node-axis size ``N``. ``None`` (eager default) falls back to ``int(n_node.sum())``. Pass a static value (e.g. ``atype.shape[0]``) to keep this function trace-safe under ``make_fx``/``torch.export``. + do_atomic_virial + whether to materialize the per-atom virial (else ``None`` is returned). + create_graph + whether the backward retains a graph (training, for second-order grad). Returns ------- @@ -186,9 +187,9 @@ def fit_output_to_model_output_graph( edge_index, edge_mask, n_node, + node_capacity=N, do_atomic_virial=(vdef.c_differentiable and do_atomic_virial), create_graph=create_graph, - node_capacity=N, ) # force (N, 3) -> (N, 1, 3) [flat; caller unravels at I/O boundary] ff_list.append(force.reshape(N, 1, 3)) From 7658091dd008394abbf1936d38e34abb443062c0 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 13:17:46 +0800 Subject: [PATCH 23/33] fix(dpmodel): clamp graph edge-scatter indices in-bounds (CUDA device-assert on graph .pt2 export) The graph forward_common_lower_graph .pt2 AOTI export device-asserted on CUDA ('index out of bounds: 0 <= tmp8 < ks0') in the edge_force_virial scatter. A padding/guard edge's node index reaches the index_add address computation BEFORE edge_mask zeroes its (zero) contribution; CPU does not bounds-check the address (so all dev + CI, which are CPU-only, were green), CUDA device-asserts. Clamp src/dst into [0, n_out) and edge_frame into [0, nf) before the scatters -- padding edges carry w_edge==0, so a clamped out-of-range index scatters zero (numerically harmless). Verified on Tesla T4: graph .pt2 export + DeepEval parity (small_8/large_20) + full graph suite + consistency all pass; parity confirms the clamp does not corrupt (N bound was correct, the stray index was genuine padding). Also bind the per-node scatter to the input atype.shape[0] (node_capacity) rather than the re-derived fit_ret.shape[0] -- hardening. --- .../dpmodel/utils/neighbor_graph/derivatives.py | 14 ++++++++++++-- deepmd/pt_expt/model/edge_transform_output.py | 16 +++++++++++++++- deepmd/pt_expt/model/make_model.py | 5 +++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py index 2c8c50eaca..8d6125abfc 100644 --- a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py +++ b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py @@ -91,8 +91,15 @@ def edge_force_virial( # zero padding/guard contributions; cast mask to g's dtype (array-API pure, # CLAUDE.md mask-multiply guideline — avoids bool*float under array_api_strict) g = g_e * xp.astype(edge_mask[:, None], g_e.dtype) - src = edge_index[0] - dst = edge_index[1] + # Clamp scatter indices into the valid node range ``[0, n_out)``. Padding/guard + # edges (``edge_mask == 0``) carry ``g == 0`` above, so ``w_edge == 0`` and a + # clamped out-of-range index scatters ZERO -- numerically harmless. This keeps + # the scatter address in-bounds for the CUDA-compiled kernel: under dynamic-edge + # ``torch.export`` a padding index can reach the ``index_add`` BEFORE the mask + # zeroes its value, tripping ``tl.device_assert(idx < ks0)`` (a hard device-side + # assert on CUDA; benign on CPU, which does not bounds-check the address). + src = xp.clip(edge_index[0], 0, n_out - 1) + dst = xp.clip(edge_index[1], 0, n_out - 1) # force (output sized to the node axis, incl. any padding tail) force = segment_sum(g, dst, n_out) - segment_sum(g, src, n_out) # per-edge virial w_e[k, j] = -g_e[k] * edge_vec[j] (broadcast, no einsum) @@ -106,5 +113,8 @@ def edge_force_virial( edge_frame = xp.astype( xp.searchsorted(boundaries, dst, side="right"), xp.int64 ) # (E,) in [0, nf) + # searchsorted(side="right") can return ``nf`` for an out-of-range ``dst`` + # (padding/garbage); clamp into ``[0, nf)`` for the same CUDA-bounds reason. + edge_frame = xp.clip(edge_frame, 0, nf - 1) virial = segment_sum(w_edge, edge_frame, nf) # (nf, 3, 3) return force, atom_virial, virial diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py index 98a620b524..653f323404 100644 --- a/deepmd/pt_expt/model/edge_transform_output.py +++ b/deepmd/pt_expt/model/edge_transform_output.py @@ -88,6 +88,7 @@ def fit_output_to_model_output_graph( do_atomic_virial: bool = False, create_graph: bool = True, mask: torch.Tensor | None = None, + node_capacity: int | None = None, ) -> dict[str, torch.Tensor]: """Graph analogue of the dense pt_expt ``fit_output_to_model_output``. @@ -121,6 +122,15 @@ def fit_output_to_model_output_graph( Whether the backward retains a graph (training). mask (N,) flat realness mask; used only for intensive-output reduction. + node_capacity + Authoritative node-axis size ``N`` = the scatter bound for the + per-node force/atom-virial assembly. Pass the INPUT ``atype.shape[0]`` + (the pristine node-axis symbol that ``edge_index`` indexes into by + construction); ``None`` falls back to the descriptor/fitting output's + ``fit_ret.shape[0]`` (value-equal). This makes the scatter bound the + input node axis rather than a re-derived shape -- hardening; the actual + CUDA out-of-bounds device-assert is prevented by the index clamp in + :func:`~deepmd.dpmodel.utils.neighbor_graph.derivatives.edge_force_virial`. Returns ------- @@ -145,7 +155,11 @@ def fit_output_to_model_output_graph( # Derive N from the fitting output's leading shape rather than int(n_node.sum()). # shape attributes are always static Python ints (or SymInts in symbolic-mode # tracing) and are trace-safe; reading a tensor VALUE via int() is not. - N = next(iter(fit_ret.values())).shape[0] + N = ( + node_capacity + if node_capacity is not None + else next(iter(fit_ret.values())).shape[0] + ) frame_id = frame_id_from_n_node( n_node, n_total=N ) # (N,) int64 frame index per atom diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py index 3e14ed2d56..5b19cb63f1 100644 --- a/deepmd/pt_expt/model/make_model.py +++ b/deepmd/pt_expt/model/make_model.py @@ -370,6 +370,11 @@ def forward_common_lower_graph( do_atomic_virial=do_atomic_virial, create_graph=self.training, mask=atomic_ret["mask"] if "mask" in atomic_ret else None, + # Bound the per-node scatter by the INPUT node axis (the symbol + # ``edge_index`` indexes into), not the re-derived fitting-output + # shape -- avoids a CUDA out-of-bounds device-assert under + # dynamic-edge torch.export. See fit_output_to_model_output_graph. + node_capacity=atype.shape[0], ) def _resolve_graph_method( From 0f437317296b8175579c1d4952e55fb19d70cc14 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 15:24:13 +0800 Subject: [PATCH 24/33] fix(dpmodel): export-safe modulo clamp for graph edge-scatter indices The xp.clip clamp added in the previous commit breaks under torch.export: the bound n_out is a SymInt and array_api_compat's clip reads .shape on it ('SymInt' object has no attribute 'shape'), failing every graph .pt2 export on CI/local (it only passed on the GPU box's newer array_api_compat). Replace with modulo (src % n_out, dst % n_out, edge_frame % nf) -- pure arithmetic, so torch.export-safe, and a no-op on in-range real indices. Also fixes the case a mask-multiply missed: the out-of-range index occurs on an edge_mask==1 edge (ks0=n_out binds to a smaller symbol than the live node count at AOTI runtime), which only a clamp-every-index (clip or modulo), not a mask-zero, neutralizes. Out-of-range edges carry ~zero w_edge, so wrapping them is numerically harmless. Verified on Tesla T4: graph .pt2 export (gen_dpa1) + DeepEval parity (small_8/large_20) + C++ TestInferDpa1GraphPtExpt 10/10 all pass; CPU export suite 12/12 (the SymInt failure the prior commit would have hit in CI). --- .../utils/neighbor_graph/derivatives.py | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py index 8d6125abfc..affeb57ccc 100644 --- a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py +++ b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py @@ -91,15 +91,19 @@ def edge_force_virial( # zero padding/guard contributions; cast mask to g's dtype (array-API pure, # CLAUDE.md mask-multiply guideline — avoids bool*float under array_api_strict) g = g_e * xp.astype(edge_mask[:, None], g_e.dtype) - # Clamp scatter indices into the valid node range ``[0, n_out)``. Padding/guard - # edges (``edge_mask == 0``) carry ``g == 0`` above, so ``w_edge == 0`` and a - # clamped out-of-range index scatters ZERO -- numerically harmless. This keeps - # the scatter address in-bounds for the CUDA-compiled kernel: under dynamic-edge - # ``torch.export`` a padding index can reach the ``index_add`` BEFORE the mask - # zeroes its value, tripping ``tl.device_assert(idx < ks0)`` (a hard device-side - # assert on CUDA; benign on CPU, which does not bounds-check the address). - src = xp.clip(edge_index[0], 0, n_out - 1) - dst = xp.clip(edge_index[1], 0, n_out - 1) + # Wrap node indices into ``[0, n_out)`` so every scatter address is in-bounds. + # Real edges already have index < n_out (modulo is a no-op). Out-of-range + # indices CAN appear in the CUDA-compiled kernel: under dynamic-edge + # ``torch.export`` the scatter bound ``ks0 == n_out`` binds to a SMALLER + # symbol than the live node count at AOTI runtime, so a valid index trips + # ``tl.device_assert(idx < ks0)`` (a hard device-side assert on CUDA; benign + # on CPU, which does not bounds-check the address). Such edges carry ~zero + # ``w_edge`` (masked ``g`` + tiny ``g_e``), so wrapping them to another node + # is numerically harmless. Modulo is pure arithmetic => torch.export-safe, + # unlike ``xp.clip`` (SymInt bound breaks array_api_compat) and unlike a + # mask-multiply (misses ``edge_mask == 1`` out-of-range indices). + src = edge_index[0] % n_out + dst = edge_index[1] % n_out # force (output sized to the node axis, incl. any padding tail) force = segment_sum(g, dst, n_out) - segment_sum(g, src, n_out) # per-edge virial w_e[k, j] = -g_e[k] * edge_vec[j] (broadcast, no einsum) @@ -112,9 +116,8 @@ def edge_force_virial( boundaries = xp.cumulative_sum(n_node) # (nf,) per-frame node upper bounds edge_frame = xp.astype( xp.searchsorted(boundaries, dst, side="right"), xp.int64 - ) # (E,) in [0, nf) - # searchsorted(side="right") can return ``nf`` for an out-of-range ``dst`` - # (padding/garbage); clamp into ``[0, nf)`` for the same CUDA-bounds reason. - edge_frame = xp.clip(edge_frame, 0, nf - 1) + ) # (E,) in [0, nf] + # wrap into [0, nf) for the same CUDA-bounds reason (export-safe modulo) + edge_frame = edge_frame % nf virial = segment_sum(w_edge, edge_frame, nf) # (nf, 3, 3) return force, atom_virial, virial From afda4c7b36812aa0c970a9223643cd98a3b2a3ed Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 15:24:41 +0800 Subject: [PATCH 25/33] fix(pt_expt): address AI review (CodeQL + CodeRabbit) on #5604 - training.py: add explanatory comments to four pass-only except clauses (CodeQL empty-except); replace the nloc==1-fragile 'N != nframes' shape heuristic in the compiled-graph unravel with an explicit node-level key set so single-atom-per-frame outputs reshape to (nf, 1, *) (CodeRabbit). - serialization.py: mark aparam's atom axis dynamic in the graph export dynamic-shape spec ({0: nframes, 1: nloc}), matching the dense path, so a dim_aparam>0 graph export no longer specializes nloc (CodeRabbit). - test_lammps_dpa1_graph_pt2.py: skip the two reference-comparison tests when the gen_dpa1 .expected fixture is absent (clean skip vs TypeError on None); force 'processors 2 1 1' in the empty-subdomain MPI test so the empty-rank branch is genuinely exercised (CodeRabbit). Skipped as invalid: CodeQL 'import DEVICE' (house pattern, benign in test); CodeRabbit do_atomic_virial-in-compiled-graph (the dense compiled path drops it identically; training never requests atom_virial). Validated: CPU export 8/8 + compiled-varying-natoms 5/5 (incl. dpa1_no_attn). --- deepmd/pt_expt/train/training.py | 31 +++++++++++-------- deepmd/pt_expt/utils/serialization.py | 6 +++- .../lmp/tests/test_lammps_dpa1_graph_pt2.py | 15 ++++++++- 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 6393d78e39..409fcef48d 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -421,13 +421,13 @@ def fn( if _dim_fp > 1: _forbidden.add(_dim_fp) except Exception: - pass + pass # best-effort: dim_fparam unavailable -> nothing to forbid try: _dim_ap = model.get_dim_aparam() if _dim_ap > 1: _forbidden.add(_dim_ap) except Exception: - pass + pass # best-effort: dim_aparam unavailable -> nothing to forbid if charge_spin is not None: _dim_cs = int(charge_spin.shape[1]) if _dim_cs > 1: @@ -634,11 +634,11 @@ def _trace_and_compile_graph( try: _fitting = model.get_fitting_net() except AttributeError: - pass + pass # optional accessor; a model without a fitting net keeps None try: _atomic_model = model.atomic_model except AttributeError: - pass + pass # optional attribute; a model without an atomic model keeps None do_grad_r = model.do_grad_r("energy") do_grad_c = model.do_grad_c("energy") @@ -667,13 +667,13 @@ def _trace_and_compile_graph( if _dim_fp > 1: _forbidden.add(_dim_fp) except Exception: - pass + pass # best-effort: dim_fparam unavailable -> nothing to forbid try: _dim_ap = model.get_dim_aparam() if _dim_ap > 1: _forbidden.add(_dim_ap) except Exception: - pass + pass # best-effort: dim_aparam unavailable -> nothing to forbid if charge_spin is not None and charge_spin.shape[-1] > 1: _forbidden.add(int(charge_spin.shape[-1])) for _tbv in task_buf_vals_trace: @@ -1279,15 +1279,20 @@ def _forward_graph( # (nf, *)). Unravel the node-level keys to rectangular (nf, nloc, *) so # callers receive the same shapes as the dense path. N = nframes * nloc + # Node-level (per-atom, lead dim N) public keys emitted by the graph + # lower; the remaining keys are frame-level (lead dim nf) and must NOT + # be unravelled. Keying on the NAME rather than the ``N != nframes`` + # shape heuristic keeps the single-atom case (nloc == 1, where + # N == nframes) correct -- node-level outputs still reshape to + # (nf, 1, *) instead of staying (nf, *). + node_level_keys = {"atom_energy", "force", "atom_virial", "mask"} out: dict[str, torch.Tensor] = {} for key, val in result.items(): - # ``N != nframes`` distinguishes node-level keys (lead dim N) from - # frame-level keys (lead dim nf) by shape. DEGENERATE: when nloc==1, - # N == nframes, so node-level keys are NOT unravelled and stay - # (nf, *) instead of (nf, 1, *). Harmless for the varying-natoms - # trainer (nloc >> 1); a single-atom-per-frame system would need an - # explicit per-key category check instead of the shape heuristic. - if val is not None and val.shape[:1] == torch.Size([N]) and N != nframes: + if ( + key in node_level_keys + and val is not None + and val.shape[:1] == torch.Size([N]) + ): out[key] = val.reshape(nframes, nloc, *val.shape[1:]) else: out[key] = val diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index 1bb49a8b5a..f4b538df75 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -447,6 +447,7 @@ def _build_graph_dynamic_shapes( nframes_dim = torch.export.Dim("nframes", min=1) n_node_total_dim = torch.export.Dim("n_node_total", min=1) nedge_dim = torch.export.Dim("nedge", min=2) + nloc_dim = torch.export.Dim("nloc", min=1) return ( {0: n_node_total_dim}, # atype: (N,) {0: nframes_dim}, # n_node: (nf,) @@ -454,7 +455,10 @@ def _build_graph_dynamic_shapes( {0: nedge_dim}, # edge_vec: (E, 3) — E dynamic {0: nedge_dim}, # edge_mask: (E,) — E dynamic {0: nframes_dim} if fparam is not None else None, # fparam: (nf, ndf) - {0: nframes_dim} if aparam is not None else None, # aparam: (nf, nloc, nda) + # aparam: (nf, nloc, nda) — both the frame AND atom axes are dynamic, + # matching the dense ``_build_dynamic_shapes`` (otherwise a dim_aparam>0 + # graph export specializes nloc to the sample size and breaks at runtime). + {0: nframes_dim, 1: nloc_dim} if aparam is not None else None, # aparam {0: nframes_dim} if charge_spin is not None else None, # charge_spin ) diff --git a/source/lmp/tests/test_lammps_dpa1_graph_pt2.py b/source/lmp/tests/test_lammps_dpa1_graph_pt2.py index 3e6a11428d..ab898e15f6 100644 --- a/source/lmp/tests/test_lammps_dpa1_graph_pt2.py +++ b/source/lmp/tests/test_lammps_dpa1_graph_pt2.py @@ -73,6 +73,12 @@ except FileNotFoundError: expected_e = expected_f = expected_v = None +# Gate the reference-comparison tests on the generated ``.expected`` fixture so +# they skip cleanly (rather than failing with a ``TypeError`` on ``None``) when +# gen_dpa1.py has not run (e.g. PyTorch not built). The MPI multi-rank tests +# compare against a single-rank run of the same archive and do not need it. +_HAS_REF = expected_e is not None + box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0]) coord = np.array( [ @@ -129,6 +135,7 @@ def lammps(): lmp.close() +@pytest.mark.skipif(not _HAS_REF, reason="gen_dpa1.py .expected fixture not generated") def test_pair_deepmd(lammps) -> None: """Single-rank serial run (``atom_modify map yes``): the graph .pt2 folds ghosts onto local owners (``fold_to_local=True``) and must match @@ -145,6 +152,7 @@ def test_pair_deepmd(lammps) -> None: lammps.run(1) +@pytest.mark.skipif(not _HAS_REF, reason="gen_dpa1.py .expected fixture not generated") def test_pair_deepmd_virial(lammps) -> None: """Single-rank per-atom virial via ``centroid/stress/atom``.""" lammps.pair_style(f"deepmd {pb_file.resolve()}") @@ -297,7 +305,12 @@ def test_pair_deepmd_mpi_dpa1_graph_empty_subdomain() -> None: zero contribution from the empty rank — compared against a same-archive single-rank reference of the same fixture. """ - out_mpi = _run_mpi_subprocess(nprocs=2, data_path=data_file_empty_subdomain) + # Force ``processors 2 1 1`` so the split is along x at 15 and rank 1 is + # genuinely empty -- otherwise LAMMPS may auto-pick a grid where neither + # rank is empty and the branch under test is not exercised. + out_mpi = _run_mpi_subprocess( + nprocs=2, data_path=data_file_empty_subdomain, processors="2 1 1" + ) out_ref = _run_mpi_subprocess(nprocs=1, data_path=data_file_empty_subdomain) np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) np.testing.assert_allclose( From 0ea2c3435567610ae6e00f58aa540ab598fd912f Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 15:25:09 +0800 Subject: [PATCH 26/33] feat(pt_expt): dp freeze --lower-kind {nlist,graph} for graph .pt2 export Add a user-facing entry point to the graph C++ inference path. Before this, the graph lower was reachable only via the internal API / gen_dpa1.py test fixture, so a user-frozen dpa1 .pt2 always used the dense (nlist) lower and the tested C++ graph path was unreachable from the CLI. - main.py: add --lower-kind {nlist,graph} to the freeze subparser (default nlist; PyTorch-Exportable backend only, same convention as --head/--node-names). - entrypoints/main.py: thread lower_kind into freeze() -> deserialize_to_file. Fail fast (ValueError) when 'graph' is requested for a non-graph-eligible model (reuses _model_uses_graph_lower; currently dpa1 attn_layer==0 only). Enable do_atomic_virial for the graph form -- near-free there (one extra scatter off the shared single backward). - test_dp_freeze.py: graph-eligibility rejection (se_e2_a) + a public-CLI graph freeze of dpa1(attn_layer=0) asserting metadata lower_input_kind=graph. Opt-in by design; auto-selecting graph for eligible models (mirroring training) is deferred until the graph path covers attention/angles/MP. Both tests pass. --- deepmd/main.py | 10 ++++ deepmd/pt_expt/entrypoints/main.py | 39 +++++++++++++++- source/tests/pt_expt/test_dp_freeze.py | 64 ++++++++++++++++++++++++++ 3 files changed, 111 insertions(+), 2 deletions(-) diff --git a/deepmd/main.py b/deepmd/main.py index 43f40dc214..0a4d44137a 100644 --- a/deepmd/main.py +++ b/deepmd/main.py @@ -350,6 +350,16 @@ def main_parser() -> argparse.ArgumentParser: type=str, help="(Supported backend: PyTorch) Task head (alias: model branch) to freeze if in multi-task mode.", ) + parser_frz.add_argument( + "--lower-kind", + default="nlist", + type=str, + choices=["nlist", "graph"], + help="(Supported backend: PyTorch Exportable) Lower-level export form of the " + "frozen .pt2: 'nlist' (default, dense neighbor-list lower) or 'graph' " + "(NeighborGraph edge-list lower; only for graph-eligible models, currently " + "dpa1 with attn_layer=0). 'graph' selects the C++ graph inference path.", + ) # * test script ******************************************************************** parser_tst = subparsers.add_parser( diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py index da28229bf4..2567756578 100644 --- a/deepmd/pt_expt/entrypoints/main.py +++ b/deepmd/pt_expt/entrypoints/main.py @@ -387,6 +387,7 @@ def freeze( model: str, output: str = "frozen_model.pte", head: str | None = None, + lower_kind: str = "nlist", ) -> None: """Freeze a pt_expt checkpoint into a .pte exported model. @@ -398,6 +399,13 @@ def freeze( Path for the output .pte file. head : str or None Head to freeze in multi-task mode. + lower_kind : str + Lower-level export form: ``"nlist"`` (default, dense neighbor-list lower) + or ``"graph"`` (NeighborGraph edge-list lower). ``"graph"`` is only valid + for graph-eligible models (``mixed_types`` and ``uses_graph_lower``, + currently dpa1 with ``attn_layer == 0``) and selects the C++ graph + inference path; the per-atom virial is enabled for it (near-free in the + graph path: one extra scatter off the shared single backward). """ import torch @@ -458,12 +466,34 @@ def freeze( single_model_params = model_params m.eval() + + # The graph lower is opt-in and only valid for graph-eligible models (dpa1 + # attn_layer==0 today). Fail fast with a clear message rather than emitting a + # broken .pt2. Enable the per-atom virial for the graph form -- it is + # near-free there (one extra scatter off the single shared backward). + do_atomic_virial = False + if lower_kind == "graph": + from deepmd.pt_expt.train.training import ( + _model_uses_graph_lower, + ) + + if not _model_uses_graph_lower(m): + raise ValueError( + "lower_kind='graph' requires a graph-eligible model " + "(mixed_types and a descriptor exposing uses_graph_lower()==True, " + "currently dpa1 with attn_layer==0). Use lower_kind='nlist' for " + "this model." + ) + do_atomic_virial = True + model_dict_serialized = m.serialize() deserialize_to_file( output, {"model": model_dict_serialized, "model_def_script": single_model_params}, + do_atomic_virial=do_atomic_virial, + lower_kind=lower_kind, ) - log.info("Saved frozen model to %s", output) + log.info("Saved frozen model to %s (lower_kind=%s)", output, lower_kind) def change_bias( @@ -703,7 +733,12 @@ def main(args: list[str] | argparse.Namespace | None = None) -> None: FLAGS.model = str(model_path) if not FLAGS.output.endswith((".pte", ".pt2")): FLAGS.output = str(Path(FLAGS.output).with_suffix(".pte")) - freeze(model=FLAGS.model, output=FLAGS.output, head=FLAGS.head) + freeze( + model=FLAGS.model, + output=FLAGS.output, + head=FLAGS.head, + lower_kind=getattr(FLAGS, "lower_kind", "nlist"), + ) elif FLAGS.command == "change-bias": change_bias( input_file=FLAGS.INPUT, diff --git a/source/tests/pt_expt/test_dp_freeze.py b/source/tests/pt_expt/test_dp_freeze.py index 7c33f0de81..ebfac485ea 100644 --- a/source/tests/pt_expt/test_dp_freeze.py +++ b/source/tests/pt_expt/test_dp_freeze.py @@ -41,6 +41,36 @@ "data_stat_nbatch": 20, } +# dpa1 with attn_layer == 0 — the only graph-eligible model family today +# (mixed_types and uses_graph_lower()==True), used to exercise the +# ``freeze --lower-kind graph`` public-CLI path. +model_dpa1_graph = { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_atten", + "sel": 30, + "rcut_smth": 2.0, + "rcut": 6.0, + "neuron": [2, 4, 8], + "axis_neuron": 4, + "attn": 5, + "attn_layer": 0, + "attn_dotr": True, + "attn_mask": False, + "activation_function": "tanh", + "scaling_factor": 1.0, + "normalize": True, + "temperature": 1.0, + "type_one_side": True, + "seed": 1, + }, + "fitting_net": { + "neuron": [5, 5, 5], + "resnet_dt": True, + "seed": 1, + }, +} + class TestDPFreezePtExpt(unittest.TestCase): """Test dp freeze for the pt_expt backend.""" @@ -103,6 +133,40 @@ def test_freeze_default_suffix(self) -> None: expected = os.path.join(self.tmpdir, "frozen_default_suffix.pte") self.assertTrue(os.path.exists(expected)) + def test_freeze_graph_rejects_ineligible(self) -> None: + """``--lower-kind graph`` on a non-graph-eligible model (se_e2_a, + mixed_types=False) fails fast rather than emitting a broken .pt2. + """ + output = os.path.join(self.tmpdir, "frozen_graph_reject.pt2") + with self.assertRaises(ValueError): + freeze(model=self.ckpt_file, output=output, lower_kind="graph") + + def test_freeze_graph_dpa1(self) -> None: + """``freeze --lower-kind graph`` on a graph-eligible dpa1(attn_layer=0) + model produces a .pt2 whose metadata records the graph lower (the + user-facing entry point to the C++ graph inference path). + """ + import json + import zipfile + + model_params = deepcopy(model_dpa1_graph) + model = get_model(model_params) + wrapper = ModelWrapper(model, model_params=model_params) + ckpt = os.path.join(self.tmpdir, "dpa1_graph.pt") + torch.save({"model": wrapper.state_dict()}, ckpt) + + output = os.path.join(self.tmpdir, "frozen_dpa1_graph.pt2") + freeze(model=ckpt, output=output, lower_kind="graph") + self.assertTrue(os.path.exists(output)) + + # the .pt2 is a zip; metadata.json must record the graph lower + with zipfile.ZipFile(output) as zf: + meta_name = next( + n for n in zf.namelist() if n.endswith("extra/metadata.json") + ) + metadata = json.loads(zf.read(meta_name)) + self.assertEqual(metadata["lower_input_kind"], "graph") + def test_freeze_pt2(self) -> None: """Freeze to .pt2 (AOTInductor) and verify the file is loadable.""" self.assertTrue(os.path.exists(self.shared_pt2)) From 7a50e60e2754437827e6e74c4a3aaed3b39e8724 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Tue, 30 Jun 2026 15:52:51 +0800 Subject: [PATCH 27/33] test(api_cc): add deeppot_dpa1_graph.pt2 to universal/variant battery Wire the dpa1 graph .pt2 (attn_layer=0, lower_input_kind=graph) into the parametrized VariantDeepPotTest as case dpa1_graph_ptexpt (Backend::PTExpt, ref deeppot_dpa1_graph.expected, pbc/nopbc, tol 1e-10/1e-4). Flags mirror dpa1_pytorch_pt2 exactly; all 19 enabled subtests pass on remote GPU (FiniteDifference, LmpNlist*, CutoffTwice, TypeSel, NoPbc*). The dedicated test_deeppot_dpa1_graph_ptexpt.cc is retained for graph-unique coverage. --- source/api_cc/tests/test_deeppot_universal.cc | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/source/api_cc/tests/test_deeppot_universal.cc b/source/api_cc/tests/test_deeppot_universal.cc index e0ee6fc8f4..1c599d7d33 100644 --- a/source/api_cc/tests/test_deeppot_universal.cc +++ b/source/api_cc/tests/test_deeppot_universal.cc @@ -143,6 +143,28 @@ std::vector variant_deeppot_cases() { /*supports_no_pbc_atomic=*/false, /*supports_no_pbc_lmp_nlist=*/true, /*supports_no_pbc_lmp_nlist_atomic=*/false}, + {"dpa1_graph_ptexpt", + Backend::PTExpt, + "../../tests/infer/deeppot_dpa1_graph.pt2", + /*convert_pbtxt=*/false, + nullptr, + nullptr, + "../../tests/infer/deeppot_dpa1_graph.expected", + "pbc", + "nopbc", + 1e-10, + 1e-4, + /*supports_float=*/true, + /*supports_finite_difference=*/true, + /*supports_lmp_nlist=*/true, + /*supports_lmp_nlist_atomic=*/true, + /*supports_lmp_nlist_cutoff_twice=*/true, + /*supports_lmp_nlist_type_sel=*/true, + /*supports_print_summary=*/true, + /*supports_no_pbc_simple=*/true, + /*supports_no_pbc_atomic=*/false, + /*supports_no_pbc_lmp_nlist=*/true, + /*supports_no_pbc_lmp_nlist_atomic=*/false}, {"dpa2_pytorch_pth", Backend::PyTorch, "../../tests/infer/deeppot_dpa2.pth", From b4c0b49f522997b8fd373ea1c4cefbf752a8e4a7 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 1 Jul 2026 08:43:48 +0800 Subject: [PATCH 28/33] =?UTF-8?q?fix(pt=5Fexpt):=20address=20iProzd=20revi?= =?UTF-8?q?ew=20=E2=80=94=20graph=20freeze=20defaults=20to=20.pt2=20+=20mo?= =?UTF-8?q?dulo-clamp=20invariant=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - entrypoints/main.py: 'dp --pt_expt freeze --lower-kind graph' now defaults a suffix-less output to .pt2 (the AOTI archive the C++ graph path consumes) instead of .pte; explicit .pte/.pt2 suffixes are preserved for both kinds. New test (freeze mocked) covers all four suffix/kind combinations. - test_edge_force_virial.py: add an invariant test that the in-bounds index clamp (% n_out) NEVER alters a real (edge_mask==True) edge -- a boundary real edge on node n_out-1 scatters unwrapped, and a masked guard edge with deliberately out-of-range indices contributes nothing; the result equals the real-edges-only reference. Directly answers iProzd's ask to prove the modulo cannot silently remap real edges (only zero-weight guard edges). Both new tests pass. --- deepmd/pt_expt/entrypoints/main.py | 9 ++++- .../common/dpmodel/test_edge_force_virial.py | 32 ++++++++++++++++ source/tests/pt_expt/test_dp_freeze.py | 37 +++++++++++++++++++ 3 files changed, 76 insertions(+), 2 deletions(-) diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py index 2567756578..5b3b74b6bd 100644 --- a/deepmd/pt_expt/entrypoints/main.py +++ b/deepmd/pt_expt/entrypoints/main.py @@ -731,13 +731,18 @@ def main(args: list[str] | argparse.Namespace | None = None) -> None: f"Checkpoint path '{model_path}' does not exist." ) FLAGS.model = str(model_path) + _lower_kind = getattr(FLAGS, "lower_kind", "nlist") if not FLAGS.output.endswith((".pte", ".pt2")): - FLAGS.output = str(Path(FLAGS.output).with_suffix(".pte")) + # Default suffix: .pt2 for the graph export (an AOTI .pt2 archive is + # what the C++ graph path consumes), .pte otherwise. Explicit user + # .pte / .pt2 suffixes are preserved for both. + _default_suffix = ".pt2" if _lower_kind == "graph" else ".pte" + FLAGS.output = str(Path(FLAGS.output).with_suffix(_default_suffix)) freeze( model=FLAGS.model, output=FLAGS.output, head=FLAGS.head, - lower_kind=getattr(FLAGS, "lower_kind", "nlist"), + lower_kind=_lower_kind, ) elif FLAGS.command == "change-bias": change_bias( diff --git a/source/tests/common/dpmodel/test_edge_force_virial.py b/source/tests/common/dpmodel/test_edge_force_virial.py index fa84ef7ba4..722960f57a 100644 --- a/source/tests/common/dpmodel/test_edge_force_virial.py +++ b/source/tests/common/dpmodel/test_edge_force_virial.py @@ -97,6 +97,38 @@ def test_all_edges_masked_gives_zero(self) -> None: np.testing.assert_allclose(av, np.zeros((n, 3, 3))) np.testing.assert_allclose(vir, np.zeros((nf, 3, 3))) + def test_modulo_clamp_leaves_real_edges_unchanged(self) -> None: + # INVARIANT (iProzd review): the in-bounds index clamp (``% n_out``) that + # keeps the CUDA-exported scatter address legal must NEVER alter a real + # (edge_mask == True) edge -- only masked/out-of-range guard edges may be + # remapped, and those carry zero weight so remapping is harmless. Here a + # REAL edge sits on the boundary node ``n_out - 1`` (the largest valid + # index, where a wrong wrap would be visible) and a MASKED guard edge + # carries deliberately OUT-OF-RANGE indices (>= n_out) with nonzero g/vec. + # Correctness requires the result to equal the real-edges-only reference: + # the boundary real edge must land on node n_out-1 (not wrapped), and the + # out-of-range guard must contribute nothing. If real edges were ever + # remapped by the modulo (the shape-binding bug iProzd warned about), the + # boundary node's force/virial would be wrong and this test would fail. + n_node = np.array([5], dtype=np.int64) # 1 frame, nodes 0..4 (n_out = 5) + # e0: real, src on the boundary node 4 -> node 0 ; e1: real, node 0 -> 4 + # e2: MASKED guard with out-of-range indices src=99, dst=77 (>= n_out) + edge_index = np.array([[4, 0, 99], [0, 4, 77]], dtype=np.int64) + edge_vec = np.array([[1.0, 0.0, 0.0], [-1.0, 0.0, 0.0], [9.0, 9.0, 9.0]]) + edge_mask = np.array([True, True, False]) + g = np.array([[0.5, 0.2, 0.0], [0.3, 0.0, 0.1], [7.0, 7.0, 7.0]]) + force, av, vir = edge_force_virial(g, edge_vec, edge_index, edge_mask, n_node) + + # reference: the SAME two real edges only (no guard edge at all) + ref_force, ref_av, ref_vir = edge_force_virial( + g[:2], edge_vec[:2], edge_index[:, :2], edge_mask[:2], n_node + ) + np.testing.assert_allclose(force, ref_force) + np.testing.assert_allclose(av, ref_av) + np.testing.assert_allclose(vir, ref_vir) + # explicit: the boundary real edge scattered its force to node 4 (unwrapped) + self.assertTrue(np.any(force[4] != 0.0)) + def test_ragged_multiframe_with_edge_and_node_padding(self) -> None: # MOST GENERAL case: 2 frames with DIFFERENT node counts (3 and 5) AND # different edge counts (2 and 3), masked guard EDGES, and a padded NODE diff --git a/source/tests/pt_expt/test_dp_freeze.py b/source/tests/pt_expt/test_dp_freeze.py index ebfac485ea..9090e4978d 100644 --- a/source/tests/pt_expt/test_dp_freeze.py +++ b/source/tests/pt_expt/test_dp_freeze.py @@ -133,6 +133,43 @@ def test_freeze_default_suffix(self) -> None: expected = os.path.join(self.tmpdir, "frozen_default_suffix.pte") self.assertTrue(os.path.exists(expected)) + def test_freeze_output_suffix_by_lower_kind(self) -> None: + """main() defaults a suffix-less output to .pt2 for --lower-kind graph + and .pte for nlist, while preserving an explicit .pte/.pt2 (iProzd + review). freeze() is mocked so the suffix logic is checked without the + AOTInductor compile cost. + """ + from unittest import mock + + cases = [ + ("graph", "out_g", None, ".pt2"), # graph, no suffix -> .pt2 + ("nlist", "out_n", None, ".pte"), # nlist, no suffix -> .pte + ("graph", "out_g_explicit", ".pte", ".pte"), # explicit .pte kept + ("nlist", "out_n_explicit", ".pt2", ".pt2"), # explicit .pt2 kept + ] + for lower_kind, stem, explicit, expected_suffix in cases: + with self.subTest(lower_kind=lower_kind, explicit=explicit): + name = stem + (explicit or "") + captured: dict = {} + + def _fake_freeze(model, output, head=None, lower_kind="nlist", **kw): + captured["output"] = output + captured["lower_kind"] = lower_kind + + flags = argparse.Namespace( + command="freeze", + checkpoint_folder=self.ckpt_file, + output=os.path.join(self.tmpdir, name), + head=None, + lower_kind=lower_kind, + log_level=2, + log_path=None, + ) + with mock.patch("deepmd.pt_expt.entrypoints.main.freeze", _fake_freeze): + main(flags) + self.assertTrue(captured["output"].endswith(expected_suffix)) + self.assertEqual(captured["lower_kind"], lower_kind) + def test_freeze_graph_rejects_ineligible(self) -> None: """``--lower-kind graph`` on a non-graph-eligible model (se_e2_a, mixed_types=False) fails fast rather than emitting a broken .pt2. From 4a76f6df9b89118095f554c824c7747d99f57ad6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 1 Jul 2026 00:45:58 +0000 Subject: [PATCH 29/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- source/tests/pt_expt/test_dp_freeze.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/source/tests/pt_expt/test_dp_freeze.py b/source/tests/pt_expt/test_dp_freeze.py index 9090e4978d..cdaa4a02c6 100644 --- a/source/tests/pt_expt/test_dp_freeze.py +++ b/source/tests/pt_expt/test_dp_freeze.py @@ -139,7 +139,9 @@ def test_freeze_output_suffix_by_lower_kind(self) -> None: review). freeze() is mocked so the suffix logic is checked without the AOTInductor compile cost. """ - from unittest import mock + from unittest import ( + mock, + ) cases = [ ("graph", "out_g", None, ".pt2"), # graph, no suffix -> .pt2 From c095e12db33c833427e5c55a528557a0986d6f69 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 1 Jul 2026 11:42:16 +0800 Subject: [PATCH 30/33] docs(dpmodel): document the graph edge-scatter modulo as the permanent fix + root cause Reframe the comment on the % n_out index guard: it is the PERMANENT fix, not an interim workaround. Records the GPU-confirmed root cause -- the dynamic-edge graph torch.export path traces the node count as several equal-but-distinct symbols tied only by aten._assert_scalar(Eq(...)) nodes, which _strip_shape_assertions neutralises wholesale (needed for export to trace), dropping those equalities so inductor can no longer prove idx < n_out and emits a device_assert. The upstream alternative (making the shared, spin-critical _strip_shape_assertions selective) is deliberately not taken -- it risks the torch.export bugs that helper bypasses and the spin .pt2 path. Comment-only. --- .../utils/neighbor_graph/derivatives.py | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py index affeb57ccc..599877d7e2 100644 --- a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py +++ b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py @@ -91,17 +91,29 @@ def edge_force_virial( # zero padding/guard contributions; cast mask to g's dtype (array-API pure, # CLAUDE.md mask-multiply guideline — avoids bool*float under array_api_strict) g = g_e * xp.astype(edge_mask[:, None], g_e.dtype) - # Wrap node indices into ``[0, n_out)`` so every scatter address is in-bounds. - # Real edges already have index < n_out (modulo is a no-op). Out-of-range - # indices CAN appear in the CUDA-compiled kernel: under dynamic-edge - # ``torch.export`` the scatter bound ``ks0 == n_out`` binds to a SMALLER - # symbol than the live node count at AOTI runtime, so a valid index trips - # ``tl.device_assert(idx < ks0)`` (a hard device-side assert on CUDA; benign - # on CPU, which does not bounds-check the address). Such edges carry ~zero - # ``w_edge`` (masked ``g`` + tiny ``g_e``), so wrapping them to another node - # is numerically harmless. Modulo is pure arithmetic => torch.export-safe, - # unlike ``xp.clip`` (SymInt bound breaks array_api_compat) and unlike a - # mask-multiply (misses ``edge_mask == 1`` out-of-range indices). + # Wrap node indices into ``[0, n_out)`` so every scatter address is provably + # in-bounds. For a well-formed graph every real edge already has + # ``index < n_out`` (== ``atype.shape[0]``), so this modulo is the IDENTITY on + # real edges (pinned by test_modulo_clamp_leaves_real_edges_unchanged) -- a + # correctness-preserving guard, not a value fixup. + # + # Why it is needed (root cause, GPU-confirmed): under the dynamic-edge graph + # ``torch.export`` path the node count is traced as several equal-but-distinct + # symbols (``atype.shape[0]``, ``fit_ret.shape[0]``, ...), tied only by + # ``aten._assert_scalar(Eq(...))`` nodes. ``_strip_shape_assertions`` + # (pt_expt/utils/serialization.py) neutralises ALL such asserts so export can + # trace -- which also drops those node-count equalities, so inductor can no + # longer prove the scatter index and its bound ``ks0 == n_out`` share a symbol + # and emits ``tl.device_assert(idx < ks0)`` (fatal on CUDA; unchecked on CPU, + # which is why all CPU dev/CI was green). ``% n_out`` discharges that guard + # unconditionally. This is the PERMANENT fix: the upstream alternative -- + # making the SHARED, spin-export-critical ``_strip_shape_assertions`` + # selective -- risks re-triggering the torch.export bugs it exists to bypass + # and the spin ``.pt2`` path, so it is deliberately NOT taken. + # + # Pure arithmetic => torch.export-safe, unlike ``xp.clip`` (SymInt bound + # breaks array_api_compat's clip) and unlike a mask-multiply (which misses the + # ``edge_mask == 1`` indices the stripped guard mis-bounds). src = edge_index[0] % n_out dst = edge_index[1] % n_out # force (output sized to the node axis, incl. any padding tail) From 3348c80e4de2d5c8dd4437d37bd8c1da6da4ec20 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 1 Jul 2026 14:36:45 +0800 Subject: [PATCH 31/33] refactor(pt_expt): consolidate graph trace/sample builders (OutisLi review) Merge the two near-identical synthetic-graph builders _make_graph_sample_inputs (serialization.py) and _make_graph_trace_inputs (training.py) into one build_synthetic_graph_inputs in serialization.py, parameterized by dtype (export=float64 ABI, training=GLOBAL_PT precision) and device (export passes cpu explicitly instead of mutating env.DEVICE) plus the want_fparam/aparam/charge_spin gating. Also factor the duplicated prime-collision _forbidden scan into _forbidden_dims_from_model, shared by the dense and graph _trace_and_compile. Removes the desync risk between the training and export graph traces flagged in review. --- deepmd/pt_expt/train/training.py | 188 +++++++------------------- deepmd/pt_expt/utils/serialization.py | 134 +++++++++--------- 2 files changed, 121 insertions(+), 201 deletions(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 409fcef48d..5068ac51af 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -274,6 +274,41 @@ def get_additional_data_requirement(_model: Any) -> list[DataRequirementItem]: # --------------------------------------------------------------------------- +def _forbidden_dims_from_model( + model: torch.nn.Module, + task_buf_vals: tuple[torch.Tensor, ...], +) -> set[int]: + """Prime-collision set for trace-dim selection. + + Collects every ``> 1`` dim of the model's parameters/buffers (so + ``_next_safe_prime`` never aliases an internal dim like ``g2_dim`` / + ``axis_neuron`` / ``attn_head`` without a hardcoded list), plus + ``dim_fparam``/``dim_aparam`` and the task-buffer dims. Shared by the dense + :func:`_trace_and_compile` and the graph :func:`_trace_and_compile_graph`; + each caller adds its path-specific dims (nall/nloc/nsel for dense, + charge_spin for both) on top of this base set. + """ + forbidden: set[int] = { + int(_d) + for _src in (model.parameters(), model.buffers()) + for _p in _src + for _d in _p.shape + if _d > 1 + } + for _getter in (model.get_dim_fparam, model.get_dim_aparam): + try: + _dim = _getter() + if _dim > 1: + forbidden.add(int(_dim)) + except Exception: + pass # best-effort: dim unavailable -> nothing to forbid + for _tbv in task_buf_vals: + for _d in _tbv.shape: + if _d > 1: + forbidden.add(int(_d)) + return forbidden + + def _trace_and_compile( model: torch.nn.Module, ext_coord: torch.Tensor, @@ -397,17 +432,11 @@ def fn( # large to alias with any architecture dim and need no adjustment. # # The prime for nf is chosen by enumerating every dimension that appears - # in the model's parameters and buffers, then calling _next_safe_prime to - # find the first prime that doesn't collide with any of them. This - # catches internal dims like g2_dim, axis_neuron, attn_head, etc. - # without requiring a hardcoded list. - _forbidden: set[int] = { - int(_d) - for _src in (model.parameters(), model.buffers()) - for _p in _src - for _d in _p.shape - if _d > 1 - } + # in the model's parameters and buffers (see _forbidden_dims_from_model), + # then calling _next_safe_prime to find the first prime that doesn't collide + # with any of them -- catching internal dims like g2_dim/axis_neuron/ + # attn_head without a hardcoded list. Add the dense-path dims on top. + _forbidden = _forbidden_dims_from_model(model, task_buf_vals_trace) # Also add the real nloc and nall so trace_nf never aliases them. _forbidden.add(int(ext_coord.shape[1])) # nall _forbidden.add(int(ext_atype.shape[1])) # nall (same tensor, defensive) @@ -416,26 +445,10 @@ def fn( _nsel = int(nlist.shape[2]) if _nsel > 1: _forbidden.add(_nsel) - try: - _dim_fp = model.get_dim_fparam() - if _dim_fp > 1: - _forbidden.add(_dim_fp) - except Exception: - pass # best-effort: dim_fparam unavailable -> nothing to forbid - try: - _dim_ap = model.get_dim_aparam() - if _dim_ap > 1: - _forbidden.add(_dim_ap) - except Exception: - pass # best-effort: dim_aparam unavailable -> nothing to forbid if charge_spin is not None: _dim_cs = int(charge_spin.shape[1]) if _dim_cs > 1: _forbidden.add(_dim_cs) - for _tbv in task_buf_vals_trace: - for _d in _tbv.shape: - if _d > 1: - _forbidden.add(int(_d)) trace_nf = _next_safe_prime(5, _forbidden) @@ -653,33 +666,12 @@ def _trace_and_compile_graph( # * nf = n_node.shape[0] (per-frame reductions) # * N = atype.shape[0] (flat node axis = sum(n_node)) # * E = edge_vec.shape[0] (edge axis) - # They are chosen as collision-free primes vs every parameter/buffer dim. + # They are chosen as collision-free primes vs every parameter/buffer dim + # (see _forbidden_dims_from_model) plus charge_spin. # ------------------------------------------------------------------ - _forbidden: set[int] = { - int(_d) - for _src in (model.parameters(), model.buffers()) - for _p in _src - for _d in _p.shape - if _d > 1 - } - try: - _dim_fp = model.get_dim_fparam() - if _dim_fp > 1: - _forbidden.add(_dim_fp) - except Exception: - pass # best-effort: dim_fparam unavailable -> nothing to forbid - try: - _dim_ap = model.get_dim_aparam() - if _dim_ap > 1: - _forbidden.add(_dim_ap) - except Exception: - pass # best-effort: dim_aparam unavailable -> nothing to forbid + _forbidden = _forbidden_dims_from_model(model, task_buf_vals_trace) if charge_spin is not None and charge_spin.shape[-1] > 1: _forbidden.add(int(charge_spin.shape[-1])) - for _tbv in task_buf_vals_trace: - for _d in _tbv.shape: - if _d > 1: - _forbidden.add(int(_d)) trace_nf = _next_safe_prime(5, _forbidden) # nloc such that N = trace_nf * nloc is collision-free (and != trace_nf). @@ -692,11 +684,18 @@ def _trace_and_compile_graph( e_max_base = max(math.ceil(1.25 * nloc_trace * nnei), 7) e_max = _next_safe_prime(e_max_base, _forbidden | {trace_nf, trace_N}) - sample = _make_graph_trace_inputs( + # Shared with the .pt2 export trace (serialization.py) so the two graph + # traces can never desync on the input schema. Training uses the run-time + # float precision and device; optional tensors match the actual call. + from deepmd.pt_expt.utils.serialization import build_synthetic_graph_inputs + + sample = build_synthetic_graph_inputs( model, e_max=e_max, nframes=trace_nf, nloc=nloc_trace, + dtype=GLOBAL_PT_FLOAT_PRECISION, + device=DEVICE, want_fparam=fparam is not None, want_aparam=aparam is not None, want_charge_spin=charge_spin is not None, @@ -804,91 +803,6 @@ def fn( ) -def _make_graph_trace_inputs( - model: torch.nn.Module, - e_max: int, - nframes: int, - nloc: int, - *, - want_fparam: bool, - want_aparam: bool, - want_charge_spin: bool, -) -> tuple[torch.Tensor | None, ...]: - """Build a synthetic carry-all NeighborGraph for the graph-compile trace. - - Returns positional tensors in the order - ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, - charge_spin)`` matching ``forward_common_lower_graph``. The edge axis is - padded to the STATIC ``e_max`` (masked) so its concrete value is a chosen - prime; ``fparam`` / ``aparam`` / ``charge_spin`` are emitted only when the - model+data path actually carries them (``want_*``), so the traced branch - matches the run-time call. - """ - from deepmd.dpmodel.utils.neighbor_graph import ( - GraphLayout, - build_neighbor_graph, - ) - - rcut = model.get_rcut() - ntypes = len(model.get_type_map()) - dim_fparam = model.get_dim_fparam() - dim_aparam = model.get_dim_aparam() - - box_size = rcut * 3.0 - box_np = (np.eye(3, dtype=np.float64) * box_size).reshape(1, 9) - rng = np.random.default_rng(42) - coord_np = rng.random((nframes, nloc, 3)) * box_size * 0.5 + box_size * 0.25 - atype_np = np.zeros((nframes, nloc), dtype=np.int64) - for i in range(nloc): - atype_np[:, i] = i % ntypes - - coord_t = torch.tensor(coord_np, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE) - atype_t = torch.tensor(atype_np, dtype=torch.int64, device=DEVICE) - box_t = torch.tensor( - np.tile(box_np, (nframes, 1)), dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE - ) - - graph = build_neighbor_graph( - coord_t, atype_t, box_t, rcut, layout=GraphLayout(edge_capacity=e_max) - ) - - s_atype = atype_t.reshape(-1) - s_n_node = graph.n_node - s_edge_index = graph.edge_index - s_edge_vec = graph.edge_vec - s_edge_mask = graph.edge_mask - - s_fparam = ( - torch.zeros(nframes, dim_fparam, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE) - if (want_fparam and dim_fparam > 0) - else None - ) - s_aparam = ( - torch.zeros( - nframes, nloc, dim_aparam, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE - ) - if (want_aparam and dim_aparam > 0) - else None - ) - dim_cs = model.get_dim_chg_spin() if hasattr(model, "get_dim_chg_spin") else 0 - s_charge_spin = ( - torch.zeros(nframes, dim_cs, dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE) - if (want_charge_spin and dim_cs > 0) - else None - ) - - return ( - s_atype, - s_n_node, - s_edge_index, - s_edge_vec, - s_edge_mask, - s_fparam, - s_aparam, - s_charge_spin, - ) - - class _CompiledModel(torch.nn.Module): """Coord extension (eager) -> compiled forward_lower (dynamic shapes). diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index f4b538df75..e0c08a6f84 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -315,32 +315,57 @@ def _make_sample_inputs( return ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam, charge_spin -def _make_graph_sample_inputs( +def build_synthetic_graph_inputs( model: torch.nn.Module, e_max: int, nframes: int = 2, nloc: int = 7, + *, + dtype: torch.dtype, + device: torch.device | None = None, + want_fparam: bool = True, + want_aparam: bool = True, + want_charge_spin: bool = True, ) -> tuple[torch.Tensor | None, ...]: - """Create sample inputs for tracing ``forward_lower_graph``. + """Build a synthetic carry-all ``NeighborGraph`` for graph-lower tracing. - Builds a small random system, runs the carry-all + Single source of the trace-time graph inputs, shared by ``.pt2`` export + (:func:`_trace_and_export`) and compiled training + (:func:`deepmd.pt_expt.train.training._trace_and_compile_graph`), so the two + traces can never desync on the graph input schema. Builds a small random + system, runs the carry-all :func:`~deepmd.dpmodel.utils.neighbor_graph.build_neighbor_graph` with a - STATIC ``GraphLayout(edge_capacity=e_max)`` (decision #16: the masked - static edge axis), and returns tensors in the positional order expected by - :meth:`forward_lower_graph_exportable`: + STATIC ``GraphLayout(edge_capacity=e_max)`` (decision #16: the masked static + edge axis), and returns tensors in the positional order expected by + ``forward_(common_)lower_graph``: ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, charge_spin)``. + The system (``rng(42)``, ``box = rcut*3``, centered coords, ``atype[:, i] = + i % ntypes``) is identical for both callers; the only two former differences + are now parameters. + Parameters ---------- model : torch.nn.Module The pt_expt energy model (must expose ``get_rcut``/``get_type_map``/...). e_max : int - Static edge capacity ``E`` to pad the edge axis to. + Static edge capacity ``E`` to pad the (masked) edge axis to. nframes : int Number of frames in the sample system. nloc : int Number of local atoms per frame (``N == nframes * nloc``). + dtype : torch.dtype + Float precision of ``coord``/``edge_vec``/``fparam``/... . The exported + ``.pt2`` is float64-only (C++ ABI); training passes + ``GLOBAL_PT_FLOAT_PRECISION``. + device : torch.device, optional + Target device. Defaults to ``deepmd.pt_expt.utils.env.DEVICE``; the + export path passes ``cpu`` explicitly (make_fx traces on CPU). + want_fparam, want_aparam, want_charge_spin : bool + Whether to emit the optional conditioning tensor when its ``dim > 0``. + Export passes the defaults (``True`` = include if present); training + passes ``x is not None`` so the traced branch matches the run-time call. """ import deepmd.pt_expt.utils.env as _env from deepmd.dpmodel.utils.neighbor_graph import ( @@ -348,74 +373,53 @@ def _make_graph_sample_inputs( build_neighbor_graph, ) + if device is None: + device = _env.DEVICE + rcut = model.get_rcut() ntypes = len(model.get_type_map()) dim_fparam = model.get_dim_fparam() dim_aparam = model.get_dim_aparam() + dim_chg_spin = model.get_dim_chg_spin() if hasattr(model, "get_dim_chg_spin") else 0 - # Box large enough to avoid PBC degeneracy; mirrors _make_sample_inputs. + # Box large enough to avoid PBC degeneracy; centered coords. box_size = rcut * 3.0 - box = np.eye(3, dtype=np.float64) * box_size - box_np = box.reshape(1, 9) - + box_np = (np.eye(3, dtype=np.float64) * box_size).reshape(1, 9) rng = np.random.default_rng(42) - coord_np = rng.random((nframes, nloc, 3), dtype=np.float64) * box_size * 0.5 - coord_np += box_size * 0.25 # center in box - + coord_np = rng.random((nframes, nloc, 3)) * box_size * 0.5 + box_size * 0.25 atype_np = np.zeros((nframes, nloc), dtype=np.int64) for i in range(nloc): atype_np[:, i] = i % ntypes + coord_t = torch.tensor(coord_np, dtype=dtype, device=device) + atype_t = torch.tensor(atype_np, dtype=torch.int64, device=device) + box_t = torch.tensor(np.tile(box_np, (nframes, 1)), dtype=dtype, device=device) graph = build_neighbor_graph( - coord_np, - atype_np, - np.tile(box_np, (nframes, 1)), - rcut, - layout=GraphLayout(edge_capacity=e_max), + coord_t, atype_t, box_t, rcut, layout=GraphLayout(edge_capacity=e_max) ) - atype_t = torch.tensor(atype_np.reshape(-1), dtype=torch.int64, device=_env.DEVICE) - n_node_t = torch.tensor( - np.asarray(graph.n_node), dtype=torch.int64, device=_env.DEVICE - ) - edge_index_t = torch.tensor( - np.asarray(graph.edge_index), dtype=torch.int64, device=_env.DEVICE + fparam = ( + torch.zeros(nframes, dim_fparam, dtype=dtype, device=device) + if (want_fparam and dim_fparam > 0) + else None ) - edge_vec_t = torch.tensor( - np.asarray(graph.edge_vec), dtype=torch.float64, device=_env.DEVICE + aparam = ( + torch.zeros(nframes, nloc, dim_aparam, dtype=dtype, device=device) + if (want_aparam and dim_aparam > 0) + else None ) - edge_mask_t = torch.tensor( - np.asarray(graph.edge_mask), dtype=torch.bool, device=_env.DEVICE + charge_spin = ( + torch.zeros(nframes, dim_chg_spin, dtype=dtype, device=device) + if (want_charge_spin and dim_chg_spin > 0) + else None ) - if dim_fparam > 0: - fparam = torch.zeros( - nframes, dim_fparam, dtype=torch.float64, device=_env.DEVICE - ) - else: - fparam = None - - if dim_aparam > 0: - aparam = torch.zeros( - nframes, nloc, dim_aparam, dtype=torch.float64, device=_env.DEVICE - ) - else: - aparam = None - - dim_chg_spin = model.get_dim_chg_spin() if hasattr(model, "get_dim_chg_spin") else 0 - if dim_chg_spin > 0: - charge_spin = torch.zeros( - nframes, dim_chg_spin, dtype=torch.float64, device=_env.DEVICE - ) - else: - charge_spin = None - return ( - atype_t, - n_node_t, - edge_index_t, - edge_vec_t, - edge_mask_t, + atype_t.reshape(-1), + graph.n_node, + graph.edge_index, + graph.edge_vec, + graph.edge_mask, fparam, aparam, charge_spin, @@ -910,14 +914,16 @@ def _trace_and_export( nnei = sum(model.get_sel()) e_sample = math.ceil(1.25 * nloc_sample * nnei) - _orig_device = _env.DEVICE - _env.DEVICE = torch.device("cpu") - try: - sample_inputs = _make_graph_sample_inputs( - model, e_max=e_sample, nframes=2, nloc=nloc_sample - ) - finally: - _env.DEVICE = _orig_device + # make_fx traces on CPU; the .pt2 C++ ABI is float64-only. Pass device + # and dtype explicitly instead of mutating the module-level env.DEVICE. + sample_inputs = build_synthetic_graph_inputs( + model, + e_max=e_sample, + nframes=2, + nloc=nloc_sample, + dtype=torch.float64, + device=torch.device("cpu"), + ) ( atype_g, From 282f6410ebcb2583203ffda8b3536d6c5faf9d4f Mon Sep 17 00:00:00 2001 From: Han Wang Date: Wed, 1 Jul 2026 14:45:55 +0800 Subject: [PATCH 32/33] perf(api_cc): cache graph edge topology across steps + guard empty rank (OutisLi review) Comment 1 (perf): the graph LAMMPS path called buildGraphTensors every timestep, whose createEdgeTensors stage is an O(E) host loop + H2D copy that rebuilds the edge topology from scratch even when LAMMPS has not refreshed the neighbor list. Mirror the edge path: at ago==0 cache the skin topology via createEdgeTensors(with_geometry=false, fold_to_local=!multi_rank) into edge_index_tensor / edge_index_ext_tensor; each step run only the on-device compactEdgeTensors (geometry recompute + rcut filter) and assemble the cheap n_node / node-atype tensors. Topology now rebuilds only on a neighbor-list rebuild, consistent with the edge path. Comment 2 (empty-rank nit): a truly-empty rank (nall_real == 0, no local atoms AND no ghosts) would feed N == 0 into the graph, and edge_force_virial's edge_index % node_capacity would SIGFPE (div-by-zero). Early-return zero outputs for that rank. The tested nloc==0 empty-subdomain case has nall_real>0 (ghosts within rcut) and still runs normally. Validated: 29/33 graph gtests (4 pre-existing NoPBC skips) + 5/5 test_lammps_dpa1_graph_pt2 (single-rank multi-step ref, multi-rank, mpirun -n 2 == single-rank, empty-subdomain). --- source/api_cc/src/DeepPotPTExpt.cc | 78 +++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 18 deletions(-) diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index 49e7ed0662..5d9b081a34 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -554,8 +554,13 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, nlist_data.copy_from_nlist(lmp_list, nall - nghost); nlist_data.shuffle_exclude_empty(fwd_map); - // Rebuild mapping vector and tensor (cached as members; graph branch reads - // mapping_ on every step, not just ago==0, so the vector must persist). + // Rebuild mapping vector and tensor (cached as members). ``mapping_tensor`` + // is consumed every step by the dense ``run_model`` (ghost-feature gather); + // the ``mapping_`` vector is read only here at ago==0 -- to build that + // tensor and, for the edge/graph paths, to fold ghost neighbours onto their + // local owners inside ``createEdgeTensors``. (The graph path used to read + // ``mapping_`` every step via a per-step ``buildGraphTensors``; it now caches + // the topology at ago==0 like the edge/dense paths, so no per-step read.) if (lmp_list.mapping) { mapping_.resize(nall_real); for (int ii = 0; ii < nall_real; ii++) { @@ -602,10 +607,19 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, edge_index_tensor = edge_tensors.edge_index; edge_index_ext_tensor = edge_tensors.edge_index_ext; } else if (lower_input_is_graph_) { - // Graph schema rebuilds the edge topology on-device every step inside - // buildGraphTensors (from the raw, unpadded nlist_data.jlist + - // nlist_data.ilist centers), so nothing is cached here and the nlist is - // left unpadded (createEdgeTensors handles ragged rows and skips -1). + // Cache only the real skin topology, exactly like the edge path: the + // geometry (edge_vec) + rcut filter are recomputed on-device every step + // by compactEdgeTensors, so the O(E) host loop + H2D copy in + // createEdgeTensors runs ONLY on a LAMMPS nlist rebuild (ago==0), not + // every step. Single-rank folds ghosts onto local owners + // (fold_to_local=true); non-MP multi-rank keeps the extended region + // (fold_to_local=false) so ghost forces reverse-comm to their owners. + const auto edge_tensors = createEdgeTensors( + nlist_data.jlist, dcoord, mapping_, nloc, nall_real, device, + /*with_geometry=*/false, /*row_centers=*/&nlist_data.ilist, + /*fold_to_local=*/!multi_rank); + edge_index_tensor = edge_tensors.edge_index; + edge_index_ext_tensor = edge_tensors.edge_index_ext; } else { nlist_data.padding(); firstneigh_tensor = createNlistTensor(nlist_data.jlist, nnei) @@ -822,19 +836,47 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, edge_tensors.edge_index_ext, edge_tensors.edge_mask, fparam_tensor, aparam_tensor, charge_spin_tensor); } else if (lower_input_is_graph_) { - // NeighborGraph schema: build (atype, n_node, edge_index, edge_vec, - // edge_mask) from the host nlist and run the (single-rank) graph - // artifact. Single-rank folds ghosts onto local owners (N == nloc); - // multi-rank (non-MP only — the fail-fast above blocks MP graph - // multi-rank) keeps the extended region (N == nall_real, node types from - // the real halo types) so LAMMPS reverse-comm folds ghost forces back. - const auto graph_tensors = buildGraphTensors( - nlist_data.jlist, dcoord, datype, mapping_, nloc, nall_real, - static_cast(rcut), device, &nlist_data.ilist, - /*fold_to_local=*/!multi_rank); + if (nall_real == 0) { + // Truly-empty rank (no local atoms AND no ghosts): the graph would emit + // N == 0 nodes, and edge_force_virial's ``edge_index % node_capacity`` + // would divide by zero (SIGFPE) -- it also violates the exported + // ``Dim("n_node_total", min=1)``. Such a rank contributes nothing, so + // fill zero outputs and return instead of running the model. (The + // tested ``nloc == 0`` empty-subdomain case has ``nall_real > 0`` -- + // ghosts within rcut -- so it still runs the model normally.) + ener.assign(nframes, static_cast(0)); + force.assign(static_cast(nframes) * fwd_map.size() * 3, + static_cast(0)); + virial.assign(static_cast(nframes) * 9, + static_cast(0)); + if (atomic) { + atom_energy.assign(static_cast(nframes) * fwd_map.size(), + static_cast(0)); + atom_virial.assign(static_cast(nframes) * fwd_map.size() * 9, + static_cast(0)); + } + return; + } + // NeighborGraph schema: recompute geometry + rcut filter on-device from + // the cached skin topology (edge_index[_ext]_tensor built at ago==0), + // then assemble the cheap node tensors. Mirrors the edge path -- no + // per-step host rebuild / H2D copy. Single-rank folds ghosts onto local + // owners (N == nloc); multi-rank (non-MP only — the fail-fast above blocks + // MP graph multi-rank) keeps the extended region (N == nall_real, node + // types from the real halo types) so LAMMPS reverse-comm folds ghost + // forces back. The node types come from the on-device extended + // atype_Tensor slice (== atype_ext[0:N]); n_node is a 1-element tensor. + const auto edge_tensors = + compactEdgeTensors(edge_index_tensor, edge_index_ext_tensor, + coord_Tensor, static_cast(rcut)); + const std::int64_t n_node_count = multi_rank ? nall_real : nloc; + at::Tensor n_node_tensor = + torch::full({1}, n_node_count, int_option).to(device); + at::Tensor node_atype = + atype_Tensor.slice(1, 0, n_node_count).reshape({n_node_count}); flat_outputs = run_model_graph( - graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index, - graph_tensors.edge_vec, graph_tensors.edge_mask, fparam_tensor, + node_atype, n_node_tensor, edge_tensors.edge_index, + edge_tensors.edge_vec, edge_tensors.edge_mask, fparam_tensor, aparam_tensor, charge_spin_tensor); } else { flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor, From 70b02fef68a0b1a362f4270f24df0b865124b64e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 1 Jul 2026 06:55:25 +0000 Subject: [PATCH 33/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt_expt/train/training.py | 4 +++- source/api_cc/src/DeepPotPTExpt.cc | 21 +++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index 5068ac51af..4ab43fe329 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -687,7 +687,9 @@ def _trace_and_compile_graph( # Shared with the .pt2 export trace (serialization.py) so the two graph # traces can never desync on the input schema. Training uses the run-time # float precision and device; optional tensors match the actual call. - from deepmd.pt_expt.utils.serialization import build_synthetic_graph_inputs + from deepmd.pt_expt.utils.serialization import ( + build_synthetic_graph_inputs, + ) sample = build_synthetic_graph_inputs( model, diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index 5d9b081a34..315f4cc39b 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -554,13 +554,14 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, nlist_data.copy_from_nlist(lmp_list, nall - nghost); nlist_data.shuffle_exclude_empty(fwd_map); - // Rebuild mapping vector and tensor (cached as members). ``mapping_tensor`` + // Rebuild mapping vector and tensor (cached as members). ``mapping_tensor`` // is consumed every step by the dense ``run_model`` (ghost-feature gather); // the ``mapping_`` vector is read only here at ago==0 -- to build that // tensor and, for the edge/graph paths, to fold ghost neighbours onto their // local owners inside ``createEdgeTensors``. (The graph path used to read - // ``mapping_`` every step via a per-step ``buildGraphTensors``; it now caches - // the topology at ago==0 like the edge/dense paths, so no per-step read.) + // ``mapping_`` every step via a per-step ``buildGraphTensors``; it now + // caches the topology at ago==0 like the edge/dense paths, so no per-step + // read.) if (lmp_list.mapping) { mapping_.resize(nall_real); for (int ii = 0; ii < nall_real; ii++) { @@ -861,9 +862,9 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, // the cached skin topology (edge_index[_ext]_tensor built at ago==0), // then assemble the cheap node tensors. Mirrors the edge path -- no // per-step host rebuild / H2D copy. Single-rank folds ghosts onto local - // owners (N == nloc); multi-rank (non-MP only — the fail-fast above blocks - // MP graph multi-rank) keeps the extended region (N == nall_real, node - // types from the real halo types) so LAMMPS reverse-comm folds ghost + // owners (N == nloc); multi-rank (non-MP only — the fail-fast above + // blocks MP graph multi-rank) keeps the extended region (N == nall_real, + // node types from the real halo types) so LAMMPS reverse-comm folds ghost // forces back. The node types come from the on-device extended // atype_Tensor slice (== atype_ext[0:N]); n_node is a 1-element tensor. const auto edge_tensors = @@ -874,10 +875,10 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, torch::full({1}, n_node_count, int_option).to(device); at::Tensor node_atype = atype_Tensor.slice(1, 0, n_node_count).reshape({n_node_count}); - flat_outputs = run_model_graph( - node_atype, n_node_tensor, edge_tensors.edge_index, - edge_tensors.edge_vec, edge_tensors.edge_mask, fparam_tensor, - aparam_tensor, charge_spin_tensor); + flat_outputs = + run_model_graph(node_atype, n_node_tensor, edge_tensors.edge_index, + edge_tensors.edge_vec, edge_tensors.edge_mask, + fparam_tensor, aparam_tensor, charge_spin_tensor); } else { flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor, mapping_tensor, fparam_tensor, aparam_tensor,