diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py index 27e2d68bfc..3f49b6c04f 100644 --- a/deepmd/dpmodel/descriptor/dpa1.py +++ b/deepmd/dpmodel/descriptor/dpa1.py @@ -757,9 +757,13 @@ def call_graph( ) # FLAT node axis (N, ...): no (nf, nloc) reshape -- ragged-native, spec. if self.concat_output_tebd: - tebd = xp.asarray(type_embedding, device=dev) + # Use type_embedding directly (mirrors the dense path's + # ``xp.take(type_embedding, ...)``): ``xp.asarray(..., device=dev)`` + # DETACHES under torch, silently severing the type-embedding weight + # gradient so the tebd net never trains; type_embedding already lives + # on the model device, so the device cast was redundant anyway. atype_local = xp.asarray(atype, device=dev) - atype_embd = xp.take(tebd, atype_local, axis=0) # (N, tebd_dim) + atype_embd = xp.take(type_embedding, atype_local, axis=0) # (N, tebd_dim) grrg = xp.concat([grrg, atype_embd], axis=-1) return grrg, rot_mat @@ -1523,7 +1527,10 @@ def call_graph( ss = rr[:, 0:1] # (E, 1) # neighbor / center type embeddings (concat mode); ghost type == owner type # so gathering by the LOCAL owner (src) reproduces the dense neighbor tebd. - tebd = xp.asarray(type_embedding, device=dev) + # NB: do NOT wrap in ``xp.asarray(..., device=dev)`` -- that DETACHES under + # torch and severs the type-embedding weight gradient (the tebd net would + # never train); type_embedding already lives on the model device. + tebd = type_embedding atype_embd_nlist = xp.take(tebd, nei_type, axis=0) # (E, tebd_dim) if not self.type_one_side: atype_embd_nnei = xp.take(tebd, center_type, axis=0) # (E, tebd_dim) diff --git a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py index 494e97a0c9..599877d7e2 100644 --- a/deepmd/dpmodel/utils/neighbor_graph/derivatives.py +++ b/deepmd/dpmodel/utils/neighbor_graph/derivatives.py @@ -80,15 +80,42 @@ def edge_force_virial( frame via the frame of their ``dst`` node. """ xp = array_api_compat.array_namespace(g_e) - # node-axis size; when a static ``node_capacity`` is supplied (the jax/export - # path) short-circuit so we never call int() on the traced ``sum(n_node)``. - n_out = int(node_capacity) if node_capacity is not None else int(xp.sum(n_node)) + # node-axis size; when a ``node_capacity`` is supplied (the jax/export path) + # use it AS-IS so we never call int() on the traced ``sum(n_node)`` -- and, + # crucially, never on ``node_capacity`` itself: under symbolic make_fx / + # torch.export it is a SymInt (``atype.shape[0]``); ``int(SymInt)`` would + # SPECIALIZE the node axis to the trace-time sample size, baking a constant + # ``N`` into the scatter and breaking dynamic-``N`` inference. + n_out = node_capacity if node_capacity is not None else int(xp.sum(n_node)) nf = n_node.shape[0] # zero padding/guard contributions; cast mask to g's dtype (array-API pure, # CLAUDE.md mask-multiply guideline — avoids bool*float under array_api_strict) g = g_e * xp.astype(edge_mask[:, None], g_e.dtype) - src = edge_index[0] - dst = edge_index[1] + # Wrap node indices into ``[0, n_out)`` so every scatter address is provably + # in-bounds. For a well-formed graph every real edge already has + # ``index < n_out`` (== ``atype.shape[0]``), so this modulo is the IDENTITY on + # real edges (pinned by test_modulo_clamp_leaves_real_edges_unchanged) -- a + # correctness-preserving guard, not a value fixup. + # + # Why it is needed (root cause, GPU-confirmed): under the dynamic-edge graph + # ``torch.export`` path the node count is traced as several equal-but-distinct + # symbols (``atype.shape[0]``, ``fit_ret.shape[0]``, ...), tied only by + # ``aten._assert_scalar(Eq(...))`` nodes. ``_strip_shape_assertions`` + # (pt_expt/utils/serialization.py) neutralises ALL such asserts so export can + # trace -- which also drops those node-count equalities, so inductor can no + # longer prove the scatter index and its bound ``ks0 == n_out`` share a symbol + # and emits ``tl.device_assert(idx < ks0)`` (fatal on CUDA; unchecked on CPU, + # which is why all CPU dev/CI was green). ``% n_out`` discharges that guard + # unconditionally. This is the PERMANENT fix: the upstream alternative -- + # making the SHARED, spin-export-critical ``_strip_shape_assertions`` + # selective -- risks re-triggering the torch.export bugs it exists to bypass + # and the spin ``.pt2`` path, so it is deliberately NOT taken. + # + # Pure arithmetic => torch.export-safe, unlike ``xp.clip`` (SymInt bound + # breaks array_api_compat's clip) and unlike a mask-multiply (which misses the + # ``edge_mask == 1`` indices the stripped guard mis-bounds). + src = edge_index[0] % n_out + dst = edge_index[1] % n_out # force (output sized to the node axis, incl. any padding tail) force = segment_sum(g, dst, n_out) - segment_sum(g, src, n_out) # per-edge virial w_e[k, j] = -g_e[k] * edge_vec[j] (broadcast, no einsum) @@ -101,6 +128,8 @@ def edge_force_virial( boundaries = xp.cumulative_sum(n_node) # (nf,) per-frame node upper bounds edge_frame = xp.astype( xp.searchsorted(boundaries, dst, side="right"), xp.int64 - ) # (E,) in [0, nf) + ) # (E,) in [0, nf] + # wrap into [0, nf) for the same CUDA-bounds reason (export-safe modulo) + edge_frame = edge_frame % nf virial = segment_sum(w_edge, edge_frame, nf) # (nf, 3, 3) return force, atom_virial, virial diff --git a/deepmd/dpmodel/utils/neighbor_graph/graph.py b/deepmd/dpmodel/utils/neighbor_graph/graph.py index e527a84bf0..0ce10efdf6 100644 --- a/deepmd/dpmodel/utils/neighbor_graph/graph.py +++ b/deepmd/dpmodel/utils/neighbor_graph/graph.py @@ -153,13 +153,18 @@ def frame_id_from_n_node(n_node: Array, n_total: int | None = None) -> Array: dev = array_api_compat.device(n_node) if n_total is None: n_total = int(xp.sum(n_node)) - nf = n_node.shape[0] idx = xp.arange(n_total, dtype=n_node.dtype, device=dev) boundaries = xp.cumulative_sum(n_node) # (nf,) upper bounds, exclusive frame_id = xp.astype(xp.searchsorted(boundaries, idx, side="right"), xp.int64) # padding nodes (idx >= sum(n_node)) land at frame ``nf`` (OOB); clamp them to # the last real frame so the per-frame scatter never indexes out of range. - return xp.minimum(frame_id, xp.asarray(nf - 1, dtype=xp.int64, device=dev)) + # Derive ``nf - 1`` as a RUNTIME 0-d tensor (sum of ones over the frame axis) + # rather than ``xp.asarray(n_node.shape[0] - 1)``: under symbolic make_fx / + # torch.export, ``shape[0]`` is a SymInt and materializing it into a constant + # tensor SPECIALIZES the frame axis -- baking the trace-time frame count into + # every downstream per-frame reduction and breaking dynamic-``nf`` inference. + last_frame = xp.sum(xp.ones_like(n_node)) - 1 # 0-d int == nf - 1 + return xp.minimum(frame_id, xp.astype(last_frame, xp.int64)) def node_validity_mask(n_node: Array, n_total: int) -> Array: diff --git a/deepmd/main.py b/deepmd/main.py index 43f40dc214..0a4d44137a 100644 --- a/deepmd/main.py +++ b/deepmd/main.py @@ -350,6 +350,16 @@ def main_parser() -> argparse.ArgumentParser: type=str, help="(Supported backend: PyTorch) Task head (alias: model branch) to freeze if in multi-task mode.", ) + parser_frz.add_argument( + "--lower-kind", + default="nlist", + type=str, + choices=["nlist", "graph"], + help="(Supported backend: PyTorch Exportable) Lower-level export form of the " + "frozen .pt2: 'nlist' (default, dense neighbor-list lower) or 'graph' " + "(NeighborGraph edge-list lower; only for graph-eligible models, currently " + "dpa1 with attn_layer=0). 'graph' selects the C++ graph inference path.", + ) # * test script ******************************************************************** parser_tst = subparsers.add_parser( diff --git a/deepmd/pt_expt/entrypoints/main.py b/deepmd/pt_expt/entrypoints/main.py index da28229bf4..5b3b74b6bd 100644 --- a/deepmd/pt_expt/entrypoints/main.py +++ b/deepmd/pt_expt/entrypoints/main.py @@ -387,6 +387,7 @@ def freeze( model: str, output: str = "frozen_model.pte", head: str | None = None, + lower_kind: str = "nlist", ) -> None: """Freeze a pt_expt checkpoint into a .pte exported model. @@ -398,6 +399,13 @@ def freeze( Path for the output .pte file. head : str or None Head to freeze in multi-task mode. + lower_kind : str + Lower-level export form: ``"nlist"`` (default, dense neighbor-list lower) + or ``"graph"`` (NeighborGraph edge-list lower). ``"graph"`` is only valid + for graph-eligible models (``mixed_types`` and ``uses_graph_lower``, + currently dpa1 with ``attn_layer == 0``) and selects the C++ graph + inference path; the per-atom virial is enabled for it (near-free in the + graph path: one extra scatter off the shared single backward). """ import torch @@ -458,12 +466,34 @@ def freeze( single_model_params = model_params m.eval() + + # The graph lower is opt-in and only valid for graph-eligible models (dpa1 + # attn_layer==0 today). Fail fast with a clear message rather than emitting a + # broken .pt2. Enable the per-atom virial for the graph form -- it is + # near-free there (one extra scatter off the single shared backward). + do_atomic_virial = False + if lower_kind == "graph": + from deepmd.pt_expt.train.training import ( + _model_uses_graph_lower, + ) + + if not _model_uses_graph_lower(m): + raise ValueError( + "lower_kind='graph' requires a graph-eligible model " + "(mixed_types and a descriptor exposing uses_graph_lower()==True, " + "currently dpa1 with attn_layer==0). Use lower_kind='nlist' for " + "this model." + ) + do_atomic_virial = True + model_dict_serialized = m.serialize() deserialize_to_file( output, {"model": model_dict_serialized, "model_def_script": single_model_params}, + do_atomic_virial=do_atomic_virial, + lower_kind=lower_kind, ) - log.info("Saved frozen model to %s", output) + log.info("Saved frozen model to %s (lower_kind=%s)", output, lower_kind) def change_bias( @@ -701,9 +731,19 @@ def main(args: list[str] | argparse.Namespace | None = None) -> None: f"Checkpoint path '{model_path}' does not exist." ) FLAGS.model = str(model_path) + _lower_kind = getattr(FLAGS, "lower_kind", "nlist") if not FLAGS.output.endswith((".pte", ".pt2")): - FLAGS.output = str(Path(FLAGS.output).with_suffix(".pte")) - freeze(model=FLAGS.model, output=FLAGS.output, head=FLAGS.head) + # Default suffix: .pt2 for the graph export (an AOTI .pt2 archive is + # what the C++ graph path consumes), .pte otherwise. Explicit user + # .pte / .pt2 suffixes are preserved for both. + _default_suffix = ".pt2" if _lower_kind == "graph" else ".pte" + FLAGS.output = str(Path(FLAGS.output).with_suffix(_default_suffix)) + freeze( + model=FLAGS.model, + output=FLAGS.output, + head=FLAGS.head, + lower_kind=_lower_kind, + ) elif FLAGS.command == "change-bias": change_bias( input_file=FLAGS.INPUT, diff --git a/deepmd/pt_expt/infer/deep_eval.py b/deepmd/pt_expt/infer/deep_eval.py index 97bba3d4a5..e03f893040 100644 --- a/deepmd/pt_expt/infer/deep_eval.py +++ b/deepmd/pt_expt/infer/deep_eval.py @@ -66,6 +66,20 @@ import ase.neighborlist +# Public output keys emitted by the graph-form AOTI forward +# (``forward_lower_graph_exportable``) keyed by the output-variable category that +# ``request_defs`` carries. The graph path is LOCAL-only (``N == sum(n_node)`` +# nodes, no ghosts), so its outputs are already at local-atom resolution -- no +# ``communicate_extended_output`` fold-back is needed. +_GRAPH_CATEGORY_TO_KEY = { + OutputVariableCategory.OUT: "atom_energy", + OutputVariableCategory.REDU: "energy", + OutputVariableCategory.DERV_R: "force", + OutputVariableCategory.DERV_C_REDU: "virial", + OutputVariableCategory.DERV_C: "atom_virial", +} + + def _reshape_charge_spin( charge_spin: np.ndarray, nframes: int, dim_chg_spin: int ) -> np.ndarray: @@ -1423,6 +1437,10 @@ def _eval_model( request_defs: list[OutputVariableDef], charge_spin: np.ndarray | None = None, ) -> tuple[np.ndarray, ...]: + if self.metadata.get("lower_input_kind") == "graph": + return self._eval_model_graph( + coords, cells, atom_types, fparam, aparam, request_defs, charge_spin + ) model_inputs, mapping_t, nframes, natoms = self._prepare_inputs( coords, cells, atom_types, fparam, aparam, charge_spin ) @@ -1621,6 +1639,101 @@ def _eval_model_spin( ) return tuple(results) + def _eval_model_graph( + self, + coords: np.ndarray, + cells: np.ndarray | None, + atom_types: np.ndarray, + fparam: np.ndarray | None, + aparam: np.ndarray | None, + request_defs: list[OutputVariableDef], + charge_spin: np.ndarray | None = None, + ) -> tuple[np.ndarray, ...]: + """Evaluate a graph-form ``.pt2`` (``lower_input_kind == "graph"``). + + Builds a carry-all :class:`~deepmd.dpmodel.utils.neighbor_graph.NeighborGraph` + from the eval system at its exact (tight) edge count and feeds the + positional schema + ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, + charge_spin)`` to the exported forward. The AOTI artifact's edge axis + is DYNAMIC (B2.0), so no ``edge_capacity`` padding is needed. The + forward returns the LOCAL public keys directly, so results are reshaped + without ``communicate_extended_output``. + """ + from deepmd.dpmodel.utils.neighbor_graph import ( + build_neighbor_graph, + ) + from deepmd.pt_expt.utils.env import ( + DEVICE, + ) + + nframes = coords.shape[0] + if len(atom_types.shape) == 1: + natoms = len(atom_types) + atom_types = np.tile(atom_types, nframes).reshape(nframes, -1) + else: + natoms = len(atom_types[0]) + + coord_input = coords.reshape(nframes, natoms, 3) + box_input = cells.reshape(nframes, 9) if cells is not None else None + # Dynamic edge axis (B2.0): build the carry-all graph at its exact edge + # count (no static padding); the AOTI artifact accepts any E. + graph = build_neighbor_graph( + coord_input, + atom_types, + box_input, + self._rcut, + ) + + atype_t = torch.tensor( + np.asarray(atom_types).reshape(-1), dtype=torch.int64, device=DEVICE + ) + n_node_t = torch.tensor( + np.asarray(graph.n_node), dtype=torch.int64, device=DEVICE + ) + edge_index_t = torch.tensor( + np.asarray(graph.edge_index), dtype=torch.int64, device=DEVICE + ) + edge_vec_t = torch.tensor( + np.asarray(graph.edge_vec), dtype=torch.float64, device=DEVICE + ) + edge_mask_t = torch.tensor( + np.asarray(graph.edge_mask), dtype=torch.bool, device=DEVICE + ) + + fparam_t, aparam_t = self._prepare_optional_lower_inputs( + fparam, aparam, nframes, natoms, DEVICE + ) + charge_spin_t = self._make_charge_spin_input(nframes, charge_spin) + + model_inputs = ( + atype_t, + n_node_t, + edge_index_t, + edge_vec_t, + edge_mask_t, + fparam_t, + aparam_t, + charge_spin_t, + ) + if self._is_pt2: + model_ret = self._pt2_runner(*model_inputs) + else: + model_ret = self.exported_module(*model_inputs) + + results = [] + for odef in request_defs: + shape = self._get_output_shape(odef, nframes, natoms) + gkey = _GRAPH_CATEGORY_TO_KEY.get(odef.category) + val = model_ret.get(gkey) if gkey is not None else None + if val is not None: + results.append(val.detach().cpu().numpy().reshape(shape)) + else: + results.append( + np.full(np.abs(shape), np.nan, dtype=GLOBAL_NP_FLOAT_PRECISION) + ) + return tuple(results) + def _get_output_shape( self, odef: OutputVariableDef, nframes: int, natoms: int ) -> list[int]: diff --git a/deepmd/pt_expt/model/edge_transform_output.py b/deepmd/pt_expt/model/edge_transform_output.py index 565e155157..653f323404 100644 --- a/deepmd/pt_expt/model/edge_transform_output.py +++ b/deepmd/pt_expt/model/edge_transform_output.py @@ -29,6 +29,8 @@ def edge_energy_deriv( edge_index: torch.Tensor, edge_mask: torch.Tensor, n_node: torch.Tensor, + node_capacity: int | None = None, + *, do_atomic_virial: bool = False, create_graph: bool = False, ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor]: @@ -49,6 +51,10 @@ def edge_energy_deriv( (E,) valid-edge mask. n_node (nf,) per-frame node counts. + node_capacity + Static node-axis size ``N``. ``None`` (eager default) falls back to + ``int(n_node.sum())``. Pass a static value (e.g. ``atype.shape[0]``) + to keep this function trace-safe under ``make_fx``/``torch.export``. do_atomic_virial whether to materialize the per-atom virial (else ``None`` is returned). create_graph @@ -70,7 +76,7 @@ def edge_energy_deriv( retain_graph=True, ) force, atom_virial, virial = edge_force_virial( - g_e, edge_vec, edge_index, edge_mask, n_node + g_e, edge_vec, edge_index, edge_mask, n_node, node_capacity=node_capacity ) return force, (atom_virial if do_atomic_virial else None), virial @@ -82,6 +88,7 @@ def fit_output_to_model_output_graph( do_atomic_virial: bool = False, create_graph: bool = True, mask: torch.Tensor | None = None, + node_capacity: int | None = None, ) -> dict[str, torch.Tensor]: """Graph analogue of the dense pt_expt ``fit_output_to_model_output``. @@ -115,6 +122,15 @@ def fit_output_to_model_output_graph( Whether the backward retains a graph (training). mask (N,) flat realness mask; used only for intensive-output reduction. + node_capacity + Authoritative node-axis size ``N`` = the scatter bound for the + per-node force/atom-virial assembly. Pass the INPUT ``atype.shape[0]`` + (the pristine node-axis symbol that ``edge_index`` indexes into by + construction); ``None`` falls back to the descriptor/fitting output's + ``fit_ret.shape[0]`` (value-equal). This makes the scatter bound the + input node axis rather than a re-derived shape -- hardening; the actual + CUDA out-of-bounds device-assert is prevented by the index clamp in + :func:`~deepmd.dpmodel.utils.neighbor_graph.derivatives.edge_force_virial`. Returns ------- @@ -131,9 +147,22 @@ def fit_output_to_model_output_graph( edge_mask = graph.edge_mask n_node = graph.n_node redu_prec = env.GLOBAL_PT_ENER_FLOAT_PRECISION - nf = int(n_node.shape[0]) - N = int(n_node.sum()) - frame_id = frame_id_from_n_node(n_node) # (N,) int64 frame index per atom + # Keep ``nf`` as a (possibly symbolic) shape value: under symbolic make_fx / + # torch.export ``n_node`` dim-0 is the dynamic frame axis, and ``int()`` on a + # SymInt SPECIALIZES it -- baking the trace-time frame count into every + # per-frame reduction (energy_redu / virial) and breaking multi-frame infer. + nf = n_node.shape[0] + # Derive N from the fitting output's leading shape rather than int(n_node.sum()). + # shape attributes are always static Python ints (or SymInts in symbolic-mode + # tracing) and are trace-safe; reading a tensor VALUE via int() is not. + N = ( + node_capacity + if node_capacity is not None + else next(iter(fit_ret.values())).shape[0] + ) + frame_id = frame_id_from_n_node( + n_node, n_total=N + ) # (N,) int64 frame index per atom model_ret: dict[str, torch.Tensor] = dict(fit_ret.items()) for kk, vv in fit_ret.items(): vdef = fit_output_def[kk] @@ -172,6 +201,7 @@ def fit_output_to_model_output_graph( edge_index, edge_mask, n_node, + node_capacity=N, do_atomic_virial=(vdef.c_differentiable and do_atomic_virial), create_graph=create_graph, ) diff --git a/deepmd/pt_expt/model/ener_model.py b/deepmd/pt_expt/model/ener_model.py index 6347382135..140141dd5e 100644 --- a/deepmd/pt_expt/model/ener_model.py +++ b/deepmd/pt_expt/model/ener_model.py @@ -32,6 +32,41 @@ DPEnergyModel_ = make_model(DPEnergyAtomicModel, T_Bases=(BaseModel,)) +def _translate_energy_keys( + model_ret: dict[str, torch.Tensor], + *, + do_grad_r: bool, + do_grad_c: bool, + do_atomic_virial: bool, + local: bool, +) -> dict[str, torch.Tensor]: + """Map internal fitting keys -> public energy-model keys (shared by the + dense and graph ``forward_lower`` export traces). + + Operates on plain dicts (make_fx-safe). ``local=True`` is the GRAPH path + (per-node ``N == sum(n_node)`` local atoms, no ghost/extended region) and + emits ``force``/``atom_virial``; ``local=False`` is the DENSE extended-region + path and emits ``extended_force``/``extended_virial`` (folded to local by + ``communicate_extended_output`` at inference). + """ + out: dict[str, torch.Tensor] = {} + out["atom_energy"] = model_ret["energy"] + out["energy"] = model_ret["energy_redu"] + if do_grad_r: + out["force" if local else "extended_force"] = model_ret[ + "energy_derv_r" + ].squeeze(-2) + if do_grad_c: + out["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2) + if do_atomic_virial: + out["atom_virial" if local else "extended_virial"] = model_ret[ + "energy_derv_c" + ].squeeze(-2) + if "mask" in model_ret: + out["mask"] = model_ret["mask"] + return out + + @BaseModel.register("ener") @BaseModel.register("sezm_ener") @BaseModel.register("dpa4_ener") @@ -229,21 +264,115 @@ def fn( aparam, charge_spin, ) - model_predict: dict[str, torch.Tensor] = {} - model_predict["atom_energy"] = model_ret["energy"] - model_predict["energy"] = model_ret["energy_redu"] - if do_grad_r: - model_predict["extended_force"] = model_ret["energy_derv_r"].squeeze(-2) - if do_grad_c: - model_predict["virial"] = model_ret["energy_derv_c_redu"].squeeze(-2) - if do_atomic_virial: - model_predict["extended_virial"] = model_ret[ - "energy_derv_c" - ].squeeze(-2) - if "mask" in model_ret: - model_predict["mask"] = model_ret["mask"] - return model_predict + return _translate_energy_keys( + model_ret, + do_grad_r=do_grad_r, + do_grad_c=do_grad_c, + do_atomic_virial=do_atomic_virial, + local=False, + ) return make_fx(fn, **make_fx_kwargs)( extended_coord, extended_atype, nlist, mapping, fparam, aparam, charge_spin ) + + def forward_lower_graph_exportable( + self, + atype: torch.Tensor, + n_node: torch.Tensor, + edge_index: torch.Tensor, + edge_vec: torch.Tensor, + edge_mask: torch.Tensor, + fparam: torch.Tensor | None = None, + aparam: torch.Tensor | None = None, + do_atomic_virial: bool = False, + charge_spin: torch.Tensor | None = None, + **make_fx_kwargs: Any, + ) -> torch.nn.Module: + """Trace ``forward_common_lower_graph`` into an exportable module with + public output keys. + + Delegates to ``forward_common_lower_graph_exportable`` for tracing, + then translates the internal keys to the ``forward_lower`` convention. + + Parameters + ---------- + atype + (N,) flat local atom types, ``N == sum(n_node)``. + n_node + (nf,) per-frame local atom counts. + edge_index + (2, E) ``[src, dst]`` edge endpoints (flat local indices). + edge_vec + (E, 3) neighbor-minus-center edge vectors (sample for tracing). + edge_mask + (E,) valid-edge mask (sample for tracing). + fparam, aparam, do_atomic_virial, charge_spin + As in ``forward_lower``. + **make_fx_kwargs + Extra keyword arguments forwarded to ``make_fx`` + (e.g. ``tracing_mode="symbolic"``). + + Returns + ------- + torch.nn.Module + A traced module whose ``forward`` accepts + ``(atype, n_node, edge_index, edge_vec, edge_mask, + fparam, aparam, charge_spin)`` and returns a dict with the + public keys: ``atom_energy``, ``energy``, ``force``, + ``virial``, ``atom_virial`` (the last only when + ``do_atomic_virial``). Unlike the dense + :meth:`forward_lower_exportable` (which emits ``extended_force`` / + ``extended_virial`` over the ghost-padded extended region), the + graph path is LOCAL-only (``N == sum(n_node)`` nodes, no ghosts), + so it emits ``force`` / ``atom_virial`` directly. + """ + traced = self.forward_common_lower_graph_exportable( + atype, + n_node, + edge_index, + edge_vec, + edge_mask, + fparam=fparam, + aparam=aparam, + charge_spin=charge_spin, + do_atomic_virial=do_atomic_virial, + **make_fx_kwargs, + ) + + # Translate internal keys to public convention. + # Capture model config at trace time via closures. + do_grad_r = self.do_grad_r("energy") + do_grad_c = self.do_grad_c("energy") + + def fn( + atype: torch.Tensor, + n_node: torch.Tensor, + edge_index: torch.Tensor, + edge_vec: torch.Tensor, + edge_mask: torch.Tensor, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + charge_spin: torch.Tensor | None, + ) -> dict[str, torch.Tensor]: + model_ret = traced( + atype, + n_node, + edge_index, + edge_vec, + edge_mask, + fparam, + aparam, + charge_spin, + ) + return _translate_energy_keys( + model_ret, + do_grad_r=do_grad_r, + do_grad_c=do_grad_c, + do_atomic_virial=do_atomic_virial, + local=True, + ) + + return make_fx(fn, **make_fx_kwargs)( + atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, charge_spin + ) diff --git a/deepmd/pt_expt/model/make_model.py b/deepmd/pt_expt/model/make_model.py index 50ede240e4..5b19cb63f1 100644 --- a/deepmd/pt_expt/model/make_model.py +++ b/deepmd/pt_expt/model/make_model.py @@ -370,6 +370,11 @@ def forward_common_lower_graph( do_atomic_virial=do_atomic_virial, create_graph=self.training, mask=atomic_ret["mask"] if "mask" in atomic_ret else None, + # Bound the per-node scatter by the INPUT node axis (the symbol + # ``edge_index`` indexes into), not the re-derived fitting-output + # shape -- avoids a CUDA out-of-bounds device-assert under + # dynamic-edge torch.export. See fit_output_to_model_output_graph. + node_capacity=atype.shape[0], ) def _resolve_graph_method( @@ -630,6 +635,86 @@ def fn( model.need_sorted_nlist_for_lower = _orig_need_sort return traced + def forward_common_lower_graph_exportable( + self, + atype: torch.Tensor, + n_node: torch.Tensor, + edge_index: torch.Tensor, + edge_vec: torch.Tensor, + edge_mask: torch.Tensor, + fparam: torch.Tensor | None = None, + aparam: torch.Tensor | None = None, + do_atomic_virial: bool = False, + charge_spin: torch.Tensor | None = None, + **make_fx_kwargs: Any, + ) -> torch.nn.Module: + """make_fx trace of ``forward_common_lower_graph`` with ``edge_vec`` + as the autograd leaf — the export target for graph-form .pt2 archives. + + Parameters + ---------- + atype + (N,) flat local atom types, ``N == sum(n_node)``. + n_node + (nf,) per-frame local atom counts. + edge_index + (2, E) ``[src, dst]`` edge endpoints (flat local indices). + edge_vec + (E, 3) neighbor-minus-center edge vectors (sample for tracing). + edge_mask + (E,) valid-edge mask (sample for tracing). + fparam, aparam, do_atomic_virial, charge_spin + As in ``forward_common_lower_graph``. + **make_fx_kwargs + Extra keyword arguments forwarded to ``make_fx`` + (e.g. ``tracing_mode="symbolic"``). + + Returns + ------- + torch.nn.Module + A traced module whose ``forward`` accepts + ``(atype, n_node, edge_index, edge_vec, edge_mask, + fparam, aparam, charge_spin)`` and returns a dict with the + same internal keys as ``forward_common_lower_graph``. + """ + model = self + + def fn( + atype: torch.Tensor, + n_node: torch.Tensor, + edge_index: torch.Tensor, + edge_vec: torch.Tensor, + edge_mask: torch.Tensor, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + charge_spin: torch.Tensor | None, + ) -> dict[str, torch.Tensor]: + # forward_common_lower_graph creates the autograd leaf from + # edge_vec internally, so no outer detach/requires_grad_ here + # (it would only add spurious ops to the traced graph). + return model.forward_common_lower_graph( + atype, + n_node, + edge_index, + edge_vec, + edge_mask, + do_atomic_virial=do_atomic_virial, + fparam=fparam, + aparam=aparam, + charge_spin=charge_spin, + ) + + return make_fx(fn, **make_fx_kwargs)( + atype, + n_node, + edge_index, + edge_vec, + edge_mask, + fparam, + aparam, + charge_spin, + ) + def forward_common_lower_exportable_with_comm( self, extended_coord: torch.Tensor, diff --git a/deepmd/pt_expt/train/training.py b/deepmd/pt_expt/train/training.py index bd6fdb02a3..4ab43fe329 100644 --- a/deepmd/pt_expt/train/training.py +++ b/deepmd/pt_expt/train/training.py @@ -274,6 +274,41 @@ def get_additional_data_requirement(_model: Any) -> list[DataRequirementItem]: # --------------------------------------------------------------------------- +def _forbidden_dims_from_model( + model: torch.nn.Module, + task_buf_vals: tuple[torch.Tensor, ...], +) -> set[int]: + """Prime-collision set for trace-dim selection. + + Collects every ``> 1`` dim of the model's parameters/buffers (so + ``_next_safe_prime`` never aliases an internal dim like ``g2_dim`` / + ``axis_neuron`` / ``attn_head`` without a hardcoded list), plus + ``dim_fparam``/``dim_aparam`` and the task-buffer dims. Shared by the dense + :func:`_trace_and_compile` and the graph :func:`_trace_and_compile_graph`; + each caller adds its path-specific dims (nall/nloc/nsel for dense, + charge_spin for both) on top of this base set. + """ + forbidden: set[int] = { + int(_d) + for _src in (model.parameters(), model.buffers()) + for _p in _src + for _d in _p.shape + if _d > 1 + } + for _getter in (model.get_dim_fparam, model.get_dim_aparam): + try: + _dim = _getter() + if _dim > 1: + forbidden.add(int(_dim)) + except Exception: + pass # best-effort: dim unavailable -> nothing to forbid + for _tbv in task_buf_vals: + for _d in _tbv.shape: + if _d > 1: + forbidden.add(int(_d)) + return forbidden + + def _trace_and_compile( model: torch.nn.Module, ext_coord: torch.Tensor, @@ -397,17 +432,11 @@ def fn( # large to alias with any architecture dim and need no adjustment. # # The prime for nf is chosen by enumerating every dimension that appears - # in the model's parameters and buffers, then calling _next_safe_prime to - # find the first prime that doesn't collide with any of them. This - # catches internal dims like g2_dim, axis_neuron, attn_head, etc. - # without requiring a hardcoded list. - _forbidden: set[int] = { - int(_d) - for _src in (model.parameters(), model.buffers()) - for _p in _src - for _d in _p.shape - if _d > 1 - } + # in the model's parameters and buffers (see _forbidden_dims_from_model), + # then calling _next_safe_prime to find the first prime that doesn't collide + # with any of them -- catching internal dims like g2_dim/axis_neuron/ + # attn_head without a hardcoded list. Add the dense-path dims on top. + _forbidden = _forbidden_dims_from_model(model, task_buf_vals_trace) # Also add the real nloc and nall so trace_nf never aliases them. _forbidden.add(int(ext_coord.shape[1])) # nall _forbidden.add(int(ext_atype.shape[1])) # nall (same tensor, defensive) @@ -416,26 +445,10 @@ def fn( _nsel = int(nlist.shape[2]) if _nsel > 1: _forbidden.add(_nsel) - try: - _dim_fp = model.get_dim_fparam() - if _dim_fp > 1: - _forbidden.add(_dim_fp) - except Exception: - pass - try: - _dim_ap = model.get_dim_aparam() - if _dim_ap > 1: - _forbidden.add(_dim_ap) - except Exception: - pass if charge_spin is not None: _dim_cs = int(charge_spin.shape[1]) if _dim_cs > 1: _forbidden.add(_dim_cs) - for _tbv in task_buf_vals_trace: - for _d in _tbv.shape: - if _d > 1: - _forbidden.add(int(_d)) trace_nf = _next_safe_prime(5, _forbidden) @@ -477,6 +490,25 @@ def fn( *task_buf_vals_trace, ) + return ( + _finalize_compiled_lower(traced_lower, model, was_training, compile_opts), + task_buf_order, + ) + + +def _finalize_compiled_lower( + traced_lower: "torch.fx.GraphModule", + model: torch.nn.Module, + was_training: bool, + compile_opts: dict[str, Any] | None, + extra_options: dict[str, Any] | None = None, +) -> torch.nn.Module: + """Shared post-``make_fx`` tail: strip detach, rebuild, inductor-compile. + + Used by both the dense :func:`_trace_and_compile` and the graph + :func:`_trace_and_compile_graph` so the second-order-gradient handling + (detach removal) and inductor options stay identical on both paths. + """ # make_fx inserts aten.detach.default for saved tensors used in the # decomposed autograd.grad backward ops. These detach nodes break # second-order gradient flow (d(force)/d(params) for force training). @@ -503,6 +535,8 @@ def fn( # pytorch/pytorch#174379, #178080, #179494 under # data-dependent symbolic shapes. } + if extra_options: + inductor_options.update(extra_options) if compile_opts: inductor_options.update(compile_opts) @@ -511,7 +545,264 @@ def fn( backend="inductor", dynamic=True, options=inductor_options, - ), task_buf_order + ) + + +def _model_uses_graph_lower(model: torch.nn.Module) -> bool: + """Whether ``model``'s eager default-flip routes through the GRAPH lower. + + Mirrors the predicate in + :meth:`~deepmd.pt_expt.model.make_model.make_model..CM._resolve_graph_method` + for ``neighbor_graph_method is None`` (the training default): a model is + graph-eligible iff it is ``mixed_types`` AND its single descriptor reports + ``uses_graph_lower() == True`` (currently only dpa1 ``attn_layer == 0``). + + When True the compiled lower must be the GRAPH ``forward_common_lower_graph`` + so the compiled path matches eager training (which already default-flips to + the carry-all graph forward); when False the dense ``forward_lower`` is + compiled (se_e2_a / dpa2 / dpa3 / linear / zbl). + + ASSUMPTION: training uses the default ``neighbor_graph_method`` (None). If a + user-facing ``"legacy"`` opt-out is ever plumbed into the trainer, this gate + must also honor it (else eager would run dense while the compiled path runs + the graph lower, re-introducing the eager!=compiled divergence this fixes). + """ + if not hasattr(model, "mixed_types"): + return False + try: + if not model.mixed_types(): + return False + except (AttributeError, NotImplementedError): + return False + # Linear / ZBL atomic models have no single ``descriptor`` -> dense. + descriptor = getattr(getattr(model, "atomic_model", None), "descriptor", None) + uses_graph = getattr(descriptor, "uses_graph_lower", None) + if uses_graph is None: + return False + try: + return bool(uses_graph()) + except (AttributeError, NotImplementedError): + return False + + +def _trace_and_compile_graph( + model: torch.nn.Module, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + charge_spin: torch.Tensor | None, + compile_opts: dict[str, Any] | None = None, + task_buffers: dict[str, torch.Tensor] | None = None, +) -> tuple[torch.nn.Module, tuple[str, ...]]: + """Symbolic-trace ``forward_common_lower_graph`` and inductor-compile it. + + The GRAPH analogue of :func:`_trace_and_compile`. Builds a small synthetic + NeighborGraph with prime-controlled ``nf`` / ``N`` / ``E`` axes (so make_fx's + duck-shape unification keeps the three dynamic dims as distinct symbols), + traces ``model.forward_common_lower_graph`` with ``edge_vec`` as the autograd + leaf, and translates the internal fitting keys to the public energy-model + keys (``atom_energy`` / ``energy`` / ``force`` / ``virial``). The compiled + callable accepts the positional graph tensors plus the promoted task buffers + and returns those public keys on the FLAT node axis (``N == sum(n_node)``); + the caller (:meth:`_CompiledModel.forward`) unravels them to ``(nf, nloc, *)``. + + Parameters + ---------- + model + The (uncompiled) graph-eligible energy model. + fparam, aparam, charge_spin + Representative optional inputs (or ``None``) so the traced branch + matches what :meth:`_CompiledModel.forward` passes at run time. + compile_opts + User-supplied inductor options (merged over the built-in defaults). + task_buffers + Per-task buffers promoted to FX placeholders (see + :func:`_detect_task_buffers`). + """ + import math + + from torch._decomp import ( + get_decompositions, + ) + from torch.fx.experimental.proxy_tensor import ( + make_fx, + ) + + from deepmd.pt_expt.model.ener_model import ( + _translate_energy_keys, + ) + + was_training = model.training + # Trace in train mode so create_graph=True is captured inside the graph + # force backward (forward_common_lower_graph passes create_graph=self.training). + model.train() + + task_buf_order: tuple[str, ...] = tuple(task_buffers.keys()) if task_buffers else () + task_buf_vals_trace: tuple[torch.Tensor, ...] = ( + tuple(task_buffers[k] for k in task_buf_order) if task_buffers else () + ) + + _fitting: torch.nn.Module | None = None + _atomic_model: torch.nn.Module | None = None + if task_buf_order: + try: + _fitting = model.get_fitting_net() + except AttributeError: + pass # optional accessor; a model without a fitting net keeps None + try: + _atomic_model = model.atomic_model + except AttributeError: + pass # optional attribute; a model without an atomic model keeps None + + do_grad_r = model.do_grad_r("energy") + do_grad_c = model.do_grad_c("energy") + + # ------------------------------------------------------------------ + # Build the trace-time NeighborGraph with prime-distinct nf / N / E. + # + # make_fx (tracing_mode="symbolic") unifies dimension symbols that share a + # concrete value (duck-shape merging). The three dynamic axes of the graph + # lower must stay distinct symbols, otherwise the per-frame segment_sum + # (N -> nf) and the per-edge scatter (E -> N) bake in a false equality: + # * nf = n_node.shape[0] (per-frame reductions) + # * N = atype.shape[0] (flat node axis = sum(n_node)) + # * E = edge_vec.shape[0] (edge axis) + # They are chosen as collision-free primes vs every parameter/buffer dim + # (see _forbidden_dims_from_model) plus charge_spin. + # ------------------------------------------------------------------ + _forbidden = _forbidden_dims_from_model(model, task_buf_vals_trace) + if charge_spin is not None and charge_spin.shape[-1] > 1: + _forbidden.add(int(charge_spin.shape[-1])) + + trace_nf = _next_safe_prime(5, _forbidden) + # nloc such that N = trace_nf * nloc is collision-free (and != trace_nf). + nloc_trace = 7 + while (trace_nf * nloc_trace) in (_forbidden | {trace_nf}): + nloc_trace += 1 + trace_N = trace_nf * nloc_trace + # Static edge capacity, prime-padded to stay distinct from nf and N. + nnei = sum(model.get_sel()) + e_max_base = max(math.ceil(1.25 * nloc_trace * nnei), 7) + e_max = _next_safe_prime(e_max_base, _forbidden | {trace_nf, trace_N}) + + # Shared with the .pt2 export trace (serialization.py) so the two graph + # traces can never desync on the input schema. Training uses the run-time + # float precision and device; optional tensors match the actual call. + from deepmd.pt_expt.utils.serialization import ( + build_synthetic_graph_inputs, + ) + + sample = build_synthetic_graph_inputs( + model, + e_max=e_max, + nframes=trace_nf, + nloc=nloc_trace, + dtype=GLOBAL_PT_FLOAT_PRECISION, + device=DEVICE, + want_fparam=fparam is not None, + want_aparam=aparam is not None, + want_charge_spin=charge_spin is not None, + ) + ( + s_atype, + s_n_node, + s_edge_index, + s_edge_vec, + s_edge_mask, + s_fparam, + s_aparam, + s_charge_spin, + ) = sample + + def fn( + atype: torch.Tensor, + n_node: torch.Tensor, + edge_index: torch.Tensor, + edge_vec: torch.Tensor, + edge_mask: torch.Tensor, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + charge_spin: torch.Tensor | None, + *task_buf_vals: torch.Tensor, + ) -> dict[str, torch.Tensor]: + # Patch task-specific buffers with the proxy tensors so make_fx records + # them as FX placeholders (mirrors the dense ``_trace_and_compile``). + originals: dict[str, torch.Tensor | None] = {} + if task_buf_order: + for name, val in zip(task_buf_order, task_buf_vals, strict=True): + if name.startswith(_AM_PREFIX): + actual = name[len(_AM_PREFIX) :] + if _atomic_model is not None: + originals[name] = _atomic_model._buffers.get(actual) + _atomic_model._buffers[actual] = val + else: + if _fitting is not None: + originals[name] = _fitting._buffers.get(name) + _fitting._buffers[name] = val + try: + # forward_common_lower_graph makes edge_vec the autograd leaf + # internally, so no outer detach/requires_grad_ here. + model_ret = model.forward_common_lower_graph( + atype, + n_node, + edge_index, + edge_vec, + edge_mask, + do_atomic_virial=False, + fparam=fparam, + aparam=aparam, + charge_spin=charge_spin, + ) + return _translate_energy_keys( + model_ret, + do_grad_r=do_grad_r, + do_grad_c=do_grad_c, + do_atomic_virial=False, + local=True, + ) + finally: + for name, orig in originals.items(): + if name.startswith(_AM_PREFIX): + actual = name[len(_AM_PREFIX) :] + if _atomic_model is not None: + _atomic_model._buffers[actual] = orig + else: + if _fitting is not None: + _fitting._buffers[name] = orig + + decomp_table = get_decompositions([torch.ops.aten.silu_backward.default]) + + traced_lower = make_fx( + fn, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + decomposition_table=decomp_table, + )( + s_atype, + s_n_node, + s_edge_index, + s_edge_vec, + s_edge_mask, + s_fparam, + s_aparam, + s_charge_spin, + *task_buf_vals_trace, + ) + + # The per-frame virial reduction scatters E edges into the (nf, 3, 3) virial + # via an atomic_add; inductor's CPU vectorizer asserts on that scatter's + # scalar index (``index.is_vec``). Disable CPU SIMD for the graph lower so + # the scatter is emitted scalar — numerically this only removes a + # reduction-order source, keeping eager==compiled within fp64 tolerance. + return ( + _finalize_compiled_lower( + traced_lower, + model, + was_training, + compile_opts, + extra_options={"cpp.simdlen": 0}, + ), + task_buf_order, + ) class _CompiledModel(torch.nn.Module): @@ -546,6 +837,9 @@ def __init__( self._compiled_by_structure: dict = ( compiled_by_structure if compiled_by_structure is not None else {} ) + # Resolved on the first forward: whether to compile the GRAPH lower + # (graph-eligible mixed_types descriptors) or the dense forward_lower. + self._graph_eligible: bool | None = None def __getattr__(self, name: str) -> Any: # Delegate unknown lookups to original_model so that callers such as @@ -579,6 +873,18 @@ def forward( nframes, nloc = atype.shape[:2] rcut = self.original_model.get_rcut() + + # Graph-eligible models (dpa1 attn_layer==0) default-flip to the carry-all + # GRAPH forward in eager training; the compiled lower must be the GRAPH + # lower too, otherwise the eager (graph) and compiled (dense) backward + # gradients diverge at fp64 accumulation and the optimizer amplifies it. + if self._graph_eligible is None: + self._graph_eligible = _model_uses_graph_lower(self.original_model) + if self._graph_eligible: + return self._forward_graph( + coord, atype, box, fparam, aparam, charge_spin, nframes, nloc, rcut + ) + sel = self.original_model.get_sel() # coord extension + nlist (data-dependent, run in eager) @@ -751,6 +1057,163 @@ def forward( out["mask"] = result["mask"] return out + def _forward_graph( + self, + coord: torch.Tensor, + atype: torch.Tensor, + box: torch.Tensor | None, + fparam: torch.Tensor | None, + aparam: torch.Tensor | None, + charge_spin: torch.Tensor | None, + nframes: int, + nloc: int, + rcut: float, + ) -> dict[str, torch.Tensor]: + """Carry-all GRAPH forward -> compiled ``forward_common_lower_graph``. + + Builds the carry-all NeighborGraph eagerly (the SAME builder the eager + uncompiled default-flip uses, so the graph tensors are bit-identical), + then calls the compiled graph lower. The graph force is per-LOCAL-node + ``(N, 3)`` with ``N == nframes * nloc`` for a single-rank carry-all graph, + so no extended->local scatter is needed; only the flat ``(N, *)`` node + keys are unravelled to ``(nf, nloc, *)`` at the I/O boundary. + """ + from deepmd.dpmodel.utils.neighbor_graph import ( + build_neighbor_graph, + ) + + _model = self.original_model + + coord_3d = coord.detach().reshape(nframes, nloc, 3) + box_flat = box.detach().reshape(nframes, 9) if box is not None else None + + # Mirror the optional-input defaulting of the dense path / eager + # call_common: a model configured with fparam / charge_spin substitutes + # its default when the data omits it, so the compiled (frozen) branch + # always sees a tensor. + _dim_fparam = ( + _model.get_dim_fparam() if hasattr(_model, "get_dim_fparam") else 0 + ) + if fparam is None and _dim_fparam > 0: + _default_fparam = _model.get_default_fparam() + if _default_fparam is not None: + fparam = ( + torch.as_tensor( + _default_fparam, dtype=coord_3d.dtype, device=coord_3d.device + ) + .reshape(1, _dim_fparam) + .expand(nframes, -1) + ) + _dim_cs = ( + _model.get_dim_chg_spin() if hasattr(_model, "get_dim_chg_spin") else 0 + ) + if charge_spin is None and _dim_cs > 0: + _default_cs = _model.get_default_chg_spin() + if _default_cs is not None: + charge_spin = ( + torch.as_tensor( + _default_cs, dtype=coord_3d.dtype, device=coord_3d.device + ) + .reshape(1, _dim_cs) + .expand(nframes, -1) + ) + + # Carry-all graph (dynamic E, no edge_capacity) — identical to the eager + # uncompiled ``_call_common_graph`` builder so the two paths match. + ng = build_neighbor_graph(coord_3d, atype, box_flat, rcut) + atype_flat = atype.reshape(nframes * nloc) + + # Lazy compile of the GRAPH lower (cached per structure key). + if self.compiled_forward_lower is None: + if self._structure_key in self._compiled_by_structure: + compiled_lower, buf_order = self._compiled_by_structure[ + self._structure_key + ] + log.info("Reusing compiled graph lower (shared structure, lazy).") + else: + log.info( + "Lazy compile (graph lower): tracing on first forward call " + "(structure_key=%s).", + self._structure_key, + ) + compiled_lower, buf_order = _trace_and_compile_graph( + _model, + fparam, + aparam, + charge_spin, + task_buffers=self._task_buffers, + compile_opts=self._compile_opts, + ) + self._compiled_by_structure[self._structure_key] = ( + compiled_lower, + buf_order, + ) + self.compiled_forward_lower = compiled_lower + self._task_buf_order = buf_order + self._task_buffers = None + + # Feed a detached, grad-enabled edge_vec leaf: the traced graph's internal + # ``edge_vec.detach()`` is stripped by ``_strip_saved_tensor_detach`` (as + # for the dense ext_coord leaf), so the force backward roots at this input. + edge_vec = ng.edge_vec.detach().requires_grad_(True) + + if self._task_buf_order: + try: + _fitting = _model.get_fitting_net() + _am = getattr(_model, "atomic_model", None) + _vals: list[torch.Tensor] = [] + for _name in self._task_buf_order: + if _name.startswith(_AM_PREFIX): + _actual = _name[len(_AM_PREFIX) :] + _vals.append(_am._buffers[_actual]) + else: + _vals.append(getattr(_fitting, _name)) + task_buf_vals: tuple = tuple(_vals) + except AttributeError as exc: + raise RuntimeError( + f"Compiled graph expects task buffers {self._task_buf_order!r} " + "but they could not be retrieved from the model. " + "This is a bug in the compile path." + ) from exc + else: + task_buf_vals = () + + result = self.compiled_forward_lower( + atype_flat, + ng.n_node, + ng.edge_index, + edge_vec, + ng.edge_mask, + fparam, + aparam, + charge_spin, + *task_buf_vals, + ) + + # The compiled graph lower emits PUBLIC keys on the FLAT node axis + # (``atom_energy`` / ``force`` are (N, *); ``energy`` / ``virial`` are + # (nf, *)). Unravel the node-level keys to rectangular (nf, nloc, *) so + # callers receive the same shapes as the dense path. + N = nframes * nloc + # Node-level (per-atom, lead dim N) public keys emitted by the graph + # lower; the remaining keys are frame-level (lead dim nf) and must NOT + # be unravelled. Keying on the NAME rather than the ``N != nframes`` + # shape heuristic keeps the single-atom case (nloc == 1, where + # N == nframes) correct -- node-level outputs still reshape to + # (nf, 1, *) instead of staying (nf, *). + node_level_keys = {"atom_energy", "force", "atom_virial", "mask"} + out: dict[str, torch.Tensor] = {} + for key, val in result.items(): + if ( + key in node_level_keys + and val is not None + and val.shape[:1] == torch.Size([N]) + ): + out[key] = val.reshape(nframes, nloc, *val.shape[1:]) + else: + out[key] = val + return out + # --------------------------------------------------------------------------- # Trainer diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py index 9c03783574..e0c08a6f84 100644 --- a/deepmd/pt_expt/utils/serialization.py +++ b/deepmd/pt_expt/utils/serialization.py @@ -40,27 +40,32 @@ def _strip_shape_assertions(graph_module: torch.nn.Module) -> None: - """Neutralise shape-guard assertion nodes in a spin model's exported graph. - - ``torch.export`` inserts ``aten._assert_scalar`` nodes for symbolic shape - relationships discovered during tracing. For the spin model, the atom- - doubling logic creates slice patterns that depend on ``(nall - nloc)``, - producing guards like ``Ne(nall, nloc)``. These guards are spurious: the - model computes correct results even when ``nall == nloc`` (NoPBC, no ghost - atoms). - - This function is **only called for spin models** (guarded by ``if is_spin`` - in ``_trace_and_export``). The assertion messages use opaque symbolic - variable names (e.g. ``Ne(s22, s96)``) rather than human-readable names, - so filtering by message content is not reliable. Since - ``prefer_deferred_runtime_asserts_over_guards=True`` converts all shape - guards into these deferred assertions, and the only shape relationships in - the spin model involve nall/nloc, neutralising all of them is safe in this - context. - - We replace each assertion's condition with ``True`` rather than erasing the - node; erasing nodes can disturb the FX graph structure and produce NaN - gradients on some Python/torch versions. + """Neutralise deferred shape-guard assertion nodes in an exported graph. + + ``torch.export`` (with ``prefer_deferred_runtime_asserts_over_guards=True``) + inserts ``aten._assert_scalar`` nodes for symbolic-shape relationships + discovered during tracing. The assertion messages use opaque symbolic names + (e.g. ``Ne(s22, s96)``), so filtering by message content is not reliable; we + replace each assertion's condition with ``True`` rather than erasing the node + (erasing can disturb the FX graph and yield NaN gradients on some torch + versions). + + Called from TWO export paths in ``_trace_and_export``: + + * **spin (dense) models** — atom-doubling slice patterns depend on + ``(nall - nloc)``, producing spurious guards like ``Ne(nall, nloc)``; the + model is correct even when ``nall == nloc`` (NoPBC, no ghosts). + * **graph models** — the DYNAMIC edge axis (``Dim("nedge")``) produces guards + of the ``nloc_min``/SIGFPE family on the edge count ``E``. These are the + shape-specialization guards the static-``edge_capacity`` path was designed + to avoid; neutralising them is what makes one artifact eval any edge count. + + **Safety:** in both contexts every input is constructed well-formed by the + builder (spin: valid atom doubling; graph: ``build_neighbor_graph`` / + ``buildGraphTensors`` always emit ``E >= min_edges == 2`` with in-range, + masked edges), so the neutralised guards would never legitimately fire. The + only cost is that a MALFORMED runtime tensor no longer throws cleanly — the + documented AOTI trade-off (CLAUDE.md), accepted identically on both paths. """ graph = graph_module.graph for node in list(graph.nodes): @@ -310,6 +315,158 @@ def _make_sample_inputs( return ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam, charge_spin +def build_synthetic_graph_inputs( + model: torch.nn.Module, + e_max: int, + nframes: int = 2, + nloc: int = 7, + *, + dtype: torch.dtype, + device: torch.device | None = None, + want_fparam: bool = True, + want_aparam: bool = True, + want_charge_spin: bool = True, +) -> tuple[torch.Tensor | None, ...]: + """Build a synthetic carry-all ``NeighborGraph`` for graph-lower tracing. + + Single source of the trace-time graph inputs, shared by ``.pt2`` export + (:func:`_trace_and_export`) and compiled training + (:func:`deepmd.pt_expt.train.training._trace_and_compile_graph`), so the two + traces can never desync on the graph input schema. Builds a small random + system, runs the carry-all + :func:`~deepmd.dpmodel.utils.neighbor_graph.build_neighbor_graph` with a + STATIC ``GraphLayout(edge_capacity=e_max)`` (decision #16: the masked static + edge axis), and returns tensors in the positional order expected by + ``forward_(common_)lower_graph``: + ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, + charge_spin)``. + + The system (``rng(42)``, ``box = rcut*3``, centered coords, ``atype[:, i] = + i % ntypes``) is identical for both callers; the only two former differences + are now parameters. + + Parameters + ---------- + model : torch.nn.Module + The pt_expt energy model (must expose ``get_rcut``/``get_type_map``/...). + e_max : int + Static edge capacity ``E`` to pad the (masked) edge axis to. + nframes : int + Number of frames in the sample system. + nloc : int + Number of local atoms per frame (``N == nframes * nloc``). + dtype : torch.dtype + Float precision of ``coord``/``edge_vec``/``fparam``/... . The exported + ``.pt2`` is float64-only (C++ ABI); training passes + ``GLOBAL_PT_FLOAT_PRECISION``. + device : torch.device, optional + Target device. Defaults to ``deepmd.pt_expt.utils.env.DEVICE``; the + export path passes ``cpu`` explicitly (make_fx traces on CPU). + want_fparam, want_aparam, want_charge_spin : bool + Whether to emit the optional conditioning tensor when its ``dim > 0``. + Export passes the defaults (``True`` = include if present); training + passes ``x is not None`` so the traced branch matches the run-time call. + """ + import deepmd.pt_expt.utils.env as _env + from deepmd.dpmodel.utils.neighbor_graph import ( + GraphLayout, + build_neighbor_graph, + ) + + if device is None: + device = _env.DEVICE + + rcut = model.get_rcut() + ntypes = len(model.get_type_map()) + dim_fparam = model.get_dim_fparam() + dim_aparam = model.get_dim_aparam() + dim_chg_spin = model.get_dim_chg_spin() if hasattr(model, "get_dim_chg_spin") else 0 + + # Box large enough to avoid PBC degeneracy; centered coords. + box_size = rcut * 3.0 + box_np = (np.eye(3, dtype=np.float64) * box_size).reshape(1, 9) + rng = np.random.default_rng(42) + coord_np = rng.random((nframes, nloc, 3)) * box_size * 0.5 + box_size * 0.25 + atype_np = np.zeros((nframes, nloc), dtype=np.int64) + for i in range(nloc): + atype_np[:, i] = i % ntypes + + coord_t = torch.tensor(coord_np, dtype=dtype, device=device) + atype_t = torch.tensor(atype_np, dtype=torch.int64, device=device) + box_t = torch.tensor(np.tile(box_np, (nframes, 1)), dtype=dtype, device=device) + graph = build_neighbor_graph( + coord_t, atype_t, box_t, rcut, layout=GraphLayout(edge_capacity=e_max) + ) + + fparam = ( + torch.zeros(nframes, dim_fparam, dtype=dtype, device=device) + if (want_fparam and dim_fparam > 0) + else None + ) + aparam = ( + torch.zeros(nframes, nloc, dim_aparam, dtype=dtype, device=device) + if (want_aparam and dim_aparam > 0) + else None + ) + charge_spin = ( + torch.zeros(nframes, dim_chg_spin, dtype=dtype, device=device) + if (want_charge_spin and dim_chg_spin > 0) + else None + ) + + return ( + atype_t.reshape(-1), + graph.n_node, + graph.edge_index, + graph.edge_vec, + graph.edge_mask, + fparam, + aparam, + charge_spin, + ) + + +def _build_graph_dynamic_shapes( + *sample_inputs: torch.Tensor | None, +) -> tuple: + """Build dynamic-shape specifications for the graph-form forward_lower export. + + ``nframes`` (the ``n_node`` axis), ``N`` (the flat node axis) AND the edge + axis ``E`` are all dynamic dims (B2.0: the dynamic edge axis replaces the + static ``edge_capacity`` of B1). ``E`` is marked ``Dim("nedge", min=2)`` so + the AOTI artifact accepts any system size with no capacity ceiling — the + ``min=2`` lower bound mirrors the dense path's ``Dim("nnei", min=...)`` (a + dynamic, SIGFPE-tamed axis) and matches the carry-all builder's + ``min_edges=2`` guard (every dynamic graph carries >=2 edges). + + Parameters + ---------- + *sample_inputs : torch.Tensor | None + ``(atype, n_node, edge_index, edge_vec, edge_mask, fparam, aparam, + charge_spin)`` — 8 entries matching ``forward_lower_graph_exportable``. + """ + fparam = sample_inputs[5] + aparam = sample_inputs[6] + charge_spin = sample_inputs[7] + nframes_dim = torch.export.Dim("nframes", min=1) + n_node_total_dim = torch.export.Dim("n_node_total", min=1) + nedge_dim = torch.export.Dim("nedge", min=2) + nloc_dim = torch.export.Dim("nloc", min=1) + return ( + {0: n_node_total_dim}, # atype: (N,) + {0: nframes_dim}, # n_node: (nf,) + {1: nedge_dim}, # edge_index: (2, E) — E dynamic + {0: nedge_dim}, # edge_vec: (E, 3) — E dynamic + {0: nedge_dim}, # edge_mask: (E,) — E dynamic + {0: nframes_dim} if fparam is not None else None, # fparam: (nf, ndf) + # aparam: (nf, nloc, nda) — both the frame AND atom axes are dynamic, + # matching the dense ``_build_dynamic_shapes`` (otherwise a dim_aparam>0 + # graph export specializes nloc to the sample size and breaks at runtime). + {0: nframes_dim, 1: nloc_dim} if aparam is not None else None, # aparam + {0: nframes_dim} if charge_spin is not None else None, # charge_spin + ) + + def _build_dynamic_shapes( *sample_inputs: torch.Tensor | None, has_spin: bool = False, @@ -416,7 +573,9 @@ def _build_dynamic_shapes( return (*base, None, None, None, None, None, None, None, None) -def _collect_metadata(model: torch.nn.Module, is_spin: bool = False) -> dict: +def _collect_metadata( + model: torch.nn.Module, is_spin: bool = False, lower_kind: str = "nlist" +) -> dict: """Collect metadata from the model for C++ inference. This metadata is stored as ``metadata.json`` in both .pt2 and .pte archives. @@ -528,6 +687,12 @@ def _probe_has_message_passing(obj: object) -> bool | None: if result is not None: break meta["has_message_passing"] = result if result is not None else False + + # Which input schema the compiled AOTI forward consumes: + # "nlist" → dense quartet (extended_coord, extended_atype, nlist, mapping) + # "graph" → NeighborGraph (atype, n_node, edge_index, edge_vec, edge_mask) + # The C++ loader branches on this to build the matching inputs. + meta["lower_input_kind"] = "graph" if lower_kind == "graph" else "nlist" return meta @@ -599,6 +764,7 @@ def deserialize_to_file( data: dict, model_json_override: dict | None = None, do_atomic_virial: bool = False, + lower_kind: str = "nlist", ) -> None: """Deserialize a dictionary to a .pte or .pt2 model file. @@ -622,14 +788,23 @@ def deserialize_to_file( do_atomic_virial : bool If True, export with per-atom virial correction (3 extra backward passes, ~2.5x slower). Default False for best performance. + lower_kind : str + Which lower-forward schema the compiled AOTI graph consumes: + ``"nlist"`` (default) traces the dense quartet + (``extended_coord``/``extended_atype``/``nlist``/``mapping``); + ``"graph"`` traces the NeighborGraph schema + (``atype``/``n_node``/``edge_index``/``edge_vec``/``edge_mask``) with a + DYNAMIC edge axis ``E`` (``Dim("nedge", min=2)``), so the artifact + accepts any system size. The selected schema is recorded as + ``lower_input_kind`` in ``metadata.json``. """ if model_file.endswith(".pt2"): _deserialize_to_file_pt2( - model_file, data, model_json_override, do_atomic_virial + model_file, data, model_json_override, do_atomic_virial, lower_kind ) else: _deserialize_to_file_pte( - model_file, data, model_json_override, do_atomic_virial + model_file, data, model_json_override, do_atomic_virial, lower_kind ) @@ -638,6 +813,7 @@ def _trace_and_export( model_json_override: dict | None = None, with_comm_dict: bool = False, do_atomic_virial: bool = False, + lower_kind: str = "nlist", ) -> tuple: """Common logic: build model, trace, export. @@ -663,6 +839,10 @@ def _trace_and_export( If True, the traced graph computes per-atom virial (extra autograd.grad backward passes); off by default to keep .pt2 inference fast. Mirrors PR #5407 in upstream master. + lower_kind + ``"nlist"`` (default) traces the dense quartet forward; ``"graph"`` + traces ``forward_lower_graph_exportable`` over the NeighborGraph schema + with a dynamic edge axis. Recorded as ``lower_input_kind`` in metadata. Returns ------- @@ -700,7 +880,125 @@ def _trace_and_export( model.eval() # 2. Collect metadata - metadata = _collect_metadata(model, is_spin=is_spin) + metadata = _collect_metadata(model, is_spin=is_spin, lower_kind=lower_kind) + + # 2b. Graph-form export branch (NeighborGraph schema). The graph path is + # LOCAL-only (no ghosts), single-rank, energy-model only in PR-A/PR-B; it + # traces ``forward_lower_graph_exportable`` with a DYNAMIC edge axis (B2.0). + # The dense (nlist) path below is left byte-unchanged. + if lower_kind == "graph": + import math + + if is_spin: + raise NotImplementedError( + "graph-form .pt2 export is not supported for spin models" + ) + if with_comm_dict: + raise NotImplementedError( + "graph-form .pt2 export does not support the with-comm artifact " + "(multi-rank graph message passing is a later PR)" + ) + if not hasattr(model, "forward_lower_graph_exportable"): + raise NotImplementedError( + f"model {type(model).__name__} has no " + "forward_lower_graph_exportable; graph-form .pt2 export " + "requires an energy model" + ) + + # The edge axis is DYNAMIC (B2.0): the AOTI artifact accepts any edge + # count, so there is no capacity to bake. The trace sample is built at a + # concrete, padded edge size only to keep the trace tensors distinct + # from the other dynamic dims (nframes=2, N=14) under torch.export's + # duck-sizing; the value itself does NOT constrain runtime. + nloc_sample = 7 + nnei = sum(model.get_sel()) + e_sample = math.ceil(1.25 * nloc_sample * nnei) + + # make_fx traces on CPU; the .pt2 C++ ABI is float64-only. Pass device + # and dtype explicitly instead of mutating the module-level env.DEVICE. + sample_inputs = build_synthetic_graph_inputs( + model, + e_max=e_sample, + nframes=2, + nloc=nloc_sample, + dtype=torch.float64, + device=torch.device("cpu"), + ) + + ( + atype_g, + n_node_g, + edge_index_g, + edge_vec_g, + edge_mask_g, + fparam_g, + aparam_g, + charge_spin_g, + ) = sample_inputs + + # Trace via make_fx on CPU (decomposes autograd.grad into aten ops). + traced = model.forward_lower_graph_exportable( + atype_g, + n_node_g, + edge_index_g, + edge_vec_g, + edge_mask_g, + fparam=fparam_g, + aparam=aparam_g, + do_atomic_virial=do_atomic_virial, + charge_spin=charge_spin_g, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + ) + sample_out = traced( + atype_g, + n_node_g, + edge_index_g, + edge_vec_g, + edge_mask_g, + fparam_g, + aparam_g, + charge_spin_g, + ) + output_keys = list(sample_out.keys()) + + dynamic_shapes = _build_graph_dynamic_shapes(*sample_inputs) + exported = torch.export.export( + traced, + sample_inputs, + dynamic_shapes=dynamic_shapes, + strict=False, + prefer_deferred_runtime_asserts_over_guards=True, + ) + + # Neutralise shape-guard assertion nodes on the dynamic edge axis. + # ``prefer_deferred_runtime_asserts_over_guards=True`` converts the + # symbolic-shape guards discovered while tracing into deferred + # ``aten._assert_scalar`` nodes; on the dynamic ``E`` axis these are the + # SIGFPE-prone ``nloc_min``-family checks (CLAUDE.md AOTI pitfalls) that + # the dense spin path already strips. Replacing each condition with + # ``True`` (not erasing the node) keeps the graph well-formed while + # letting the AOTI artifact generalise across edge counts. + _strip_shape_assertions(exported.graph_module) + + if target_device.type != "cpu": + from torch.export.passes import ( + move_to_device_pass, + ) + + exported = move_to_device_pass(exported, target_device) + + metadata["do_atomic_virial"] = do_atomic_virial + # The edge axis is DYNAMIC (B2.0): the AOTI forward accepts any edge + # count, so there is no ``edge_capacity`` to persist. The C++ / Python + # conversion hub builds the carry-all graph at its exact (tight) edge + # count and feeds it straight through. + + json_source = model_json_override if model_json_override is not None else data + data_for_json = deepcopy(json_source) + data_for_json = _numpy_to_json_serializable(data_for_json) + + return exported, metadata, data_for_json, output_keys # 3. Create sample inputs on CPU for tracing # torch.export's duck-sizing unifies dimensions with the same sample value, @@ -917,10 +1215,14 @@ def _deserialize_to_file_pte( data: dict, model_json_override: dict | None = None, do_atomic_virial: bool = False, + lower_kind: str = "nlist", ) -> None: """Deserialize a dictionary to a .pte model file.""" exported, metadata, data_for_json, output_keys = _trace_and_export( - data, model_json_override, do_atomic_virial=do_atomic_virial + data, + model_json_override, + do_atomic_virial=do_atomic_virial, + lower_kind=lower_kind, ) model_def_script = data.get("model_def_script") or {} @@ -939,6 +1241,7 @@ def _deserialize_to_file_pt2( data: dict, model_json_override: dict | None = None, do_atomic_virial: bool = False, + lower_kind: str = "nlist", ) -> None: """Deserialize a dictionary to a .pt2 model file (AOTInductor). @@ -976,7 +1279,10 @@ def _deserialize_to_file_pt2( # First artifact: regular (no comm). Always produced. exported, metadata, data_for_json, output_keys = _trace_and_export( - data, model_json_override, do_atomic_virial=do_atomic_virial + data, + model_json_override, + do_atomic_virial=do_atomic_virial, + lower_kind=lower_kind, ) metadata["output_keys"] = output_keys diff --git a/source/api_cc/include/DeepPotPTExpt.h b/source/api_cc/include/DeepPotPTExpt.h index 68a553e29c..ddaea35646 100644 --- a/source/api_cc/include/DeepPotPTExpt.h +++ b/source/api_cc/include/DeepPotPTExpt.h @@ -308,10 +308,12 @@ class DeepPotPTExpt : public DeepPotBackend { bool do_atomic_virial; // whether model was exported with atomic virial corr int nnei; // expected nlist nnei dimension (= sum(sel)) bool lower_input_is_edge_ = false; + bool lower_input_is_graph_ = false; NeighborListData nlist_data; - at::Tensor mapping_tensor; // cached mapping tensor (LAMMPS path) - at::Tensor firstneigh_tensor; // cached nlist tensor (LAMMPS path) - at::Tensor edge_index_tensor; // cached local edge graph (LAMMPS path) + at::Tensor mapping_tensor; // cached mapping tensor (LAMMPS path) + std::vector mapping_; // cached mapping vector (LAMMPS path) + at::Tensor firstneigh_tensor; // cached nlist tensor (LAMMPS path) + at::Tensor edge_index_tensor; // cached local edge graph (LAMMPS path) at::Tensor edge_index_ext_tensor; // cached extended edge graph (LAMMPS path) std::unique_ptr loader; // Optional second AOTInductor artifact for the multi-rank GNN code @@ -398,6 +400,30 @@ class DeepPotPTExpt : public DeepPotBackend { const torch::Tensor& aparam, const torch::Tensor& charge_spin); + /** + * @brief Run a NeighborGraph-schema ``.pt2`` (lower_input_kind="graph"). + * + * Positional AOTI input order matches the Python export ABI: + * ``(atype, n_node, edge_index, edge_vec, edge_mask, [fparam], [aparam], + * [charge_spin])``. Unlike the edge schema there is no ``coord`` and no + * ``edge_scatter_index`` input; node count is carried by ``n_node`` and the + * geometry is fully described by ``edge_vec``. + * + * @param[in] atype Per-node local types, shape ``(N,)`` int64. + * @param[in] n_node Per-frame node count, shape ``(nf,)`` int64. + * @param[in] edge_index Folded edge graph ``(2, E)`` int64 [src, dst]. + * @param[in] edge_vec Edge vectors ``(E, 3)`` (neighbour - center). + * @param[in] edge_mask Physical-edge mask ``(E,)`` bool. + */ + std::vector run_model_graph(const torch::Tensor& atype, + const torch::Tensor& n_node, + const torch::Tensor& edge_index, + const torch::Tensor& edge_vec, + const torch::Tensor& edge_mask, + const torch::Tensor& fparam, + const torch::Tensor& aparam, + const torch::Tensor& charge_spin); + /** * @brief Run the with-comm .pt2 artifact with comm tensors appended. * diff --git a/source/api_cc/include/commonPT.h b/source/api_cc/include/commonPT.h index 643e53974a..02c25aa047 100644 --- a/source/api_cc/include/commonPT.h +++ b/source/api_cc/include/commonPT.h @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include @@ -364,6 +366,233 @@ inline EdgeTensorPack compactEdgeTensors(const torch::Tensor& edge_index, return pack; } +struct GraphTensorPack { + torch::Tensor atype; + torch::Tensor n_node; + torch::Tensor edge_index; + torch::Tensor edge_vec; + torch::Tensor edge_mask; +}; + +/** + * @brief Build NeighborGraph input tensors from a host neighbor list + * (single-rank, dynamic edge axis). + * + * Mirrors the edge schema but drops ``coord``/``edge_scatter_index`` and adds + * ``n_node``. Edge construction is delegated to the existing + * ``createEdgeTensors``/``compactEdgeTensors`` helpers (same rcut filter, + * variable edge count and two masked dummy edges that keep the dynamic edge + * dimension non-empty); the wrapper then (a) drops the extended scatter index, + * (b) emits ``n_node = [nloc]`` for the single frame, and (c) sets the node + * types from the local slice of ``atype_ext``. + * + * @param nlist Neighbor-list rows (local idx into the extended set). + * @param coord Extended coordinates shaped as nall x 3. + * @param atype_ext Extended atom types, length nall. Node types are taken from + * the extended types (NOT ``atype[mapping]``); for single-rank ghost-free + * this is just ``atype_ext[0:nloc]``, while multi-rank (B3) passes the halo + * types directly. + * @param mapping Extended-to-local atom map, length nall. + * @param nloc Number of local atoms. + * @param nall Number of extended atoms. + * @param rcut Model cutoff (edges with ``rr > rcut**2`` are dropped). + * @param device Target device for the returned tensors. + * @param row_centers Optional center atom index for each neighbor-list row + * (LAMMPS compacts away empty rows); ``nullptr`` means row i is center i. + * @param fold_to_local Whether ghost neighbours are folded onto their local + * owners (single-rank, ``N == nloc``, ``n_node = [nloc]``, node types from + * ``atype_ext[0:nloc]``) or kept as distinct extended nodes (multi-rank, + * ``N == nall``, ``n_node = [nall]``, node types from the full ``atype_ext`` + * including the real halo types — the #5583 invariant). In the multi-rank + * case ``edge_index`` indexes the extended atoms directly, so ghost reaction + * forces land on the ghost rows and are folded to their owners by LAMMPS + * reverse-comm (no with-comm artifact / no border_op — dpa1 is non-MP). + */ +template +inline GraphTensorPack buildGraphTensors( + const std::vector>& nlist, + const std::vector& coord, + const std::vector& atype_ext, + const std::vector& mapping, + const int nloc, + const int nall, + const double rcut, + const torch::Device& device, + const std::vector* row_centers = nullptr, + const bool fold_to_local = true) { + auto int_options = torch::TensorOptions().dtype(torch::kInt64); + + // 1. Cached-style topology only (no geometry): when fold_to_local=true, + // edge_index folds ghost neighbours onto their local owners (single-rank); + // when false, edge_index indexes the extended atoms directly (multi-rank). + // edge_index_ext always keeps extended indices for the on-device geometry + // recompute. + const EdgeTensorPack topo = + createEdgeTensors(nlist, coord, mapping, nloc, nall, device, + /*with_geometry=*/false, row_centers, fold_to_local); + + // 2. Recompute geometry from the current coords on-device, filter by rcut and + // append the two masked dummy edges. The model is compiled for float64 + // inputs, so build the coord tensor as float64 to match the edge path. + std::vector coord_d(coord.begin(), coord.end()); + at::Tensor coord_tensor = + torch::from_blob(coord_d.data(), {static_cast(nall), 3}, + torch::TensorOptions().dtype(torch::kFloat64)) + .clone() + .to(device); + const EdgeTensorPack edges = compactEdgeTensors( + topo.edge_index, topo.edge_index_ext, coord_tensor, rcut); + + GraphTensorPack pack; + pack.edge_index = edges.edge_index; // (2, E): local-folded or extended + pack.edge_vec = edges.edge_vec; // (E, 3) neighbour - center + pack.edge_mask = edges.edge_mask; // (E,) bool + // Single-rank: N == nloc (ghosts folded onto owners). Multi-rank: N == nall + // (ghosts are distinct nodes whose features come from their real halo types). + const std::int64_t n_node_count = fold_to_local ? nloc : nall; + pack.n_node = torch::full({1}, n_node_count, int_options).to(device); + // Node types from the extended types (NOT atype[mapping]): the local slice + // for single-rank, the full extended set (incl. real halo types) for + // multi-rank. + std::vector atype_nodes(atype_ext.begin(), + atype_ext.begin() + n_node_count); + pack.atype = torch::from_blob(atype_nodes.data(), {n_node_count}, int_options) + .clone() + .to(device); + return pack; +} + +/** + * @brief Remap NeighborGraph (graph-schema) public outputs onto the dense + * internal-key layout the rest of ``compute`` consumes. + * + * The graph forward (``forward_lower_graph_exportable``) is LOCAL-only and + * emits flat-N PUBLIC keys: + * - ``atom_energy`` (N, 1) per-atom energy (N == nloc) + * - ``energy`` (nf, 1) reduced total energy + * - ``force`` (N, 3) per-atom force (ghosts already folded onto + * their local owners via ``edge_index``) + * - ``virial`` (nf, 9) reduced total virial + * - ``atom_virial`` (N, 9) per-atom (full-to-src) virial + * + * The downstream extraction in ``DeepPotPTExpt::compute`` was written for the + * dense forward's internal keys with their extra dims: + * ``energy_redu`` (nf,1), ``energy_derv_c_redu`` (nf,1,9), + * ``energy_derv_r`` (nf,nall,1,3), ``energy`` (nf,nloc,1), + * ``energy_derv_c`` (nf,nall,1,9). + * + * This helper rewrites the public keys into those internal keys (single frame, + * nf == 1). The per-atom force / atom-virial are LOCAL (nloc rows); they are + * zero-padded up to the extended length ``nall`` so the existing fold-back + * (``fold_back`` / ``select_map``) is a no-op on the ghost rows — the local + * rows already carry the folded ghost contributions, so zero ghosts avoid + * double counting (and keep LAMMPS reverse-comm correct). + * + * **Single-rank only.** Multi-rank inference (B3.2) must NOT call this + * function: ghost/halo forces are real cross-rank contributions that must be + * returned as-is and folded back via reverse-comm rather than being zeroed. + * Calling this function on a multi-rank result would silently zero those forces + * and produce wrong energetics. Pass ``single_rank = false`` to get an + * explicit exception instead of silent corruption. + * + * @param[in,out] output_map Output tensor map (public keys in, internal keys + * added). + * @param[in] nloc Number of local atoms (== N, the graph node count). + * @param[in] nall Extended atom count to pad the per-atom outputs up to. + * @param[in] atomic Whether atomic energy / virial were requested. + * @param[in] single_rank Must be true; throws deepmd_exception if false. + */ +inline void remap_graph_outputs_to_dense_keys( + std::map& output_map, + const std::int64_t nloc, + const std::int64_t nall, + const bool atomic, + const bool single_rank = true) { + if (!single_rank) { + throw deepmd::deepmd_exception( + "remap_graph_outputs_to_dense_keys is single-rank-only; multi-rank " + "uses the extended-region reverse-comm fold (PR-B3.2)"); + } + using torch::indexing::Slice; + const std::int64_t nf = 1; + const auto& energy_pub = output_map.at("energy"); // (nf, 1) + const auto& force_pub = output_map.at("force"); // (N, 3), N == nloc + const auto& virial_pub = output_map.at("virial"); // (nf, 9) + + output_map["energy_redu"] = energy_pub.reshape({nf, 1}); + output_map["energy_derv_c_redu"] = virial_pub.reshape({nf, 1, 9}); + + // Local force -> (nf, nall, 1, 3) with zero ghost rows. + auto force_full = torch::zeros({nf, nall, 1, 3}, force_pub.options()); + force_full.index_put_({0, Slice(0, nloc), 0}, force_pub); + output_map["energy_derv_r"] = force_full; + + if (atomic) { + const auto& atom_energy_pub = output_map.at("atom_energy"); // (N, 1) + const auto& atom_virial_pub = output_map.at("atom_virial"); // (N, 9) + output_map["energy"] = atom_energy_pub.reshape({nf, nloc, 1}); + auto atom_virial_full = + torch::zeros({nf, nall, 1, 9}, atom_virial_pub.options()); + atom_virial_full.index_put_({0, Slice(0, nloc), 0}, atom_virial_pub); + output_map["energy_derv_c"] = atom_virial_full; + } +} + +/** + * @brief Remap NeighborGraph public outputs onto the dense internal-key layout + * for the MULTI-RANK (extended-region) non-message-passing path. + * + * Built with ``fold_to_local=false``, the graph has ``N == nall`` nodes: ghost + * (halo) atoms are distinct nodes, so the per-node ``force`` is already the + * EXTENDED force (one row per extended atom). Ghost reaction forces stay on + * their ghost rows and are folded back to their owning rank by LAMMPS + * reverse-comm — exactly as the dense path returns its extended force. No + * zero-padding (unlike the single-rank helper) and no with-comm artifact (dpa1 + * is non-MP). + * + * Key differences from the single-rank helper: + * - ``energy_redu`` = sum of the LOCAL atom energies + * (``atom_energy[0:nloc]``) ONLY. The public ``energy`` key reduces over all + * ``N == nall`` nodes, which would double-count the bias energy of ghost nodes + * that belong to other ranks (ghost nodes have no center edges, so they carry a + * bias-only energy and zero force/virial gradient — harmless for force/virial + * but wrong for the owned energy). + * - ``energy_derv_r`` / ``energy_derv_c`` keep all ``nall`` rows (no + * padding). + * + * @param[in,out] output_map Output tensor map (public keys in, internal keys + * added). + * @param[in] nloc Number of local atoms (owned by this rank). + * @param[in] nall Extended atom count (== N, the graph node count). + * @param[in] atomic Whether atomic energy / virial were requested. + */ +inline void remap_graph_outputs_to_dense_keys_extended( + std::map& output_map, + const std::int64_t nloc, + const std::int64_t nall, + const bool atomic) { + using torch::indexing::Slice; + const std::int64_t nf = 1; + const auto& atom_energy_pub = output_map.at("atom_energy"); // (N==nall, 1) + const auto& force_pub = output_map.at("force"); // (N==nall, 3) extended + const auto& virial_pub = output_map.at("virial"); // (nf, 9) + + // Owned energy = sum over LOCAL atoms only; ghost nodes carry bias-only + // energy belonging to other ranks. + output_map["energy_redu"] = + atom_energy_pub.index({Slice(0, nloc)}).sum().reshape({nf, 1}); + output_map["energy_derv_c_redu"] = virial_pub.reshape({nf, 1, 9}); + // Extended force: ghost rows stay distinct for LAMMPS reverse-comm fold-back. + output_map["energy_derv_r"] = force_pub.reshape({nf, nall, 1, 3}); + + if (atomic) { + const auto& atom_virial_pub = output_map.at("atom_virial"); // (N==nall, 9) + output_map["energy"] = + atom_energy_pub.index({Slice(0, nloc)}).reshape({nf, nloc, 1}); + output_map["energy_derv_c"] = atom_virial_pub.reshape({nf, nall, 1, 9}); + } +} + } // namespace deepmd #endif // BUILD_PYTORCH diff --git a/source/api_cc/src/DeepPotPTExpt.cc b/source/api_cc/src/DeepPotPTExpt.cc index 96033fcab4..315f4cc39b 100644 --- a/source/api_cc/src/DeepPotPTExpt.cc +++ b/source/api_cc/src/DeepPotPTExpt.cc @@ -155,8 +155,10 @@ void DeepPotPTExpt::init(const std::string& model, const std::string lower_input_kind = metadata["lower_input_kind"].as_string(); lower_input_is_edge_ = lower_input_kind == "edge_vec"; + lower_input_is_graph_ = lower_input_kind == "graph"; } else { lower_input_is_edge_ = false; + lower_input_is_graph_ = false; } type_map.clear(); @@ -289,6 +291,31 @@ std::vector DeepPotPTExpt::run_model_edges( return loader->run(inputs); } +std::vector DeepPotPTExpt::run_model_graph( + const torch::Tensor& atype, + const torch::Tensor& n_node, + const torch::Tensor& edge_index, + const torch::Tensor& edge_vec, + const torch::Tensor& edge_mask, + const torch::Tensor& fparam, + const torch::Tensor& aparam, + const torch::Tensor& charge_spin) { + // NeighborGraph ABI: (atype, n_node, edge_index, edge_vec, edge_mask, + // [fparam], [aparam], [charge_spin]). No coord, no edge_scatter_index. + std::vector inputs = {atype, n_node, edge_index, edge_vec, + edge_mask}; + if (dfparam > 0) { + inputs.push_back(fparam); + } + if (daparam > 0) { + inputs.push_back(aparam); + } + if (dchgspin > 0) { + inputs.push_back(charge_spin); + } + return loader->run(inputs); +} + std::vector DeepPotPTExpt::run_model_with_comm( const torch::Tensor& coord, const torch::Tensor& atype, @@ -475,6 +502,24 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, bool multi_rank = (lmp_list.nprocs > 1); bool atom_map_present = (lmp_list.mapping != nullptr); bool use_with_comm = has_comm_artifact_ && multi_rank; + // NeighborGraph multi-rank dispatch: + // - NON-message-passing (dpa1, se_e2_a, ...): the SAME single-rank graph + // .pt2 runs on the EXTENDED region (fold_to_local=false; ghosts are + // distinct nodes whose features come from their real halo types). No + // with-comm artifact / no border_op is needed; ghost reaction forces are + // folded to their owners by LAMMPS reverse-comm. Handled below. + // - message-passing graph (DPA2/DPA3, PR-G): would need a with-comm graph + // artifact for cross-rank ghost-feature exchange — not yet supported. + // Fail fast before building any tensors so callers get a clear message + // instead of a wrong answer. + if (lower_input_is_graph_ && multi_rank && has_message_passing_) { + throw deepmd::deepmd_exception( + "Multi-rank message-passing graph (NeighborGraph) .pt2 inference is " + "not yet supported (PR-G). Non-message-passing graph models (e.g. " + "dpa1) run multi-rank on the extended-region single-rank artifact; " + "for message-passing models run single-rank, or use a dense/edge " + ".pt2 for multi-rank LAMMPS."); + } // Decision matrix (see PR #5450 description): // non-GNN model (has_message_passing_ == false): regular path is // always safe. @@ -505,19 +550,25 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, // LAMMPS sets ago=0 on every nlist rebuild (neighbor rebuild, re-partition, // atom exchange between subdomains), so `ago > 0` implies the cached // mapping and nlist tensors are still valid. Rebuild only on ago==0. - std::vector mapping; if (ago == 0) { nlist_data.copy_from_nlist(lmp_list, nall - nghost); nlist_data.shuffle_exclude_empty(fwd_map); - // Rebuild mapping tensor + // Rebuild mapping vector and tensor (cached as members). ``mapping_tensor`` + // is consumed every step by the dense ``run_model`` (ghost-feature gather); + // the ``mapping_`` vector is read only here at ago==0 -- to build that + // tensor and, for the edge/graph paths, to fold ghost neighbours onto their + // local owners inside ``createEdgeTensors``. (The graph path used to read + // ``mapping_`` every step via a per-step ``buildGraphTensors``; it now + // caches the topology at ago==0 like the edge/dense paths, so no per-step + // read.) if (lmp_list.mapping) { - mapping.resize(nall_real); + mapping_.resize(nall_real); for (int ii = 0; ii < nall_real; ii++) { - mapping[ii] = fwd_map[lmp_list.mapping[bkw_map[ii]]]; + mapping_[ii] = fwd_map[lmp_list.mapping[bkw_map[ii]]]; } mapping_tensor = - torch::from_blob(mapping.data(), {1, nall_real}, int_option) + torch::from_blob(mapping_.data(), {1, nall_real}, int_option) .clone() .to(device); } else { @@ -530,12 +581,12 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, // features via border_op and ignores this tensor for ghost // gather — see deepmd/pt_expt/descriptor/ // repflows.py::_exchange_ghosts). - mapping.resize(nall_real); + mapping_.resize(nall_real); for (int ii = 0; ii < nall_real; ii++) { - mapping[ii] = ii; + mapping_[ii] = ii; } mapping_tensor = - torch::from_blob(mapping.data(), {1, nall_real}, int_option) + torch::from_blob(mapping_.data(), {1, nall_real}, int_option) .clone() .to(device); } @@ -551,11 +602,25 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, // their features can be exchanged across ranks via border_op, instead of // being folded onto a local owner that this rank does not own. const auto edge_tensors = createEdgeTensors( - nlist_data.jlist, dcoord, mapping, nloc, nall_real, device, + nlist_data.jlist, dcoord, mapping_, nloc, nall_real, device, /*with_geometry=*/false, /*row_centers=*/&nlist_data.ilist, /*fold_to_local=*/!use_with_comm); edge_index_tensor = edge_tensors.edge_index; edge_index_ext_tensor = edge_tensors.edge_index_ext; + } else if (lower_input_is_graph_) { + // Cache only the real skin topology, exactly like the edge path: the + // geometry (edge_vec) + rcut filter are recomputed on-device every step + // by compactEdgeTensors, so the O(E) host loop + H2D copy in + // createEdgeTensors runs ONLY on a LAMMPS nlist rebuild (ago==0), not + // every step. Single-rank folds ghosts onto local owners + // (fold_to_local=true); non-MP multi-rank keeps the extended region + // (fold_to_local=false) so ghost forces reverse-comm to their owners. + const auto edge_tensors = createEdgeTensors( + nlist_data.jlist, dcoord, mapping_, nloc, nall_real, device, + /*with_geometry=*/false, /*row_centers=*/&nlist_data.ilist, + /*fold_to_local=*/!multi_rank); + edge_index_tensor = edge_tensors.edge_index; + edge_index_ext_tensor = edge_tensors.edge_index_ext; } else { nlist_data.padding(); firstneigh_tensor = createNlistTensor(nlist_data.jlist, nnei) @@ -771,6 +836,49 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, edge_tensors.edge_index, edge_tensors.edge_vec, edge_tensors.edge_index_ext, edge_tensors.edge_mask, fparam_tensor, aparam_tensor, charge_spin_tensor); + } else if (lower_input_is_graph_) { + if (nall_real == 0) { + // Truly-empty rank (no local atoms AND no ghosts): the graph would emit + // N == 0 nodes, and edge_force_virial's ``edge_index % node_capacity`` + // would divide by zero (SIGFPE) -- it also violates the exported + // ``Dim("n_node_total", min=1)``. Such a rank contributes nothing, so + // fill zero outputs and return instead of running the model. (The + // tested ``nloc == 0`` empty-subdomain case has ``nall_real > 0`` -- + // ghosts within rcut -- so it still runs the model normally.) + ener.assign(nframes, static_cast(0)); + force.assign(static_cast(nframes) * fwd_map.size() * 3, + static_cast(0)); + virial.assign(static_cast(nframes) * 9, + static_cast(0)); + if (atomic) { + atom_energy.assign(static_cast(nframes) * fwd_map.size(), + static_cast(0)); + atom_virial.assign(static_cast(nframes) * fwd_map.size() * 9, + static_cast(0)); + } + return; + } + // NeighborGraph schema: recompute geometry + rcut filter on-device from + // the cached skin topology (edge_index[_ext]_tensor built at ago==0), + // then assemble the cheap node tensors. Mirrors the edge path -- no + // per-step host rebuild / H2D copy. Single-rank folds ghosts onto local + // owners (N == nloc); multi-rank (non-MP only — the fail-fast above + // blocks MP graph multi-rank) keeps the extended region (N == nall_real, + // node types from the real halo types) so LAMMPS reverse-comm folds ghost + // forces back. The node types come from the on-device extended + // atype_Tensor slice (== atype_ext[0:N]); n_node is a 1-element tensor. + const auto edge_tensors = + compactEdgeTensors(edge_index_tensor, edge_index_ext_tensor, + coord_Tensor, static_cast(rcut)); + const std::int64_t n_node_count = multi_rank ? nall_real : nloc; + at::Tensor n_node_tensor = + torch::full({1}, n_node_count, int_option).to(device); + at::Tensor node_atype = + atype_Tensor.slice(1, 0, n_node_count).reshape({n_node_count}); + flat_outputs = + run_model_graph(node_atype, n_node_tensor, edge_tensors.edge_index, + edge_tensors.edge_vec, edge_tensors.edge_mask, + fparam_tensor, aparam_tensor, charge_spin_tensor); } else { flat_outputs = run_model(coord_Tensor, atype_Tensor, firstneigh_tensor, mapping_tensor, fparam_tensor, aparam_tensor, @@ -782,6 +890,24 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, std::map output_map; extract_outputs(output_map, flat_outputs); + if (lower_input_is_graph_) { + // The graph forward emits flat-N PUBLIC keys (atom_energy/energy/force/ + // virial/atom_virial); rewrite them into the dense internal-key layout the + // downstream extraction/fold-back expects. + if (multi_rank) { + // Extended region (N == nall_real): force is already per-extended-atom, + // owned energy = sum over local atom energies, no zero-padding. Ghost + // forces fold back via LAMMPS reverse-comm (no with-comm artifact). + deepmd::remap_graph_outputs_to_dense_keys_extended(output_map, nloc, + nall_real, atomic); + } else { + // Single-rank (N == nloc): ghosts folded onto owners; pad the per-atom + // force/virial up to nall_real with zero ghost rows. + deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall_real, + atomic, /*single_rank=*/true); + } + } + if (phantom_n > 0) { // Strip the phantom local prefix and zero the empty rank's energy. The // phantom atoms carry no edges, so their force / per-atom virial are @@ -1015,9 +1141,16 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, .to(device); at::Tensor nlist_tensor; EdgeTensorPack edge_tensors; + GraphTensorPack graph_tensors; if (lower_input_is_edge_) { edge_tensors = createEdgeTensors(nlist_raw, coord_cpy_d, mapping_64, nloc, nall, device); + } else if (lower_input_is_graph_) { + // Standalone (no nlist) graph schema: build_nlist already cut at rcut and + // keys row i to center i, so no row_centers remapping is needed. + graph_tensors = + buildGraphTensors(nlist_raw, coord_cpy_d, atype_cpy, mapping_64, nloc, + nall, static_cast(rcut), device); } else { nlist_tensor = createNlistTensor(nlist_raw, nnei).to(torch::kInt64).to(device); @@ -1104,6 +1237,11 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, edge_tensors.edge_index, edge_tensors.edge_vec, edge_tensors.edge_index_ext, edge_tensors.edge_mask, fparam_tensor, aparam_tensor, charge_spin_tensor); + } else if (lower_input_is_graph_) { + flat_outputs = run_model_graph( + graph_tensors.atype, graph_tensors.n_node, graph_tensors.edge_index, + graph_tensors.edge_vec, graph_tensors.edge_mask, fparam_tensor, + aparam_tensor, charge_spin_tensor); } else { flat_outputs = run_model(coord_Tensor, atype_Tensor, nlist_tensor, mapping_tensor, @@ -1114,6 +1252,17 @@ void DeepPotPTExpt::compute(ENERGYVTYPE& ener, std::map output_map; extract_outputs(output_map, flat_outputs); + if (lower_input_is_graph_) { + // The graph forward emits LOCAL public keys; rewrite them into the dense + // internal-key layout used below. nloc == N (graph node count); pad the + // per-atom force/virial up to the extended nall with zero ghost rows so the + // fold-back is a no-op on ghosts. + // single_rank=true: the standalone (build_nlist) path is always + // single-rank; there is no comm_dict / cross-rank ghost exchange here. + deepmd::remap_graph_outputs_to_dense_keys(output_map, nloc, nall, atomic, + /*single_rank=*/true); + } + // 7. Extract energy torch::Tensor flat_energy_ = output_map["energy_redu"].view({-1}).to(torch::kCPU); diff --git a/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc new file mode 100644 index 0000000000..c57abaf0a0 --- /dev/null +++ b/source/api_cc/tests/test_deeppot_dpa1_graph_ptexpt.cc @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later +// Test C++ inference for the NeighborGraph (graph-schema) .pt2 path of the +// pt_expt backend. The graph model is a dpa1(attn_layer=0) descriptor exported +// with lower_kind="graph" (gen_dpa1.py section B); this is the FIRST runtime +// exercise of the C++ graph ingestion added in PR-B Phase B2 +// (lower_input_is_graph_ / run_model_graph / buildGraphTensors / the +// compute_inner graph branch). +// +// Reference values (deeppot_dpa1_graph.expected) come from an INDEPENDENT +// nlist (dense-quartet) evaluation of the same weights, so a match validates +// the graph AOTI ABI/geometry, not just self-consistency. A second, persisted +// nlist .pt2 of the same weights (deeppot_dpa1_graph_nlist_ref.pt2) is loaded +// alongside the graph model so arbitrary system sizes (dynamic edge axis) can +// be cross-checked graph≈dense live without baking more reference blocks. +#include + +#include +#include +#include + +#include "DeepPot.h" +#include "DeepPotPTExpt.h" +#include "expected_ref.h" +#include "neighbor_list.h" +#include "test_utils.h" + +namespace { +constexpr const char* kGraphModel = "../../tests/infer/deeppot_dpa1_graph.pt2"; +constexpr const char* kNlistRefModel = + "../../tests/infer/deeppot_dpa1_graph_nlist_ref.pt2"; +constexpr const char* kRefPath = + "../../tests/infer/deeppot_dpa1_graph.expected"; +} // namespace + +template +class TestInferDpa1GraphPtExpt : public ::testing::Test { + protected: + std::vector coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74, + 00.25, 3.32, 1.68, 3.36, 3.00, 1.81, + 3.51, 2.51, 2.60, 4.27, 3.22, 1.56}; + std::vector atype = {0, 1, 1, 0, 1, 1}; + std::vector box = {13., 0., 0., 0., 13., 0., 0., 0., 13.}; + // Per-atom reference (energy/force/virial) loaded from the .expected sidecar. + std::vector expected_e; + std::vector expected_f; + std::vector expected_v; + int natoms; + double expected_tot_e; + std::vector expected_tot_v; + + // Graph-schema model under test. + static deepmd::DeepPot dp; + // Independent nlist (dense) model with identical weights — used as a live + // graph≈dense oracle for arbitrary system sizes. + static deepmd::DeepPot dp_ref; + + static void SetUpTestSuite() { +#if defined(BUILD_PYTORCH) && BUILD_PT_EXPT + dp.init(kGraphModel); + dp_ref.init(kNlistRefModel); +#endif + } + + void SetUp() override { +#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT + GTEST_SKIP() << "Skip because PyTorch support is not enabled."; +#endif + deepmd_test::ExpectedRef ref; + ref.load(kRefPath); + expected_e = ref.get("pbc", "expected_e"); + expected_f = ref.get("pbc", "expected_f"); + expected_v = ref.get("pbc", "expected_v"); + + natoms = expected_e.size(); + EXPECT_EQ(natoms * 3, static_cast(expected_f.size())); + EXPECT_EQ(natoms * 9, static_cast(expected_v.size())); + expected_tot_e = 0.; + expected_tot_v.assign(9, 0.); + for (int ii = 0; ii < natoms; ++ii) { + expected_tot_e += expected_e[ii]; + } + for (int ii = 0; ii < natoms; ++ii) { + for (int dd = 0; dd < 9; ++dd) { + expected_tot_v[dd] += expected_v[ii * 9 + dd]; + } + } + }; + + void TearDown() override {}; + + static void TearDownTestSuite() { + dp = deepmd::DeepPot(); + dp_ref = deepmd::DeepPot(); + } +}; + +template +deepmd::DeepPot TestInferDpa1GraphPtExpt::dp; +template +deepmd::DeepPot TestInferDpa1GraphPtExpt::dp_ref; + +TYPED_TEST_SUITE(TestInferDpa1GraphPtExpt, ValueTypes); + +// Case 1: DeepPot builds its own neighbor list and runs the standalone graph +// branch (lower_input_is_graph_, build_nlist -> buildGraphTensors). Validates +// the graph AOTI ABI/geometry against the independent nlist reference. +TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist) { + using VALUETYPE = TypeParam; + std::vector& coord = this->coord; + std::vector& atype = this->atype; + std::vector& box = this->box; + std::vector& expected_f = this->expected_f; + int& natoms = this->natoms; + double& expected_tot_e = this->expected_tot_e; + std::vector& expected_tot_v = this->expected_tot_v; + deepmd::DeepPot& dp = this->dp; + double ener; + std::vector force, virial; + dp.compute(ener, force, virial, coord, atype, box); + + EXPECT_EQ(force.size(), static_cast(natoms * 3)); + EXPECT_EQ(virial.size(), 9u); + + EXPECT_LT(fabs(ener - expected_tot_e), EPSILON); + for (int ii = 0; ii < natoms * 3; ++ii) { + EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON); + } + for (int ii = 0; ii < 9; ++ii) { + EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON); + } +} + +// Case 2: a SECOND, larger system (12 atoms, different edge count) through the +// SAME loaded graph model — proves the dynamic edge axis works in C++. The +// graph result is cross-checked against the dense nlist .pt2 (same weights); +// at non-binding sel they must agree bit-for-bit (fp64 ~1e-10). +TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_sys2_dynamic_edges) { + using VALUETYPE = TypeParam; + deepmd::DeepPot& dp = this->dp; + deepmd::DeepPot& dp_ref = this->dp_ref; + + // 12 atoms: original 6 stacked with a +13 z-shifted copy, box doubled in z. + // Same local density as the 6-atom fixture, so per-atom neighbor counts stay + // far below sel=30 and graph(carry-all) == dense(sel-truncated). + std::vector coord2 = { + 12.83, 2.56, 2.18, 12.09, 2.87, 2.74, 00.25, 3.32, 1.68, + 3.36, 3.00, 1.81, 3.51, 2.51, 2.60, 4.27, 3.22, 1.56, + 12.83, 2.56, 15.18, 12.09, 2.87, 15.74, 00.25, 3.32, 14.68, + 3.36, 3.00, 14.81, 3.51, 2.51, 15.60, 4.27, 3.22, 14.56}; + std::vector atype2 = {0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1}; + std::vector box2 = {13., 0., 0., 0., 13., 0., 0., 0., 26.}; + int natoms2 = atype2.size(); + + double ener_g, ener_r; + std::vector force_g, virial_g, force_r, virial_r; + dp.compute(ener_g, force_g, virial_g, coord2, atype2, box2); + dp_ref.compute(ener_r, force_r, virial_r, coord2, atype2, box2); + + EXPECT_EQ(force_g.size(), static_cast(natoms2 * 3)); + EXPECT_EQ(virial_g.size(), 9u); + + EXPECT_LT(fabs(ener_g - ener_r), EPSILON); + for (int ii = 0; ii < natoms2 * 3; ++ii) { + EXPECT_LT(fabs(force_g[ii] - force_r[ii]), EPSILON); + } + for (int ii = 0; ii < 9; ++ii) { + EXPECT_LT(fabs(virial_g[ii] - virial_r[ii]), EPSILON); + } +} + +// Case 3 (CRITICAL): exercise the LAMMPS compute_inner graph branch with an +// explicit InputNlist and the `ago` cache. Calling compute twice WITHOUT +// rebuilding the nlist — first ago=0 (rebuild), then ago=1 (reuse) — must give +// identical results. This is the only case that hits compute_inner + the +// member-cached mapping_ vector; the build-nlist cases above never touch it. +// Regression guard for the OOB-on-ago>0 bug fixed by caching mapping_ as a +// member (commit 7c70db47b). +TYPED_TEST(TestInferDpa1GraphPtExpt, lammps_nlist_ago) { + using VALUETYPE = TypeParam; + std::vector& coord = this->coord; + std::vector& atype = this->atype; + std::vector& box = this->box; + std::vector& expected_f = this->expected_f; + int& natoms = this->natoms; + double& expected_tot_e = this->expected_tot_e; + std::vector& expected_tot_v = this->expected_tot_v; + deepmd::DeepPot& dp = this->dp; + + float rc = dp.cutoff(); + int nloc = coord.size() / 3; + std::vector coord_cpy; + std::vector atype_cpy, mapping; + std::vector > nlist_data; + _build_nlist(nlist_data, coord_cpy, atype_cpy, mapping, coord, + atype, box, rc); + int nall = coord_cpy.size() / 3; + std::vector ilist(nloc), numneigh(nloc); + std::vector firstneigh(nloc); + deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]); + convert_nlist(inlist, nlist_data); + // The graph branch folds ghost neighbours onto their local owners via the + // LAMMPS atom-map; without it periodic (ghost) edges would be dropped. + inlist.mapping = mapping.data(); + + // ago=0: rebuild the cached nlist/mapping, then run the graph branch. + double ener; + std::vector force_, virial; + dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc, + inlist, 0); + std::vector force; + _fold_back(force, force_, mapping, nloc, nall, 3); + + EXPECT_EQ(force.size(), static_cast(natoms * 3)); + EXPECT_EQ(virial.size(), 9u); + EXPECT_LT(fabs(ener - expected_tot_e), EPSILON); + for (int ii = 0; ii < natoms * 3; ++ii) { + EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON); + } + for (int ii = 0; ii < 9; ++ii) { + EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON); + } + + // ago=1: reuse the cached nlist/mapping (NO rebuild). Must match again. + // This is the path that previously read the local mapping vector OOB. + ener = 0.; + std::fill(force_.begin(), force_.end(), 0.0); + std::fill(virial.begin(), virial.end(), 0.0); + dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc, + inlist, 1); + _fold_back(force, force_, mapping, nloc, nall, 3); + + EXPECT_EQ(force.size(), static_cast(natoms * 3)); + EXPECT_EQ(virial.size(), 9u); + EXPECT_LT(fabs(ener - expected_tot_e), EPSILON); + for (int ii = 0; ii < natoms * 3; ++ii) { + EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON); + } + for (int ii = 0; ii < 9; ++ii) { + EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON); + } +} + +// Case 5: exercise the DeepPot::compute ATOMIC overload on the graph .pt2. +// This is the first test to reach the ``if (atomic)`` branch inside +// remap_graph_outputs_to_dense_keys (the atom_energy/atom_virial remapping). +// The per-atom reference values are already loaded from +// deeppot_dpa1_graph.expected into this->expected_e and this->expected_v by +// SetUp(). +TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_atomic) { + using VALUETYPE = TypeParam; + std::vector& coord = this->coord; + std::vector& atype = this->atype; + std::vector& box = this->box; + std::vector& expected_e = this->expected_e; + std::vector& expected_f = this->expected_f; + std::vector& expected_v = this->expected_v; + int& natoms = this->natoms; + double& expected_tot_e = this->expected_tot_e; + std::vector& expected_tot_v = this->expected_tot_v; + deepmd::DeepPot& dp = this->dp; + + double ener; + std::vector force, virial, atom_energy, atom_virial; + // Standalone atomic overload: DeepPot builds its own nlist (graph branch), + // then returns per-atom energy + atom-virial alongside total + // energy/force/virial. + dp.compute(ener, force, virial, atom_energy, atom_virial, coord, atype, box); + + EXPECT_EQ(force.size(), static_cast(natoms * 3)); + EXPECT_EQ(virial.size(), 9u); + EXPECT_EQ(atom_energy.size(), static_cast(natoms)); + EXPECT_EQ(atom_virial.size(), static_cast(natoms * 9)); + + EXPECT_LT(fabs(ener - expected_tot_e), EPSILON); + for (int ii = 0; ii < natoms * 3; ++ii) { + EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON); + } + for (int ii = 0; ii < 9; ++ii) { + EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON); + } + for (int ii = 0; ii < natoms; ++ii) { + EXPECT_LT(fabs(atom_energy[ii] - expected_e[ii]), EPSILON); + } + for (int ii = 0; ii < natoms * 9; ++ii) { + EXPECT_LT(fabs(atom_virial[ii] - expected_v[ii]), EPSILON); + } +} + +// Case 4: a tiny system with no in-cutoff neighbors — only the two masked +// dummy edges survive (nedge_min=2 guard / SIGFPE-edge family). The graph +// must run cleanly, produce finite, interaction-free output (zero force/virial) +// and agree with the dense reference. +TYPED_TEST(TestInferDpa1GraphPtExpt, cpu_build_nlist_tiny_no_edges) { + using VALUETYPE = TypeParam; + deepmd::DeepPot& dp = this->dp; + deepmd::DeepPot& dp_ref = this->dp_ref; + + // Two atoms ~33 apart in a 40-box: no neighbor within rcut=6 and no periodic + // image either, so the graph sees zero real edges (only the 2 dummy edges). + std::vector coord_t = {1.0, 1.0, 1.0, 20.0, 20.0, 20.0}; + std::vector atype_t = {0, 1}; + std::vector box_t = {40., 0., 0., 0., 40., 0., 0., 0., 40.}; + int natoms_t = atype_t.size(); + + double ener_g, ener_r; + std::vector force_g, virial_g, force_r, virial_r; + ASSERT_NO_THROW( + dp.compute(ener_g, force_g, virial_g, coord_t, atype_t, box_t)); + dp_ref.compute(ener_r, force_r, virial_r, coord_t, atype_t, box_t); + + EXPECT_EQ(force_g.size(), static_cast(natoms_t * 3)); + EXPECT_EQ(virial_g.size(), 9u); + + EXPECT_TRUE(std::isfinite(ener_g)); + // No interactions: force and virial must vanish. + for (int ii = 0; ii < natoms_t * 3; ++ii) { + EXPECT_TRUE(std::isfinite(force_g[ii])); + EXPECT_LT(fabs(force_g[ii]), EPSILON); + } + for (int ii = 0; ii < 9; ++ii) { + EXPECT_TRUE(std::isfinite(virial_g[ii])); + EXPECT_LT(fabs(virial_g[ii]), EPSILON); + } + // graph == dense for the isolated-atom limit. + EXPECT_LT(fabs(ener_g - ener_r), EPSILON); + for (int ii = 0; ii < natoms_t * 3; ++ii) { + EXPECT_LT(fabs(force_g[ii] - force_r[ii]), EPSILON); + } +} diff --git a/source/api_cc/tests/test_deeppot_universal.cc b/source/api_cc/tests/test_deeppot_universal.cc index e0ee6fc8f4..1c599d7d33 100644 --- a/source/api_cc/tests/test_deeppot_universal.cc +++ b/source/api_cc/tests/test_deeppot_universal.cc @@ -143,6 +143,28 @@ std::vector variant_deeppot_cases() { /*supports_no_pbc_atomic=*/false, /*supports_no_pbc_lmp_nlist=*/true, /*supports_no_pbc_lmp_nlist_atomic=*/false}, + {"dpa1_graph_ptexpt", + Backend::PTExpt, + "../../tests/infer/deeppot_dpa1_graph.pt2", + /*convert_pbtxt=*/false, + nullptr, + nullptr, + "../../tests/infer/deeppot_dpa1_graph.expected", + "pbc", + "nopbc", + 1e-10, + 1e-4, + /*supports_float=*/true, + /*supports_finite_difference=*/true, + /*supports_lmp_nlist=*/true, + /*supports_lmp_nlist_atomic=*/true, + /*supports_lmp_nlist_cutoff_twice=*/true, + /*supports_lmp_nlist_type_sel=*/true, + /*supports_print_summary=*/true, + /*supports_no_pbc_simple=*/true, + /*supports_no_pbc_atomic=*/false, + /*supports_no_pbc_lmp_nlist=*/true, + /*supports_no_pbc_lmp_nlist_atomic=*/false}, {"dpa2_pytorch_pth", Backend::PyTorch, "../../tests/infer/deeppot_dpa2.pth", diff --git a/source/lmp/tests/test_lammps_dpa1_graph_pt2.py b/source/lmp/tests/test_lammps_dpa1_graph_pt2.py new file mode 100644 index 0000000000..ab898e15f6 --- /dev/null +++ b/source/lmp/tests/test_lammps_dpa1_graph_pt2.py @@ -0,0 +1,319 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Test LAMMPS with the NeighborGraph (graph-schema) .pt2 DPA1 model. + +The model ``deeppot_dpa1_graph.pt2`` is a dpa1(attn_layer=0) descriptor +exported with ``lower_kind="graph"`` (gen_dpa1.py section B). dpa1 is +NON-message-passing, so the SAME single-rank graph .pt2 also drives the +multi-rank path: the C++ ``DeepPotPTExpt`` builds an EXTENDED-region graph +(``fold_to_local=False``; ghosts are distinct nodes whose features come from +their real halo types) and returns per-extended-atom forces, which LAMMPS +reverse-comm folds back to their owners. There is NO with-comm artifact and +NO ``border_op`` (that is the message-passing PR-G path) — hence no +``use_loc_mapping=False`` variant. + +Reference values come from ``source/tests/infer/gen_dpa1.py`` (the same +``deeppot_dpa1_graph.expected`` the C++ gtest uses); the multi-rank run must +match the single-rank reference for energy, per-atom force, and per-atom +virial. This is the core multi-rank correctness gate for the non-MP graph +path implemented in B3.1. +""" + +import importlib.util +import os +import shutil +import subprocess as sp +import sys +import tempfile +from pathlib import ( + Path, +) + +import constants +import numpy as np +import pytest +from expected_ref import ( + read_expected_ref, +) +from lammps import ( + PyLammps, +) +from write_lmp_data import ( + write_lmp_data, +) + +pb_file = ( + Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa1_graph.pt2" +) +ref_file = ( + Path(__file__).parent.parent.parent + / "tests" + / "infer" + / "deeppot_dpa1_graph.expected" +) +# The MPI runner is backend-agnostic (DATAFILE PB_FILE OUTPUT + flags); reuse +# the DPA3 driver verbatim rather than duplicate it. +mpi_runner = Path(__file__).parent / "run_mpi_pair_deepmd_dpa3_pt2.py" + +data_file = Path(__file__).parent / "data_dpa1_graph_pt2.lmp" +# Elongated-box variant for the empty-subdomain MPI corner: x extended to +# 30 A while atoms stay in x in [0.25, 12.83]; with ``processors 2 1 1`` the +# split at x = 15 leaves rank 1 with zero local atoms. +data_file_empty_subdomain = ( + Path(__file__).parent / "data_dpa1_graph_pt2_empty_subdomain.lmp" +) + +# Reference values written by source/tests/infer/gen_dpa1.py (PBC case). +# Guarded with try/except because gen_dpa1.py only runs when PyTorch is built. +try: + _ref = read_expected_ref(ref_file)["pbc"] + expected_e = float(np.sum(_ref["expected_e"])) + expected_f = _ref["expected_f"].reshape(6, 3) + # LAMMPS uses the opposite sign convention for virial vs DeepPot. + expected_v = -_ref["expected_v"].reshape(6, 9) +except FileNotFoundError: + expected_e = expected_f = expected_v = None + +# Gate the reference-comparison tests on the generated ``.expected`` fixture so +# they skip cleanly (rather than failing with a ``TypeError`` on ``None``) when +# gen_dpa1.py has not run (e.g. PyTorch not built). The MPI multi-rank tests +# compare against a single-rank run of the same archive and do not need it. +_HAS_REF = expected_e is not None + +box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0]) +coord = np.array( + [ + [12.83, 2.56, 2.18], + [12.09, 2.87, 2.74], + [0.25, 3.32, 1.68], + [3.36, 3.00, 1.81], + [3.51, 2.51, 2.60], + [4.27, 3.22, 1.56], + ] +) +# Model type_map is ["O", "H"]; gtest atype = [0, 1, 1, 0, 1, 1] -> LAMMPS +# types [1, 2, 2, 1, 2, 2] under identity ``pair_coeff * *``. +type_OH = np.array([1, 2, 2, 1, 2, 2]) + + +def setup_module() -> None: + if os.environ.get("ENABLE_PYTORCH", "1") != "1": + pytest.skip( + "Skip test because PyTorch support is not enabled.", + ) + write_lmp_data(box, coord, type_OH, data_file) + box_empty_subdomain = np.array([0, 30, 0, 13, 0, 13, 0, 0, 0]) + write_lmp_data(box_empty_subdomain, coord, type_OH, data_file_empty_subdomain) + + +def teardown_module() -> None: + for f in [data_file, data_file_empty_subdomain]: + if f.exists(): + os.remove(f) + + +def _lammps(data_file, units="metal", atom_map: str = "yes") -> PyLammps: + lammps = PyLammps() + lammps.units(units) + lammps.boundary("p p p") + lammps.atom_style("atomic") + if atom_map != "no": + lammps.atom_modify(f"map {atom_map}") + lammps.neighbor("2.0 bin") + lammps.neigh_modify("every 10 delay 0 check no") + lammps.read_data(data_file.resolve()) + lammps.mass("1 16") + lammps.mass("2 2") + lammps.timestep(0.0005) + lammps.fix("1 all nve") + return lammps + + +@pytest.fixture +def lammps(): + lmp = _lammps(data_file=data_file) + yield lmp + lmp.close() + + +@pytest.mark.skipif(not _HAS_REF, reason="gen_dpa1.py .expected fixture not generated") +def test_pair_deepmd(lammps) -> None: + """Single-rank serial run (``atom_modify map yes``): the graph .pt2 + folds ghosts onto local owners (``fold_to_local=True``) and must match + the gen_dpa1.py reference for energy and per-atom force. + """ + lammps.pair_style(f"deepmd {pb_file.resolve()}") + lammps.pair_coeff("* *") + lammps.run(0) + assert lammps.eval("pe") == pytest.approx(expected_e) + for ii in range(6): + assert lammps.atoms[ii].force == pytest.approx( + expected_f[lammps.atoms[ii].id - 1] + ) + lammps.run(1) + + +@pytest.mark.skipif(not _HAS_REF, reason="gen_dpa1.py .expected fixture not generated") +def test_pair_deepmd_virial(lammps) -> None: + """Single-rank per-atom virial via ``centroid/stress/atom``.""" + lammps.pair_style(f"deepmd {pb_file.resolve()}") + lammps.pair_coeff("* *") + lammps.compute("virial all centroid/stress/atom NULL pair") + for ii in range(9): + jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii] + lammps.variable(f"virial{jj} atom c_virial[{ii + 1}]") + lammps.dump( + "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)]) + ) + lammps.run(0) + assert lammps.eval("pe") == pytest.approx(expected_e) + for ii in range(6): + assert lammps.atoms[ii].force == pytest.approx( + expected_f[lammps.atoms[ii].id - 1] + ) + idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1 + for ii in range(9): + assert np.array( + lammps.variables[f"virial{ii}"].value + ) / constants.nktv2p == pytest.approx(expected_v[idx_map, ii]) + + +# --------------------------------------------------------------------------- +# Multi-rank test (non-MP extended-region graph path; B3.1). +# +# dpa1 is non-message-passing, so multi-rank uses the SAME single-rank graph +# .pt2 on the extended region. The expected energy/force/virial are the +# single-rank reference: each rank evaluates its local atoms over the extended +# graph; ghost reaction forces fold back via LAMMPS reverse-comm. +# --------------------------------------------------------------------------- + + +def _run_mpi_subprocess( + extra_args: list[str] | None = None, + nprocs: int = 2, + data_path: Path | None = None, + processors: str | None = None, + runner_args: list[str] | None = None, +) -> dict: + """Invoke the (backend-agnostic) DPA3 MPI runner under + ``mpirun -n `` against the dpa1 graph .pt2 and return + ``{"pe": float, "forces": (n, 3), "virials": (n, 9)}``. + + ``nprocs == 1`` forces ``--processors 1 1 1`` so the C++ side sees + ``nprocs == 1`` and routes to the single-rank graph path — a + same-archive reference for the multi-rank comparison. + """ + if data_path is None: + data_path = data_file + with tempfile.NamedTemporaryFile(mode="r", suffix=".out", delete=False) as f: + out_path = f.name + try: + argv = [ + "mpirun", + "-n", + str(nprocs), + sys.executable, + str(mpi_runner), + str(data_path.resolve()), + str(pb_file.resolve()), + out_path, + ] + if processors is not None: + argv.extend(["--processors", processors]) + elif nprocs == 1: + argv.extend(["--processors", "1 1 1"]) + if extra_args: + argv.extend(extra_args) + if runner_args: + argv.extend(runner_args) + sp.check_call(argv) + with open(out_path) as fh: + lines = fh.read().strip().splitlines() + pe = float(lines[0]) + rows = np.array( + [list(map(float, line.split())) for line in lines[1:]], + dtype=np.float64, + ) + forces = rows[:, :3] + virials = rows[:, 3:] + return {"pe": pe, "forces": forces, "virials": virials} + finally: + if os.path.exists(out_path): + os.remove(out_path) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa1_graph() -> None: + """Multi-rank LAMMPS run for the dpa1 graph .pt2 must match the + single-rank reference within numerical tolerance for energy, forces, + and per-atom virial. + + This is the core correctness gate for the non-MP extended-region + multi-rank C++ path (B3.1): the extended graph + reverse-comm + fold-back must reproduce the folded single-rank result. + """ + out = _run_mpi_subprocess() + assert out["pe"] == pytest.approx(expected_e, rel=0, abs=1e-8) + for ii in range(6): + np.testing.assert_allclose(out["forces"][ii], expected_f[ii], atol=1e-8, rtol=0) + # ``centroid/stress/atom`` column order [xx, yy, zz, xy, xz, yz, yx, zx, + # zy]; the inverse permutation maps it back to the expected_v columns. + expected_v_to_lammps = [0, 6, 7, 3, 1, 8, 4, 5, 2] + np.testing.assert_allclose( + out["virials"][:, expected_v_to_lammps] / constants.nktv2p, + expected_v, + atol=1e-8, + rtol=0, + ) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa1_graph_matches_single_rank() -> None: + """Multi-rank (``-n 2``) ≡ single-rank (``-n 1``) on the SAME archive + and trajectory — isolates the extended-region multi-rank C++ path from + the .pt2 reference values (a wrong-but-finite divergence would show up + here even if the hardcoded reference drifted). + """ + out_mpi = _run_mpi_subprocess(nprocs=2) + out_ref = _run_mpi_subprocess(nprocs=1) + np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 + ) + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10) + + +@pytest.mark.skipif( + shutil.which("mpirun") is None, reason="MPI is not installed on this system" +) +@pytest.mark.skipif( + importlib.util.find_spec("mpi4py") is None, reason="mpi4py is not installed" +) +def test_pair_deepmd_mpi_dpa1_graph_empty_subdomain() -> None: + """Multi-rank with one rank owning zero local atoms (elongated box, + ``processors 2 1 1``, split at x = 15). The extended-region graph path + must still produce correct forces/virial on the populated rank and a + zero contribution from the empty rank — compared against a same-archive + single-rank reference of the same fixture. + """ + # Force ``processors 2 1 1`` so the split is along x at 15 and rank 1 is + # genuinely empty -- otherwise LAMMPS may auto-pick a grid where neither + # rank is empty and the branch under test is not exercised. + out_mpi = _run_mpi_subprocess( + nprocs=2, data_path=data_file_empty_subdomain, processors="2 1 1" + ) + out_ref = _run_mpi_subprocess(nprocs=1, data_path=data_file_empty_subdomain) + np.testing.assert_allclose(out_mpi["forces"], out_ref["forces"], atol=1e-8, rtol=0) + np.testing.assert_allclose( + out_mpi["virials"], out_ref["virials"], atol=1e-8, rtol=0 + ) + assert out_mpi["pe"] == pytest.approx(out_ref["pe"], rel=1e-8, abs=1e-10) diff --git a/source/tests/common/dpmodel/test_edge_force_virial.py b/source/tests/common/dpmodel/test_edge_force_virial.py index fa84ef7ba4..722960f57a 100644 --- a/source/tests/common/dpmodel/test_edge_force_virial.py +++ b/source/tests/common/dpmodel/test_edge_force_virial.py @@ -97,6 +97,38 @@ def test_all_edges_masked_gives_zero(self) -> None: np.testing.assert_allclose(av, np.zeros((n, 3, 3))) np.testing.assert_allclose(vir, np.zeros((nf, 3, 3))) + def test_modulo_clamp_leaves_real_edges_unchanged(self) -> None: + # INVARIANT (iProzd review): the in-bounds index clamp (``% n_out``) that + # keeps the CUDA-exported scatter address legal must NEVER alter a real + # (edge_mask == True) edge -- only masked/out-of-range guard edges may be + # remapped, and those carry zero weight so remapping is harmless. Here a + # REAL edge sits on the boundary node ``n_out - 1`` (the largest valid + # index, where a wrong wrap would be visible) and a MASKED guard edge + # carries deliberately OUT-OF-RANGE indices (>= n_out) with nonzero g/vec. + # Correctness requires the result to equal the real-edges-only reference: + # the boundary real edge must land on node n_out-1 (not wrapped), and the + # out-of-range guard must contribute nothing. If real edges were ever + # remapped by the modulo (the shape-binding bug iProzd warned about), the + # boundary node's force/virial would be wrong and this test would fail. + n_node = np.array([5], dtype=np.int64) # 1 frame, nodes 0..4 (n_out = 5) + # e0: real, src on the boundary node 4 -> node 0 ; e1: real, node 0 -> 4 + # e2: MASKED guard with out-of-range indices src=99, dst=77 (>= n_out) + edge_index = np.array([[4, 0, 99], [0, 4, 77]], dtype=np.int64) + edge_vec = np.array([[1.0, 0.0, 0.0], [-1.0, 0.0, 0.0], [9.0, 9.0, 9.0]]) + edge_mask = np.array([True, True, False]) + g = np.array([[0.5, 0.2, 0.0], [0.3, 0.0, 0.1], [7.0, 7.0, 7.0]]) + force, av, vir = edge_force_virial(g, edge_vec, edge_index, edge_mask, n_node) + + # reference: the SAME two real edges only (no guard edge at all) + ref_force, ref_av, ref_vir = edge_force_virial( + g[:2], edge_vec[:2], edge_index[:, :2], edge_mask[:2], n_node + ) + np.testing.assert_allclose(force, ref_force) + np.testing.assert_allclose(av, ref_av) + np.testing.assert_allclose(vir, ref_vir) + # explicit: the boundary real edge scattered its force to node 4 (unwrapped) + self.assertTrue(np.any(force[4] != 0.0)) + def test_ragged_multiframe_with_edge_and_node_padding(self) -> None: # MOST GENERAL case: 2 frames with DIFFERENT node counts (3 and 5) AND # different edge counts (2 and 3), masked guard EDGES, and a padded NODE diff --git a/source/tests/common/dpmodel/test_graph_static_capacity.py b/source/tests/common/dpmodel/test_graph_static_capacity.py new file mode 100644 index 0000000000..c9ff982d01 --- /dev/null +++ b/source/tests/common/dpmodel/test_graph_static_capacity.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Tests for static edge_capacity masked padding in build_neighbor_graph. + +Codifies the contract: build_neighbor_graph(..., layout=GraphLayout(edge_capacity=E_max)) +returns a NeighborGraph whose edge_index/edge_vec/edge_mask have a STATIC leading +edge dim E_max (real edges in the compact prefix, edge_mask=False tail), so export +sees a fixed E. Edge overflow must raise ValueError. +""" + +import numpy as np +import pytest + +from deepmd.dpmodel.utils.neighbor_graph import ( + GraphLayout, + build_neighbor_graph, +) + + +class TestStaticEdgeCapacity: + """Tests for static edge_capacity masked padding via build_neighbor_graph.""" + + @pytest.fixture() + def small_system(self): + """6-atom periodic system with a 20 Å box (atoms well within rcut=4 range).""" + rng = np.random.default_rng(0) + coord = rng.normal(size=(1, 6, 3)) * 1.5 + atype = np.array([[0, 1, 0, 1, 0, 1]], dtype=np.int64) + box = np.eye(3).reshape(1, 9) * 20.0 + return coord, atype, box + + def test_static_edge_capacity_shape(self, small_system): + """Static edge_capacity=64 yields edge_index.shape == (2, 64).""" + coord, atype, box = small_system + cap = build_neighbor_graph( + coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=64) + ) + assert cap.edge_index.shape == (2, 64) + assert cap.edge_vec.shape == (64, 3) + assert cap.edge_mask.shape == (64,) + + def test_static_edge_capacity_matches_dynamic(self, small_system): + """Static graph has same real-edge count as dynamic graph.""" + coord, atype, box = small_system + dyn = build_neighbor_graph(coord, atype, box, 4.0) + cap = build_neighbor_graph( + coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=64) + ) + assert cap.edge_index.shape == (2, 64) + assert int(cap.edge_mask.sum()) == int(dyn.edge_mask.sum()) + + def test_static_edge_capacity_real_prefix_matches_dynamic(self, small_system): + """The real-edge prefix of the static graph matches the dynamic graph.""" + coord, atype, box = small_system + dyn = build_neighbor_graph(coord, atype, box, 4.0) + cap = build_neighbor_graph( + coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=64) + ) + n_real = int(dyn.edge_mask.sum()) + # real prefix must match exactly + np.testing.assert_array_equal( + cap.edge_index[:, :n_real], dyn.edge_index[:, :n_real] + ) + np.testing.assert_allclose(cap.edge_vec[:n_real], dyn.edge_vec[:n_real]) + # padding suffix must have edge_mask=False + assert not np.any(cap.edge_mask[n_real:]) + + def test_overflow_raises(self, small_system): + """edge_capacity smaller than real edge count must raise ValueError.""" + coord, atype, box = small_system + # capacity=1 is guaranteed to be smaller than the real edge count + with pytest.raises(ValueError, match="edge overflow"): + build_neighbor_graph( + coord, atype, box, 4.0, layout=GraphLayout(edge_capacity=1) + ) diff --git a/source/tests/infer/gen_dpa1.py b/source/tests/infer/gen_dpa1.py index 7eaaae4ae2..9c743c9f88 100644 --- a/source/tests/infer/gen_dpa1.py +++ b/source/tests/infer/gen_dpa1.py @@ -1,10 +1,21 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: LGPL-3.0-or-later -"""Generate deeppot_dpa1.pth and deeppot_dpa1.pt2 test models. +"""Generate deeppot_dpa1.pth, deeppot_dpa1.pt2, and deeppot_dpa1_graph.pt2 test models. -Creates a DPA1 model from dpmodel config, serializes, and exports to both -.pth (torch.jit) and .pt2 (torch.export) from the same weights. -Also prints reference values for C++ tests (PBC and NoPbc). +Creates two DPA1 models from dpmodel configs: + - deeppot_dpa1.pt2 / deeppot_dpa1.pth (attn_layer=2, dense nlist-form export) + - deeppot_dpa1_graph.pt2 (attn_layer=0, graph-form export via + lower_kind="graph"; the graph forward + is eligible only when attn_layer==0) + +Both are serialized and exported to their respective formats from the same weights. +Reference sidecar files (.expected) consumed by C++ gtests are also written: + - deeppot_dpa1.expected — from the nlist .pt2 eval (existing) + - deeppot_dpa1_graph.expected — from an independent NLIST .pt2 eval (NOT the + graph .pt2; dpmodel se_atten has no analytical force, so the dense nlist + path is the independent ground truth). At non-binding sel the graph and + nlist paths see the same neighbor set, so the graph .pt2 is sanity-checked + against this reference at ≤1e-5. """ import copy @@ -171,6 +182,157 @@ def main(): print(f"// .pth NoPbc total energy: {e_pth_np[0, 0]:.18e}") # noqa: T201 print(f"// .pth vs .pt2 NoPbc energy diff: {abs(e_np[0, 0] - e_pth_np[0, 0]):.2e}") # noqa: T201 + # ============================================================ + # Section B: graph-eligible DPA1 (attn_layer=0) model + # ============================================================ + # attn_layer=0 disables the attention layers, making the descriptor + # a plain two-body embedding (se_e2_a-like) that is eligible for the + # NeighborGraph forward path (forward_lower_graph_exportable). + # Config mirrors DPA1_CONFIG in + # source/tests/pt_expt/utils/test_graph_pt2_metadata.py + graph_config = { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_atten", + "sel": 30, + "rcut_smth": 2.0, + "rcut": 6.0, + "neuron": [2, 4, 8], + "axis_neuron": 4, + "attn": 5, + "attn_layer": 0, + "attn_dotr": True, + "attn_mask": False, + "activation_function": "tanh", + "scaling_factor": 1.0, + "normalize": True, + "temperature": 1.0, + "type_one_side": True, + "seed": 1, + }, + "fitting_net": { + "neuron": [5, 5, 5], + "resnet_dt": True, + "seed": 1, + }, + } + + print("\n---- Building graph-eligible DPA1 (attn_layer=0) ----") # noqa: T201 + + # ---- B.1 Build dpmodel, serialize ---- + model_g = get_model(copy.deepcopy(graph_config)) + model_dict_g = model_g.serialize() + + data_g = { + "model": copy.deepcopy(model_dict_g), + "model_def_script": graph_config, + "backend": "dpmodel", + "software": "deepmd-kit", + "version": "3.0.0", + } + + # ---- B.2 Compute reference via nlist .pt2 (independent of graph path) ---- + # The reference for deeppot_dpa1_graph.expected comes from the NLIST .pt2 + # (dense-quartet forward), NOT the graph .pt2. This ensures the C++ gtest + # (B2.5) independently validates the graph AOTI path against a known-good + # nlist evaluation. + # + # The nlist .pt2 is also PERSISTED (deeppot_dpa1_graph_nlist_ref.pt2): the + # C++ gtest loads it alongside the graph .pt2 to cross-check graph≈dense at + # 1e-9 on arbitrary system sizes (dynamic-edge-axis cases) without baking a + # second reference block into the .expected sidecar. Same weights as the + # graph model, so at non-binding sel the two paths must agree. + nlist_ref_pt2 = os.path.join(base_dir, "deeppot_dpa1_graph_nlist_ref.pt2") + print(f"Exporting reference nlist .pt2 to {nlist_ref_pt2} ...") # noqa: T201 + pt_expt_deserialize_to_file( + nlist_ref_pt2, + copy.deepcopy(data_g), + do_atomic_virial=True, + lower_kind="nlist", # independent: dense nlist, NOT graph + ) + dp_nlist_ref = DeepPot(nlist_ref_pt2) + + # PBC reference from nlist path + e_r1, f_r1, v_r1, ae_r1, av_r1 = dp_nlist_ref.eval(coord, box, atype, atomic=True) + # NoPBC reference from nlist path + e_rnp, f_rnp, v_rnp, ae_rnp, av_rnp = dp_nlist_ref.eval( + coord, None, atype, atomic=True + ) + + print(f"Nlist ref PBC energy: {e_r1[0, 0]:.18e}") # noqa: T201 + print(f"Nlist ref NoPBC energy: {e_rnp[0, 0]:.18e}") # noqa: T201 + max_ref_force_pbc = float(np.max(np.abs(f_r1))) + max_ref_force_nopbc = float(np.max(np.abs(f_rnp))) + print(f"Nlist ref PBC max |force|: {max_ref_force_pbc:.6e}") # noqa: T201 + print(f"Nlist ref NoPBC max |force|: {max_ref_force_nopbc:.6e}") # noqa: T201 + if max_ref_force_pbc < 1e-10: + raise RuntimeError( + f"Graph model nlist-ref forces are degenerate " + f"(max={max_ref_force_pbc:.2e}); weights may need perturbation." + ) + + # ---- B.3 Write sidecar reference file ---- + graph_ref_path = os.path.join(base_dir, "deeppot_dpa1_graph.expected") + write_expected_ref( + graph_ref_path, + sections={ + "pbc": { + "expected_e": ae_r1[0, :, 0], + "expected_f": f_r1[0], + "expected_v": av_r1[0], + }, + "nopbc": { + "expected_e": ae_rnp[0, :, 0], + "expected_f": f_rnp[0], + "expected_v": av_rnp[0], + }, + }, + source_script="source/tests/infer/gen_dpa1.py", + ) + print(f"Wrote {graph_ref_path}") # noqa: T201 + + # ---- B.4 Export graph-form .pt2 ---- + graph_pt2_path = os.path.join(base_dir, "deeppot_dpa1_graph.pt2") + print(f"Exporting to {graph_pt2_path} (lower_kind='graph') ...") # noqa: T201 + pt_expt_deserialize_to_file( + graph_pt2_path, + copy.deepcopy(data_g), + do_atomic_virial=True, + lower_kind="graph", + ) + print("Graph .pt2 export done.") # noqa: T201 + + # ---- B.5 Sanity-check: graph .pt2 vs nlist reference ---- + # Both use the SAME weights; at non-binding sel the math is equivalent. + # Verifies that forward_lower_graph_exportable + edge_energy_deriv match + # the nlist forward for this concrete system. + dp_graph = DeepPot(graph_pt2_path) + + # PBC sanity check + e_g1, f_g1, v_g1, ae_g1, av_g1 = dp_graph.eval(coord, box, atype, atomic=True) + force_diff_pbc = float(np.max(np.abs(f_g1[0] - f_r1[0]))) + print( # noqa: T201 + f"Graph .pt2 vs nlist ref PBC force max diff: {force_diff_pbc:.2e}" + ) + if force_diff_pbc > 1e-5: + raise RuntimeError( + f"BLOCKED: graph .pt2 PBC force differs from nlist reference by " + f"{force_diff_pbc:.2e} (threshold 1e-5)." + ) + + # NoPBC sanity check + e_gnp, f_gnp, v_gnp, ae_gnp, av_gnp = dp_graph.eval(coord, None, atype, atomic=True) + force_diff_nopbc = float(np.max(np.abs(f_gnp[0] - f_rnp[0]))) + print( # noqa: T201 + f"Graph .pt2 vs nlist ref NoPBC force max diff: {force_diff_nopbc:.2e}" + ) + if force_diff_nopbc > 1e-5: + raise RuntimeError( + f"BLOCKED: graph .pt2 NoPBC force differs from nlist reference by " + f"{force_diff_nopbc:.2e} (threshold 1e-5)." + ) + + print("\nAll graph sanity checks passed.") # noqa: T201 print("\nDone!") # noqa: T201 diff --git a/source/tests/pt_expt/infer/test_graph_deepeval.py b/source/tests/pt_expt/infer/test_graph_deepeval.py new file mode 100644 index 0000000000..7fc83b677f --- /dev/null +++ b/source/tests/pt_expt/infer/test_graph_deepeval.py @@ -0,0 +1,239 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Graph-form ``.pt2`` DeepEval parity vs the eager dense reference. + +A graph-form ``.pt2`` (exported with ``lower_kind="graph"``) carries the +NeighborGraph schema ``(atype, n_node, edge_index, edge_vec, edge_mask, ...)`` +in its AOTI forward. This test verifies that evaluating such an archive +through the pt_expt :class:`DeepPot` reproduces the eager dpa1 energy / force / +virial to ``rtol=atol=1e-10`` (fp64), for both PBC and non-PBC. + +The graph path is CARRY-ALL (every neighbor within ``rcut``); the eager dense +reference is sel-capped (``sel=30``, forced via +``neighbor_graph_method="legacy"``). They coincide only at NON-BINDING ``sel`` +(max neighbor count ``< sel``), so the test fixture is a small, sparse cluster +and the non-binding condition is asserted explicitly -- otherwise the parity +would vacuously compare two different neighbor sets. +""" + +import copy +import os +import tempfile + +import numpy as np +import pytest +import torch + +from deepmd.infer import ( + DeepPot, +) +from deepmd.pt_expt.utils.env import ( + DEVICE, +) +from deepmd.pt_expt.utils.serialization import ( + deserialize_to_file, +) + +# dpa1 with attn_layer == 0 -- the energy model exercised by the graph path. +DPA1_CONFIG = { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_atten", + "sel": 30, + "rcut_smth": 2.0, + "rcut": 6.0, + "neuron": [2, 4, 8], + "axis_neuron": 4, + "attn": 5, + "attn_layer": 0, + "attn_dotr": True, + "attn_mask": False, + "activation_function": "tanh", + "scaling_factor": 1.0, + "normalize": True, + "temperature": 1.0, + "type_one_side": True, + "seed": 1, + }, + "fitting_net": { + "neuron": [5, 5, 5], + "resnet_dt": True, + "seed": 1, + }, +} + +RCUT = 6.0 +SEL = 30 + + +def _build_system( + natoms: int = 8, seed: int = 20240626 +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """A small, sparse cluster: ``natoms`` inside a 5 A blob, centered in an 18 A box. + + The blob keeps every atom within ``rcut`` of at most ``natoms - 1`` others + (<< ``sel``), so the carry-all graph neighbor set equals the sel-capped + dense one. Varying ``natoms`` yields a different edge count, exercising the + DYNAMIC edge axis of the exported ``.pt2`` (B2.0). + """ + rng = np.random.default_rng(seed) + box_size = 18.0 + blob = rng.random((natoms, 3)) * 5.0 + box_size * 0.5 - 2.5 + coords = blob.reshape(1, natoms, 3) + cells = (np.eye(3) * box_size).reshape(1, 9) + # Alternate O/H types; both species present regardless of natoms. + atype = np.array([i % 2 for i in range(natoms)], dtype=np.int32) + return coords, cells, atype + + +# Two DIFFERENT-size systems evaluated through the SAME exported ``.pt2``. +# Both are sparse, non-binding clusters but with different edge counts, so the +# second size FAILS against a static-``E`` artifact (B1) and PASSES only once +# the edge axis is dynamic (B2.0). +_SYSTEMS = { + "small_8": {"natoms": 8, "seed": 20240626}, + "large_20": {"natoms": 20, "seed": 20240701}, +} + + +def _max_neighbors( + coords: np.ndarray, cells: np.ndarray | None, atype: np.ndarray +) -> int: + """Max carry-all neighbor count per center within ``rcut`` (for the non-binding check).""" + from deepmd.dpmodel.utils.neighbor_graph import ( + build_neighbor_graph, + ) + + natoms = atype.shape[0] + graph = build_neighbor_graph( + coords.reshape(1, natoms, 3), + atype.reshape(1, natoms), + cells.reshape(1, 9) if cells is not None else None, + RCUT, + ) + real = np.asarray(graph.edge_mask) + dst = np.asarray(graph.edge_index)[1][real] + counts = np.bincount(dst, minlength=natoms) + return int(counts.max()) + + +def _eager_dense_reference( + model: torch.nn.Module, + coords: np.ndarray, + cells: np.ndarray | None, + atype: np.ndarray, +) -> dict[str, np.ndarray]: + """Reference energy/force/virial from the eager dense (sel-capped) path.""" + natoms = atype.shape[0] + coord_t = torch.tensor( + coords.reshape(1, natoms, 3), dtype=torch.float64, device=DEVICE + ).requires_grad_(True) + atype_t = torch.tensor(atype.reshape(1, natoms), dtype=torch.int64, device=DEVICE) + box_t = ( + torch.tensor(cells.reshape(1, 9), dtype=torch.float64, device=DEVICE) + if cells is not None + else None + ) + ret = model.call_common( + coord_t, + atype_t, + box_t, + do_atomic_virial=True, + neighbor_graph_method="legacy", + ) + out = { + "atom_energy": ret["energy"], + "energy": ret["energy_redu"], + "force": ret["energy_derv_r"].squeeze(-2), + "virial": ret["energy_derv_c_redu"].squeeze(-2), + "atom_virial": ret["energy_derv_c"].squeeze(-2), + } + return {k: v.detach().cpu().numpy() for k, v in out.items()} + + +@pytest.fixture(scope="module") +def graph_pt2(): + """Build a dpa1(attn_layer=0) model and export it to a graph-form ``.pt2``. + + The AOTI compile is slow (~90 s), so it is done once per module. The eager + pt_expt model is returned alongside the archive path to serve as the dense + parity reference. + """ + from deepmd.pt_expt.model import ( + get_model, + ) + + model = get_model(copy.deepcopy(DPA1_CONFIG)).to(torch.float64) + model.eval() + data = {"model": model.serialize()} + + tmpdir = tempfile.mkdtemp() + pt2_path = os.path.join(tmpdir, "deeppot_dpa1_graph.pt2") + deserialize_to_file( + pt2_path, + copy.deepcopy(data), + do_atomic_virial=True, + lower_kind="graph", + ) + yield pt2_path, model + os.unlink(pt2_path) + os.rmdir(tmpdir) + + +@pytest.mark.parametrize("system", list(_SYSTEMS)) # two different edge counts +@pytest.mark.parametrize("pbc", [True, False]) # periodic vs non-periodic +def test_graph_pt2_deepeval_parity(graph_pt2, pbc, system) -> None: + """Graph ``.pt2`` DeepEval == eager dense dpa1 (energy/force/virial), 1e-10. + + Both ``_SYSTEMS`` are fed through the SAME module-scoped ``.pt2``; the + differing edge counts prove the exported artifact's edge axis is dynamic + (a static-``E`` B1 artifact would reject / mis-shape the larger system). + """ + pt2_path, model = graph_pt2 + coords, cells, atype = _build_system(**_SYSTEMS[system]) + box = cells if pbc else None + + # Anti-vacuity: the carry-all graph and the sel-capped dense reference only + # agree when no center is sel-bound. Assert the system is non-binding. + max_nn = _max_neighbors(coords, box, atype) + assert max_nn < SEL, ( + f"test system is sel-binding (max neighbors {max_nn} >= sel {SEL}); " + "carry-all graph and sel-capped dense reference would diverge" + ) + + dp = DeepPot(pt2_path) + assert dp.deep_eval.metadata["lower_input_kind"] == "graph" + + e, f, v, ae, av = dp.eval(coords, box, atype, atomic=True) + ref = _eager_dense_reference(model, coords, box, atype) + + np.testing.assert_allclose( + e.reshape(-1), + ref["energy"].reshape(-1), + rtol=1e-10, + atol=1e-10, + err_msg="energy", + ) + np.testing.assert_allclose( + f.reshape(-1), ref["force"].reshape(-1), rtol=1e-10, atol=1e-10, err_msg="force" + ) + np.testing.assert_allclose( + v.reshape(-1), + ref["virial"].reshape(-1), + rtol=1e-10, + atol=1e-10, + err_msg="virial", + ) + np.testing.assert_allclose( + ae.reshape(-1), + ref["atom_energy"].reshape(-1), + rtol=1e-10, + atol=1e-10, + err_msg="atom_energy", + ) + np.testing.assert_allclose( + av.reshape(-1), + ref["atom_virial"].reshape(-1), + rtol=1e-10, + atol=1e-10, + err_msg="atom_virial", + ) diff --git a/source/tests/pt_expt/model/test_graph_export.py b/source/tests/pt_expt/model/test_graph_export.py new file mode 100644 index 0000000000..6b735aa3d5 --- /dev/null +++ b/source/tests/pt_expt/model/test_graph_export.py @@ -0,0 +1,135 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Graph-lower export: forward_common_lower_graph_exportable traces + torch.export.""" + +import pytest +import torch + +from deepmd.dpmodel.utils.neighbor_graph import ( + build_neighbor_graph, +) +from deepmd.pt.utils import ( + env, +) +from deepmd.pt_expt.descriptor.dpa1 import ( + DescrptDPA1, +) +from deepmd.pt_expt.fitting import ( + InvarFitting, +) +from deepmd.pt_expt.model import ( + EnergyModel, +) + +from ...seed import ( + GLOBAL_SEED, +) + +_RCUT, _NT = 4.0, 2 + + +def _model(): + ds = DescrptDPA1( + _RCUT, + 0.5, + 20, + _NT, + neuron=[3, 6], + axis_neuron=2, + attn_layer=0, + precision="float64", + seed=GLOBAL_SEED, + ).to(env.DEVICE) + ft = InvarFitting( + "energy", + _NT, + ds.get_dim_out(), + 1, + mixed_types=ds.mixed_types(), + precision="float64", + seed=GLOBAL_SEED, + ).to(env.DEVICE) + return EnergyModel(ds, ft, type_map=["A", "B"]).to(env.DEVICE) + + +def _graph_inputs(model): + rng = torch.Generator(device=env.DEVICE).manual_seed(GLOBAL_SEED) + nloc = 6 + coord = ( + torch.rand(1, nloc, 3, dtype=torch.float64, device=env.DEVICE, generator=rng) + * 3.0 + ) + atype = torch.tensor([[0, 1, 0, 1, 0, 1]], dtype=torch.int64, device=env.DEVICE) + box = torch.eye(3, dtype=torch.float64, device=env.DEVICE).reshape(1, 9) * 20.0 + g = build_neighbor_graph(coord, atype, box, model.get_rcut()) + return (atype.reshape(-1), g.n_node, g.edge_index, g.edge_vec, g.edge_mask) + + +def test_graph_exportable_traces(): + model = _model().eval() + atype, n_node, ei, ev, em = _graph_inputs(model) + gm = model.forward_common_lower_graph_exportable( + atype, + n_node, + ei, + ev, + em, + do_atomic_virial=False, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + ) + assert isinstance(gm, torch.nn.Module) + # the traced module reproduces eager outputs + eager = model.forward_common_lower_graph( + atype, n_node, ei, ev, em, do_atomic_virial=False + ) + # traced module has placeholders for all 8 fn args (fparam/aparam/charge_spin=None) + traced = gm(atype, n_node, ei, ev, em, None, None, None) + # traced returns a tuple/dict; compare energy_redu + te = traced["energy_redu"] if isinstance(traced, dict) else traced[1] + torch.testing.assert_close(te, eager["energy_redu"], rtol=1e-10, atol=1e-10) + + +@pytest.mark.parametrize("do_atomic_virial", [False, True]) # both branches of the bool +def test_forward_lower_graph_exportable_public_keys(do_atomic_virial): + """EnergyModel.forward_lower_graph_exportable: traces the public-key path and + reproduces eager energy/force; atom_virial present iff do_atomic_virial. + """ + model = _model().eval() + atype, n_node, ei, ev, em = _graph_inputs(model) + gm = model.forward_lower_graph_exportable( + atype, + n_node, + ei, + ev, + em, + do_atomic_virial=do_atomic_virial, + tracing_mode="symbolic", + _allow_non_fake_inputs=True, + ) + assert isinstance(gm, torch.nn.Module) + out = gm(atype, n_node, ei, ev, em, None, None, None) + + # public key set (graph path is local-only: force/atom_virial, NOT extended_*) + assert "atom_energy" in out and "energy" in out and "force" in out + assert "virial" in out + assert "extended_force" not in out and "extended_virial" not in out + # atom_virial appears ONLY when do_atomic_virial=True + assert ("atom_virial" in out) == do_atomic_virial + + # values match the eager graph lower + eager = model.forward_common_lower_graph( + atype, n_node, ei, ev, em, do_atomic_virial=do_atomic_virial + ) + torch.testing.assert_close( + out["energy"], eager["energy_redu"], rtol=1e-10, atol=1e-10 + ) + torch.testing.assert_close( + out["force"], eager["energy_derv_r"].squeeze(-2), rtol=1e-10, atol=1e-10 + ) + if do_atomic_virial: + torch.testing.assert_close( + out["atom_virial"], + eager["energy_derv_c"].squeeze(-2), + rtol=1e-10, + atol=1e-10, + ) diff --git a/source/tests/pt_expt/test_dp_freeze.py b/source/tests/pt_expt/test_dp_freeze.py index 7c33f0de81..cdaa4a02c6 100644 --- a/source/tests/pt_expt/test_dp_freeze.py +++ b/source/tests/pt_expt/test_dp_freeze.py @@ -41,6 +41,36 @@ "data_stat_nbatch": 20, } +# dpa1 with attn_layer == 0 — the only graph-eligible model family today +# (mixed_types and uses_graph_lower()==True), used to exercise the +# ``freeze --lower-kind graph`` public-CLI path. +model_dpa1_graph = { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_atten", + "sel": 30, + "rcut_smth": 2.0, + "rcut": 6.0, + "neuron": [2, 4, 8], + "axis_neuron": 4, + "attn": 5, + "attn_layer": 0, + "attn_dotr": True, + "attn_mask": False, + "activation_function": "tanh", + "scaling_factor": 1.0, + "normalize": True, + "temperature": 1.0, + "type_one_side": True, + "seed": 1, + }, + "fitting_net": { + "neuron": [5, 5, 5], + "resnet_dt": True, + "seed": 1, + }, +} + class TestDPFreezePtExpt(unittest.TestCase): """Test dp freeze for the pt_expt backend.""" @@ -103,6 +133,79 @@ def test_freeze_default_suffix(self) -> None: expected = os.path.join(self.tmpdir, "frozen_default_suffix.pte") self.assertTrue(os.path.exists(expected)) + def test_freeze_output_suffix_by_lower_kind(self) -> None: + """main() defaults a suffix-less output to .pt2 for --lower-kind graph + and .pte for nlist, while preserving an explicit .pte/.pt2 (iProzd + review). freeze() is mocked so the suffix logic is checked without the + AOTInductor compile cost. + """ + from unittest import ( + mock, + ) + + cases = [ + ("graph", "out_g", None, ".pt2"), # graph, no suffix -> .pt2 + ("nlist", "out_n", None, ".pte"), # nlist, no suffix -> .pte + ("graph", "out_g_explicit", ".pte", ".pte"), # explicit .pte kept + ("nlist", "out_n_explicit", ".pt2", ".pt2"), # explicit .pt2 kept + ] + for lower_kind, stem, explicit, expected_suffix in cases: + with self.subTest(lower_kind=lower_kind, explicit=explicit): + name = stem + (explicit or "") + captured: dict = {} + + def _fake_freeze(model, output, head=None, lower_kind="nlist", **kw): + captured["output"] = output + captured["lower_kind"] = lower_kind + + flags = argparse.Namespace( + command="freeze", + checkpoint_folder=self.ckpt_file, + output=os.path.join(self.tmpdir, name), + head=None, + lower_kind=lower_kind, + log_level=2, + log_path=None, + ) + with mock.patch("deepmd.pt_expt.entrypoints.main.freeze", _fake_freeze): + main(flags) + self.assertTrue(captured["output"].endswith(expected_suffix)) + self.assertEqual(captured["lower_kind"], lower_kind) + + def test_freeze_graph_rejects_ineligible(self) -> None: + """``--lower-kind graph`` on a non-graph-eligible model (se_e2_a, + mixed_types=False) fails fast rather than emitting a broken .pt2. + """ + output = os.path.join(self.tmpdir, "frozen_graph_reject.pt2") + with self.assertRaises(ValueError): + freeze(model=self.ckpt_file, output=output, lower_kind="graph") + + def test_freeze_graph_dpa1(self) -> None: + """``freeze --lower-kind graph`` on a graph-eligible dpa1(attn_layer=0) + model produces a .pt2 whose metadata records the graph lower (the + user-facing entry point to the C++ graph inference path). + """ + import json + import zipfile + + model_params = deepcopy(model_dpa1_graph) + model = get_model(model_params) + wrapper = ModelWrapper(model, model_params=model_params) + ckpt = os.path.join(self.tmpdir, "dpa1_graph.pt") + torch.save({"model": wrapper.state_dict()}, ckpt) + + output = os.path.join(self.tmpdir, "frozen_dpa1_graph.pt2") + freeze(model=ckpt, output=output, lower_kind="graph") + self.assertTrue(os.path.exists(output)) + + # the .pt2 is a zip; metadata.json must record the graph lower + with zipfile.ZipFile(output) as zf: + meta_name = next( + n for n in zf.namelist() if n.endswith("extra/metadata.json") + ) + metadata = json.loads(zf.read(meta_name)) + self.assertEqual(metadata["lower_input_kind"], "graph") + def test_freeze_pt2(self) -> None: """Freeze to .pt2 (AOTInductor) and verify the file is loadable.""" self.assertTrue(os.path.exists(self.shared_pt2)) diff --git a/source/tests/pt_expt/test_training.py b/source/tests/pt_expt/test_training.py index 45061c084a..a9764947c0 100644 --- a/source/tests/pt_expt/test_training.py +++ b/source/tests/pt_expt/test_training.py @@ -1352,9 +1352,7 @@ def _make_varying_config( config = normalize(config) return config - def _check_varying_natoms( - self, descriptor: dict | None = None, force_legacy_descriptor: bool = False - ) -> None: + def _check_varying_natoms(self, descriptor: dict | None = None) -> None: """Per-step compiled-vs-uncompiled comparison for the given descriptor. The loss config has ``start_pref_f=1000`` and ``start_pref_v=1.0``, @@ -1370,17 +1368,10 @@ def _check_varying_natoms( cannot meet that on float64 the descriptor has a real numerical problem (see the DPA1 limitation note where this happened). - ``force_legacy_descriptor`` makes a graph-eligible descriptor (dpa1 - ``attn_layer==0``) take the legacy *dense* (env-mat) path on BOTH the - compiled and uncompiled sides, so this stays a true compile-correctness - check (same computation, compiled vs eager). The pt_expt eager default - for such a descriptor is the carry-all GRAPH forward while the compiled - ``forward_lower`` is the sel-capped DENSE forward; those are two - *different* force computations whose parameter gradients agree only to - fp64 accumulation (~1e-12), which the optimizer then amplifies into a - diverging training trajectory. Making the compiled GRAPH lower (so - eager==compiled) is tracked for PR-B; until then this test exercises the - dense path it actually compiles. + Graph-eligible descriptors (dpa1 ``attn_layer==0``) compile the GRAPH + lower (``forward_common_lower_graph``) so the compiled path matches the + eager carry-all graph default-flip; non-eligible descriptors + (se_e2_a / dpa2 / dpa3) compile the dense ``forward_lower``. """ from deepmd.pt_expt.train.training import ( _CompiledModel, @@ -1400,16 +1391,6 @@ def _check_varying_natoms( compiled_model = trainer_c.wrapper.model["Default"] self.assertIsInstance(compiled_model, _CompiledModel) - if force_legacy_descriptor: - # Pin BOTH sides to the legacy dense (env-mat) path so the - # uncompiled reference matches the dense ``forward_lower`` - # that gets compiled (must happen before the first forward, - # i.e. before the lazy compile trace). See the docstring / - # PR-B note: the graph forward vs dense forward differ in the - # backward at fp64 precision, which the optimizer amplifies. - for _m in (trainer_uc.model, compiled_model.original_model): - _m.get_descriptor().uses_graph_lower = lambda: False - # Sync weights so predictions can be compared exactly compiled_model.original_model.load_state_dict( trainer_uc.model.state_dict() @@ -1482,25 +1463,20 @@ def test_compiled_matches_uncompiled_varying_natoms_dpa3(self) -> None: self._check_varying_natoms(_DESCRIPTOR_DPA3) def test_compiled_matches_uncompiled_varying_natoms_dpa1_no_attn(self) -> None: - """DPA1 (attn_layer=0): compiled vs uncompiled match (dense path). + """DPA1 (attn_layer=0): compiled vs uncompiled match (GRAPH lower). - ``force_legacy_descriptor=True`` pins both sides to the legacy dense - (env-mat) forward -- the path the compiled ``forward_lower`` actually - uses. The pt_expt eager default for dpa1(attn_layer=0) is the carry-all - GRAPH forward, a *different* force computation from the compiled dense - forward; their backward gradients agree only to fp64 accumulation, which - the optimizer amplifies, so comparing graph-vs-dense through training is - ill-posed. Making the compiled path the GRAPH lower (eager==compiled) - is tracked for PR-B (graph .pt2/export). + The pt_expt eager default for dpa1(attn_layer=0) is the carry-all GRAPH + forward, and the compiled path now compiles the matching GRAPH lower + (``forward_common_lower_graph``) -- so eager==compiled and the + multi-step varying-natoms trajectory (predictions + per-parameter grads + + loss) agrees to the strict ``atol=rtol=1e-10`` tolerance. DPA1 with attention layers is intentionally not covered: the compiled se_atten path is hardware-sensitive on multi-threaded CPUs (parallel reduction order diverges from eager above the 1e-10 tolerance). ``_compile_model`` warns the user instead. """ - self._check_varying_natoms( - _DESCRIPTOR_DPA1_NO_ATTN, force_legacy_descriptor=True - ) + self._check_varying_natoms(_DESCRIPTOR_DPA1_NO_ATTN) def test_compile_warns_dpa1_with_attention(self) -> None: """DPA1 (attn_layer>0) under compile must emit a warning. diff --git a/source/tests/pt_expt/utils/test_graph_pt2_metadata.py b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py new file mode 100644 index 0000000000..17aef4d671 --- /dev/null +++ b/source/tests/pt_expt/utils/test_graph_pt2_metadata.py @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +"""Graph-form ``.pt2`` export: ``lower_input_kind`` metadata branch. + +Covers both branches of the ``lower_kind`` selector on +``deserialize_to_file``: ``"graph"`` traces ``forward_lower_graph_exportable`` +over the NeighborGraph schema and records ``lower_input_kind == "graph"`` in +``metadata.json``; the default (``"nlist"``) traces the dense quartet and +records ``lower_input_kind == "nlist"``. +""" + +import copy +import json +import os +import tempfile +import zipfile + +import pytest + +from deepmd.pt_expt.utils.serialization import ( + deserialize_to_file, +) + +# dpa1 with attn_layer == 0 — the energy model exercised by the graph path. +DPA1_CONFIG = { + "type_map": ["O", "H"], + "descriptor": { + "type": "se_atten", + "sel": 30, + "rcut_smth": 2.0, + "rcut": 6.0, + "neuron": [2, 4, 8], + "axis_neuron": 4, + "attn": 5, + "attn_layer": 0, + "attn_dotr": True, + "attn_mask": False, + "activation_function": "tanh", + "scaling_factor": 1.0, + "normalize": True, + "temperature": 1.0, + "type_one_side": True, + "seed": 1, + }, + "fitting_net": { + "neuron": [5, 5, 5], + "resnet_dt": True, + "seed": 1, + }, +} + + +def _build_dpa1_data() -> dict: + """Build a serialized dpmodel data dict for a dpa1(attn_layer=0) energy model.""" + from deepmd.dpmodel.model.model import ( + get_model, + ) + + model = get_model(copy.deepcopy(DPA1_CONFIG)) + return { + "model": model.serialize(), + "model_def_script": copy.deepcopy(DPA1_CONFIG), + "backend": "dpmodel", + "software": "deepmd-kit", + "version": "3.0.0", + } + + +def _read_metadata(pt2_path: str) -> dict: + """Read ``model/extra/metadata.json`` from a ``.pt2`` ZIP archive.""" + with zipfile.ZipFile(pt2_path, "r") as zf: + raw = zf.read("model/extra/metadata.json").decode("utf-8") + return json.loads(raw) + + +@pytest.fixture(scope="module") +def dpa1_dpmodel_data() -> dict: + return _build_dpa1_data() + + +def test_graph_pt2_has_lower_input_kind_graph(dpa1_dpmodel_data) -> None: + """``lower_kind="graph"`` -> metadata ``lower_input_kind == "graph"``.""" + with tempfile.TemporaryDirectory() as d: + p = os.path.join(d, "m_graph.pt2") + deserialize_to_file( + p, + copy.deepcopy(dpa1_dpmodel_data), + do_atomic_virial=True, + lower_kind="graph", + ) + meta = _read_metadata(p) + assert meta["lower_input_kind"] == "graph" + # B2.0: the edge axis is DYNAMIC (Dim("nedge", min=2)); there is no static + # capacity baked into the AOTI artifact, so no ``edge_capacity`` is persisted. + assert "edge_capacity" not in meta + + +def test_dense_pt2_has_lower_input_kind_nlist(dpa1_dpmodel_data) -> None: + """Default (``lower_kind="nlist"``) -> metadata ``lower_input_kind == "nlist"``.""" + with tempfile.TemporaryDirectory() as d: + p = os.path.join(d, "m_dense.pt2") + deserialize_to_file( + p, + copy.deepcopy(dpa1_dpmodel_data), + do_atomic_virial=True, + ) + meta = _read_metadata(p) + assert meta["lower_input_kind"] == "nlist" + # edge_capacity is a graph-only artifact constant; the dense path omits it. + assert "edge_capacity" not in meta