diff --git a/deepmd/dpmodel/array_api.py b/deepmd/dpmodel/array_api.py
index c5547af79a..970528ccea 100644
--- a/deepmd/dpmodel/array_api.py
+++ b/deepmd/dpmodel/array_api.py
@@ -180,6 +180,55 @@ def xp_add_at(x: Array, indices: Array, values: Array) -> Array:
         return x
 
 
+def xp_maximum_at(x: Array, indices: Array, values: Array) -> Array:
+    """Segment max-assign of values into x at the specified indices.
+
+    Element-wise analogue of :func:`xp_add_at` that takes the maximum instead
+    of the sum: for every ``k`` it assigns ``x[indices[k]] = maximum(
+    x[indices[k]], values[k])``. Repeated indices reduce to the per-segment
+    maximum, which is order-independent.
+
+    Parameters
+    ----------
+    x : Array
+        Destination array indexed along axis 0; typically pre-filled with
+        ``-inf`` so empty segments stay neutral.
+    indices : Array
+        Integer destination indices with shape (K,).
+    values : Array
+        Source values with shape (K, *x.shape[1:]).
+
+    Returns
+    -------
+    Array
+        The updated array (modified in place and returned for NumPy; a new
+        array for JAX/PyTorch).
+    """
+    xp = array_api_compat.array_namespace(x, indices, values)
+    if array_api_compat.is_numpy_array(x):
+        # NumPy: in-place ufunc reduction at the given indices.
+        xp.maximum.at(x, indices, values)
+        return x
+
+    elif array_api_compat.is_jax_array(x):
+        # JAX: functional indexed-max update, not in-place.
+        return x.at[indices].max(values)
+    elif array_api_compat.is_torch_array(x):
+        import torch
+
+        index = indices.reshape([-1] + [1] * (values.ndim - 1)).expand_as(values)
+        return torch.scatter_reduce(
+            x, 0, index, values, reduce="amax", include_self=True
+        )
+    else:
+        # Fallback for array_api_strict: basic indexing only.
+        n = indices.shape[0]
+        for i in range(n):
+            idx = int(indices[i])
+            x[idx, ...] = xp.maximum(x[idx, ...], values[i, ...])
+        return x
+
+
 def xp_sigmoid(x: Array) -> Array:
     """Compute the sigmoid function.
 
diff --git a/deepmd/dpmodel/descriptor/dpa4.py b/deepmd/dpmodel/descriptor/dpa4.py
index de96ee8c0c..013164cc51 100644
--- a/deepmd/dpmodel/descriptor/dpa4.py
+++ b/deepmd/dpmodel/descriptor/dpa4.py
@@ -1,47 +1,49 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """
-DPA4 (SeZM) descriptor: dpmodel (array-API) backend.
-
-This is the dpmodel port of ``deepmd.pt.model.descriptor.sezm.DescrptSeZM``.
-It orchestrates the dpa4_nn building blocks on the padded, frame-explicit
-edge layout (``E = nf * nloc * nnei``; no ``torch.nonzero``-style sparse
-edge extraction anywhere; see ``dpa4_nn.edge_cache``).
-
-Scope notes (vs pt):
-
-- Only the standard DeePMD ``call(coord_ext, atype_ext, nlist, mapping)``
-  path is ported. The pt-only paths (sparse ``edge_index`` inputs,
-  ``forward_with_edges``, zone bridging / InnerClamp, charge/spin condition
-  embedding, AMP autocast) are out of core scope; out-of-core construction
-  flags raise ``NotImplementedError`` at ``__init__`` (either here or in the
-  owning submodule).
-- ``random_gamma`` is a training-only augmentation in pt
-  (``random_gamma and self.training``); dpmodel evaluates in inference mode,
-  so the roll is never applied (the config value is still serialized).
-- ``use_amp`` is accepted and ignored: it is a pt-runtime (CUDA autocast)
-  switch with no dpmodel counterpart.
+DPA4/SeZM descriptor: Smooth Equivariant Zone-bridging Model.
+
+dpmodel (array-API) backend
+
+This implementation is designed around two goals:
+
+1) Conservative forces: the descriptor is computed from differentiable energy.
+2) Efficient inference: edge geometry and Wigner-D rotation blocks are computed
+   exactly once per `call()` and reused by all interaction blocks.
+
+Shared descriptor building blocks live in the `dpa4_nn` subpackage.
+
+Runtime flow at a glance:
+1) Build edge cache and radial features once.
+2) Run interaction blocks with shared geometric caches.
+3) Return scalar (`l=0`) descriptor channels for fitting.
+
+Layout notes
+------------
+- Node-level backbone features use contiguous `(N, D_node, 1, C)` where
+  `D_node=(l_schedule[i]+extra_node_l+1)^2` and `C=channels`.
+- The singleton focus axis is kept only to reuse the existing equivariant
+  operators; real multi-focus structure lives strictly inside `SO2Convolution`.
+- Edge-level SO(2) internal operators keep m-major reduced layout
+  `(E, F, D_m_trunc, Cf)` with `F=n_focus` and `Cf=focus_dim` inside the
+  SO(2) branch only.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm``.
 """
 
 from __future__ import (
     annotations,
 )
 
-import logging
 import math
 from typing import (
     TYPE_CHECKING,
     Any,
-    NoReturn,
 )
 
 import array_api_compat
 import numpy as np
 
-log = logging.getLogger(__name__)
-
-# Warn at most once per process for backend-ignored switches (keyed by name).
-_WARNED_ONCE: set[str] = set()
-
 from deepmd.dpmodel import (
     NativeOP,
 )
@@ -73,14 +75,20 @@
 from .base_descriptor import (
     BaseDescriptor,
 )
+from .dpa4_nn.attn_res import (
+    DepthAttnRes,
+)
 from .dpa4_nn.block import (
     SeZMInteractionBlock,
 )
 from .dpa4_nn.edge_cache import (
     EdgeCache,
     build_edge_cache,
+    build_edge_cache_from_edges,
+    edge_cache_to_dtype,
 )
 from .dpa4_nn.embedding import (
+    ChargeSpinEmbedding,
     EnvironmentInitialEmbedding,
     GeometricInitialEmbedding,
     SeZMTypeEmbedding,
@@ -95,18 +103,27 @@
     ScalarRMSNorm,
 )
 from .dpa4_nn.radial import (
+    BridgingSwitch,
     C3CutoffEnvelope,
+    InnerClamp,
     RadialBasis,
     RadialMLP,
 )
 from .dpa4_nn.utils import (
+    ATTN_RES_MODES,
     get_promoted_dtype,
+    safe_norm,
 )
 from .dpa4_nn.wignerd import (
     WignerDCalculator,
+    build_edge_quaternion,
 )
 
 if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+    )
+
     from deepmd.dpmodel.array_api import (
         Array,
     )
@@ -117,8 +134,6 @@
         DPPath,
     )
 
-ATTN_RES_MODES = ("none", "independent", "dependent")
-
 
 @BaseDescriptor.register("SeZM")
 @BaseDescriptor.register("sezm")
@@ -126,27 +141,296 @@
 @BaseDescriptor.register("dpa4")
 class DescrptDPA4(NativeOP, BaseDescriptor):
     """
-    DPA4 (SeZM) descriptor, dpmodel backend.
-
-    See the pt ``DescrptSeZM`` docstring
-    (``deepmd/pt/model/descriptor/sezm.py``) for the full per-parameter
-    description; the constructor mirrors the pt signature and defaults
-    exactly. Parameters whose machinery is not ported to dpmodel raise
-    ``NotImplementedError`` at construction (some directly here, the rest
-    delegated to the owning submodule, e.g. ``layer_scale`` and the
-    ``*_attn_res`` / SO(2) attention projection flags).
-
-    Execution outline (pt ``forward`` standard path):
-
-    1. Type embedding and pair-exclusion keep mask.
-    2. ``build_edge_cache`` once (geometry, envelope, RBF, Wigner-D) on the
-       padded edge layout.
-    3. Radial features once; optional environment FiLM seeding and geometric
-       initial embedding.
-    4. ``SeZMInteractionBlock`` stack with the per-block l/m schedules.
-    5. Final scalar (l=0) FFN readout to ``(nf, nloc, channels)``.
+    SeZM descriptor.
+
+    Execution outline
+    -----------------
+    1. Build a per-forward `EdgeFeatureCache` (geometry, envelope, Wigner-D).
+    2. Build radial/type edge features once and reuse across blocks.
+    3. Run `SeZMInteractionBlock` stack with optional l/m schedules.
+    4. Extract scalar channels and apply the final scalar FFN.
+
+    Parameters
+    ----------
+    ntypes
+        Number of element types.
+    sel
+        Maximum number of neighbors per type within `rcut`.
+        - int: broadcast to all types, e.g. sel=100 with ntypes=2 → [100, 100]
+        - list[int]: sel[i] is the maximum number of type i atoms within `rcut`
+    rcut
+        Cutoff radius in Å.
+    env_exp
+        C^3 cutoff envelope exponents `[rbf_env_exp, edge_env_exp]`.
+        - `rbf_env_exp`: Controls radial basis function envelope decay.
+        - `edge_env_exp`: Controls message passing edge weight envelope decay.
+        Larger values give weaker suppression (values stay near 1.0 longer).
+    channels
+        Total channels per (l,m) coefficient.
+    basis_type
+        Radial basis type. Supported values are ``"bessel"`` and ``"gaussian"``.
+    n_radial
+        Number of radial basis functions.
+    radial_mlp
+        Hidden layer sizes for radial networks. An output layer of size
+        `(l_schedule[0]+extra_node_l+1)*channels` will be automatically appended.
+    use_env_seed
+        If True, seed the initial node state with local-environment information:
+        apply environment matrix FiLM conditioning on l=0 features using 4D
+        `[s, s*r_hat]` representation, and enable the non-scalar geometric
+        initial embedding when `l_schedule[0] + extra_node_l > 0`. If False, the initial state
+        contains only atom-local scalar features before message passing. FiLM
+        deltas are normalized and scaled with learnable strengths initialized
+        to small values. Internal dimensions are derived from `channels`:
+        `embed_dim=min(channels, 128)`,
+        `axis_dim=min(4 if embed_dim < 64 else 8, embed_dim-1)`,
+        `type_dim=clamp(channels//4, 8, 32)`,
+        `rbf_out_dim=max(32, embed_dim-2*type_dim)`,
+        `hidden_dim=min(256, max(2*embed_dim, rbf_out_dim+2*type_dim))`.
+    random_gamma
+        If True, apply a random roll about the edge-aligned local ``+Z`` axis
+        before building the Wigner-D blocks. The roll is sampled independently
+        per edge and per forward call.
+    edge_cartesian
+        If True, every block whose message-passing degree is ``1`` or ``2``
+        replaces its per-edge SO(2) rotation-frame tensor product with the
+        equivalent global-frame Cartesian rank-2 tensor product, removing the two
+        per-edge Wigner-D rotations. Blocks with degree ``0`` or ``>= 3`` keep
+        the SO(2) path. When every block takes the Cartesian path the full
+        Wigner-D construction is skipped automatically, and the geometric initial
+        embedding falls back to the zonal coupling.
+    node_cartesian
+        Per-node global-frame Cartesian rank-2 tensor product on the aggregated
+        message, applied in every block whose message-passing degree is ``1`` or
+        ``2``. Configured by a ``"<mode>:<layers>"`` string where ``mode`` is
+        ``"default"`` (one-sided product) or ``"parity"`` (symmetrized product)
+        and ``layers`` is the stack depth; a bare integer ``N`` is shorthand for
+        ``"default:N"``, and ``"none"`` disables it. Orthogonal to
+        ``edge_cartesian``: either, both, or neither may be set. Unlike
+        ``edge_cartesian`` it does not affect the Wigner-D construction, since the
+        per-edge message path is left unchanged.
+    lmax
+        Maximum degree, only used when `l_schedule` is None.
+    l_schedule
+        Pyramid schedule of lmax per block, e.g. [3, 3, 2]. Must be non-increasing.
+        If set, lmax and n_blocks will be ignored.
+    mmax
+        Maximum SO(2) order (|m|), only used when `m_schedule` is None.
+        If None, defaults to the per-block `lmax` (i.e. `m_schedule = l_schedule`).
+    kmax
+        Maximum Wigner-D frame order (|k|) used by SO(3) grid nets. The frame set
+        is built as ``[0, -1, 1, ..., -kmax, kmax]``. ``kmax=0`` recovers the
+        S2-like k=0 slice, while ``kmax=1`` is the default low-cost setting that
+        opens odd/antisymmetric coupling paths.
+    m_schedule
+        Schedule of mmax per block, e.g. [2, 2, 1, 0]. Must satisfy
+        `m_schedule[i] <= l_schedule[i]` for every block. A non-increasing schedule is
+        recommended but not required. If set, `mmax` will be ignored.
+    extra_node_l
+        Extra node representation degree above each message-passing degree.
+        The node degree of block `i` is `l_schedule[i] + extra_node_l`, while
+        SO(2) message passing still uses `l_schedule[i]`.
+    n_blocks
+        Number of blocks (only used when `l_schedule` is None).
+    so2_norm
+        If True, apply intermediate ReducedEquivariantRMSNorm between SO(2) mixing layers.
+        When False (default), no normalization is applied between layers.
+    mixing_layers
+        Number of learnable mixing layers in the per-edge message core of each
+        block (legacy alias: ``so2_layers``). ``0`` applies only the
+        edge-condition modulation: the rotation-free per-degree radial scaling on
+        the SO(2) path, or a single ``x @ T_e`` when ``edge_cartesian`` applies.
+        The per-node ``node_cartesian`` stack carries its own independent depth.
+    so2_attn_res
+        SO(2)-internal depth-wise attention residual mode inside each interaction
+        block. Must be one of ``"none"``, ``"independent"``, or ``"dependent"``.
+    radial_so2_mode
+        Dynamic radial degree mixer mode inside SO(2) convolution. ``"none"``
+        applies elementwise radial modulation, ``"degree"`` uses a
+        channel-shared edge-conditioned cross-degree kernel, and
+        ``"degree_channel"`` uses a per-channel cross-degree kernel. Has no
+        effect on blocks taking the Cartesian path (``edge_cartesian`` with
+        degree 1 or 2), where the dynamic radial degree mixer is bypassed.
+    radial_so2_rank
+        Low-rank channel factorization rank for
+        ``radial_so2_mode="degree_channel"``. ``0`` uses the full
+        per-channel dynamic degree kernel.
+    n_focus
+        Number of parallel focus streams used only inside the SO(2) convolution.
+        Node-level backbone tensors still keep a singleton focus axis.
+    focus_dim
+        Hidden width per focus stream inside the SO(2) convolution.
+        ``focus_dim=0`` means using ``channels``.
+    n_atten_head
+        Number of attention heads when aggregating messages in SO(2) convolution.
+        0 applies a plain envelope-weighted scatter-sum; >0 enables
+        envelope-gated grouped softmax attention with output-side head gate.
+        Attention uses ``w**2 * exp(logit)`` in the numerator and
+        ``zeta + sum(w**2 * exp(logit))`` in the denominator.
+    atten_f_mix
+        If True, merge all SO(2) focus streams into one attention stream after
+        rotate-back. Attention heads split ``n_focus * focus_dim`` instead of
+        each focus stream independently.
+    atten_v_proj
+        If True, apply an explicit degree-aware value projection inside SO(2)
+        attention.
+    atten_o_proj
+        If True, apply an explicit degree-aware output projection inside SO(2)
+        attention.
+    ffn_neurons
+        Hidden width for block FFNs and the final scalar output FFN.
+        If ``>0``, both paths use this width.
+        If ``=0``, each path resolves its own width from ``channels`` and its
+        effective GLU setting: ``4 * channels`` without GLU, ``(8 / 3) * channels``
+        with GLU, then round up to a multiple of 32.
+    grid_mlp
+        Either one boolean applied to every grid path, or three booleans
+        ``[node_wise, message_node, ffn]`` selecting the polynomial point-wise
+        grid MLP operation per grid path. On any path whose ``grid_branch``
+        entry is positive it is overridden by branch mixing, and it has no
+        effect on the final ``l=0`` output head.
+    grid_branch
+        Either one non-negative integer applied to every grid path, or three
+        integers ``[node_wise, message_node, ffn]`` setting the number of
+        scalar-routed polynomial product branches per grid path. ``0`` disables
+        branch mixing on that path; positive values select branch mixing and
+        take precedence over ``grid_mlp``. Branch weights are computed from
+        ``l=0`` scalar features only, while each branch is a quadratic product
+        of channel-mixed grid fields. The ``node_wise`` and ``message_node``
+        entries control the SO(2) convolution cross-grid paths, and the ``ffn``
+        entry controls the block-internal FFN grid path.
+    ffn_blocks
+        Number of FFN subblocks per interaction block.
+    sandwich_norm
+        Pre/post-norm switches for [SO(2), FFN] residual branches in order:
+        [so2_pre, so2_post, ffn_pre, ffn_post], shared across all blocks.
+    mlp_bias
+        Whether to use bias in equivariant layers. When False, removes bias from:
+        - SO3Linear: l=0 bias
+        - SO2Linear: l=0 bias
+        - GatedActivation: gate linear bias
+        - DepthAttnRes: input-dependent query projection
+        - EnvironmentInitialEmbedding:
+          rbf_proj_layer1/2 and g_layer1/2
+        Attention logit and output-gate parameters in SO(2) convolution are
+        always bias-free.
+    layer_scale
+        If True, apply learnable LayerScale (init 1e-3) on residual branches:
+        - SO(2) branch: per-focus-channel scales `(n_focus, focus_dim)`
+          on each SO(2) mixing layer.
+        - FFN branch: per-channel scales `(channels,)` on each FFN subblock.
+    full_attn_res
+        Descriptor-level full attention residual mode over the unit history
+        `[x0, so2_0, ffn_0_0, ffn_0_1, ..., so2_1, ffn_1_0, ffn_1_1, ...]`,
+        where each FFN subblock contributes its own completed unit
+        representation. `independent` uses learned query vectors, while
+        `dependent` derives queries from the current SeZM state before the
+        SO(2) unit, before each FFN unit, and before the final aggregation.
+        Must be one of ``"none"``, ``"independent"``, or ``"dependent"``.
+    block_attn_res
+        Descriptor-level block attention residual mode over the block history
+        `[x0, b1, b2, ...]`, where each `b_i` is the sum of all unit outputs
+        inside one `SeZMInteractionBlock`. `independent` uses learned query
+        vectors, while `dependent` derives queries from the current SeZM state
+        before the SO(2) unit, before each FFN unit, and before the final block
+        aggregation. Must be one of ``"none"``, ``"independent"``, or
+        ``"dependent"``. Cannot be enabled together with `full_attn_res`.
+    s2_activation
+        Two booleans ``[so2_enabled, ffn_enabled]``.
+        ``so2_enabled=True`` makes the SO(2) gated activation path use
+        ``activation_function="silu"``.
+        ``ffn_enabled=True`` makes the block-internal FFN path use
+        ``activation_function="silu"`` and ``glu_activation=True``.
+        S2-grid resolutions are resolved automatically per block. The
+        tensor-product grid uses ``[2 * mmax + 4, ceil_even(3 * lmax + 2)]``
+        in the SO(2) branch, and the FFN branch lifts it to a square
+        ``[max(R_phi, R_theta), max(R_phi, R_theta)]`` grid. Lebedev branches
+        use the smallest packaged rule with precision at least ``3 * lmax``.
+        The final ``l=0`` output FFN is unchanged.
+    ffn_so3_grid
+        If True, use the SO(3) Wigner-D grid in the block-internal FFN. This
+        option takes precedence over the FFN grid path and ignores
+        ``s2_activation[1]``. The final ``l=0`` output FFN is unchanged.
+    node_wise_s2
+        If True, add an edge-local S2 product branch between source and
+        destination node features inside the SO(2) convolution.
+    node_wise_so3
+        If True, use the corresponding edge-local SO(3) Wigner-D grid-net branch.
+        The source side is the query and the destination side is the context.
+    message_node_s2
+        If True, add a post-aggregation S2 product branch between hidden messages
+        and destination node features before the SO(2) output projection.
+    message_node_so3
+        If True, use the corresponding post-aggregation SO(3) Wigner-D grid-net
+        branch. The message is the query and the node state is the context.
+    so3_readout
+        Read-out FFN mode for the final ``l=0`` descriptor. ``"none"`` applies a
+        degree-0 scalar FFN to the ``l=0`` slice only; ``l>0`` coefficients are
+        discarded before the read-out. ``"glu"`` and ``"mlp"`` apply a full
+        equivariant FFN whose degree equals the node degree of the last
+        interaction block, driven by the SO(3) Wigner-D grid, so ``l>0`` geometry
+        is folded into ``l=0`` before the scalar is extracted. The value selects
+        the quadratic grid product (``"glu"``) or the polynomial point-wise grid
+        MLP (``"mlp"``). The Wigner-D frame order follows ``kmax``. The residual
+        stays on the ``l=0`` channel.
+    lebedev_quadrature
+        Either one boolean applied to both S2 branches, or two booleans
+        ``[so2_enabled, ffn_enabled]`` aligned with ``s2_activation``. If
+        enabled for a branch, that branch uses packaged Lebedev quadrature
+        instead of the tensor-product sphere grid in its S2 projector.
+    activation_function
+        Base activation function for helper MLPs, the SO(2) gated activation
+        path, and the final ``l=0`` output FFN.
+        It is overridden to ``"silu"`` only on paths whose ``s2_activation``
+        switch is enabled.
+    glu_activation
+        Base GLU switch for FFN. The block-internal FFN path overrides it to
+        ``True`` only when ``s2_activation[1]=True``. The final ``l=0`` output
+        FFN always keeps this user-provided value.
+    use_amp
+        If True, use automatic mixed precision (AMP) with bfloat16 on CUDA
+        during training. This can improve speed and reduce memory usage.
+        Enabling this option is recommended on GPUs with native bfloat16 support.
+        Disable it on GPUs without native bfloat16 support to avoid runtime
+        errors or additional conversion overhead.
+    exclude_types
+        List of excluded type pairs.
+    precision
+        Precision for neural network parameters and computations. Geometry computations
+        (edge distances, Wigner-D matrices, rotations, and enabled env seeds) always
+        run in fp32+ to provide accurate geometric information for better convergence.
+        Only the interaction blocks use this precision.
+    eps
+        Small epsilon for numerical stability in division and normalization.
+    trainable
+        Whether parameters are trainable.
+    seed
+        Random seed(s).
+    type_map
+        Type names.
+    inner_clamp_r_inner
+        Inner radius for distance saturation in Å. If both inner and outer radii
+        are set, the descriptor freezes short-range descriptor geometry inside
+        the zone-bridging window.
+    inner_clamp_r_outer
+        Outer radius for distance saturation in Å.
+    add_chg_spin_ebd
+        If True, add frame-level charge/spin condition embedding to scalar type
+        features before edge features are built.
+    default_chg_spin
+        Default frame-level charge/spin condition `[charge, spin]`. This value is
+        used when `add_chg_spin_ebd=True` and no explicit `charge_spin` tensor is
+        provided at the descriptor or SeZM model boundary.
+
+    Notes
+    -----
+    SeZM does not use the traditional environment matrix (r, a_x, a_y, a_z).
+    Instead, it uses radial basis functions and spherical harmonics directly.
+    The mean/stddev statistics are kept for interface compatibility but are not
+    actively used in the forward pass.
     """
 
+    _ENV_DIM: int = 1  # Use se_r style (radial only) for EnvMatStatSe compatibility
     LATEST_VERSION: float = 1.1
 
     def __init__(
@@ -161,6 +445,8 @@ def __init__(
         radial_mlp: list[int] | None = None,
         use_env_seed: bool = True,
         random_gamma: bool = True,
+        edge_cartesian: bool = False,
+        node_cartesian: str | int = "none",
         lmax: int = 3,
         l_schedule: list[int] | None = None,
         mmax: int | None = 1,
@@ -169,7 +455,8 @@ def __init__(
         extra_node_l: int = 0,
         n_blocks: int = 3,
         so2_norm: bool = False,
-        so2_layers: int = 4,
+        mixing_layers: int = 4,
+        so2_layers: int | None = None,
         so2_attn_res: str = "none",
         radial_so2_mode: str = "degree_channel",
         radial_so2_rank: int = 1,
@@ -221,8 +508,11 @@ def __init__(
             )
         self.env_exp = [int(x) for x in env_exp]
         self.eps = float(eps)
-        # version >= 1.1 O(1) floor for the envelope-squared degree
-        # normalization (see pt sezm.py).
+        # Floor for the envelope-squared degree normalization (GIE / env_seed).
+        # version < 1.1 keeps the tiny ``eps`` floor (legacy path, untouched);
+        # version >= 1.1 swaps in this O(1) value so sparse-neighborhood (e.g.
+        # dimer) features vanish smoothly at rcut instead of saturating and
+        # kinking just inside the cutoff.
         self.deg_norm_floor = 0.25
 
         if isinstance(sel, int):
@@ -231,6 +521,7 @@ def __init__(
         self.sel = [int(x) for x in sel]
         self.type_map = type_map
         self.nnei = int(sum(self.sel))
+        self.ndescrpt = int(self.nnei * self._ENV_DIM)
 
         self.channels = int(channels)
         self.n_focus = int(n_focus)
@@ -248,27 +539,22 @@ def __init__(
             sandwich_norm = [False, True, True, False]
         if not isinstance(sandwich_norm, (list, tuple)) or len(sandwich_norm) != 4:
             raise ValueError(
-                "sandwich_norm must be a list[bool] of length 4: "
-                "[so2_pre, so2_post, ffn_pre, ffn_post]"
+                "sandwich_norm must be a list[bool] of length 4: [so2_pre, so2_post, ffn_pre, ffn_post]"
             )
         self.sandwich_norm = [bool(x) for x in sandwich_norm]
-        (
-            self.so2_pre_norm,
-            self.so2_post_norm,
-            self.ffn_pre_norm,
-            self.ffn_post_norm,
-        ) = self.sandwich_norm
+        self.so2_pre_norm = self.sandwich_norm[0]
+        self.so2_post_norm = self.sandwich_norm[1]
+        self.ffn_pre_norm = self.sandwich_norm[2]
+        self.ffn_post_norm = self.sandwich_norm[3]
         if s2_activation is None:
             s2_activation = [False, True]
         if not isinstance(s2_activation, list) or len(s2_activation) != 2:
             raise ValueError(
-                "`s2_activation` must be a list[bool] of length 2: "
-                "[so2_activation, ffn_activation]"
+                "`s2_activation` must be a list[bool] of length 2: [so2_activation, ffn_activation]"
             )
         if any(not isinstance(flag, bool) for flag in s2_activation):
             raise ValueError(
-                "`s2_activation` must be a list[bool] of length 2: "
-                "[so2_activation, ffn_activation]"
+                "`s2_activation` must be a list[bool] of length 2: [so2_activation, ffn_activation]"
             )
         self.s2_activation = list(s2_activation)
         self.ffn_so3_grid = bool(ffn_so3_grid)
@@ -285,27 +571,17 @@ def __init__(
             lebedev_quadrature = [lebedev_quadrature, lebedev_quadrature]
         if not isinstance(lebedev_quadrature, list) or len(lebedev_quadrature) != 2:
             raise ValueError(
-                "`lebedev_quadrature` must be a bool or a list[bool] of length 2: "
-                "[so2_quadrature, ffn_quadrature]"
+                "`lebedev_quadrature` must be a bool or a list[bool] of length 2: [so2_quadrature, ffn_quadrature]"
             )
         if any(not isinstance(flag, bool) for flag in lebedev_quadrature):
             raise ValueError(
-                "`lebedev_quadrature` must be a bool or a list[bool] of length 2: "
-                "[so2_quadrature, ffn_quadrature]"
+                "`lebedev_quadrature` must be a bool or a list[bool] of length 2: [so2_quadrature, ffn_quadrature]"
             )
         self.lebedev_quadrature = list(lebedev_quadrature)
-        # The tensor-product (e3nn-style) sphere grid is not ported to
-        # dpmodel; only the packaged Lebedev quadrature path exists
-        # (see dpa4_nn.projection).
-        if not all(self.lebedev_quadrature):
-            raise NotImplementedError(
-                "lebedev_quadrature entries with False (tensor-product S2 "
-                "grid) are not ported to dpmodel"
-            )
         self.activation_function = str(activation_function)
         self.glu_activation = bool(glu_activation)
 
-        # === Split effective activation config by branch (pt sezm.py) ===
+        # === Split effective activation config by branch ===
         self.so2_s2_activation = self.s2_activation[0]
         self.ffn_s2_activation = False if self.ffn_so3_grid else self.s2_activation[1]
         self.so2_lebedev_quadrature = self.lebedev_quadrature[0]
@@ -324,50 +600,56 @@ def __init__(
         self.out_activation_function = self.activation_function
         self.out_glu_activation = self.glu_activation
         self.precision = str(precision)
-        # Geometry / seeding paths run in promoted ("fp32+") precision (pt
-        # uses compute_dtype = get_promoted_dtype(dtype) there).
         self.compute_precision = str(
             np.dtype(get_promoted_dtype(PRECISION_DICT[self.precision])).name
         )
         self.mlp_bias = bool(mlp_bias)
         self.layer_scale = bool(layer_scale)
-        # pt-runtime-only switch (CUDA bfloat16 autocast during training);
-        # accepted for config compatibility and ignored by dpmodel.
-        self.use_amp = bool(use_amp)
-        if self.use_amp and "use_amp" not in _WARNED_ONCE:
-            log.warning(
-                "`use_amp` has no effect on the dpmodel/pt_expt backend "
-                "(it is a pt-runtime CUDA autocast switch); ignoring it."
-            )
-            _WARNED_ONCE.add("use_amp")
+        self.use_amp = bool(use_amp)  # and self.training
         self.trainable = bool(trainable)
         self.seed = seed
         self.random_gamma = bool(random_gamma)
+        self.edge_cartesian = bool(edge_cartesian)
+        self.node_cartesian = str(node_cartesian)
         self.add_chg_spin_ebd = bool(add_chg_spin_ebd)
-        if self.add_chg_spin_ebd:
-            raise NotImplementedError(
-                "add_chg_spin_ebd=True (ChargeSpinEmbedding) is not ported to dpmodel"
-            )
         if default_chg_spin is not None and len(default_chg_spin) != 2:
             raise ValueError("`default_chg_spin` must contain [charge, spin].")
         self.default_chg_spin = (
             None if default_chg_spin is None else [float(x) for x in default_chg_spin]
         )
 
-        # === Zone bridging (InnerClamp + BridgingSwitch): not ported ===
+        # === Zone bridging: InnerClamp + Source Freeze Propagation Gate ===
+        # Both the geometry clamp (``InnerClamp``) and the message-passing
+        # switch (``BridgingSwitch``) are activated together on the same
+        # ``[r_inner, r_outer]`` window. The clamp freezes scalar distance
+        # on every ``(j, k)`` edge with ``r_{jk} < r_inner``; the switch
+        # feeds a per-edge C3 amplitude into ``compute_edge_src_gate`` so
+        # that any node with a frozen neighbor cannot propagate
+        # information through the GNN, closing the direction / multi-hop
+        # leakage channels that a pure ``InnerClamp`` cannot reach. Both
+        # modules are parameter-free, so enabling bridging does not add
+        # any keys to the descriptor's state dict.
         self.inner_clamp_r_inner = (
             float(inner_clamp_r_inner) if inner_clamp_r_inner is not None else None
         )
         self.inner_clamp_r_outer = (
             float(inner_clamp_r_outer) if inner_clamp_r_outer is not None else None
         )
-        if self.inner_clamp_r_inner is not None or self.inner_clamp_r_outer is not None:
-            raise NotImplementedError(
-                "inner_clamp_r_inner/inner_clamp_r_outer (zone bridging) are "
-                "not ported to dpmodel"
+        if (
+            self.inner_clamp_r_inner is not None
+            and self.inner_clamp_r_outer is not None
+        ):
+            self.inner_clamp: InnerClamp | None = InnerClamp(
+                self.inner_clamp_r_inner, self.inner_clamp_r_outer
             )
+            self.bridging_switch: BridgingSwitch | None = BridgingSwitch(
+                self.inner_clamp_r_inner, self.inner_clamp_r_outer
+            )
+        else:
+            self.inner_clamp = None
+            self.bridging_switch = None
 
-        # === Env seed derived dimensions (pt sezm.py) ===
+        # === Env seed parameters ===
         self.use_env_seed = bool(use_env_seed)
         self.env_seed_embed_dim = min(self.channels, 128)
         self.env_seed_type_dim = min(32, max(8, self.channels // 4))
@@ -377,12 +659,15 @@ def __init__(
         g_in_dim = rbf_out_dim + 2 * self.env_seed_type_dim
         self.env_seed_hidden_dim = min(256, max(2 * self.env_seed_embed_dim, g_in_dim))
 
-        # === Deterministic seed split (same indices as pt) ===
+        # === Split deterministic seeds at the descriptor top-level ===
         seed_type_embedding = child_seed(self.seed, 0)
         seed_blocks = child_seed(self.seed, 1)
         seed_out = child_seed(self.seed, 2)
         seed_radial_embedding = child_seed(self.seed, 3)
         seed_env_seed = child_seed(self.seed, 4)
+        seed_full_attn = child_seed(self.seed, 5)
+        seed_block_attn = child_seed(self.seed, 6)
+        seed_charge_spin = child_seed(self.seed, 7)
 
         # === L/M schedules ===
         self._init_lm_schedules(lmax, n_blocks, l_schedule, mmax, m_schedule)
@@ -396,7 +681,9 @@ def __init__(
         self.rad_sizes_per_block = [l + 1 for l in self.l_schedule]
 
         self.so2_norm = bool(so2_norm)
-        self.so2_layers = int(so2_layers)
+        # ``so2_layers`` is the legacy alias for ``mixing_layers``; when supplied
+        # it takes precedence so existing configs keep working.
+        self.mixing_layers = int(mixing_layers if so2_layers is None else so2_layers)
         self.so2_attn_res_mode = str(so2_attn_res).lower()
         if self.so2_attn_res_mode not in ATTN_RES_MODES:
             raise ValueError(
@@ -412,16 +699,23 @@ def __init__(
             raise ValueError("`radial_so2_rank` must be non-negative")
         self.ffn_neurons = int(ffn_neurons)
         self.block_ffn_neurons = self._resolve_ffn_neurons(
-            self.ffn_neurons, glu_activation=self.ffn_glu_activation
+            self.ffn_neurons,
+            glu_activation=self.ffn_glu_activation,
         )
         self.out_ffn_neurons = self._resolve_ffn_neurons(
-            self.ffn_neurons, glu_activation=self.out_glu_activation
+            self.ffn_neurons,
+            glu_activation=self.out_glu_activation,
         )
         self.grid_mlp = self._broadcast_grid_setting(
-            grid_mlp, name="grid_mlp", cast=bool
+            grid_mlp,
+            name="grid_mlp",
+            cast=bool,
         )
         self.grid_branch = self._broadcast_grid_setting(
-            grid_branch, name="grid_branch", cast=int, non_negative=True
+            grid_branch,
+            name="grid_branch",
+            cast=int,
+            non_negative=True,
         )
         (
             self.node_wise_grid_mlp,
@@ -469,17 +763,28 @@ def __init__(
         # === Excluded type pairs ===
         self.reinit_exclude(exclude_types)
 
-        # === Type embedding (fp32+) ===
+        # === Type embedding ===
         self.type_embedding = SeZMTypeEmbedding(
             ntypes=self.ntypes,
             embed_dim=self.channels,
-            precision=self.compute_precision,
+            precision=self.compute_precision,  # force fp32+
             seed=seed_type_embedding,
             trainable=self.trainable,
         )
+        if self.add_chg_spin_ebd:
+            self.charge_spin_embedding: ChargeSpinEmbedding | None = (
+                ChargeSpinEmbedding(
+                    embed_dim=self.channels,
+                    activation_function=self.activation_function,
+                    precision=self.compute_precision,
+                    seed=seed_charge_spin,
+                    trainable=self.trainable,
+                )
+            )
+        else:
+            self.charge_spin_embedding = None
 
-        # === Env FiLM embedding (optional, fp32+) ===
-        compute_np_prec = PRECISION_DICT[self.compute_precision]
+        # === Env FiLM embedding (optional) ===
         if self.use_env_seed:
             self.env_seed_embedding: EnvironmentInitialEmbedding | None = (
                 EnvironmentInitialEmbedding(
@@ -493,19 +798,19 @@ def __init__(
                     mlp_bias=self.mlp_bias,
                     activation_function=self.activation_function,
                     eps=self.eps,
-                    precision=self.compute_precision,
+                    precision=self.compute_precision,  # force fp32+
                     trainable=self.trainable,
                     seed=seed_env_seed,
                 )
             )
-            self.film_scale_norm: ScalarRMSNorm | None = ScalarRMSNorm(
+            self.film_scale_norm = ScalarRMSNorm(
                 channels=self.channels,
                 n_focus=1,
                 eps=self.eps,
                 precision=self.compute_precision,
                 trainable=self.trainable,
             )
-            self.film_shift_norm: ScalarRMSNorm | None = ScalarRMSNorm(
+            self.film_shift_norm = ScalarRMSNorm(
                 channels=self.channels,
                 n_focus=1,
                 eps=self.eps,
@@ -513,11 +818,16 @@ def __init__(
                 trainable=self.trainable,
             )
             film_strength_init = 0.01
-            self.film_scale_strength_log: np.ndarray | None = np.full(
-                (1,), math.log(film_strength_init), dtype=compute_np_prec
+            # Use 1D tensor (not scalar) for FSDP2 compatibility
+            self.film_scale_strength_log = np.full(
+                (1,),
+                math.log(film_strength_init),
+                dtype=PRECISION_DICT[self.compute_precision],
             )
-            self.film_shift_strength_log: np.ndarray | None = np.full(
-                (1,), math.log(film_strength_init), dtype=compute_np_prec
+            self.film_shift_strength_log = np.full(
+                (1,),
+                math.log(film_strength_init),
+                dtype=PRECISION_DICT[self.compute_precision],
             )
         else:
             self.env_seed_embedding = None
@@ -530,43 +840,56 @@ def __init__(
             rcut=self.rcut,
             basis_type=self.basis_type,
             n_radial=self.n_radial,
-            precision=self.compute_precision,
+            precision=self.compute_precision,  # force fp32+
             exponent=self.env_exp[0],
         )
 
-        # === Shared radial embedding: RBF -> per-l radial features (fp32+) ===
+        # === Shared radial embedding: RBF -> per-l radial features ===
+        # Output dimension follows the first node degree, directly usable by
+        # GIE and truncated for each SO2Conv block.
+        # radial_mlp specifies hidden layer sizes; input/output layers are prepended/appended.
+        # Use fp32+ precision (same as RBF output) for numerical stability.
         radial_out_dim = (self.node_l_schedule[0] + 1) * self.channels
         radial_mlp_layers = [self.n_radial, *self.radial_mlp, radial_out_dim]
         self.radial_embedding = RadialMLP(
             radial_mlp_layers,
             activation_function=self.activation_function,
-            precision=self.compute_precision,
+            precision=self.compute_precision,  # force fp32+
             trainable=self.trainable,
             seed=seed_radial_embedding,
         )
 
         # === C^3 cutoff envelope for edge weight ===
-        self.edge_envelope = C3CutoffEnvelope(
-            self.rcut, self.env_exp[1], precision=self.compute_precision
-        )
-
-        wigner_lmax = self.l_schedule[0]
+        self.edge_envelope = C3CutoffEnvelope(rcut=self.rcut, exponent=self.env_exp[1])
+
+        # === Edge-aligned Wigner-D calculator ===
+        # Cartesian blocks (degree 1 or 2) skip the SO(2) rotations, so the full
+        # per-edge Wigner-D blocks are built only when a block keeps the SO(2)
+        # path (tracked by ``_need_full_wigner``).
+        block_edge_cartesian = [
+            self.edge_cartesian and l_b in (1, 2) for l_b in self.l_schedule
+        ]
+        block_node_cartesian = [
+            self.node_cartesian if l_b in (1, 2) else "none" for l_b in self.l_schedule
+        ]
+        self._need_full_wigner = not all(block_edge_cartesian)
         self.wigner_calc = WignerDCalculator(
-            wigner_lmax, eps=self.eps, precision=self.compute_precision
+            lmax=self.l_schedule[0],
+            eps=self.eps,
+            precision=self.compute_precision,  # force fp32+
         )
 
-        # === Geometric initial embedding (optional, fp32+) ===
         self.use_gie = self.use_env_seed and self.node_l_schedule[0] > 0
         if self.use_gie:
-            self.gie: GeometricInitialEmbedding | None = GeometricInitialEmbedding(
+            self.gie = GeometricInitialEmbedding(
                 lmax=self.node_l_schedule[0],
                 channels=self.channels,
-                precision=self.compute_precision,
+                precision=self.compute_precision,  # force fp32+
             )
             if self.extra_node_l > 0:
                 self.gie_zonal_wigner_calc: WignerDCalculator | None = (
                     WignerDCalculator(
-                        self.node_l_schedule[0],
+                        lmax=self.node_l_schedule[0],
                         eps=self.eps,
                         precision=self.compute_precision,
                     )
@@ -577,7 +900,6 @@ def __init__(
             self.gie = None
             self.gie_zonal_wigner_calc = None
 
-        # === Interaction blocks ===
         blocks: list[SeZMInteractionBlock] = []
         for block_idx, (l_b, node_l_b, m_b) in enumerate(
             zip(
@@ -597,10 +919,12 @@ def __init__(
                     n_focus=self.n_focus,
                     focus_dim=self.focus_dim,
                     so2_norm=self.so2_norm,
-                    so2_layers=self.so2_layers,
+                    mixing_layers=self.mixing_layers,
                     so2_attn_res=self.so2_attn_res_mode,
                     radial_so2_mode=self.radial_so2_mode,
                     radial_so2_rank=self.radial_so2_rank,
+                    edge_cartesian=block_edge_cartesian[block_idx],
+                    node_cartesian=block_node_cartesian[block_idx],
                     ffn_neurons=self.block_ffn_neurons,
                     node_wise_grid_mlp=self.node_wise_grid_mlp,
                     node_wise_grid_branch=self.node_wise_grid_branch,
@@ -642,7 +966,32 @@ def __init__(
             )
         self.blocks = blocks
 
-        # === Final FFN for l=0 output mixing (fp32+) ===
+        # === Optional descriptor-level attention residuals ===
+        self.final_block_attn_res = None
+        if self.use_full_attn_res:
+            self.final_full_attn_res: DepthAttnRes | None = DepthAttnRes(
+                channels=self.channels,
+                input_dependent=self.full_attn_res_mode == "dependent",
+                eps=self.eps,
+                bias=self.mlp_bias,
+                precision=self.compute_precision,
+                trainable=self.trainable,
+                seed=child_seed(seed_full_attn, 2000),
+            )
+        else:
+            self.final_full_attn_res = None
+        if self.use_block_attn_res:
+            self.final_block_attn_res: DepthAttnRes | None = DepthAttnRes(
+                channels=self.channels,
+                input_dependent=self.block_attn_res_mode == "dependent",
+                eps=self.eps,
+                bias=self.mlp_bias,
+                precision=self.compute_precision,
+                trainable=self.trainable,
+                seed=child_seed(seed_block_attn, 2000),
+            )
+
+        # === Final FFN for l=0 output mixing ===
         # ``so3_readout="none"`` runs a degree-0 scalar FFN on the l=0 slice.
         # ``"glu"``/``"mlp"`` run a full FFN at the last block's node degree whose
         # SO(3) Wigner-D grid folds l>0 geometry into l=0; the value selects the
@@ -655,24 +1004,786 @@ def __init__(
             kmax=min(self.kmax, readout_lmax),
             grid_mlp=self.so3_readout == "mlp",
             grid_branch=0,
+            precision=self.compute_precision,
             s2_activation=False,
             ffn_so3_grid=self.so3_readout != "none",
             activation_function=self.out_activation_function,
             glu_activation=self.out_glu_activation,
             mlp_bias=self.mlp_bias,
-            precision=self.compute_precision,
             trainable=self.trainable,
             seed=seed_out,
         )
 
-        # === Statistics buffers (interface compatibility, unused in call) ===
-        model_np_prec = PRECISION_DICT[self.precision]
-        self.mean = np.zeros((0,), dtype=model_np_prec)
-        self.stddev = np.ones((0,), dtype=model_np_prec)
+        # === Statistics buffers (interface compatibility) ===
+        self.stats: dict[str, Any] | None = None
+        self.mean = np.zeros(0, dtype=PRECISION_DICT[self.precision])
+        self.stddev = np.ones(0, dtype=PRECISION_DICT[self.precision])
 
-    # =========================================================================
-    # Construction helpers (mirroring pt)
-    # =========================================================================
+    def call(
+        self,
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        mapping: Array | None = None,
+        edge_index: Array | None = None,
+        edge_vec: Array | None = None,
+        edge_mask: Array | None = None,
+        comm_dict: dict[str, Array] | None = None,
+        fparam: Array | None = None,
+        force_embedding: Array | None = None,
+        charge_spin: Array | None = None,
+    ) -> tuple[
+        Array,
+        Array | None,
+        Array | None,
+        Array | None,
+        Array | None,
+    ]:
+        """
+        Compute the descriptor.
+
+        Parameters
+        ----------
+        coord_ext
+            Extended coordinates of atoms with shape (nf, nall*3) or (nf, nall, 3) in Å.
+        atype_ext
+            Extended atom types with shape (nf, nall).
+        nlist
+            Neighbor list with shape (nf, nloc, nnei).
+        mapping
+            Extended-to-local mapping with shape (nf, nall), or None.
+        edge_index
+            Fixed-shape edge indices with shape (2, E). If provided, the descriptor
+            uses the edge-list path and ignores `nlist` and `mapping`.
+        edge_vec
+            Fixed-shape edge vectors with shape (E, 3) in Å. Required when
+            `edge_index` is provided.
+        edge_mask
+            Fixed-shape edge mask with shape (E,). Required when `edge_index`
+            is provided.
+        comm_dict
+            Communication dictionary for parallel inference (unused).
+        fparam
+            Frame parameters with shape (nf, nfp). Not used by SeZM, kept for
+            interface compatibility.
+        force_embedding
+            Optional precomputed equivariant force embedding with shape
+            ``(nf * nloc, D, 1, channels)``, where
+            ``D = (node_l_schedule[0] + 1) ** 2``. This tensor is added to the
+            initial SO(3) backbone state before the interaction blocks.
+        charge_spin
+            Frame-level charge and spin conditions with shape (nf, 2).
+
+        Returns
+        -------
+        descriptor
+            Descriptor with shape (nf, nloc, channels). Only l=0 is returned.
+        rot_mat
+            None (not used).
+        g2
+            None (not used).
+        h2
+            None (not used).
+        sw
+            None (not used).
+        """
+        xp = array_api_compat.array_namespace(coord_ext, atype_ext)
+        device = array_api_compat.device(coord_ext)
+        if coord_ext.ndim == 2:
+            coord_ext = xp.reshape(coord_ext, (coord_ext.shape[0], -1, 3))
+        elif coord_ext.ndim != 3:
+            raise ValueError("coord_ext must have shape (nf, nall*3) or (nf, nall, 3)")
+
+        if edge_index is not None:
+            nf_edge = atype_ext.shape[0]
+            charge_spin = self._canonicalize_charge_spin(
+                charge_spin,
+                nf=nf_edge,
+                dtype=coord_ext.dtype,
+                device=device,
+            )
+            descriptor, _ = self.call_with_edges(
+                coord_ext=coord_ext,
+                atype_ext=atype_ext,
+                edge_index=edge_index,
+                edge_vec=edge_vec,
+                edge_mask=edge_mask,
+                force_embedding=force_embedding,
+                charge_spin=charge_spin,
+            )
+            return (
+                descriptor,
+                None,
+                None,
+                None,
+                None,
+            )
+
+        # === Step 1. Setup dimensions ===
+        coord_ext = xp.astype(coord_ext, get_xp_precision(xp, self.compute_precision))
+        nf, nloc, nnei = nlist.shape
+        nall = coord_ext.shape[1]
+        n_nodes = nf * nloc
+        charge_spin = self._canonicalize_charge_spin(
+            charge_spin,
+            nf=nf,
+            dtype=coord_ext.dtype,
+            device=device,
+        )
+
+        # === Step 2. Excluded type pairs ===
+        if self.exclude_types:
+            # (nf, nloc, nnei), True means keep.
+            pair_keep_mask = xp.astype(
+                self.emask.build_type_exclude_mask(nlist, atype_ext), xp.bool
+            )
+        else:
+            pair_keep_mask = xp.ones_like(nlist, dtype=xp.bool)
+
+        # === Step 3. Type embedding (l=0) ===
+        atype_loc = xp_take_first_n(atype_ext, 1, nloc)  # (nf, nloc)
+        type_ebed = xp.reshape(
+            self.type_embedding(atype_loc), (n_nodes, self.channels)
+        )  # (N, C)
+        if self.charge_spin_embedding is not None:
+            type_ebed = self._apply_charge_spin_embedding(
+                type_ebed,
+                charge_spin,
+                nf=nf,
+                nloc=nloc,
+            )
+
+        # === Step 4. Build edge cache once (geometry + RBF + Wigner-D) ===
+        # Zone bridging (InnerClamp + SFPG + ZBL) is not routed through the
+        # standard DeePMD path: bridging only makes physical sense when
+        # paired with the ZBL energy that ``SeZMModel`` injects on the
+        # sparse-edge path, so ``forward`` keeps the original
+        # bridging-free aggregation semantics.
+        edge_cache = build_edge_cache(
+            type_ebed=type_ebed,
+            extended_coord=coord_ext,
+            nlist=nlist,
+            mapping=mapping,
+            pair_keep_mask=pair_keep_mask,
+            eps=self.eps,
+            deg_norm_floor=(self.deg_norm_floor if self.version >= 1.1 else self.eps),
+            edge_envelope=self.edge_envelope,
+            radial_basis=self.radial_basis,
+            n_radial=self.radial_basis.n_radial,
+            # Random local-Z roll is a training-only augmentation;
+            # the model is roll-equivariant, so inference fixes gamma.
+            random_gamma=False,
+            wigner_calc=self.wigner_calc,
+            build_wigner=self._need_full_wigner,
+        )
+
+        ebed_dim_0 = self.node_ebed_dims[0]  # (node_lmax+1)^2
+        x0 = type_ebed  # (N, C)
+        x0_out = x0  # (N, C)
+
+        # === Step 5. Compute radial features once (fp32+) ===
+        # Shape: (E, (node_lmax+1)*C) -> (E, node_lmax+1, C)
+        radial_feat = xp.reshape(
+            self.radial_embedding(edge_cache.edge_rbf),
+            (-1, self.node_l_schedule[0] + 1, self.channels),
+        )  # (E, lmax+1, C)
+        if self.version >= 1.1:
+            radial_feat = radial_feat * xp.reshape(edge_cache.edge_env, (-1, 1, 1))
+
+        # === Step 6. Env FiLM conditioning (optional, fp32+) ===
+        if self.use_env_seed:
+            atype_flat = xp.reshape(atype_loc, (-1,))  # (N,)
+            film = self.env_seed_embedding(
+                edge_cache=edge_cache,
+                atype_flat=atype_flat,
+                n_nodes=n_nodes,
+            )  # (N, 2*C)
+            scale_logits = film[:, : self.channels]  # (N, C)
+            shift_logits = film[:, self.channels :]  # (N, C)
+            scale_hat = self.film_scale_norm(scale_logits)  # (N, C)
+            shift_hat = self.film_shift_norm(shift_logits)  # (N, C)
+            scale_strength = xp.exp(
+                xp_asarray_nodetach(
+                    xp, self.film_scale_strength_log[...], device=device
+                )
+            )
+            shift_strength = xp.exp(
+                xp_asarray_nodetach(
+                    xp, self.film_shift_strength_log[...], device=device
+                )
+            )
+            scale = 1.0 + scale_strength * xp.tanh(scale_hat)  # (N, C)
+            shift = shift_strength * xp.tanh(shift_hat)  # (N, C)
+            x0_out = x0 * scale + shift
+
+        # === Step 7. Build backbone l=0 features ===
+        x = xp.concat(
+            [
+                xp.reshape(x0_out, (n_nodes, 1, 1, self.channels)),
+                xp.zeros(
+                    (n_nodes, ebed_dim_0 - 1, 1, self.channels),
+                    dtype=type_ebed.dtype,
+                    device=device,
+                ),
+            ],
+            axis=1,
+        )  # (N, D, 1, C)
+
+        # === Step 8. Geometric Initial Embedding (fp32+) ===
+        if self.use_gie:
+            # GIE only needs l>=1, slice radial_feat[:, 1:, :]
+            zonal_coupling = self._build_gie_zonal_coupling(edge_cache)
+            x = (
+                x
+                + self.gie(
+                    n_nodes=n_nodes,
+                    edge_cache=edge_cache,
+                    radial_feat=radial_feat[:, 1:, :],
+                    zonal_coupling=zonal_coupling,
+                )[:, :, None, :]
+            )
+
+        # === Step 9. Fuse edge type features into radial features (fp32+) ===
+        radial_feat = radial_feat + xp.reshape(
+            edge_cache.edge_type_feat, (-1, 1, self.channels)
+        )
+        radial_feat = xp.astype(radial_feat, get_xp_precision(xp, self.precision))
+        rad_feat_per_block = [
+            radial_feat[:, :rad_len, :] for rad_len in self.rad_sizes_per_block
+        ]  # list of (E, lmax+1, C)
+
+        # === Step 10. Convert to self.dtype and run blocks ===
+        x = xp.astype(x, get_xp_precision(xp, self.precision))  # (N, D, 1, C)
+        if force_embedding is not None:
+            x = x + xp.astype(force_embedding, get_xp_precision(xp, self.precision))
+        edge_cache = edge_cache_to_dtype(
+            edge_cache, get_xp_precision(xp, self.precision)
+        )
+        x = self._forward_blocks(x, edge_cache, rad_feat_per_block)
+
+        # === Step 11. Final l=0 output mixing ===
+        # ``none`` feeds the l=0 slice only; ``glu``/``mlp`` feed the full
+        # (N, D, 1, C) node tensor so the SO(3) grid folds l>0 into l=0. The
+        # residual is added on the full coefficient tensor before extracting
+        # l=0: slicing the summed tensor rather than the FFN output keeps the
+        # saved degree-axis stride static under torch.compile dynamic shapes.
+        ffn_in = (
+            xp.astype(
+                xp.reshape(x[:, 0:1, :, :], (n_nodes, 1, 1, self.channels)),
+                get_xp_precision(xp, self.compute_precision),
+            )
+            if self.so3_readout == "none"
+            # truncate to the final node degree: the empty-edge path
+            # skips the blocks, leaving x at node_ebed_dims[0]; output_ffn
+            # is built for node_ebed_dims[-1]. No-op when blocks ran.
+            else xp.astype(
+                x[:, : self.node_ebed_dims[-1], :, :],
+                get_xp_precision(xp, self.compute_precision),
+            )
+        )
+        x_scalar = (ffn_in + self.output_ffn(ffn_in))[:, 0:1, :, :]
+
+        # === Step 12. Reshape to (nf, nloc, channels) and return ===
+        descriptor = xp.reshape(x_scalar, (nf, nloc, self.channels))  # (nf, nloc, C)
+        return (
+            xp.astype(descriptor, get_xp_precision(xp, "global")),
+            None,
+            None,
+            None,
+            None,
+        )
+
+    def call_with_edges(
+        self,
+        *,
+        coord_ext: Array,
+        atype_ext: Array,
+        edge_index: Array,
+        edge_vec: Array,
+        edge_mask: Array,
+        force_embedding: Array | None = None,
+        charge_spin: Array | None = None,
+        comm_dict: dict[str, Array] | None = None,
+        nloc: int | None = None,
+    ) -> tuple[Array, Array]:
+        """
+        Compute the descriptor from a sparse edge list.
+
+        Two node-set conventions share this path. In the single-domain path
+        (``comm_dict`` is ``None``) the nodes are exactly the local atoms and
+        ``edge_index`` source/destination both index ``[0, nf*nloc)``. In the
+        parallel (LAMMPS multi-rank) path the nodes span the extended region
+        (local owners followed by ghosts), ``edge_index`` indexes the extended
+        atoms directly, and each interaction block refreshes ghost-node features
+        from their owner ranks at the SO(2) convolution input (see
+        :func:`~deepmd.pt.model.descriptor.sezm_nn.block.exchange_ghost_features`).
+
+        Parameters
+        ----------
+        coord_ext
+            Coordinates with shape (nf, n*3) or (nf, n, 3) in Å, where ``n`` is
+            ``nloc`` in the single-domain path and ``nall`` in the parallel path.
+        atype_ext
+            Atom types with shape (nf, n). In the parallel path this spans the
+            extended region so ghost type embeddings are available for the
+            edge-type and environment-seed features.
+        edge_index
+            Edge indices with shape (2, E).
+        edge_vec
+            Edge vectors with shape (E, 3) in Å.
+        edge_mask
+            Edge mask with shape (E,).
+        force_embedding
+            Optional precomputed equivariant force embedding with shape
+            ``(nf * nloc, D, 1, channels)``, where
+            ``D = (node_l_schedule[0] + 1) ** 2``. This tensor is added to the
+            initial SO(3) backbone state before the interaction blocks.
+        charge_spin
+            Frame-level charge and spin conditions with shape (nf, 2).
+        comm_dict
+            Border-exchange tensors for parallel inference. When provided, the
+            node set spans the extended region and ghost features are exchanged
+            via ``deepmd_export::border_op`` between interaction blocks.
+        nloc
+            Number of owned (local) atoms per frame. Required when ``comm_dict``
+            is provided; the final scalar read-out is restricted to these atoms.
+
+        Returns
+        -------
+        tuple[Array, Array]
+            The scalar descriptor with shape ``(nf, nloc, channels)`` and the
+            final equivariant latent with shape ``(nf * nloc, D_final, 1, channels)``.
+        """
+        xp = array_api_compat.array_namespace(coord_ext, atype_ext, edge_vec)
+        device = array_api_compat.device(coord_ext)
+        # === Step 1. Setup dimensions ===
+        # ``n_per_frame`` is the per-frame node count: ``nloc`` in the
+        # single-domain path and ``nall`` in the parallel path. ``out_nloc`` is
+        # the owned-atom count used for the final local read-out.
+        coord_ext = xp.astype(coord_ext, get_xp_precision(xp, self.compute_precision))
+        nf, n_per_frame = atype_ext.shape[:2]
+        parallel = comm_dict is not None
+        if parallel:
+            # Multi-rank parallel inference requires a custom border-exchange
+            # communication op that is not available in the dpmodel backend.
+            raise NotImplementedError(
+                "multi-rank comm_dict inference is not supported in the dpmodel backend"
+            )
+        out_nloc = nloc if parallel else n_per_frame
+        atype_flat = xp.reshape(atype_ext, (-1,))  # (N,)
+
+        # === Step 2. Type embedding (l=0) ===
+        type_ebed = xp.reshape(
+            self.type_embedding(atype_ext), (-1, self.channels)
+        )  # (N, C)
+        if self.charge_spin_embedding is not None:
+            type_ebed = self._apply_charge_spin_embedding(
+                type_ebed,
+                charge_spin,
+                nf=nf,
+                nloc=n_per_frame,
+            )
+        n_nodes = type_ebed.shape[0]
+
+        # === Step 3. Build edge cache once (sparse edges) ===
+        edge_cache = build_edge_cache_from_edges(
+            type_ebed=type_ebed,
+            atype_flat=atype_flat,
+            edge_index=edge_index,
+            edge_vec=edge_vec,
+            edge_mask=edge_mask,
+            compute_dtype=get_xp_precision(xp, self.compute_precision),
+            eps=self.eps,
+            deg_norm_floor=(self.deg_norm_floor if self.version >= 1.1 else self.eps),
+            inner_clamp=self.inner_clamp,
+            bridging_switch=self.bridging_switch,
+            edge_envelope=self.edge_envelope,
+            radial_basis=self.radial_basis,
+            has_exclude_types=bool(self.exclude_types),
+            edge_type_keep_mask=self._edge_type_keep_mask,
+            # Random local-Z roll is a training-only augmentation;
+            # the model is roll-equivariant, so inference fixes gamma.
+            random_gamma=False,
+            wigner_calc=self.wigner_calc,
+            build_wigner=self._need_full_wigner,
+        )
+
+        ebed_dim_0 = self.node_ebed_dims[0]  # (node_lmax+1)^2
+        x0 = type_ebed  # (N, C)
+        x0_out = x0  # (N, C)
+
+        # === Step 4. Compute radial features once (fp32+) ===
+        radial_feat_flat = self.radial_embedding(edge_cache.edge_rbf)
+        radial_feat = xp.reshape(
+            radial_feat_flat,
+            (
+                radial_feat_flat.shape[0],
+                self.node_l_schedule[0] + 1,
+                self.channels,
+            ),
+        )  # (E, lmax+1, C)
+        if self.version >= 1.1:
+            radial_feat = radial_feat * xp.reshape(edge_cache.edge_env, (-1, 1, 1))
+
+        # === Step 5. Env FiLM conditioning (optional, fp32+) ===
+        if self.use_env_seed:
+            film = self.env_seed_embedding(
+                edge_cache=edge_cache,
+                atype_flat=atype_flat,
+                n_nodes=n_nodes,
+            )  # (N, 2*C)
+            scale_logits = film[:, : self.channels]  # (N, C)
+            shift_logits = film[:, self.channels :]  # (N, C)
+            scale_hat = self.film_scale_norm(scale_logits)  # (N, C)
+            shift_hat = self.film_shift_norm(shift_logits)  # (N, C)
+            scale_strength = xp.exp(
+                xp_asarray_nodetach(
+                    xp, self.film_scale_strength_log[...], device=device
+                )
+            )
+            shift_strength = xp.exp(
+                xp_asarray_nodetach(
+                    xp, self.film_shift_strength_log[...], device=device
+                )
+            )
+            scale = 1.0 + scale_strength * xp.tanh(scale_hat)  # (N, C)
+            shift = shift_strength * xp.tanh(shift_hat)  # (N, C)
+            x0_out = x0 * scale + shift
+
+        # === Step 6. Build backbone l=0 features ===
+        x = xp.concat(
+            [
+                xp.reshape(x0_out, (n_nodes, 1, 1, self.channels)),
+                xp.zeros(
+                    (n_nodes, ebed_dim_0 - 1, 1, self.channels),
+                    dtype=type_ebed.dtype,
+                    device=device,
+                ),
+            ],
+            axis=1,
+        )  # (N, D, 1, C)
+
+        # === Step 7. Geometric Initial Embedding (fp32+) ===
+        if self.use_gie:
+            zonal_coupling = self._build_gie_zonal_coupling(edge_cache)
+            x = (
+                x
+                + self.gie(
+                    n_nodes=n_nodes,
+                    edge_cache=edge_cache,
+                    radial_feat=radial_feat[:, 1:, :],
+                    zonal_coupling=zonal_coupling,
+                )[:, :, None, :]
+            )
+
+        # === Step 8. Fuse edge type features into radial features (fp32+) ===
+        radial_feat = xp.astype(radial_feat, get_xp_precision(xp, self.precision))
+        radial_feat = radial_feat + xp.reshape(
+            xp.astype(edge_cache.edge_type_feat, get_xp_precision(xp, self.precision)),
+            (-1, 1, self.channels),
+        )
+        rad_feat_per_block = [
+            radial_feat[:, :rad_len, :] for rad_len in self.rad_sizes_per_block
+        ]
+
+        # === Step 9. Convert to self.dtype and run blocks ===
+        x = xp.astype(x, get_xp_precision(xp, self.precision))  # (N, D, 1, C)
+        if force_embedding is not None:
+            x = x + xp.astype(force_embedding, get_xp_precision(xp, self.precision))
+        edge_cache = edge_cache_to_dtype(
+            edge_cache, get_xp_precision(xp, self.precision)
+        )
+        x = self._forward_blocks(x, edge_cache, rad_feat_per_block, comm_dict=comm_dict)
+
+        # === Step 10. Keep the owned-atom rows for the read-out ===
+        # ``n_out_nodes`` is the owned-node count in the flattened layout
+        # (``nf * nloc``). Single-domain: ``out_nloc == n_per_frame``, so this
+        # equals the whole node set and the slice is a no-op. Parallel
+        # (single-frame): it drops the trailing ghost rows that only fed message
+        # passing -- LAMMPS orders owned atoms before ghosts, so they lead.
+        n_out_nodes = nf * out_nloc
+        x = x[:n_out_nodes]
+
+        # === Step 11. Final l=0 output mixing ===
+        # ``none`` feeds the l=0 slice only; ``glu``/``mlp`` feed the full
+        # (N, D, 1, C) node tensor so the SO(3) grid folds l>0 into l=0. The
+        # residual is added on the full coefficient tensor before extracting
+        # l=0: slicing the summed tensor rather than the FFN output keeps the
+        # saved degree-axis stride static under torch.compile dynamic shapes.
+        ffn_in = (
+            xp.astype(
+                xp.reshape(x[:, 0:1, :, :], (n_out_nodes, 1, 1, self.channels)),
+                get_xp_precision(xp, self.compute_precision),
+            )
+            if self.so3_readout == "none"
+            # truncate to the final node degree: the empty-edge path
+            # skips the blocks, leaving x at node_ebed_dims[0]; output_ffn
+            # is built for node_ebed_dims[-1]. No-op when blocks ran.
+            else xp.astype(
+                x[:, : self.node_ebed_dims[-1], :, :],
+                get_xp_precision(xp, self.compute_precision),
+            )
+        )
+        x_scalar = (ffn_in + self.output_ffn(ffn_in))[:, 0:1, :, :]
+
+        # === Step 12. Reshape to (nf, nloc, channels) and return ===
+        descriptor = xp.reshape(
+            x_scalar, (nf, out_nloc, self.channels)
+        )  # (nf, nloc, C)
+        return xp.astype(descriptor, get_xp_precision(xp, "global")), x
+
+    def _forward_blocks(
+        self,
+        x: Array,
+        edge_cache: EdgeCache,
+        radial_feat_per_block: list[Array],
+        comm_dict: dict[str, Array] | None = None,
+    ) -> Array:
+        """
+        Run the interaction blocks with optional depth attention.
+
+        Parameters
+        ----------
+        x
+            Initial node features with shape (N, D, 1, C).
+        edge_cache
+            Per-edge cache.
+        radial_feat_per_block
+            List of per-block radial features already truncated to l_schedule[i]+1.
+        comm_dict
+            Border-exchange tensors for parallel inference, forwarded to each
+            block. The block refreshes ghost rows at the SO(2) convolution
+            input — the descriptor's only cross-node operation — so message
+            passing always reads up-to-date neighbours regardless of the
+            (per-node) attention-residual history.
+
+        Returns
+        -------
+        Array
+            Output features with shape (N, D, 1, C).
+        """
+        if not self.use_full_attn_res and not self.use_block_attn_res:
+            # === Fast path without descriptor-level attention residuals ===
+            for i, block in enumerate(self.blocks):
+                x = x[:, : self.node_ebed_dims[i], :, :]
+                blk_radial = radial_feat_per_block[i]
+                x, _, _, _ = block(
+                    x,
+                    edge_cache,
+                    blk_radial,
+                    comm_dict=self._block_comm(i, comm_dict),
+                )
+            return x
+
+        n_node = x.shape[0]
+        xp = array_api_compat.array_namespace(x)
+
+        def node_l0_extractor(v: Array) -> Array:
+            """Extract scalar features from global SO(3) layout."""
+            return xp.reshape(v[:, 0, :, :], (n_node, self.channels))
+
+        if self.use_full_attn_res:
+            # === Step 1. Maintain descriptor-level unit history ===
+            unit_history = [x]
+
+            # === Step 2. Run each block with selective unit-history aggregation ===
+            for i, block in enumerate(self.blocks):
+                current_dim = self.node_ebed_dims[i]
+                current_x = x[:, :current_dim, :, :]
+                truncated_unit_history = [
+                    source[:, :current_dim, :, :] for source in unit_history
+                ]
+                blk_radial = radial_feat_per_block[i]
+                block_output, _, so2_unit_output, ffn_unit_outputs = block(
+                    current_x,
+                    edge_cache,
+                    blk_radial,
+                    unit_history=truncated_unit_history,
+                    comm_dict=self._block_comm(i, comm_dict),
+                )
+                unit_history.append(so2_unit_output)
+                unit_history.extend(ffn_unit_outputs)
+                x = block_output
+
+            # === Step 3. Final aggregation over all completed unit representations ===
+            final_dim = self.node_ebed_dims[-1]
+            final_sources = [source[:, :final_dim, :, :] for source in unit_history]
+            x = xp.astype(
+                self.final_full_attn_res(
+                    sources=final_sources,
+                    scalar_extractor=node_l0_extractor,
+                    current_x=x,
+                ),
+                get_xp_precision(xp, self.precision),
+            )
+            return x
+
+        # === Step 1. Maintain descriptor-level block history ===
+        block_history = [x]
+
+        # === Step 2. Run each block with selective block-history aggregation ===
+        for i, block in enumerate(self.blocks):
+            current_dim = self.node_ebed_dims[i]
+            current_x = x[:, :current_dim, :, :]
+            truncated_block_history = [
+                source[:, :current_dim, :, :] for source in block_history
+            ]
+            blk_radial = radial_feat_per_block[i]
+            block_output, block_summary, _, _ = block(
+                current_x,
+                edge_cache,
+                blk_radial,
+                unit_history=truncated_block_history,
+                comm_dict=self._block_comm(i, comm_dict),
+            )
+            block_history.append(block_summary)
+            x = block_output
+
+        # === Step 3. Final aggregation over all completed block summaries ===
+        final_dim = self.node_ebed_dims[-1]
+        final_sources = [source[:, :final_dim, :, :] for source in block_history]
+        x = xp.astype(
+            self.final_block_attn_res(
+                sources=final_sources,
+                scalar_extractor=node_l0_extractor,
+                current_x=x,
+            ),
+            get_xp_precision(xp, self.precision),
+        )
+        return x
+
+    def _edge_quaternion(self, edge_cache: EdgeCache) -> Array:
+        """
+        Return the cached global->local edge quaternion, rebuilding if absent.
+
+        Parameters
+        ----------
+        edge_cache : EdgeFeatureCache
+            Per-edge cache. ``edge_quat`` is populated by the cache builder; the
+            fallback covers caches produced without it.
+
+        Returns
+        -------
+        Array
+            Unit quaternions with shape (E, 4).
+        """
+        edge_quat = edge_cache.edge_quat
+        if edge_quat is None:
+            edge_len = safe_norm(edge_cache.edge_vec, self.eps)
+            edge_quat = build_edge_quaternion(
+                edge_cache.edge_vec,
+                edge_len=edge_len,
+                eps=self.eps,
+            )
+        return edge_quat
+
+    def _build_gie_zonal_coupling(
+        self,
+        edge_cache: EdgeCache,
+    ) -> Array | None:
+        """
+        Build node-level zonal coupling for GIE when node degrees exceed MP degrees.
+
+        Returns
+        -------
+        Array or None
+            Coupling with shape ``(E, D_node - 1)``. ``None`` is returned only
+            when the full Wigner-D blocks are present and ``extra_node_l == 0``,
+            in which case GIE gathers the coupling from the cache directly. When
+            the blocks are skipped (all-Cartesian model) the full coupling is
+            reconstructed from the edge quaternion via the m=0-only path.
+        """
+        if edge_cache.Dt_full is None:
+            calc = self.gie_zonal_wigner_calc or self.wigner_calc
+            return calc.forward_zonal(self._edge_quaternion(edge_cache), lmin=1)
+        if self.gie_zonal_wigner_calc is None:
+            return None
+        xp = array_api_compat.array_namespace(edge_cache.Dt_full)
+        device = array_api_compat.device(edge_cache.Dt_full)
+        mp_row_count = self.ebed_dims[0] - 1
+        mp_row_index = self.gie.non_scalar_row_index[:mp_row_count]
+        mp_m0_col_index = self.gie.zonal_m0_col_index_for_row[:mp_row_count]
+        dim_full = edge_cache.Dt_full.shape[-1]
+        mp_coupling = xp.take(
+            xp.reshape(edge_cache.Dt_full, (-1, dim_full * dim_full)),
+            xp_asarray_nodetach(
+                xp, mp_row_index * dim_full + mp_m0_col_index, device=device
+            ),
+            axis=1,
+        )
+        extra_coupling = self.gie_zonal_wigner_calc.forward_zonal(
+            self._edge_quaternion(edge_cache),
+            lmin=self.lmax + 1,
+        )
+        return xp.concat([mp_coupling, extra_coupling], axis=1)
+
+    def _apply_charge_spin_embedding(
+        self,
+        type_ebed: Array,
+        charge_spin: Array,
+        *,
+        nf: int,
+        nloc: int,
+    ) -> Array:
+        """
+        Add frame-level charge and spin conditions to scalar type features.
+
+        Parameters
+        ----------
+        type_ebed
+            Flattened type embeddings with shape (nf * nloc, channels).
+        charge_spin
+            Frame-level charge and spin conditions with shape (nf, 2).
+        nf
+            Number of frames.
+        nloc
+            Number of local atoms.
+
+        Returns
+        -------
+        Array
+            Conditioned type embeddings with shape (nf * nloc, channels).
+        """
+        xp = array_api_compat.array_namespace(type_ebed, charge_spin)
+        condition = self.charge_spin_embedding(xp.astype(charge_spin, type_ebed.dtype))
+        condition = xp.broadcast_to(condition[:, None, :], (nf, nloc, self.channels))
+        return type_ebed + xp.reshape(condition, type_ebed.shape)
+
+    def _edge_type_keep_mask(
+        self,
+        atype_flat: Array,
+        src: Array,
+        dst: Array,
+    ) -> Array:
+        """
+        Build keep mask for edge pairs based on excluded type pairs.
+
+        Parameters
+        ----------
+        atype_flat
+            Flattened local atom types with shape (N,).
+        src
+            Source indices with shape (E,).
+        dst
+            Destination indices with shape (E,).
+
+        Returns
+        -------
+        Array
+            Boolean mask with shape (E,), True means keep.
+        """
+        xp = array_api_compat.array_namespace(atype_flat, src, dst)
+        if len(self.emask.exclude_types) == 0:
+            return xp.ones_like(src, dtype=xp.bool)
+        device = array_api_compat.device(atype_flat)
+        type_i = xp.take(atype_flat, dst, axis=0)
+        type_j = xp.take(atype_flat, src, axis=0)
+        type_i = xp.where(type_i >= 0, type_i, self.ntypes)
+        type_j = xp.where(type_j >= 0, type_j, self.ntypes)
+        type_ij = type_i * (self.ntypes + 1) + type_j
+        type_mask = xp_asarray_nodetach(xp, self.emask.type_mask[...], device=device)
+        keep = xp.take(type_mask, xp.astype(type_ij, xp.int64), axis=0)
+        return xp.astype(keep, xp.bool)
 
     @staticmethod
     def _broadcast_grid_setting(
@@ -682,7 +1793,12 @@ def _broadcast_grid_setting(
         cast: type,
         non_negative: bool = False,
     ) -> list:
-        """Normalize a grid-path setting to ``[node_wise, message_node, ffn]``."""
+        """Normalize a grid-path setting to ``[node_wise, message_node, ffn]``.
+
+        A scalar is broadcast to all three grid paths, while a length-three
+        list is validated element-wise. When ``non_negative`` is set, every
+        entry must be ``>= 0``.
+        """
         entries = list(value) if isinstance(value, list) else [value, value, value]
         if len(entries) != 3:
             raise ValueError(
@@ -694,7 +1810,12 @@ def _broadcast_grid_setting(
             raise ValueError(f"`{name}` entries must be non-negative")
         return normalized
 
-    def _resolve_ffn_neurons(self, ffn_neurons: int, *, glu_activation: bool) -> int:
+    def _resolve_ffn_neurons(
+        self,
+        ffn_neurons: int,
+        *,
+        glu_activation: bool,
+    ) -> int:
         """Resolve one FFN hidden width from the descriptor config."""
         resolved = int(ffn_neurons)
         if resolved < 0:
@@ -716,7 +1837,8 @@ def _init_lm_schedules(
         mmax: int | None,
         m_schedule: list[int] | None,
     ) -> None:
-        """Parse and validate L/M schedules (pt ``_init_lm_schedules``)."""
+        """Parse and validate L/M schedules, setting self.l_schedule/m_schedule/lmax/mmax."""
+        # === L schedule ===
         if l_schedule is None:
             self.l_schedule = [int(lmax)] * int(n_blocks)
         else:
@@ -734,6 +1856,7 @@ def _init_lm_schedules(
         self.lmax = int(self.l_schedule[0])
         self.n_blocks = len(self.l_schedule)
 
+        # === M schedule ===
         if m_schedule is None:
             if mmax is None:
                 self.m_schedule = [int(l) for l in self.l_schedule]
@@ -754,6 +1877,7 @@ def _init_lm_schedules(
             raise ValueError(
                 "`m_schedule` entries must satisfy `m_schedule[i] <= l_schedule[i]`"
             )
+
         self.mmax = int(self.m_schedule[0])
 
     def _init_node_l_schedules(self, extra_node_l: int) -> None:
@@ -770,234 +1894,85 @@ def _init_node_l_schedules(self, extra_node_l: int) -> None:
         self.node_lmax = int(self.node_l_schedule[0])
         self.node_ebed_dim = int(self.node_ebed_dims[0])
 
-    def reinit_exclude(
-        self, exclude_types: list[tuple[int, int]] | None = None
-    ) -> None:
-        if exclude_types is None:
-            exclude_types = []
-        self.exclude_types = exclude_types
-        self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
-
-    # =========================================================================
-    # Forward
-    # =========================================================================
-
-    def call(
+    def _canonicalize_charge_spin(
         self,
-        coord_ext: Array,
-        atype_ext: Array,
-        nlist: Array,
-        mapping: Array | None = None,
-        fparam: Array | None = None,
-        comm_dict: dict | None = None,
-        charge_spin: Array | None = None,
-    ) -> tuple[Array, Any, Any, Any, Any]:
-        """Compute the DPA4 descriptor.
+        charge_spin: Array | None,
+        *,
+        nf: int,
+        dtype: Any,
+        device: Any,
+    ) -> Array | None:
+        """
+        Canonicalize charge/spin conditions for the public descriptor path.
 
         Parameters
         ----------
-        coord_ext
-            Extended coordinates with shape (nf, nall*3) or (nf, nall, 3).
-        atype_ext
-            Extended atom types with shape (nf, nall).
-        nlist
-            Neighbor list with shape (nf, nloc, nnei); -1 marks padding.
-        mapping
-            Extended-to-local mapping with shape (nf, nall), or None when the
-            neighbor indices are already local.
-        fparam
-            Frame parameters; not used by DPA4 (interface compatibility).
-        comm_dict
-            MPI communication metadata; not used (interface compatibility).
         charge_spin
-            Charge/spin embedding input; must be None since
-            ``add_chg_spin_ebd=True`` is rejected at construction
-            (interface compatibility with ``DPAtomicModel``).
+            Optional frame-level charge and spin conditions.
+        nf
+            Number of frames.
+        dtype
+            Target floating-point dtype.
+        device
+            Target device.
 
         Returns
         -------
-        descriptor
-            Scalar descriptor with shape (nf, nloc, channels).
-        rot_mat, g2, h2, sw
-            ``None`` placeholders (pt returns empty tensors for these).
+        Array or None
+            Tensor with shape (nf, 2) when condition embedding is enabled.
         """
-        xp = array_api_compat.array_namespace(coord_ext, atype_ext, nlist)
-        nf, nloc, nnei = nlist.shape
-        nall = xp.reshape(coord_ext, (nf, -1)).shape[1] // 3
-        extended_coord = xp.reshape(coord_ext, (nf, nall, 3))
-        extended_coord = xp.astype(
-            extended_coord, get_xp_precision(xp, self.compute_precision)
-        )
-        n_nodes = nf * nloc
-
-        # === Step 1. Excluded type pairs (keep mask, True means keep) ===
-        # The dpmodel PairExcludeMask returns an int mask; build_edge_cache
-        # expects a boolean keep mask.
-        pair_keep_mask = self.emask.build_type_exclude_mask(nlist, atype_ext) != 0
-
-        # === Step 2. Type embedding (l=0) ===
-        # Use ``xp_take_first_n`` (torch.index_select) rather than a plain
-        # ``[:, :nloc]`` slice: the slice makes torch.export emit a spurious
-        # ``Ne(nall, nloc)`` contiguity guard that breaks the ``nall == nloc``
-        # (NoPBC, no ghost atoms) case in the compiled .pt2 artifact.
-        atype_loc = xp_take_first_n(atype_ext, 1, nloc)
-        type_ebed = xp.reshape(
-            self.type_embedding(atype_loc), (n_nodes, self.channels)
-        )  # (N, C)
-
-        # === Step 3. Build edge cache once (geometry + RBF + Wigner-D) ===
-        # Random local-Z roll is a training-only augmentation in pt; the
-        # dpmodel descriptor evaluates in inference mode, so gamma is fixed.
-        edge_cache = build_edge_cache(
-            type_ebed=type_ebed,
-            extended_coord=extended_coord,
-            nlist=nlist,
-            mapping=mapping,
-            pair_keep_mask=pair_keep_mask,
-            eps=self.eps,
-            deg_norm_floor=(self.deg_norm_floor if self.version >= 1.1 else self.eps),
-            edge_envelope=self.edge_envelope,
-            radial_basis=self.radial_basis,
-            n_radial=self.n_radial,
-            random_gamma=False,
-            wigner_calc=self.wigner_calc,
-        )
-
-        # === Step 4. Compute radial features once (fp32+) ===
-        # Padded layout: E = nf * nloc * nnei is shape-determined, so there is
-        # no pt-style empty-edge special case.
-        radial_feat = xp.reshape(
-            self.radial_embedding(edge_cache.edge_rbf),
-            (-1, self.node_l_schedule[0] + 1, self.channels),
-        )  # (E, node_lmax+1, C)
-        if self.version >= 1.1:
-            radial_feat = radial_feat * xp.reshape(edge_cache.edge_env, (-1, 1, 1))
-
-        # === Step 5. Env FiLM conditioning (optional, fp32+) ===
-        x0_out = type_ebed  # (N, C)
-        if self.use_env_seed:
-            atype_flat = xp.reshape(atype_loc, (-1,))
-            film = self.env_seed_embedding(
-                edge_cache=edge_cache,
-                atype_flat=atype_flat,
-                n_nodes=n_nodes,
-            )  # (N, 2*C)
-            scale_logits = film[:, : self.channels]
-            shift_logits = film[:, self.channels :]
-            scale_hat = self.film_scale_norm(scale_logits)
-            shift_hat = self.film_shift_norm(shift_logits)
-            device = array_api_compat.device(scale_hat)
-            scale_strength = xp.exp(
-                xp_asarray_nodetach(xp, self.film_scale_strength_log, device=device)
-            )
-            shift_strength = xp.exp(
-                xp_asarray_nodetach(xp, self.film_shift_strength_log, device=device)
-            )
-            scale = 1.0 + scale_strength * xp.tanh(scale_hat)
-            shift = shift_strength * xp.tanh(shift_hat)
-            x0_out = type_ebed * scale + shift
-
-        # === Step 6. Build backbone l=0 features ===
-        # pt scatters x0_out into x[:, 0, 0, :] of a zeros tensor; here this
-        # is a concat with zero rows for l >= 1 (no fancy __setitem__).
-        ebed_dim_0 = self.node_ebed_dims[0]
-        x = xp.concat(
-            [
-                x0_out[:, None, :],
-                xp.zeros(
-                    (n_nodes, ebed_dim_0 - 1, self.channels),
-                    dtype=x0_out.dtype,
-                    device=array_api_compat.device(x0_out),
-                ),
-            ]
-            if ebed_dim_0 > 1
-            else [x0_out[:, None, :]],
-            axis=1,
-        )  # (N, D, C)
-
-        # === Step 7. Geometric initial embedding (fp32+) ===
-        if self.use_gie:
-            zonal_coupling = self._build_gie_zonal_coupling(edge_cache)
-            x = x + self.gie(
-                n_nodes=n_nodes,
-                edge_cache=edge_cache,
-                radial_feat=radial_feat[:, 1:, :],
-                zonal_coupling=zonal_coupling,
+        if self.charge_spin_embedding is None:
+            return None
+        if charge_spin is None:
+            if self.default_chg_spin is None:
+                raise ValueError("`charge_spin` is required for this SeZM descriptor.")
+            default_chg_spin = np.asarray(self.default_chg_spin)
+            xp = array_api_compat.array_namespace(default_chg_spin)
+            charge_spin = xp.reshape(
+                xp_asarray_nodetach(xp, default_chg_spin, dtype=dtype, device=device),
+                (1, 2),
             )
-        x = x[:, :, None, :]  # (N, D, 1, C)
-
-        # === Step 8. Fuse edge type features into radial features ===
-        radial_feat = radial_feat + edge_cache.edge_type_feat[:, None, :]
-        rad_feat_per_block = [
-            radial_feat[:, :rad_len, :] for rad_len in self.rad_sizes_per_block
-        ]
-
-        # === Step 9. Run interaction blocks (residual baseline path) ===
-        for i, block in enumerate(self.blocks):
-            x = x[:, : self.node_ebed_dims[i], :, :]
-            x = block(x, edge_cache, rad_feat_per_block[i])[0]
-
-        # === Step 10. Final l=0 output mixing ===
-        # ``none`` feeds the l=0 slice only; ``glu``/``mlp`` feed the full
-        # (N, D, 1, C) node tensor so the SO(3) grid folds l>0 into l=0. The
-        # residual is added on the full coefficient tensor before extracting
-        # l=0 to mirror pt.
-        compute_prec = get_xp_precision(xp, self.compute_precision)
-        if self.so3_readout == "none":
-            ffn_in = xp.astype(
-                xp.reshape(x[:, 0:1, :, :], (n_nodes, 1, 1, self.channels)),
-                compute_prec,
-            )  # (N, 1, 1, C)
         else:
-            # truncate to the final node degree (what output_ffn is built for);
-            # no-op in the normal path (blocks already shrank x), defensive vs
-            # any path that leaves x at the initial degree. Mirrors pt.
-            ffn_in = xp.astype(
-                x[:, : self.node_ebed_dims[-1], :, :], compute_prec
-            )  # (N, D, 1, C)
-        x_scalar = (ffn_in + self.output_ffn(ffn_in))[:, 0:1, :, :]
-
-        # === Step 11. Reshape and return ===
-        descriptor = xp.reshape(x_scalar, (nf, nloc, self.channels))
-        descriptor = xp.astype(descriptor, get_xp_precision(xp, "global"))
-        return descriptor, None, None, None, None
-
-    def _build_gie_zonal_coupling(self, edge_cache: EdgeCache) -> Any:
-        """
-        Build node-level zonal coupling for GIE when node degrees exceed MP
-        degrees (pt ``_build_gie_zonal_coupling``).
-
-        Returns ``None`` when ``extra_node_l == 0``, letting GIE gather from
-        the MP Wigner-D cache.
+            xp = array_api_compat.array_namespace(charge_spin)
+            charge_spin = xp.astype(charge_spin, dtype)
+
+        if charge_spin.ndim == 1:
+            if math.prod(charge_spin.shape) != 2:
+                raise ValueError("`charge_spin` must contain [charge, spin].")
+            charge_spin = xp.reshape(charge_spin, (1, 2))
+        elif charge_spin.ndim != 2 or charge_spin.shape[-1] != 2:
+            raise ValueError("`charge_spin` must have shape (nf, 2).")
+
+        if charge_spin.shape[0] == 1 and nf != 1:
+            charge_spin = xp.broadcast_to(charge_spin, (nf, charge_spin.shape[-1]))
+        elif charge_spin.shape[0] != nf:
+            raise ValueError("`charge_spin` first dimension must match nframes.")
+        return charge_spin
+
+    def _block_comm(
+        self,
+        block_idx: int,
+        comm_dict: dict[str, Array] | None,
+    ) -> dict[str, Array] | None:
+        """Return the border-exchange tensors block ``block_idx`` actually needs.
+
+        Only the SO(2) convolution reads neighbour features, so a block needs the
+        ghost exchange exactly when its neighbour rows cannot be rebuilt locally.
+        Block 0 reads the initial node state: a rank reproduces its ghost rows
+        from ``extended_atype`` (type embedding) unless env-seed / GIE folds
+        neighbour-environment information into them. Every later block reads a
+        previous block's output, which a rank cannot reproduce for ghosts (they
+        receive no messages locally). Returning ``None`` skips the exchange, so a
+        purely local model (``use_env_seed=False`` with a single block) runs
+        multi-rank with no communication at all.
         """
-        if self.gie_zonal_wigner_calc is None:
+        if comm_dict is None:
             return None
-        xp = array_api_compat.array_namespace(edge_cache.edge_quat)
-        device = array_api_compat.device(edge_cache.edge_quat)
-        n_edge = edge_cache.dst.shape[0]
-        mp_row_count = self.ebed_dims[0] - 1
-        mp_rows = self.gie.non_scalar_row_index[:mp_row_count]
-        mp_cols = self.gie.zonal_m0_col_index_for_row[:mp_row_count]
-        Dt_full = edge_cache.Dt_full
-        dim_full = Dt_full.shape[-1]
-        flat_index = xp_asarray_nodetach(
-            xp, mp_rows * dim_full + mp_cols, device=device
-        )
-        mp_coupling = xp.take(
-            xp.reshape(Dt_full, (n_edge, dim_full * dim_full)),
-            flat_index,
-            axis=1,
-        )  # (E, D_mp - 1)
-        extra_coupling = self.gie_zonal_wigner_calc.forward_zonal(
-            edge_cache.edge_quat,
-            lmin=self.lmax + 1,
-        )
-        return xp.concat([mp_coupling, extra_coupling], axis=1)
-
-    # =========================================================================
-    # DeePMD descriptor interface
-    # =========================================================================
+        if block_idx == 0 and not self.use_env_seed:
+            return None
+        return comm_dict
 
+    # === DeePMD descriptor interface ===
     def get_rcut(self) -> float:
         return self.rcut
 
@@ -1016,6 +1991,18 @@ def get_ntypes(self) -> int:
     def get_type_map(self) -> list[str]:
         return self.type_map if self.type_map is not None else []
 
+    def get_dim_chg_spin(self) -> int:
+        """Return the charge/spin condition width."""
+        return 2 if self.add_chg_spin_ebd else 0
+
+    def has_default_chg_spin(self) -> bool:
+        """Return whether default charge/spin conditions are configured."""
+        return self.default_chg_spin is not None
+
+    def get_default_chg_spin(self) -> list[float] | None:
+        """Return default charge/spin conditions."""
+        return self.default_chg_spin
+
     def get_dim_out(self) -> int:
         return self.channels
 
@@ -1023,14 +2010,37 @@ def get_dim_emb(self) -> int:
         return self.get_dim_out()
 
     def mixed_types(self) -> bool:
-        """DPA4 uses SeZMTypeEmbedding, no type-distinguished nlist needed."""
+        """
+        If true, the descriptor
+        1. assumes total number of atoms aligned across frames;
+        2. requires a neighbor list that does not distinguish different atomic types.
+
+        If false, the descriptor
+        1. assumes total number of atoms of each atom type aligned across frames;
+        2. requires a neighbor list that distinguishes different atomic types.
+
+        SeZM uses SeZMTypeEmbedding for type handling, so it does not require
+        a type-distinguished neighbor list.
+        """
         return True
 
     def has_message_passing(self) -> bool:
-        return bool(len(self.blocks) > 0 and self.lmax > 0)
+        # SeZM resolves ghost neighbours through the atom-map fold (single
+        # domain) or border_op exchange (parallel) instead of reading them
+        # directly, so its lower path always needs message-passing handling.
+        return True
 
     def has_message_passing_across_ranks(self) -> bool:
-        return self.has_message_passing()
+        """Whether multi-rank inference needs cross-rank ghost-feature exchange.
+
+        SeZM reads ghost-neighbour features at every interaction block, so a
+        domain-decomposed run must exchange them through ``border_op``. Source
+        Freeze Propagation bridging is excluded: its per-node gate folds a
+        node's entire outgoing-edge set, which a single rank cannot observe for
+        ghost owners, so the edge-based with-comm artifact is not exported for
+        bridging models and multi-rank inference fails fast instead.
+        """
+        return self.bridging_switch is None
 
     def need_sorted_nlist_for_lower(self) -> bool:
         return False
@@ -1048,14 +2058,38 @@ def dim_emb(self) -> int:
 
     def share_params(
         self, base_class: Any, shared_level: int, resume: bool = False
-    ) -> NoReturn:
-        """Parameter sharing is a pt-backend training feature."""
-        raise NotImplementedError
+    ) -> None:
+        """
+        Share the parameters of self to the base_class with shared_level during multitask training.
 
-    def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat: Any = None
-    ) -> NoReturn:
-        raise NotImplementedError("change_type_map is not supported for SeZM")
+        SeZM does not rely on running mean/stddev statistics in ``forward``
+        (``EquivariantRMSNorm`` is used instead), so only submodules and
+        the optional FiLM strength parameters need to be linked.
+
+        Parameters
+        ----------
+        base_class
+            The base class to share parameters with. Must be the same class as self.
+
+        shared_level
+            The level of sharing.
+
+            - ``0``: share every learnable submodule and FiLM strength parameter
+              (type_embedding, env_seed_embedding, film_*_norm,
+              film_*_strength_log, radial_basis, radial_embedding,
+              edge_envelope, wigner_calc, gie, blocks, final_*_attn_res,
+              output_ffn).
+            - ``1``: share ``type_embedding`` and optional condition embedding.
+
+        resume
+            Unused for SeZM; kept for interface compatibility.
+
+        Raises
+        ------
+        NotImplementedError
+            If ``shared_level`` is not ``0`` or ``1``.
+        """
+        raise NotImplementedError("share_params is not yet implemented for DescrptDPA4")
 
     def enable_compression(
         self,
@@ -1064,151 +2098,212 @@ def enable_compression(
         table_stride_1: float = 0.01,
         table_stride_2: float = 0.1,
         check_frequency: int = -1,
-    ) -> NoReturn:
+    ) -> None:
+        """Receive the statistics (distance, max_nbor_size and env_mat_range) of the training data.
+
+        Parameters
+        ----------
+        min_nbor_dist
+            The nearest distance between atoms
+        table_extrapolate
+            The scale of model extrapolation
+        table_stride_1
+            The uniform stride of the first table
+        table_stride_2
+            The uniform stride of the second table
+        check_frequency
+            The overflow check frequency
+        """
         raise NotImplementedError("Compression is unsupported for SeZM.")
 
-    # === Statistics interface (interface compatibility only) ===
-    # SeZM normalizes with learnable RMS norms; mean/stddev are kept only for
-    # interface and checkpoint-format compatibility (see pt sezm.py).
+    def change_type_map(
+        self, type_map: list[str], model_with_new_type_stat: Any | None = None
+    ) -> None:
+        raise NotImplementedError("change_type_map is not supported for SeZM")
 
-    def compute_input_stats(
-        self, merged: list[dict], path: DPPath | None = None
+    def reinit_exclude(
+        self, exclude_types: list[tuple[int, int]] | None = None
     ) -> None:
-        """No-op: statistics are not used by the DPA4 forward pass."""
+        if exclude_types is None:
+            exclude_types = []
+        self.exclude_types = exclude_types
+        self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
+
+    # =========================================================================
+    # Statistics interface (interface compatibility only)
+    # -------------------------------------------------------------------------
+    # SeZM uses EquivariantRMSNorm inside blocks for feature normalization,
+    # so mean/stddev are NOT used in forward(). These methods are kept for:
+    #   1. Interface compatibility with BaseDescriptor
+    #   2. Consistent serialization format (davg/dstd in checkpoint)
+    # =========================================================================
 
     def set_stat_mean_and_stddev(self, mean: Array, stddev: Array) -> None:
-        """Set mean and stddev (interface compatibility, unused in call)."""
+        """Set mean and stddev (interface compatibility, not used in forward)."""
         self.mean = mean
         self.stddev = stddev
 
     def get_stat_mean_and_stddev(self) -> tuple[Array, Array]:
-        """Get mean and stddev (interface compatibility, unused in call)."""
+        """Get mean and stddev (interface compatibility, not used in forward)."""
         return self.mean, self.stddev
 
-    # =========================================================================
-    # Serialization (pt state_dict-key compatible)
-    # =========================================================================
-
-    def _variables(self) -> dict[str, np.ndarray]:
-        """Variables keyed exactly by the pt ``state_dict()`` key names."""
-        model_np_prec = PRECISION_DICT[self.precision]
-        variables: dict[str, np.ndarray] = {
-            # pt interface-compatibility buffers
-            "version_tensor": np.asarray(self.version, dtype=np.float64),
-            "_empty_tensor": np.zeros((0,), dtype=np.float64),
-            "mean": to_numpy_array(self.mean).astype(model_np_prec),
-            "stddev": to_numpy_array(self.stddev).astype(model_np_prec),
-        }
+    def compute_input_stats(
+        self,
+        merged: Callable[[], list[dict]] | list[dict],
+        path: DPPath | None = None,
+    ) -> None:
+        """
+        Compute statistics (interface compatibility, not used in forward).
 
-        def add(prefix: str, sub_vars: dict[str, Any]) -> None:
-            for key, value in sub_vars.items():
-                variables[f"{prefix}{key}"] = to_numpy_array(value)
+        SeZM uses learnable EquivariantRMSNorm for normalization, so these
+        statistics do not affect the forward pass. This is a no-op that keeps
+        mean/stddev at their initialized values (zero/one) for interface consistency.
+        """
+        # No-op: mean and stddev are already initialized to zero/one in __init__
+        # and are not used in forward() due to EquivariantRMSNorm.
 
-        add("type_embedding.", self.type_embedding.serialize()["@variables"])
+    def _variables(self) -> dict[str, np.ndarray]:
+        """Variables keyed by the pt ``state_dict`` key names."""
+        variables: dict[str, np.ndarray] = {}
+        # === Descriptor-level version and statistics buffers ===
+        variables["version_tensor"] = np.asarray(self.version, dtype=np.float64)
+        variables["mean"] = to_numpy_array(self.mean)
+        variables["stddev"] = to_numpy_array(self.stddev)
+        # === Type embedding (always present) ===
+        for key, value in self.type_embedding.serialize()["@variables"].items():
+            variables[f"type_embedding.{key}"] = value
+        # === Frame charge/spin embedding (optional) ===
+        if self.add_chg_spin_ebd:
+            for key, value in self.charge_spin_embedding.serialize()[
+                "@variables"
+            ].items():
+                variables[f"charge_spin_embedding.{key}"] = value
+        # === Environment FiLM stack (optional) ===
         if self.use_env_seed:
-            add(
-                "env_seed_embedding.",
-                self.env_seed_embedding.serialize()["@variables"],
-            )
-            add("film_scale_norm.", self.film_scale_norm.serialize()["@variables"])
-            add("film_shift_norm.", self.film_shift_norm.serialize()["@variables"])
+            for key, value in self.env_seed_embedding.serialize()["@variables"].items():
+                variables[f"env_seed_embedding.{key}"] = value
+            for key, value in self.film_scale_norm.serialize()["@variables"].items():
+                variables[f"film_scale_norm.{key}"] = value
+            for key, value in self.film_shift_norm.serialize()["@variables"].items():
+                variables[f"film_shift_norm.{key}"] = value
             variables["film_scale_strength_log"] = to_numpy_array(
                 self.film_scale_strength_log
             )
             variables["film_shift_strength_log"] = to_numpy_array(
                 self.film_shift_strength_log
             )
-        add("radial_basis.", self.radial_basis.serialize()["@variables"])
-        add("radial_embedding.net.", self.radial_embedding.serialize()["@variables"])
-
-        # Static pt WignerDCalculator buffers (rebuilt at construction here;
-        # emitted so pt's strict load_state_dict finds every key).
-        def wigner_buffers(calc: WignerDCalculator) -> dict[str, np.ndarray]:
-            return {
-                "l1_perm": np.asarray([1, 2, 0], dtype=np.int64),
-                "l1_sign_outer": to_numpy_array(calc.l1_sign_outer).astype(np.float64),
-            }
-
-        add("wigner_calc.", wigner_buffers(self.wigner_calc))
+        # === Radial basis and shared radial embedding ===
+        for key, value in self.radial_basis.serialize()["@variables"].items():
+            variables[f"radial_basis.{key}"] = value
+        for key, value in self.radial_embedding.serialize()["@variables"].items():
+            variables[f"radial_embedding.net.{key}"] = value
+        # === Wigner-D static buffers ===
+        # The Wigner-D index/sign tables are derived constants with no trainable
+        # parameters. They are emitted to keep the ``state_dict`` key set complete
+        # and are rebuilt at construction on load (see ``_load_variables``).
+        variables["wigner_calc.l1_perm"] = to_numpy_array(self.wigner_calc.l1_perm)
+        variables["wigner_calc.l1_sign_outer"] = to_numpy_array(
+            self.wigner_calc.l1_sign_outer
+        )
+        if self.gie_zonal_wigner_calc is not None:
+            variables["gie_zonal_wigner_calc.l1_perm"] = to_numpy_array(
+                self.gie_zonal_wigner_calc.l1_perm
+            )
+            variables["gie_zonal_wigner_calc.l1_sign_outer"] = to_numpy_array(
+                self.gie_zonal_wigner_calc.l1_sign_outer
+            )
+        # === Geometric initial embedding index buffers (optional) ===
         if self.use_gie:
-            add(
-                "gie.",
-                {
-                    "non_scalar_row_index": self.gie.non_scalar_row_index,
-                    "zonal_m0_col_index_for_row": self.gie.zonal_m0_col_index_for_row,
-                    "radial_slot_index_for_row": self.gie.radial_slot_index_for_row,
-                },
+            variables["gie.non_scalar_row_index"] = to_numpy_array(
+                self.gie.non_scalar_row_index
             )
-            if self.gie_zonal_wigner_calc is not None:
-                add(
-                    "gie_zonal_wigner_calc.",
-                    wigner_buffers(self.gie_zonal_wigner_calc),
-                )
+            variables["gie.zonal_m0_col_index_for_row"] = to_numpy_array(
+                self.gie.zonal_m0_col_index_for_row
+            )
+            variables["gie.radial_slot_index_for_row"] = to_numpy_array(
+                self.gie.radial_slot_index_for_row
+            )
+        # === Interaction blocks ===
         for i, block in enumerate(self.blocks):
-            add(f"blocks.{i}.", block._variables())
-        add("output_ffn.", self.output_ffn._variables())
+            for key, value in block._variables().items():
+                variables[f"blocks.{i}.{key}"] = value
+        # === Descriptor-level attention residuals (optional, mutually exclusive) ===
+        if self.use_full_attn_res:
+            for key, value in self.final_full_attn_res.serialize()[
+                "@variables"
+            ].items():
+                variables[f"final_full_attn_res.{key}"] = value
+        if self.use_block_attn_res:
+            for key, value in self.final_block_attn_res.serialize()[
+                "@variables"
+            ].items():
+                variables[f"final_block_attn_res.{key}"] = value
+        # === Output FFN ===
+        for key, value in self.output_ffn._variables().items():
+            variables[f"output_ffn.{key}"] = value
         return variables
 
-    def _load_variables(self, variables: dict[str, Any]) -> None:
-        """Load variables keyed by the pt ``state_dict()`` key names."""
-        variables = dict(variables)
+    def _load_variables(self, variables: dict[str, np.ndarray]) -> None:
+        """Load variables keyed by the pt ``state_dict`` key names."""
 
-        def take_prefix(prefix: str) -> dict[str, Any]:
-            sub = {
+        def take_prefix(prefix: str) -> dict[str, np.ndarray]:
+            return {
                 key[len(prefix) :]: value
                 for key, value in variables.items()
                 if key.startswith(prefix)
             }
-            for key in list(variables):
-                if key.startswith(prefix):
-                    del variables[key]
-            return sub
-
-        # Transient / static pt buffers rebuilt at construction.
-        for key in ("version_tensor", "_empty_tensor"):
-            variables.pop(key, None)
-        take_prefix("wigner_calc.")
-        take_prefix("gie.")
-        take_prefix("gie_zonal_wigner_calc.")
-
-        model_np_prec = PRECISION_DICT[self.precision]
-        compute_np_prec = PRECISION_DICT[self.compute_precision]
-        if "mean" in variables:
-            self.mean = np.asarray(variables.pop("mean"), dtype=model_np_prec)
-        if "stddev" in variables:
-            self.stddev = np.asarray(variables.pop("stddev"), dtype=model_np_prec)
-
-        def load_via_serialize(attr: str, prefix: str) -> None:
-            sub = getattr(self, attr)
-            sv = take_prefix(prefix)
-            if sub is None:
-                if sv:
-                    raise KeyError(f"Unexpected variables with prefix: {prefix}")
-                return
-            if not sv:
-                raise KeyError(f"Missing variables with prefix: {prefix}")
-            data = sub.serialize()
-            data["@variables"] = sv
-            setattr(self, attr, type(sub).deserialize(data))
-
-        load_via_serialize("type_embedding", "type_embedding.")
-        load_via_serialize("env_seed_embedding", "env_seed_embedding.")
-        load_via_serialize("film_scale_norm", "film_scale_norm.")
-        load_via_serialize("film_shift_norm", "film_shift_norm.")
-        load_via_serialize("radial_basis", "radial_basis.")
-        load_via_serialize("radial_embedding", "radial_embedding.net.")
+
+        def load(module: Any, prefix: str) -> Any:
+            data = module.serialize()
+            data["@variables"] = take_prefix(prefix)
+            return type(module).deserialize(data)
+
+        prec = PRECISION_DICT[self.precision]
+        compute_prec = PRECISION_DICT[self.compute_precision]
+        # === Descriptor-level statistics buffers ===
+        # ``version_tensor`` and the derived Wigner-D / GIE index tables are
+        # transient: they are rebuilt at construction, so they are ignored here.
+        self.mean = np.asarray(variables["mean"], dtype=prec)
+        self.stddev = np.asarray(variables["stddev"], dtype=prec)
+        # === Type embedding (always present) ===
+        self.type_embedding = load(self.type_embedding, "type_embedding.")
+        # === Frame charge/spin embedding (optional) ===
+        if self.add_chg_spin_ebd:
+            self.charge_spin_embedding = load(
+                self.charge_spin_embedding, "charge_spin_embedding."
+            )
+        # === Environment FiLM stack (optional) ===
         if self.use_env_seed:
-            for name in ("film_scale_strength_log", "film_shift_strength_log"):
-                value = np.asarray(variables.pop(name), dtype=compute_np_prec)
-                setattr(self, name, value.reshape((1,)))
+            self.env_seed_embedding = load(
+                self.env_seed_embedding, "env_seed_embedding."
+            )
+            self.film_scale_norm = load(self.film_scale_norm, "film_scale_norm.")
+            self.film_shift_norm = load(self.film_shift_norm, "film_shift_norm.")
+            self.film_scale_strength_log = np.asarray(
+                variables["film_scale_strength_log"], dtype=compute_prec
+            )
+            self.film_shift_strength_log = np.asarray(
+                variables["film_shift_strength_log"], dtype=compute_prec
+            )
+        # === Radial basis and shared radial embedding ===
+        self.radial_basis = load(self.radial_basis, "radial_basis.")
+        self.radial_embedding = load(self.radial_embedding, "radial_embedding.net.")
+        # === Interaction blocks ===
         for i, block in enumerate(self.blocks):
             block._load_variables(take_prefix(f"blocks.{i}."))
+        # === Descriptor-level attention residuals (optional, mutually exclusive) ===
+        if self.use_full_attn_res:
+            self.final_full_attn_res = load(
+                self.final_full_attn_res, "final_full_attn_res."
+            )
+        if self.use_block_attn_res:
+            self.final_block_attn_res = load(
+                self.final_block_attn_res, "final_block_attn_res."
+            )
+        # === Output FFN ===
         self.output_ffn._load_variables(take_prefix("output_ffn."))
-        if variables:
-            raise KeyError(f"Unknown variables: {sorted(variables)}")
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the descriptor (pt ``DescrptSeZM.serialize`` format)."""
         return {
             "@class": "Descriptor",
             "type": "SeZM",
@@ -1232,8 +2327,10 @@ def serialize(self) -> dict[str, Any]:
                 "radial_mlp": self.radial_mlp,
                 "use_env_seed": self.use_env_seed,
                 "random_gamma": self.random_gamma,
+                "edge_cartesian": self.edge_cartesian,
+                "node_cartesian": self.node_cartesian,
                 "so2_norm": self.so2_norm,
-                "so2_layers": self.so2_layers,
+                "mixing_layers": self.mixing_layers,
                 "so2_attn_res": self.so2_attn_res_mode,
                 "radial_so2_mode": self.radial_so2_mode,
                 "radial_so2_rank": self.radial_so2_rank,
@@ -1278,7 +2375,6 @@ def serialize(self) -> dict[str, Any]:
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> DescrptDPA4:
-        """Deserialize from a dict (accepts the pt ``serialize()`` output)."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "Descriptor":
@@ -1288,7 +2384,7 @@ def deserialize(cls, data: dict[str, Any]) -> DescrptDPA4:
             raise ValueError(f"Invalid type for DescrptDPA4: {type_val}")
         version = float(data.pop("@version"))
         check_version_compatibility(version, cls.LATEST_VERSION, 1)
-        config = dict(data.pop("config"))
+        config = data.pop("config")
         variables = data.pop("@variables")
         data.pop("env_mat", None)
         config.pop("s2_grid_resolution", None)
@@ -1304,7 +2400,25 @@ def update_sel(
         type_map: list[str] | None,
         local_jdata: dict,
     ) -> tuple[dict, float | None]:
-        """Update the selection and perform neighbor statistics."""
+        """
+        Update the selection and perform neighbor statistics.
+
+        Parameters
+        ----------
+        train_data : DeepmdDataSystem
+            Data used to do neighbor statistics.
+        type_map : list[str] | None
+            The name of each type of atoms.
+        local_jdata : dict
+            The local data refer to the current class.
+
+        Returns
+        -------
+        dict
+            The updated local data.
+        float | None
+            The minimum distance between two atoms.
+        """
         local_jdata_cpy = local_jdata.copy()
         min_nbor_dist, sel = UpdateSel().update_one_sel(
             train_data,
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/__init__.py b/deepmd/dpmodel/descriptor/dpa4_nn/__init__.py
index 2a0f5a8616..d2847b6965 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/__init__.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/__init__.py
@@ -1,5 +1,198 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Backend-agnostic (array-API) building blocks for the DPA4/SeZM descriptor.
+"""
+Public building blocks for the DPA4/SeZM descriptor.
+
+This package re-exports the helper functions, embeddings, equivariant layers,
+and quaternion-based Wigner-D utilities used by the DPA4/SeZM descriptor and model.
 
-This package is the dpmodel port of ``deepmd.pt.model.descriptor.sezm_nn``.
+This package is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn``.
 """
+
+from .activation import (
+    GatedActivation,
+    SwiGLU,
+)
+from .attention import (
+    segment_envelope_gated_softmax,
+)
+from .attn_res import (
+    DepthAttnRes,
+)
+from .block import (
+    SeZMInteractionBlock,
+)
+from .cartesian import (
+    EdgeCartesianTensorProduct,
+    NodeCartesianTensorProduct,
+    build_cartesian_basis,
+    build_edge_cartesian_tensors,
+)
+from .edge_cache import (
+    EdgeCache,
+    build_edge_cache,
+    build_edge_cache_from_edges,
+    build_edge_type_feat,
+    compute_edge_src_gate,
+    edge_cache_to_dtype,
+)
+from .embedding import (
+    ChargeSpinEmbedding,
+    EnvironmentInitialEmbedding,
+    GeometricInitialEmbedding,
+    SeZMTypeEmbedding,
+)
+from .ffn import (
+    EquivariantFFN,
+)
+from .grid_net import (
+    BaseGridNet,
+    GridBranch,
+    GridMLP,
+    S2GridNet,
+    SO3GridNet,
+)
+from .indexing import (
+    build_gie_zonal_index,
+    build_l_major_index,
+    build_m_major_index,
+    build_m_major_l_index,
+    build_rotate_inv_rescale,
+    get_so3_dim_of_lmax,
+    map_degree_idx,
+    project_D_to_m,
+    project_Dt_from_m,
+    so3_packed_index,
+)
+from .lora import (
+    LoRASO2,
+    LoRASO3,
+    apply_lora_to_sezm,
+    build_merged_state_dict,
+    fold_lora_state_dict_keys,
+    has_lora,
+    merge_lora_into_base,
+    strip_lora_from_extra_state,
+)
+from .norm import (
+    EquivariantRMSNorm,
+    ReducedEquivariantRMSNorm,
+    RMSNorm,
+    ScalarRMSNorm,
+)
+from .projection import (
+    BaseGridProjector,
+    S2GridProjector,
+    SO3GridProjector,
+    resolve_s2_grid_resolution,
+    resolve_so3_grid,
+)
+from .radial import (
+    BridgingSwitch,
+    C3CutoffEnvelope,
+    InnerClamp,
+    RadialBasis,
+    RadialMLP,
+)
+from .so2 import (
+    DynamicRadialDegreeMixer,
+    SO2Convolution,
+    SO2Linear,
+)
+from .so3 import (
+    ChannelLinear,
+    FocusLinear,
+    SO3Linear,
+)
+from .utils import (
+    ATTN_RES_MODES,
+    get_promoted_dtype,
+    init_trunc_normal_fan_in_out,
+    safe_norm,
+)
+from .wignerd import (
+    WignerDCalculator,
+    build_edge_quaternion,
+    quaternion_multiply,
+    quaternion_nlerp,
+    quaternion_normalize,
+    quaternion_to_rotation_matrix,
+    quaternion_z_rotation,
+)
+
+__all__ = [
+    "ATTN_RES_MODES",
+    "BaseGridNet",
+    "BaseGridProjector",
+    "BridgingSwitch",
+    "C3CutoffEnvelope",
+    "ChannelLinear",
+    "ChargeSpinEmbedding",
+    "DepthAttnRes",
+    "DynamicRadialDegreeMixer",
+    "EdgeCache",
+    "EdgeCartesianTensorProduct",
+    "EnvironmentInitialEmbedding",
+    "EquivariantFFN",
+    "EquivariantRMSNorm",
+    "FocusLinear",
+    "GatedActivation",
+    "GeometricInitialEmbedding",
+    "GridBranch",
+    "GridMLP",
+    "InnerClamp",
+    "LoRASO2",
+    "LoRASO3",
+    "NodeCartesianTensorProduct",
+    "RMSNorm",
+    "RadialBasis",
+    "RadialMLP",
+    "ReducedEquivariantRMSNorm",
+    "S2GridNet",
+    "S2GridProjector",
+    "SO2Convolution",
+    "SO2Linear",
+    "SO3GridNet",
+    "SO3GridProjector",
+    "SO3Linear",
+    "ScalarRMSNorm",
+    "SeZMInteractionBlock",
+    "SeZMTypeEmbedding",
+    "SwiGLU",
+    "WignerDCalculator",
+    "apply_lora_to_sezm",
+    "build_cartesian_basis",
+    "build_edge_cache",
+    "build_edge_cache_from_edges",
+    "build_edge_cartesian_tensors",
+    "build_edge_quaternion",
+    "build_edge_type_feat",
+    "build_gie_zonal_index",
+    "build_l_major_index",
+    "build_m_major_index",
+    "build_m_major_l_index",
+    "build_merged_state_dict",
+    "build_rotate_inv_rescale",
+    "compute_edge_src_gate",
+    "edge_cache_to_dtype",
+    "fold_lora_state_dict_keys",
+    "get_promoted_dtype",
+    "get_so3_dim_of_lmax",
+    "has_lora",
+    "init_trunc_normal_fan_in_out",
+    "map_degree_idx",
+    "merge_lora_into_base",
+    "project_D_to_m",
+    "project_Dt_from_m",
+    "quaternion_multiply",
+    "quaternion_nlerp",
+    "quaternion_normalize",
+    "quaternion_to_rotation_matrix",
+    "quaternion_z_rotation",
+    "resolve_s2_grid_resolution",
+    "resolve_so3_grid",
+    "safe_norm",
+    "segment_envelope_gated_softmax",
+    "so3_packed_index",
+    "strip_lora_from_extra_state",
+]
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/activation.py b/deepmd/dpmodel/descriptor/dpa4_nn/activation.py
index 8b9897ff07..85f947b1fc 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/activation.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/activation.py
@@ -2,18 +2,12 @@
 """
 Activation helper modules for DPA4/SeZM.
 
-This module is the dpmodel port of
-``deepmd.pt.model.descriptor.sezm_nn.activation``. It contains the
-coefficient-space nonlinear operators. Both pt classes are ported:
-``GatedActivation`` (used by ``so2``, ``ffn``) and ``SwiGLU`` (used by
-``grid_net``, which is consumed by ``ffn``).
+This module contains coefficient-space nonlinear operators, including
+GatedActivation and point-wise SwiGLU. Grid projectors and grid nets live in
+dedicated modules so coefficient-space and function-space logic remain separate.
 
-Serialization contract: ``GatedActivation`` mirrors the pt ``serialize()``
-format exactly (same config and ``@variables`` keys, including the nested
-``gate_linear.weight``/``gate_linear.bias`` state-dict names), so pt
-``serialize()`` output deserializes directly. ``SwiGLU`` is parameter-free in
-pt (no ``serialize()``, no state-dict entries), so no serialization is
-implemented for it.
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.activation``.
 """
 
 from __future__ import (
@@ -37,9 +31,11 @@
     xp_sigmoid,
 )
 from deepmd.dpmodel.common import (
+    get_xp_precision,
     to_numpy_array,
 )
 from deepmd.dpmodel.utils.network import (
+    Identity,
     get_activation_fn,
 )
 from deepmd.dpmodel.utils.seed import (
@@ -64,45 +60,42 @@ class GatedActivation(NativeOP):
 
     Standard mode (gate=None in call):
         - l=0: Uses the specified activation function
-        - l>0: Each degree l has an independent gate derived from the l=0 scalar
-          features. The gate for each l is expanded to all m components within
-          that l-block.
+        - l>0: Each degree l has an independent gate derived from the l=0 scalar features.
+               The gate for each l is expanded to all m components within that l-block.
 
     GLU mode (gate provided in call, e.g., from split linear output):
         - l=0: x0 * act(g0) (SwiGLU-style when act=silu, GeGLU when act=gelu, etc.)
-        - l>0: Uses gate's scalar (g0) to generate sigmoid gates for x's vector
-          components. This preserves SO(3) equivariance (scalar gates vector,
-          not vector gates vector).
+        - l>0: Uses gate's scalar (g0) to generate sigmoid gates for x's vector components.
+               This preserves SO(3) equivariance (scalar gates vector, not vector gates vector).
 
-    This module also supports the m-major reduced layout used inside SO(2)
-    blocks. If `mmax` is provided, the coefficient axis is assumed to follow
-    the truncated m-major order built by `build_m_major_index(lmax, mmax)`;
-    otherwise, it is assumed to be the full packed (l, m) layout with
-    D=(lmax+1)^2.
+    This module also supports the m-major reduced layout used inside SO(2) blocks.
+    If `mmax` is provided, the coefficient axis is assumed to follow the truncated
+    m-major order built by `build_m_major_index(lmax, mmax)`; otherwise, it is assumed
+    to be the full packed (l, m) layout with D=(lmax+1)^2.
 
     Parameters
     ----------
-    lmax : int
+    lmax
         Maximum spherical harmonic degree.
-    mmax : int | None
-        Maximum order (|m|) for the m-major reduced layout. If None, use the
-        full packed layout with D=(lmax+1)^2.
-    channels : int
+    mmax
+        Maximum order (|m|) for the m-major reduced layout. If None, use the full
+        packed layout with D=(lmax+1)^2.
+    channels
         Number of channels per focus stream.
-    n_focus : int
+    n_focus
         Number of focus streams.
-    precision : str
+    precision
         Internal compute precision used by the gate projection and sigmoid path.
-    activation_function : str
+    activation_function
         Activation function for l=0 components (e.g., "silu", "tanh", "gelu").
-    mlp_bias : bool
+    mlp_bias
         Whether to use bias in the gate linear layer.
-    layout : str
+    layout
         Tensor layout convention. ``"nfdc"`` means input shape (N, F, D, C);
         ``"ndfc"`` means input shape (N, D, F, C).
-    trainable : bool
+    trainable
         Whether parameters are trainable.
-    seed : int | list[int] | None
+    seed
         Random seed for weight initialization.
     """
 
@@ -134,9 +127,9 @@ def __init__(
         self.layout = str(layout).lower()
         if self.layout not in {"nfdc", "ndfc"}:
             raise ValueError("`layout` must be either 'nfdc' or 'ndfc'")
-        self.trainable = bool(trainable)
+
         self.activation_function = str(activation_function)
-        prec = PRECISION_DICT[self.precision.lower()]
+        self.scalar_act = get_activation_fn(activation_function)
 
         # === Build expand_index for mapping per-l gates to all m components ===
         if self.lmax > 0:
@@ -145,45 +138,45 @@ def __init__(
             else:
                 degree_index = build_m_major_l_index(self.lmax, self.mmax)
                 expand_index = degree_index[1:] - 1
-            self.gate_linear: FocusLinear | None = FocusLinear(
+            self.gate_linear: NativeOP = FocusLinear(
                 in_channels=self.channels,
                 out_channels=self.lmax * self.channels,
                 n_focus=self.n_focus,
                 precision=self.precision,
                 bias=self.mlp_bias,
                 seed=seed,
-                trainable=self.trainable,
+                trainable=trainable,
             )
-            # pt re-initializes the gate weight with normal(0, 0.01) seeded
-            # by child_seed(seed, 1) and zeroes the bias (bias is already
-            # zero-initialized here).
+
+            prec = PRECISION_DICT[self.precision.lower()]
             rng = np.random.default_rng(child_seed(seed, 1))
             self.gate_linear.weight = rng.normal(
                 0.0, 0.01, size=self.gate_linear.weight.shape
             ).astype(prec)
+            if self.gate_linear.bias is not None:
+                self.gate_linear.bias = np.zeros(
+                    self.gate_linear.bias.shape, dtype=prec
+                )
         else:
-            # pt uses nn.Identity() here (parameter-free, no state-dict keys);
-            # the dpmodel equivalent is no gate module at all.
-            expand_index = np.zeros((0,), dtype=np.int64)
-            self.gate_linear = None
+            expand_index = np.zeros(0, dtype=np.int64)
+            self.gate_linear = Identity()
         self.expand_index = expand_index
 
+        self.trainable = bool(trainable)
+
     def call(self, x: Any, gate: Any = None) -> Any:
         """
-        Apply the gated activation.
-
         Parameters
         ----------
-        x : Array
+        x
             Value features. Shape is (N, F, D, C) when ``layout='nfdc'``,
             or (N, D, F, C) when ``layout='ndfc'``.
-        gate : Array | None
+        gate
             Optional gate features with the same layout as ``x``.
             When provided, enables GLU mode:
             - l=0: x0 * act(g0) (e.g., SwiGLU when act=silu)
             - l>0: sigmoid(Linear(g0)) gates x's vector components
-            When None (default), uses standard mode where gates are derived
-            from x itself.
+            When None (default), uses standard mode where gates are derived from x itself.
 
         Returns
         -------
@@ -193,34 +186,39 @@ def call(self, x: Any, gate: Any = None) -> Any:
         xp = array_api_compat.array_namespace(x)
         degree_axis = 1 if self.layout == "ndfc" else 2
 
-        gate_source = x if gate is None else gate
-        if degree_axis == 1:
-            gate_scalar_source = gate_source[:, 0, :, :]  # (N, F, C)
-            g0 = gate_source[:, :1, :, :]
-            x0_in = x[:, :1, :, :]
+        scalar_idx = tuple(
+            0 if ax == degree_axis else slice(None) for ax in range(x.ndim)
+        )
+        l0_idx = tuple(
+            slice(0, 1) if ax == degree_axis else slice(None) for ax in range(x.ndim)
+        )
+        rest_idx = tuple(
+            slice(1, x.shape[degree_axis]) if ax == degree_axis else slice(None)
+            for ax in range(x.ndim)
+        )
+
+        if gate is not None:
+            gate_scalar_source = gate[scalar_idx]
         else:
-            gate_scalar_source = gate_source[:, :, 0, :]  # (N, F, C)
-            g0 = gate_source[:, :, :1, :]
-            x0_in = x[:, :, :1, :]
+            gate_scalar_source = x[scalar_idx]
 
-        scalar_act = get_activation_fn(self.activation_function)
         if gate is not None:
-            x0 = x0_in * scalar_act(g0)
+            x0 = x[l0_idx] * self.scalar_act(gate[l0_idx])
         else:
-            x0 = scalar_act(x0_in)
+            x0 = self.scalar_act(x[l0_idx])
 
         if self.lmax == 0:
             return x0
 
-        gate_weight = xp_asarray_nodetach(
-            xp, self.gate_linear.weight[...], device=array_api_compat.device(x)
-        )
         input_dtype = gate_scalar_source.dtype
-        if input_dtype != gate_weight.dtype:
-            gate_scalar_source = xp.astype(gate_scalar_source, gate_weight.dtype)
-        gating_scalars = xp_sigmoid(self.gate_linear.call(gate_scalar_source))
-        if gating_scalars.dtype != input_dtype:
-            gating_scalars = xp.astype(gating_scalars, input_dtype)
+        gating_scalars = xp.astype(
+            xp_sigmoid(
+                self.gate_linear(
+                    xp.astype(gate_scalar_source, get_xp_precision(xp, self.precision))
+                )
+            ),
+            input_dtype,
+        )
         gating_scalars = xp.reshape(
             gating_scalars,
             (x.shape[0], gate_scalar_source.shape[1], self.lmax, self.channels),
@@ -228,20 +226,16 @@ def call(self, x: Any, gate: Any = None) -> Any:
         expand_index = xp_asarray_nodetach(
             xp, self.expand_index, device=array_api_compat.device(x)
         )
-        gates = xp.take(gating_scalars, expand_index, axis=2)  # (N, F, D-1, C)
+        gates = xp.take(gating_scalars, expand_index, axis=2)
         if self.layout == "ndfc":
-            gates = xp.permute_dims(gates, (0, 2, 1, 3))  # (N, D-1, F, C)
-            xt = x[:, 1:, :, :] * gates
-        else:
-            xt = x[:, :, 1:, :] * gates
-        return xp.concat([x0, xt], axis=degree_axis)
+            gates = xp.permute_dims(gates, (0, 2, 1, 3))
+        return xp.concat([x0, x[rest_idx] * gates], axis=degree_axis)
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the GatedActivation to a dict (pt-compatible format)."""
         variables = {"expand_index": to_numpy_array(self.expand_index)}
-        if self.gate_linear is not None:
+        if self.lmax > 0:
             variables["gate_linear.weight"] = to_numpy_array(self.gate_linear.weight)
-            if self.mlp_bias:
+            if self.gate_linear.bias is not None:
                 variables["gate_linear.bias"] = to_numpy_array(self.gate_linear.bias)
         return {
             "@class": "GatedActivation",
@@ -263,7 +257,6 @@ def serialize(self) -> dict[str, Any]:
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> GatedActivation:
-        """Deserialize a GatedActivation from a dict."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "GatedActivation":
@@ -272,61 +265,25 @@ def deserialize(cls, data: dict[str, Any]) -> GatedActivation:
         check_version_compatibility(version, 1, 1)
         config = data.pop("config")
         variables = data.pop("@variables")
-        mmax = config["mmax"]
-        obj = cls(
-            lmax=int(config["lmax"]),
-            mmax=None if mmax is None else int(mmax),
-            channels=int(config["channels"]),
-            n_focus=int(config["n_focus"]),
-            precision=str(config["precision"]),
-            activation_function=str(config["activation_function"]),
-            mlp_bias=bool(config["mlp_bias"]),
-            layout=str(config["layout"]),
-            trainable=bool(config["trainable"]),
-            seed=config.get("seed"),
-        )
+        obj = cls(**config)
         prec = PRECISION_DICT[obj.precision.lower()]
-        expand_index = np.asarray(variables["expand_index"], dtype=np.int64)
-        if not np.array_equal(expand_index, to_numpy_array(obj.expand_index)):
-            raise ValueError("expand_index does not match the lmax/mmax tables")
-        if obj.gate_linear is not None:
-            weight = np.asarray(variables["gate_linear.weight"], dtype=prec)
-            if weight.shape != obj.gate_linear.weight.shape:
-                raise ValueError(
-                    f"gate_linear.weight shape {weight.shape} does not match "
-                    f"the expected shape {obj.gate_linear.weight.shape}"
-                )
-            obj.gate_linear.weight = weight
-            if obj.mlp_bias:
+        obj.expand_index = np.asarray(variables["expand_index"], dtype=np.int64)
+        if obj.lmax > 0:
+            obj.gate_linear.weight = np.asarray(
+                variables["gate_linear.weight"], dtype=prec
+            )
+            if obj.gate_linear.bias is not None:
                 obj.gate_linear.bias = np.asarray(
                     variables["gate_linear.bias"], dtype=prec
-                ).reshape(obj.gate_linear.bias.shape)
+                )
         return obj
 
 
 class SwiGLU(NativeOP):
-    """Point-wise SwiGLU on the last feature axis.
-
-    Parameter-free, matching the pt version (which defines no ``serialize()``
-    and contributes no state-dict entries).
-    """
+    """Point-wise SwiGLU on the last feature axis."""
 
     def call(self, inputs: Any) -> Any:
-        """
-        Apply point-wise SwiGLU.
-
-        Parameters
-        ----------
-        inputs : Array
-            Input array with shape ``(..., 2*C)``; the first half of the last
-            axis is the gate, the second half the value.
-
-        Returns
-        -------
-        Array
-            Gated array with shape ``(..., C)``.
-        """
-        # torch.chunk(inputs, 2, dim=-1): first chunk gets ceil(C/2) entries
+        # torch.chunk(inputs, 2, dim=-1): the first half takes ceil(C/2) elements.
         nc = (inputs.shape[-1] + 1) // 2
         gate = inputs[..., :nc]
         value = inputs[..., nc:]
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/attention.py b/deepmd/dpmodel/descriptor/dpa4_nn/attention.py
index 4b7eeb24db..e2af1595e3 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/attention.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/attention.py
@@ -2,19 +2,11 @@
 """
 Attention utilities for DPA4/SeZM message passing.
 
-This module is the dpmodel port of
-``deepmd.pt.model.descriptor.sezm_nn.attention``. It implements the
-destination-wise envelope-gated softmax used by the SO(2) attention path.
+This module implements the destination-wise envelope-gated softmax used by the
+SO(2) attention path in the SeZM descriptor.
 
-Padded-edge adaptation
-----------------------
-The pt version consumes a sparse edge list and reduces per destination node
-with ``scatter_reduce(amax)`` / ``scatter_add`` keyed by ``dst``. In the
-dpmodel padded layout (see ``edge_cache.EdgeCache``) the edge axis is
-``E = n_nodes * nnei`` with slot ``(i, j)`` belonging to node ``i``, so every
-destination-wise reduction becomes a plain reduction over the ``nnei`` axis
-after a ``(n_nodes, nnei, ...)`` reshape, and invalid (padded) slots are
-removed by folding ``edge_mask`` into the non-negative per-edge weight.
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.attention``.
 """
 
 from __future__ import (
@@ -27,6 +19,10 @@
 
 import array_api_compat
 
+from deepmd.dpmodel.array_api import (
+    xp_add_at,
+    xp_maximum_at,
+)
 from deepmd.dpmodel.utils.network import (
     softplus_t,
 )
@@ -35,6 +31,7 @@
 def segment_envelope_gated_softmax(
     logits: Any,
     edge_env: Any,
+    dst: Any,
     n_nodes: int,
     z_bias_raw: Any,
     eps: float,
@@ -44,19 +41,21 @@ def segment_envelope_gated_softmax(
     """
     Compute destination-wise envelope-gated softmax attention.
 
-    All array arguments must live in the same array namespace.
-
     Parameters
     ----------
     logits
-        Attention logits with shape (E, F, H), padded-edge layout with
-        ``E = n_nodes * nnei``.
+        Attention logits with shape (E, F, H).
     edge_env
         Cutoff envelope weights with shape (E, 1) or (E,).
+    dst
+        Destination node indices with shape (E,). The group max and the
+        denominator sum are scattered over these indices, which makes the
+        normalization layout-agnostic: it is correct both for the padded
+        ``call`` (where ``dst == repeat(arange(n_nodes), nnei)``) and for the
+        sparse ``call_with_edges`` (arbitrary ``dst`` order and per-node
+        degree).
     n_nodes
-        Number of nodes. The pt ``dst`` argument is dropped: in the padded
-        layout the destination of edge slot ``(i, j)`` is implicitly node
-        ``i``.
+        Number of nodes.
     z_bias_raw
         Unconstrained denominator bias with shape (F, H).
         Softplus is applied to keep the bias strictly positive.
@@ -75,9 +74,8 @@ def segment_envelope_gated_softmax(
     edge_mask
         Optional padded-edge validity mask with shape (E,) or (E, 1);
         zero marks invalid slots. Folded into the non-negative per-edge
-        weight, so invalid slots drop out of the group max, the numerator,
-        and the denominator exactly like absent edges in the pt sparse
-        layout.
+        weight so invalid slots drop out of the group max, the numerator,
+        and the denominator.
 
     Returns
     -------
@@ -88,18 +86,8 @@ def segment_envelope_gated_softmax(
     n_edge, n_focus, n_head = logits.shape
     n_channel = n_focus * n_head
     eps_f = float(eps)
-    # Keep ``n_nodes`` symbolic (no ``int()``): it is the product ``nf*nloc``,
-    # and casting to a Python int specializes it to the trace-time sample
-    # shape, which breaks torch.export with a dynamic ``nloc`` dim. The
-    # ``Mod`` check below stays statically known (``E == n_nodes*nnei``) and
-    # the ``(n_nodes, nnei, ...)`` reshapes recover the layout symbolically.
-    if n_nodes <= 0 or n_edge % n_nodes != 0:
-        raise ValueError(
-            "padded-edge layout requires E to be a multiple of n_nodes; "
-            f"got E={n_edge}, n_nodes={n_nodes}"
-        )
-    nnei = n_edge // n_nodes
     device = array_api_compat.device(logits)
+    dst = xp.astype(dst, xp.int64)
 
     # === Step 1. Flatten (F, H) and build the effective per-edge weight ===
     logits_2d = xp.reshape(logits, (n_edge, n_channel))
@@ -134,14 +122,16 @@ def segment_envelope_gated_softmax(
     )
 
     # === Step 2. Destination-wise max for stable exponentials ===
-    # pt: scatter_reduce(amax) over dst — padded-edge max over the nnei axis.
-    group_max = xp.max(
-        xp.reshape(logits_for_max, (n_nodes, nnei, n_channel)), axis=1
+    # Destination segment max over ``dst`` (pt ``scatter_reduce`` amax). The
+    # scatter is layout-agnostic and the maximum is order-independent, so the
+    # padded ``call`` stays bit-exact while the sparse ``call_with_edges`` is
+    # handled by the same code path.
+    group_max = xp_maximum_at(
+        xp.full((n_nodes, n_channel), float("-inf"), dtype=logits.dtype, device=device),
+        dst,
+        logits_for_max,
     )  # (N, n_channel)
-    edge_max = xp.reshape(
-        xp.broadcast_to(group_max[:, None, :], (n_nodes, nnei, n_channel)),
-        (n_edge, n_channel),
-    )
+    edge_max = xp.take(group_max, dst, axis=0)
     zeros_en = xp.zeros((n_edge, n_channel), dtype=logits.dtype, device=device)
     zeros_nn = xp.zeros((n_nodes, n_channel), dtype=logits.dtype, device=device)
     edge_max = xp.where(xp.isfinite(edge_max), edge_max, zeros_en)
@@ -152,16 +142,15 @@ def segment_envelope_gated_softmax(
     edge_weighted_exp = edge_weight_sq[:, None] * exp_shifted
 
     # === Step 4. Destination-wise normalization with positive denominator bias ===
-    # pt: scatter_add over dst — padded-edge masked sum over the nnei axis
-    # (invalid slots already carry zero weight).
-    denom_sum = xp.sum(
-        xp.reshape(edge_weighted_exp, (n_nodes, nnei, n_channel)), axis=1
+    # Destination segment sum over ``dst`` (pt ``scatter_add``); invalid slots
+    # already carry zero weight. Layout-agnostic like the group max above.
+    denom_sum = xp_add_at(
+        xp.zeros((n_nodes, n_channel), dtype=logits.dtype, device=device),
+        dst,
+        edge_weighted_exp,
     )  # (N, n_channel)
     denom = denom_sum + zeta * xp.exp(-group_max_safe)
 
-    denom_edge = xp.reshape(
-        xp.broadcast_to(denom[:, None, :], (n_nodes, nnei, n_channel)),
-        (n_edge, n_channel),
-    )
+    denom_edge = xp.take(denom, dst, axis=0)
     alpha = edge_weighted_exp / (denom_edge + eps_f)
     return xp.reshape(alpha, (n_edge, n_focus, n_head))
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/attn_res.py b/deepmd/dpmodel/descriptor/dpa4_nn/attn_res.py
new file mode 100644
index 0000000000..ba6183cc91
--- /dev/null
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/attn_res.py
@@ -0,0 +1,270 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""
+Attention-residual layers for the DPA4/SeZM descriptor.
+
+This module defines the depth-wise attention residual aggregator used to
+combine equivariant states across descriptor and block histories.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.attn_res``.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+
+import array_api_compat
+import numpy as np
+
+from deepmd.dpmodel import (
+    DEFAULT_PRECISION,
+    PRECISION_DICT,
+    NativeOP,
+)
+from deepmd.dpmodel.array_api import (
+    xp_asarray_nodetach,
+)
+from deepmd.dpmodel.common import (
+    get_xp_precision,
+    to_numpy_array,
+)
+from deepmd.utils.version import (
+    check_version_compatibility,
+)
+
+from .norm import (
+    ScalarRMSNorm,
+)
+from .so3 import (
+    ChannelLinear,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+    )
+
+    from deepmd.dpmodel.array_api import (
+        Array,
+    )
+
+
+class DepthAttnRes(NativeOP):
+    """
+    Depth-wise attention residual aggregation for equivariant tensors.
+
+    Attention logits are computed only from scalar ``l=0`` channels, while the
+    resulting scalar weights are broadcast to the full equivariant value tensors.
+    This keeps the aggregation equivariant as long as all sources share the same
+    representation space.
+
+    Query modes
+    -----------
+    - ``input_dependent=True``: query comes from the current scalar state.
+    - ``input_dependent=False``: use a learned pseudo-query shared across inputs.
+
+    Both query paths are zero-initialized so the initial aggregation is a uniform
+    average over all provided sources.
+
+    Parameters
+    ----------
+    channels
+        Scalar feature dimension used by query and key.
+    input_dependent
+        Whether to project the current scalar state into a query vector.
+    eps
+        Small epsilon for key RMS normalization.
+    bias
+        Whether to use bias in the input-dependent query projection. Only
+        effective when ``input_dependent=True``.
+    precision
+        Parameter and compute precision. Caller should pass compute precision (fp32+).
+    trainable
+        Whether parameters are trainable.
+    seed
+        Random seed reserved for consistency with other modules.
+    """
+
+    if TYPE_CHECKING:
+        query_proj: ChannelLinear
+        adamw_pseudo_query: Array
+
+    def __init__(
+        self,
+        *,
+        channels: int,
+        input_dependent: bool = True,
+        eps: float = 1e-7,
+        bias: bool = True,
+        precision: str = DEFAULT_PRECISION,
+        trainable: bool,
+        seed: int | list[int] | None = None,
+    ) -> None:
+        self.channels = int(channels)
+        self.input_dependent = bool(input_dependent)
+        self.eps = float(eps)
+        self.query_bias = bool(bias)
+        self.precision = precision
+        prec = PRECISION_DICT[self.precision.lower()]
+
+        self.key_norm = ScalarRMSNorm(
+            channels=self.channels,
+            n_focus=1,
+            eps=self.eps,
+            precision=self.precision,
+            trainable=trainable,
+        )
+        if self.input_dependent:
+            self.query_proj = ChannelLinear(
+                in_channels=self.channels,
+                out_channels=self.channels,
+                precision=self.precision,
+                bias=self.query_bias,
+                trainable=trainable,
+                seed=seed,
+                init_std=0.0,
+            )
+        else:
+            self.adamw_pseudo_query = np.zeros(self.channels, dtype=prec)
+
+        self.trainable = bool(trainable)
+
+    def call(
+        self,
+        *,
+        sources: list[Array],
+        scalar_extractor: Callable[[Array], Array],
+        current_x: Array | None = None,
+    ) -> Array:
+        """
+        Aggregate same-shape sources with depth attention.
+
+        Parameters
+        ----------
+        sources
+            Source tensors with identical shape ``(B, ...)``.
+        scalar_extractor
+            Function that extracts scalar features from each source with shape
+            ``(B, C)`` where ``C=channels``.
+        current_x
+            Current tensor state. Required when ``input_dependent=True`` and
+            converted to scalar query features via ``scalar_extractor``.
+
+        Returns
+        -------
+        Array
+            Aggregated tensor with the same shape as each source.
+        """
+        source0 = sources[0]
+        if len(sources) == 1:
+            return source0
+        xp = array_api_compat.array_namespace(source0)
+        device = array_api_compat.device(source0)
+        batch_size = int(source0.shape[0])
+        value_dtype = source0.dtype
+
+        # === Step 1. Build the query vector ===
+        if self.input_dependent:
+            current_x_scalar = scalar_extractor(current_x)
+            query = self.query_proj(
+                xp.astype(current_x_scalar, get_xp_precision(xp, self.precision))
+            )
+        else:
+            query = xp.broadcast_to(
+                xp_asarray_nodetach(xp, self.adamw_pseudo_query[...], device=device)[
+                    None, :
+                ],
+                (batch_size, self.channels),
+            )
+
+        # === Step 2. Extract and normalize scalar keys ===
+        source_count = len(sources)
+        raw_keys = xp.stack(
+            [
+                xp.astype(
+                    scalar_extractor(source), get_xp_precision(xp, self.precision)
+                )
+                for source in sources
+            ],
+            axis=1,
+        )  # (B, S, C)
+        keys = self.key_norm(raw_keys)
+        logits = xp.sum(query[:, None, :] * keys, axis=-1)
+        alpha = xp.exp(logits - xp.max(logits, axis=1, keepdims=True))
+        alpha = alpha / xp.sum(alpha, axis=1, keepdims=True)  # (B, S)
+
+        # === Step 3. Broadcast scalar weights to equivariant values ===
+        value_stack = xp.stack(
+            [
+                xp.astype(source, get_xp_precision(xp, self.precision))
+                for source in sources
+            ],
+            axis=1,
+        )
+        alpha = xp.reshape(
+            alpha,
+            (
+                batch_size,
+                source_count,
+                *([1] * (value_stack.ndim - 2)),
+            ),
+        )
+        aggregated = xp.sum(alpha * value_stack, axis=1)
+        return xp.astype(aggregated, value_dtype)
+
+    def serialize(self) -> dict[str, Any]:
+        variables = {"key_norm.adam_scale": to_numpy_array(self.key_norm.adam_scale)}
+        if self.input_dependent:
+            variables["query_proj.weight"] = to_numpy_array(self.query_proj.weight)
+            if self.query_bias:
+                variables["query_proj.bias"] = to_numpy_array(self.query_proj.bias)
+        else:
+            variables["adamw_pseudo_query"] = to_numpy_array(self.adamw_pseudo_query)
+        return {
+            "@class": "DepthAttnRes",
+            "@version": 1,
+            "config": {
+                "channels": self.channels,
+                "input_dependent": self.input_dependent,
+                "eps": self.eps,
+                "bias": self.query_bias,
+                "precision": np.dtype(PRECISION_DICT[self.precision]).name,
+                "trainable": self.trainable,
+                "seed": None,
+            },
+            "@variables": variables,
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict[str, Any]) -> DepthAttnRes:
+        data = data.copy()
+        data_cls = data.pop("@class")
+        if data_cls != "DepthAttnRes":
+            raise ValueError(f"Invalid class for DepthAttnRes: {data_cls}")
+        version = int(data.pop("@version"))
+        check_version_compatibility(version, 1, 1)
+        config = data.pop("config")
+        variables = data.pop("@variables")
+        obj = cls(**config)
+        prec = PRECISION_DICT[obj.precision.lower()]
+        obj.key_norm.adam_scale = np.asarray(
+            variables["key_norm.adam_scale"], dtype=prec
+        )
+        if obj.input_dependent:
+            obj.query_proj.weight = np.asarray(
+                variables["query_proj.weight"], dtype=prec
+            )
+            if obj.query_bias:
+                obj.query_proj.bias = np.asarray(
+                    variables["query_proj.bias"], dtype=prec
+                )
+        else:
+            obj.adamw_pseudo_query = np.asarray(
+                variables["adamw_pseudo_query"], dtype=prec
+            )
+        return obj
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/block.py b/deepmd/dpmodel/descriptor/dpa4_nn/block.py
index 861a3b34d4..c13acf2617 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/block.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/block.py
@@ -1,31 +1,13 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """
-Interaction blocks for DPA4/SeZM.
-
-This module is the dpmodel port of ``deepmd.pt.model.descriptor.sezm_nn.block``.
-It defines the SeZM interaction block that combines SO(2) message passing and
-equivariant feed-forward subblocks with residual shortcuts.
-
-Branches guarded with ``NotImplementedError`` at this level (flags consumed by
-block.py itself, all unused by the core DPA4 config):
-
-- ``full_attn_res != "none"`` / ``block_attn_res != "none"`` — the pt block
-  builds ``DepthAttnRes`` aggregators and switches the forward implementation
-  (pt block.py:514/541); only the baseline residual-shortcut path
-  (pt block.py:756) is ported.
-- ``layer_scale=True`` — the pt block builds per-channel
-  ``adam_ffn_layer_scales`` on the FFN residual branches (pt block.py:500)
-  in addition to the SO(2)-internal scales; not ported.
-
-Flags merely forwarded to sub-components keep their guards there (delegated,
-not duplicated here): ``so2_attn_res``, ``so2_s2_activation``,
-``atten_f_mix``, ``atten_v_proj``, ``atten_o_proj`` (raised by
-``SO2Convolution``). The cross-mode grid products (``node_wise_s2/so3``,
-``message_node_s2/so3``) are ported and forwarded to ``SO2Convolution``.
-
-The pt eval-time activation-checkpoint / nvtx instrumentation
-(``DP_ACT_INFER``, ``DP_COMPILE_INFER``, ``nvtx_range``) is pt-runtime-only
-and intentionally not ported.
+Interaction blocks for the DPA4/SeZM descriptor.
+
+This module defines the SeZM interaction block that combines SO(2)
+message passing, equivariant feed-forward subblocks, and optional
+attention-residual history aggregation.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.block``.
 """
 
 from __future__ import (
@@ -45,6 +27,15 @@
     PRECISION_DICT,
     NativeOP,
 )
+from deepmd.dpmodel.array_api import (
+    xp_asarray_nodetach,
+)
+from deepmd.dpmodel.common import (
+    to_numpy_array,
+)
+from deepmd.dpmodel.utils.network import (
+    Identity,
+)
 from deepmd.dpmodel.utils.seed import (
     child_seed,
 )
@@ -52,6 +43,9 @@
     check_version_compatibility,
 )
 
+from .attn_res import (
+    DepthAttnRes,
+)
 from .ffn import (
     EquivariantFFN,
 )
@@ -60,15 +54,59 @@
 )
 from .so2 import (
     SO2Convolution,
-    _compute_precision,
 )
 from .utils import (
     ATTN_RES_MODES,
+    get_promoted_dtype,
 )
 
 if TYPE_CHECKING:
+    from deepmd.dpmodel.array_api import (
+        Array,
+    )
+
     from .edge_cache import (
-        EdgeCache,
+        EdgeFeatureCache,
+    )
+
+
+def exchange_ghost_features(
+    x: Array,
+    comm_dict: dict[str, Array],
+) -> Array:
+    """
+    Refresh ghost-node features from their owner ranks via MPI border exchange.
+
+    SeZM node features are SO(3) coefficients expressed in the shared global
+    frame, so a ghost atom and its owner carry identical features and the
+    per-row owner->ghost copy is exact and equivariance-preserving. The opaque
+    ``deepmd_export::border_op`` performs the exchange and carries a registered
+    backward (reverse communication of gradients), so a single
+    ``autograd.grad(energy, edge_vec)`` accumulates cross-rank force
+    contributions when every rank runs the exchange in lockstep.
+
+    This is applied to the SO(2) convolution input — the descriptor's only
+    cross-node operation — so ghost rows are correct exactly where message
+    passing reads them, regardless of how the (per-node) attention-residual
+    history that produced the input populated its ghost rows.
+
+    Parameters
+    ----------
+    x
+        Extended node features with shape (nall, D, 1, channels). Owned-atom rows
+        hold up-to-date values; ghost rows are overwritten by this call.
+    comm_dict
+        Border-exchange tensors ``send_list``, ``send_proc``, ``recv_proc``,
+        ``send_num``, ``recv_num``, ``communicator``, ``nlocal``, ``nghost``.
+
+    Returns
+    -------
+    Array
+        Node features with ghost rows filled, same shape as ``x``.
+    """
+    raise NotImplementedError(
+        "Multi-rank border exchange (comm_dict) is not supported in the "
+        "dpmodel backend."
     )
 
 
@@ -81,15 +119,167 @@ class SeZMInteractionBlock(NativeOP):
     2. FFN branch: repeated subblocks of
        optional pre-norm -> `EquivariantFFN` -> optional post-norm.
 
-    Outer residual shortcuts are applied around the SO(2) unit and each FFN
-    subblock (the pt AttnRes paths are not ported; see the module docstring).
+    In the baseline path, outer residual shortcuts are applied around the SO(2)
+    unit and each FFN subblock. In AttnRes paths, these shortcuts are replaced by
+    selective depth-wise aggregation before each unit.
 
     `SO2Convolution` internally handles the real multi-focus expansion, so this
     block keeps a singleton-focus backbone layout `(N, D, 1, C)` at boundaries.
 
-    Parameters mirror the pt ``SeZMInteractionBlock`` (pt block.py:227) with
-    ``precision`` replacing ``dtype``; see the pt docstring for the full
-    per-parameter description.
+    Parameters
+    ----------
+    lmax
+        Maximum message-passing spherical harmonic degree.
+    node_lmax
+        Maximum node representation degree. If None, equals `lmax`.
+    mmax
+        Maximum SO(2) order (|m|) mixed inside SO(2) convolution.
+    kmax
+        Maximum Wigner-D frame order (|k|) used by SO(3) grid branches.
+    channels
+        Total channels per (l, m) coefficient.
+    n_focus
+        Number of multi-focus streams used only by the internal SO(2) branch.
+    focus_dim
+        Hidden width per focus stream used inside the SO(2) branch.
+        ``focus_dim=0`` means using ``channels``.
+    focus_compete
+        If True, enable cross-focus softmax competition in SO(2) convolution.
+    so2_norm
+        If True, apply intermediate ReducedEquivariantRMSNorm between SO(2) mixing layers.
+        When False (default), no normalization is applied between layers.
+    mixing_layers
+        Number of learnable mixing layers in the per-edge message core. ``0``
+        applies only the edge-condition modulation.
+    so2_attn_res
+        Depth-wise attention residual mode across the internal SO(2) layer
+        history. Must be one of ``"none"``, ``"independent"``, or
+        ``"dependent"``.
+    radial_so2_mode
+        Dynamic radial degree mixer mode inside SO(2) convolution. ``"none"``
+        applies elementwise radial modulation, ``"degree"`` uses a
+        channel-shared edge-conditioned cross-degree kernel, and
+        ``"degree_channel"`` uses a per-channel cross-degree kernel.
+    radial_so2_rank
+        Low-rank channel factorization rank for
+        ``radial_so2_mode="degree_channel"``. ``0`` uses the full
+        per-channel dynamic degree kernel.
+    edge_cartesian
+        If True, replace the per-edge SO(2) rotation-frame tensor product inside
+        ``SO2Convolution`` with the global-frame Cartesian rank-2 tensor
+        product. Requires ``lmax`` in ``{1, 2}``.
+    node_cartesian
+        Per-node global-frame Cartesian rank-2 tensor product on the aggregated
+        message inside ``SO2Convolution``, configured by a ``"<mode>:<layers>"``
+        string (``mode`` is ``"default"`` or ``"parity"``); a bare integer ``N``
+        is shorthand for ``"default:N"``, and ``"none"`` disables it. Requires
+        ``lmax`` in ``{1, 2}`` and is orthogonal to ``edge_cartesian``.
+    n_atten_head
+        Number of attention heads when aggregating messages in SO(2) convolution.
+        0 means no attention is used; >0 enables envelope-gated grouped softmax
+        attention with output-side head gate.
+    atten_f_mix
+        If True, merge SO(2) focus streams into one attention stream after
+        rotate-back. This gives each attention head access to the full
+        multi-focus hidden width.
+    atten_v_proj
+        If True, apply an explicit degree-aware value projection inside SO(2)
+        attention.
+    atten_o_proj
+        If True, apply an explicit degree-aware output projection inside SO(2)
+        attention.
+    so2_pre_norm
+        If True, apply pre-norm before SO(2) convolution.
+    so2_post_norm
+        If True, apply post-norm on SO(2) output before the residual add.
+    ffn_pre_norm
+        If True, apply pre-norm before each FFN subblock.
+    ffn_post_norm
+        If True, apply post-norm on each FFN subblock output before the residual add.
+    ffn_neurons
+        Hidden dimension for each FFN subblock.
+    node_wise_grid_mlp
+        If True, select the polynomial grid MLP operation for the SO(2)
+        convolution node-wise cross-grid path.
+    node_wise_grid_branch
+        Number of scalar-routed polynomial product branches for the node-wise
+        cross-grid path. ``0`` disables branch mixing; positive values take
+        precedence over ``node_wise_grid_mlp``.
+    message_node_grid_mlp
+        If True, select the polynomial grid MLP operation for the SO(2)
+        convolution message-node cross-grid path.
+    message_node_grid_branch
+        Number of scalar-routed polynomial product branches for the
+        message-node cross-grid path. ``0`` disables branch mixing; positive
+        values take precedence over ``message_node_grid_mlp``.
+    ffn_grid_mlp
+        If True, select the polynomial grid MLP operation for the
+        block-internal FFN grid path.
+    ffn_grid_branch
+        Number of scalar-routed polynomial product branches for the FFN grid
+        path. ``0`` disables branch mixing; positive values take precedence
+        over ``ffn_grid_mlp``.
+    ffn_blocks
+        Number of FFN subblocks per block.
+    layer_scale
+        If True, apply learnable LayerScale (init 1e-3) on residual branches:
+        - SO(2) branch: per-focus-channel scales `(n_focus, focus_dim)`
+          on each SO(2) mixing layer.
+        - FFN branch: per-channel scales `(channels,)` on each FFN subblock.
+    full_attn_res
+        Descriptor-level full attention residual mode for this block wrapper.
+        When enabled, the block uses external unit history to build the SO(2)
+        input and the input of each FFN unit.
+    block_attn_res
+        Descriptor-level block attention residual mode for this block wrapper.
+        When enabled, the block uses external block history plus an intra-block
+        partial sum to build the SO(2) input and the input of each FFN unit.
+    so2_s2_activation
+        If True, enable the merged scalar/grid SwiGLU-S2 activation in the SO(2)
+        branch.
+    node_wise_s2
+        If True, enable the edge-local source-destination S2 product branch in
+        the SO(2) convolution.
+    node_wise_so3
+        If True, enable the corresponding edge-local SO(3) Wigner-D grid branch
+        in the SO(2) convolution.
+    message_node_s2
+        If True, enable the post-aggregation message-node S2 product branch in
+        the SO(2) convolution.
+    message_node_so3
+        If True, enable the corresponding post-aggregation SO(3) Wigner-D grid
+        branch in the SO(2) convolution.
+    ffn_s2_activation
+        If True, enable the merged scalar/grid SwiGLU-S2 activation in the
+        default FFN activation path.
+    ffn_so3_grid
+        If True, use the SO(3) Wigner-D grid in the block-internal FFN. This
+        takes precedence over ``ffn_s2_activation``.
+    so2_lebedev_quadrature
+        If True, use Lebedev quadrature for the SO(2) S2 activation projector.
+    ffn_lebedev_quadrature
+        If True, use Lebedev quadrature for the FFN S2 activation projector.
+    so2_activation_function
+        Activation function for the block-internal SO(2) l=0 gated activation
+        path when ``so2_s2_activation=False``.
+    ffn_activation_function
+        Activation function for the block-internal FFN l=0 components.
+    ffn_glu_activation
+        If True, use GLU-style gating in the block-internal FFN
+        (e.g., silu -> swiglu, gelu -> geglu).
+    mlp_bias
+        Whether to use bias in equivariant layers. Controls:
+        - SO3Linear: l=0 bias
+        - SO2Linear: l=0 bias
+        - GatedActivation: gate linear bias
+    eps
+        Small epsilon for numerical stability.
+    precision
+        Parameter precision.
+    seed
+        Random seed for weight initialization.
+    trainable
+        Whether parameters are trainable.
     """
 
     def __init__(
@@ -104,10 +294,12 @@ def __init__(
         focus_dim: int = 0,
         focus_compete: bool = True,
         so2_norm: bool = False,
-        so2_layers: int = 4,
+        mixing_layers: int = 4,
         so2_attn_res: str = "none",
         radial_so2_mode: str = "none",
         radial_so2_rank: int = 0,
+        edge_cartesian: bool = False,
+        node_cartesian: str | int = "none",
         n_atten_head: int = 1,
         atten_f_mix: bool = False,
         atten_v_proj: bool = False,
@@ -142,8 +334,8 @@ def __init__(
         mlp_bias: bool = False,
         eps: float = 1e-7,
         precision: str = DEFAULT_PRECISION,
-        seed: int | list[int] | None = None,
-        trainable: bool = True,
+        seed: int | list[int] | None,
+        trainable: bool,
     ) -> None:
         self.lmax = int(lmax)
         self.node_lmax = self.lmax if node_lmax is None else int(node_lmax)
@@ -168,7 +360,7 @@ def __init__(
             raise ValueError("`focus_dim` must be >= 0")
         self.focus_compete = bool(focus_compete)
         self.so2_norm = bool(so2_norm)
-        self.so2_layers = int(so2_layers)
+        self.mixing_layers = int(mixing_layers)
         self.so2_attn_res_mode = str(so2_attn_res).lower()
         if self.so2_attn_res_mode not in ATTN_RES_MODES:
             raise ValueError(
@@ -176,6 +368,8 @@ def __init__(
             )
         self.radial_so2_mode = str(radial_so2_mode).lower()
         self.radial_so2_rank = int(radial_so2_rank)
+        self.edge_cartesian = bool(edge_cartesian)
+        self.node_cartesian = str(node_cartesian)
         self.n_atten_head = int(n_atten_head)
         self.atten_f_mix = bool(atten_f_mix)
         self.use_atten_v_proj = bool(atten_v_proj)
@@ -204,9 +398,6 @@ def __init__(
         if self.ffn_blocks < 1:
             raise ValueError("`ffn_blocks` must be >= 1")
         self.layer_scale = bool(layer_scale)
-        if self.layer_scale:
-            # consumed by block.py itself (FFN-branch adam_ffn_layer_scales)
-            raise NotImplementedError("layer_scale=True is not ported to dpmodel")
         self.full_attn_res_mode = str(full_attn_res).lower()
         if self.full_attn_res_mode not in ATTN_RES_MODES:
             raise ValueError(
@@ -217,13 +408,11 @@ def __init__(
             raise ValueError(
                 "`block_attn_res` must be one of 'none', 'independent', or 'dependent'"
             )
-        if self.full_attn_res_mode != "none":
-            raise NotImplementedError(
-                "full_attn_res != 'none' (DepthAttnRes) is not ported to dpmodel"
-            )
-        if self.block_attn_res_mode != "none":
-            raise NotImplementedError(
-                "block_attn_res != 'none' (DepthAttnRes) is not ported to dpmodel"
+        self.use_full_attn_res = self.full_attn_res_mode != "none"
+        self.use_block_attn_res = self.block_attn_res_mode != "none"
+        if self.use_full_attn_res and self.use_block_attn_res:
+            raise ValueError(
+                "`full_attn_res` and `block_attn_res` cannot both be enabled"
             )
         self.so2_s2_activation = bool(so2_s2_activation)
         self.node_wise_s2 = bool(node_wise_s2)
@@ -240,40 +429,38 @@ def __init__(
         self.mlp_bias = bool(mlp_bias)
         self.eps = float(eps)
         self.precision = precision
-        self.compute_precision = _compute_precision(precision)
-        self.trainable = bool(trainable)
+        self.compute_precision = np.dtype(
+            get_promoted_dtype(PRECISION_DICT[self.precision])
+        ).name
 
         # === Step 0. Split deterministic seeds at the block top-level ===
-        # pt also splits seed_full_attn / seed_block_attn (block.py:378-379);
-        # those consumers are guarded above, so the splits are unused here.
         seed_so2_conv = child_seed(seed, 0)
         seed_ffn = child_seed(seed, 1)
+        seed_full_attn = child_seed(seed, 2)
+        seed_block_attn = child_seed(seed, 3)
 
         # === Step 1. SO(2) convolution branch norms ===
-        # pt uses nn.Identity() for disabled norms (parameter-free); the
-        # dpmodel equivalent is None.
-        self.pre_so2_norm: EquivariantRMSNorm | None = (
-            EquivariantRMSNorm(
+        if self.so2_pre_norm:
+            self.pre_so2_norm = EquivariantRMSNorm(
                 self.lmax,
                 self.channels,
                 n_focus=1,
                 precision=self.compute_precision,
-                trainable=self.trainable,
+                trainable=trainable,
             )
-            if self.so2_pre_norm
-            else None
-        )
-        self.post_so2_norm: EquivariantRMSNorm | None = (
-            EquivariantRMSNorm(
+        else:
+            self.pre_so2_norm = Identity()
+
+        if self.so2_post_norm:
+            self.post_so2_norm = EquivariantRMSNorm(
                 self.lmax,
                 self.channels,
                 n_focus=1,
                 precision=self.compute_precision,
-                trainable=self.trainable,
+                trainable=trainable,
             )
-            if self.so2_post_norm
-            else None
-        )
+        else:
+            self.post_so2_norm = Identity()
 
         self.so2_conv = SO2Convolution(
             lmax=self.lmax,
@@ -284,12 +471,14 @@ def __init__(
             focus_dim=self.focus_dim,
             focus_compete=self.focus_compete,
             so2_norm=self.so2_norm,
-            so2_layers=self.so2_layers,
+            mixing_layers=self.mixing_layers,
             so2_attn_res=self.so2_attn_res_mode,
             radial_so2_mode=self.radial_so2_mode,
             radial_so2_rank=self.radial_so2_rank,
+            edge_cartesian=self.edge_cartesian,
+            node_cartesian=self.node_cartesian,
             layer_scale=self.layer_scale,
-            n_atten_head=self.n_atten_head,
+            n_atten_head=n_atten_head,
             atten_f_mix=self.atten_f_mix,
             atten_v_proj=self.use_atten_v_proj,
             atten_o_proj=self.use_atten_o_proj,
@@ -306,69 +495,210 @@ def __init__(
             activation_function=self.so2_activation_function,
             mlp_bias=self.mlp_bias,
             eps=self.eps,
-            precision=self.precision,
+            precision=precision,
             seed=seed_so2_conv,
-            trainable=self.trainable,
+            trainable=trainable,
         )
 
         # === Step 2. FFN subblock sequence ===
-        pre_ffn_norms: list[EquivariantRMSNorm | None] = []
-        post_ffn_norms: list[EquivariantRMSNorm | None] = []
+        pre_ffn_norms: list = []
+        post_ffn_norms: list = []
         ffns: list[EquivariantFFN] = []
 
         for i in range(self.ffn_blocks):
             seed_ffn_i = child_seed(seed_ffn, i)
-            pre_ffn_norms.append(
-                EquivariantRMSNorm(
-                    self.node_lmax,
-                    self.channels,
-                    n_focus=1,
-                    precision=self.compute_precision,
-                    trainable=self.trainable,
+
+            if self.ffn_pre_norm:
+                pre_ffn_norms.append(
+                    EquivariantRMSNorm(
+                        self.node_lmax,
+                        self.channels,
+                        n_focus=1,
+                        precision=self.compute_precision,
+                        trainable=trainable,
+                    )
                 )
-                if self.ffn_pre_norm
-                else None
-            )
-            post_ffn_norms.append(
-                EquivariantRMSNorm(
-                    self.node_lmax,
-                    self.channels,
-                    n_focus=1,
-                    precision=self.compute_precision,
-                    trainable=self.trainable,
+            else:
+                pre_ffn_norms.append(Identity())
+
+            if self.ffn_post_norm:
+                post_ffn_norms.append(
+                    EquivariantRMSNorm(
+                        self.node_lmax,
+                        self.channels,
+                        n_focus=1,
+                        precision=self.compute_precision,
+                        trainable=trainable,
+                    )
                 )
-                if self.ffn_post_norm
-                else None
-            )
+            else:
+                post_ffn_norms.append(Identity())
+
             ffns.append(
                 EquivariantFFN(
                     lmax=self.node_lmax,
                     channels=self.channels,
-                    hidden_channels=self.ffn_neurons,
+                    hidden_channels=ffn_neurons,
                     kmax=self.kmax,
                     grid_mlp=self.ffn_grid_mlp,
                     grid_branch=self.ffn_grid_branch,
+                    precision=precision,
                     s2_activation=self.ffn_s2_activation,
                     ffn_so3_grid=self.ffn_so3_grid,
                     lebedev_quadrature=self.ffn_lebedev_quadrature,
                     activation_function=self.ffn_activation_function,
                     glu_activation=self.ffn_glu_activation,
                     mlp_bias=self.mlp_bias,
-                    precision=self.precision,
-                    trainable=self.trainable,
+                    trainable=trainable,
                     seed=seed_ffn_i,
                 )
             )
+
         self.pre_ffn_norms = pre_ffn_norms
         self.post_ffn_norms = post_ffn_norms
         self.ffns = ffns
 
+        # Optional per-channel LayerScale on each FFN residual branch
+        if self.layer_scale:
+            self.adam_ffn_layer_scales = [
+                np.ones((self.channels,), dtype=PRECISION_DICT[self.precision]) * 1e-3
+                for _ in range(self.ffn_blocks)
+            ]
+        else:
+            self.adam_ffn_layer_scales = None
+
+        # === Step 3. Optional full attention residuals for block inputs ===
+        if self.use_full_attn_res:
+            self.full_attn_res_so2: DepthAttnRes | None = DepthAttnRes(
+                channels=self.channels,
+                input_dependent=self.full_attn_res_mode == "dependent",
+                eps=self.eps,
+                bias=self.mlp_bias,
+                precision=self.compute_precision,
+                trainable=trainable,
+                seed=child_seed(seed_full_attn, 0),
+            )
+            self.full_attn_res_ffns: list | None = [
+                DepthAttnRes(
+                    channels=self.channels,
+                    input_dependent=self.full_attn_res_mode == "dependent",
+                    eps=self.eps,
+                    bias=self.mlp_bias,
+                    precision=self.compute_precision,
+                    trainable=trainable,
+                    seed=child_seed(seed_full_attn, i + 1),
+                )
+                for i in range(self.ffn_blocks)
+            ]
+            self.block_attn_res_so2 = None
+            self.block_attn_res_ffns = None
+            self._forward_impl = self._forward_with_full_attn_res
+        elif self.use_block_attn_res:
+            self.full_attn_res_so2 = None
+            self.full_attn_res_ffns = None
+            self.block_attn_res_so2: DepthAttnRes | None = DepthAttnRes(
+                channels=self.channels,
+                input_dependent=self.block_attn_res_mode == "dependent",
+                eps=self.eps,
+                bias=self.mlp_bias,
+                precision=self.compute_precision,
+                trainable=trainable,
+                seed=child_seed(seed_block_attn, 0),
+            )
+            self.block_attn_res_ffns: list | None = [
+                DepthAttnRes(
+                    channels=self.channels,
+                    input_dependent=self.block_attn_res_mode == "dependent",
+                    eps=self.eps,
+                    bias=self.mlp_bias,
+                    precision=self.compute_precision,
+                    trainable=trainable,
+                    seed=child_seed(seed_block_attn, i + 1),
+                )
+                for i in range(self.ffn_blocks)
+            ]
+            self._forward_impl = self._forward_with_block_attn_res
+        else:
+            self.full_attn_res_so2 = None
+            self.full_attn_res_ffns = None
+            self.block_attn_res_so2 = None
+            self.block_attn_res_ffns = None
+            self._forward_impl = self._forward_with_residual_shortcuts
+
+        self.trainable = bool(trainable)
+
+    def call(
+        self,
+        x: Array,
+        edge_cache: EdgeFeatureCache,
+        radial_feat: Array,
+        unit_history: list[Array] | None = None,
+        comm_dict: dict[str, Array] | None = None,
+    ) -> tuple[
+        Array,
+        Array | None,
+        Array | None,
+        list[Array] | None,
+    ]:
+        """
+        Parameters
+        ----------
+        x
+            Features with shape `(N, D, 1, C)`.
+        edge_cache
+            Edge cache.
+        radial_feat
+            Per-edge radial features with shape (E, lmax+1, C).
+        unit_history
+            Optional truncated depth history in canonical node layout. When
+            `full_attn_res != "none"`, it is interpreted as completed unit
+            history. When `block_attn_res != "none"`, it is interpreted as
+            completed block history.
+        comm_dict
+            Border-exchange tensors for parallel (LAMMPS multi-rank) inference.
+            When provided, the SO(2) convolution input has its ghost rows
+            refreshed from owner ranks; the depth-attention history may carry
+            stale ghost rows because the exchange happens at the convolution
+            input, after the (per-node) aggregation that consumes it.
+
+        Returns
+        -------
+        tuple[Array, Array | None, Array | None, list[Array] | None]
+            Tuple `(block_output, block_summary, so2_unit_output, ffn_unit_outputs)`
+            in canonical node layout. `block_output` is always returned.
+            Auxiliary outputs are mode-dependent and may be `None` when the
+            current caller does not need them:
+
+            - baseline path returns `(block_output, None, None, None)`
+            - full AttnRes path returns `(block_output, None, so2_unit_output, ffn_unit_outputs)`
+            - block AttnRes path returns `(block_output, block_summary, None, None)`
+        """
+        return self._forward_impl(x, edge_cache, radial_feat, unit_history, comm_dict)
+
+    def _extract_l0_from_canonical(self, value: Array) -> Array:
+        """
+        Extract scalar channels from canonical node layout.
+
+        Parameters
+        ----------
+        value
+            Canonical node features with shape `(N, D, 1, C)`.
+
+        Returns
+        -------
+        Array
+            Scalar channels with shape (N, channels).
+        """
+        xp = array_api_compat.array_namespace(value)
+        return xp.reshape(value[:, 0, :, :], (value.shape[0], self.channels))
+
     def _run_so2_unit(
         self,
-        x: Any,
-        edge_cache: EdgeCache,
-        radial_feat: Any,
-    ) -> Any:
+        x: Array,
+        edge_cache: EdgeFeatureCache,
+        radial_feat: Array,
+        comm_dict: dict[str, Array] | None = None,
+    ) -> Array:
         """
         Run the SO(2) unit without an outer block-level residual shortcut.
 
@@ -377,40 +707,52 @@ def _run_so2_unit(
         x
             Canonical node features with shape `(N, D, 1, C)`.
         edge_cache
-            Edge cache (padded layout; see ``edge_cache.EdgeCache``).
+            Edge cache.
         radial_feat
             Per-edge radial features with shape (E, lmax+1, C).
+        comm_dict
+            Border-exchange tensors for parallel inference. When provided, the
+            convolution input's ghost rows are refreshed from owner ranks
+            immediately before the only cross-node operation in the block, so
+            owned destinations gather up-to-date neighbour features.
 
         Returns
         -------
         Array
             SO(2) unit output with shape `(N, D, 1, C)`.
         """
+        if comm_dict is not None:
+            x = exchange_ghost_features(x, comm_dict)
+        return self._run_so2_unit_impl(x, edge_cache, radial_feat)
+
+    def _run_so2_unit_impl(
+        self,
+        x: Array,
+        edge_cache: EdgeFeatureCache,
+        radial_feat: Array,
+    ) -> Array:
+        """Run the SO(2) unit implementation."""
         xp = array_api_compat.array_namespace(x)
         n_node = x.shape[0]
         channels = self.channels
         use_full_node = self.node_lmax == self.lmax
         x_so2 = x if use_full_node else x[:, : self.mp_ebed_dim, :, :]
-        x_pre = x_so2 if self.pre_so2_norm is None else self.pre_so2_norm(x_so2)
+        x_pre = self.pre_so2_norm(x_so2)
         so2_unit_output = self.so2_conv(
             xp.reshape(x_pre, (n_node, x_so2.shape[1], channels)),
             edge_cache,
             radial_feat,
         )
-        so2_unit_output = so2_unit_output[:, :, None, :]
-        if self.post_so2_norm is not None:
-            so2_unit_output = self.post_so2_norm(so2_unit_output)
+        so2_unit_output = self.post_so2_norm(so2_unit_output[:, :, None, :])
         if use_full_node:
             return so2_unit_output
-        # zero-pad the degrees above lmax (pt writes into x.new_zeros)
-        pad = xp.zeros(
-            (n_node, self.node_ebed_dim - self.mp_ebed_dim, 1, channels),
-            dtype=x.dtype,
-            device=array_api_compat.device(x),
+        output = xp.zeros(x.shape, dtype=x.dtype, device=array_api_compat.device(x))
+        output = xp.concat(
+            [so2_unit_output, output[:, self.mp_ebed_dim :, :, :]], axis=1
         )
-        return xp.concat([so2_unit_output, pad], axis=1)
+        return output
 
-    def _run_ffn_unit(self, x: Any, unit_idx: int) -> Any:
+    def _run_ffn_unit(self, x: Array, unit_idx: int) -> Array:
         """
         Run one FFN subblock without the outer unit-level residual shortcut.
 
@@ -426,111 +768,291 @@ def _run_ffn_unit(self, x: Any, unit_idx: int) -> Any:
         Array
             FFN unit output with shape `(N, D, 1, C)`.
         """
-        pre_norm = self.pre_ffn_norms[unit_idx]
-        post_norm = self.post_ffn_norms[unit_idx]
-        x_pre = x if pre_norm is None else pre_norm(x)
-        y = self.ffns[unit_idx](x_pre)
-        if post_norm is not None:
-            y = post_norm(y)
+        return self._run_ffn_unit_impl(x, unit_idx)
+
+    def _run_ffn_unit_impl(self, x: Array, unit_idx: int) -> Array:
+        """Run one FFN subblock implementation."""
+        xp = array_api_compat.array_namespace(x)
+        n_node = x.shape[0]
+        ebed_dim = x.shape[1]
+        channels = self.channels
+        x_ffn = xp.reshape(x, (n_node, ebed_dim, 1, channels))  # (N, D, 1, C)
+        x_pre = self.pre_ffn_norms[unit_idx](x_ffn)
+        y: Array = self.ffns[unit_idx](x_pre)
+        y = self.post_ffn_norms[unit_idx](y)
+        if self.layer_scale:
+            device = array_api_compat.device(x)
+            y = y * xp_asarray_nodetach(
+                xp, self.adam_ffn_layer_scales[unit_idx][...], device=device
+            )
         return y
 
-    def call(
+    def _forward_with_residual_shortcuts(
         self,
-        x: Any,
-        edge_cache: EdgeCache,
-        radial_feat: Any,
-        unit_history: list[Any] | None = None,
-    ) -> tuple[Any, None, None, None]:
+        x: Array,
+        edge_cache: EdgeFeatureCache,
+        radial_feat: Array,
+        unit_history: list[Array] | None = None,
+        comm_dict: dict[str, Array] | None = None,
+    ) -> tuple[
+        Array,
+        Array | None,
+        Array | None,
+        list[Array] | None,
+    ]:
         """
-        Run the residual-connected block path (pt baseline path).
+        Run the original residual-connected block path.
 
         Parameters
         ----------
         x
-            Features with shape `(N, D, 1, C)`.
+            Canonical node features with shape `(N, D, 1, C)`.
         edge_cache
-            Edge cache (padded layout).
+            Edge cache.
         radial_feat
             Per-edge radial features with shape (E, lmax+1, C).
         unit_history
-            Unused in the residual-connected path (the pt AttnRes paths that
-            consume it are not ported).
+            Unused in the residual-connected path.
+        comm_dict
+            Border-exchange tensors for parallel inference, forwarded to the
+            SO(2) unit. The owned-atom residual reads the original ``x``, which
+            is already correct on owned rows.
 
         Returns
         -------
-        tuple[Array, None, None, None]
-            Tuple `(block_output, None, None, None)` matching the pt
-            baseline-path return convention.
+        tuple[Array, Array | None, Array | None, list[Array] | None]
+            Tuple `(block_output, None, None, None)`.
         """
-        so2_unit_output = self._run_so2_unit(x, edge_cache, radial_feat)
-        ffn_state = x + so2_unit_output
+        so2_unit_output = self._run_so2_unit(x, edge_cache, radial_feat, comm_dict)
+        so2_state = x + so2_unit_output
+
+        ffn_state = so2_state
         for i in range(self.ffn_blocks):
-            ffn_state = ffn_state + self._run_ffn_unit(ffn_state, i)
-        return ffn_state, None, None, None
-
-    def _sub_modules(self) -> list[tuple[str, NativeOP | None]]:
-        """Sub-modules with their pt module names (None = pt nn.Identity)."""
-        subs: list[tuple[str, NativeOP | None]] = [
-            ("pre_so2_norm", self.pre_so2_norm),
-            ("post_so2_norm", self.post_so2_norm),
-            ("so2_conv", self.so2_conv),
-        ]
+            ffn_unit_output = self._run_ffn_unit(ffn_state, i)
+            ffn_state = ffn_state + ffn_unit_output
+
+        block_output = ffn_state
+        return block_output, None, None, None
+
+    def _forward_with_full_attn_res(
+        self,
+        x: Array,
+        edge_cache: EdgeFeatureCache,
+        radial_feat: Array,
+        unit_history: list[Array] | None = None,
+        comm_dict: dict[str, Array] | None = None,
+    ) -> tuple[
+        Array,
+        Array | None,
+        Array | None,
+        list[Array] | None,
+    ]:
+        """
+        Run the block with full attention residuals over unit history.
+
+        Parameters
+        ----------
+        x
+            Current block input with shape `(N, D, 1, C)`.
+        edge_cache
+            Edge cache.
+        radial_feat
+            Per-edge radial features with shape (E, lmax+1, C).
+        unit_history
+            Truncated history in canonical node layout. Each source has shape
+            `(N, D, 1, C)`.
+        comm_dict
+            Border-exchange tensors for parallel inference, forwarded to the
+            SO(2) unit. The attention-residual aggregation is per-node, so the
+            ghost exchange at the convolution input restores ghost correctness
+            even when the history sources carry stale ghost rows.
+
+        Returns
+        -------
+        tuple[Array, Array | None, Array | None, list[Array] | None]
+            Tuple `(block_output, None, so2_unit_output, ffn_unit_outputs)`.
+        """
+        so2_input = self.full_attn_res_so2(
+            sources=unit_history,
+            scalar_extractor=self._extract_l0_from_canonical,
+            current_x=x,
+        )
+        so2_unit_output = self._run_so2_unit(
+            so2_input, edge_cache, radial_feat, comm_dict
+        )
+
+        completed_units = [*unit_history, so2_unit_output]
+        current_x = so2_unit_output
+        ffn_unit_outputs: list[Array] = []
         for i in range(self.ffn_blocks):
-            subs.append((f"pre_ffn_norms.{i}", self.pre_ffn_norms[i]))
-            subs.append((f"post_ffn_norms.{i}", self.post_ffn_norms[i]))
-            subs.append((f"ffns.{i}", self.ffns[i]))
-        return subs
-
-    def _variables(self) -> dict[str, Any]:
-        """Variables keyed by the pt ``state_dict`` key names."""
-        variables: dict[str, Any] = {}
-        for prefix, sub in self._sub_modules():
-            if sub is None:
-                continue
-            if isinstance(sub, SO2Convolution):
-                sub_vars = sub._variables()
-            else:
-                sub_vars = sub.serialize()["@variables"]
-            for key, value in sub_vars.items():
-                variables[f"{prefix}.{key}"] = value
+            ffn_input: Array = self.full_attn_res_ffns[i](
+                sources=completed_units,
+                scalar_extractor=self._extract_l0_from_canonical,
+                current_x=current_x,
+            )
+            ffn_unit_output = self._run_ffn_unit(ffn_input, i)
+            ffn_unit_outputs.append(ffn_unit_output)
+            completed_units.append(ffn_unit_output)
+            current_x = ffn_unit_output
+
+        block_output = current_x
+        return block_output, None, so2_unit_output, ffn_unit_outputs
+
+    def _forward_with_block_attn_res(
+        self,
+        x: Array,
+        edge_cache: EdgeFeatureCache,
+        radial_feat: Array,
+        unit_history: list[Array] | None = None,
+        comm_dict: dict[str, Array] | None = None,
+    ) -> tuple[
+        Array,
+        Array | None,
+        Array | None,
+        list[Array] | None,
+    ]:
+        """
+        Run the block with block attention residuals over block history.
+
+        Parameters
+        ----------
+        x
+            Current block input with shape `(N, D, 1, C)`.
+        edge_cache
+            Edge cache.
+        radial_feat
+            Per-edge radial features with shape (E, lmax+1, C).
+        unit_history
+            Truncated block history in canonical node layout. Each source has shape
+            `(N, D, 1, C)`.
+        comm_dict
+            Border-exchange tensors for parallel inference, forwarded to the
+            SO(2) unit. The attention-residual aggregation is per-node, so the
+            ghost exchange at the convolution input restores ghost correctness
+            even when the history sources carry stale ghost rows.
+
+        Returns
+        -------
+        tuple[Array, Array | None, Array | None, list[Array] | None]
+            Tuple `(block_output, block_summary, None, None)`.
+        """
+        so2_input = self.block_attn_res_so2(
+            sources=unit_history,
+            scalar_extractor=self._extract_l0_from_canonical,
+            current_x=x,
+        )
+        so2_unit_output = self._run_so2_unit(
+            so2_input, edge_cache, radial_feat, comm_dict
+        )
+
+        partial_block = so2_unit_output
+        current_x = so2_unit_output
+        for i in range(self.ffn_blocks):
+            ffn_input: Array = self.block_attn_res_ffns[i](
+                sources=[*unit_history, partial_block],
+                scalar_extractor=self._extract_l0_from_canonical,
+                current_x=current_x,
+            )
+            ffn_unit_output = self._run_ffn_unit(ffn_input, i)
+            partial_block = partial_block + ffn_unit_output
+            current_x = ffn_unit_output
+
+        block_output = current_x
+        block_summary = partial_block
+        return block_output, block_summary, None, None
+
+    def _variables(self) -> dict[str, np.ndarray]:
+        variables: dict[str, np.ndarray] = {}
+        if self.pre_so2_norm is not None:
+            pre_so2_vars = self.pre_so2_norm.serialize().get("@variables", {})
+            for key, value in pre_so2_vars.items():
+                variables[f"pre_so2_norm.{key}"] = value
+        if self.post_so2_norm is not None:
+            post_so2_vars = self.post_so2_norm.serialize().get("@variables", {})
+            for key, value in post_so2_vars.items():
+                variables[f"post_so2_norm.{key}"] = value
+        for key, value in self.so2_conv.serialize()["@variables"].items():
+            variables[f"so2_conv.{key}"] = value
+        for i, ffn in enumerate(self.ffns):
+            for key, value in ffn.serialize()["@variables"].items():
+                variables[f"ffns.{i}.{key}"] = value
+        for i, norm in enumerate(self.pre_ffn_norms):
+            if norm is not None:
+                for key, value in norm.serialize().get("@variables", {}).items():
+                    variables[f"pre_ffn_norms.{i}.{key}"] = value
+        for i, norm in enumerate(self.post_ffn_norms):
+            if norm is not None:
+                for key, value in norm.serialize().get("@variables", {}).items():
+                    variables[f"post_ffn_norms.{i}.{key}"] = value
+        if self.adam_ffn_layer_scales is not None:
+            for i, scale in enumerate(self.adam_ffn_layer_scales):
+                variables[f"adam_ffn_layer_scales.{i}"] = to_numpy_array(scale)
+        if self.full_attn_res_so2 is not None:
+            for key, value in self.full_attn_res_so2.serialize()["@variables"].items():
+                variables[f"full_attn_res_so2.{key}"] = value
+        if self.full_attn_res_ffns is not None:
+            for i, attn in enumerate(self.full_attn_res_ffns):
+                for key, value in attn.serialize()["@variables"].items():
+                    variables[f"full_attn_res_ffns.{i}.{key}"] = value
+        if self.block_attn_res_so2 is not None:
+            for key, value in self.block_attn_res_so2.serialize()["@variables"].items():
+                variables[f"block_attn_res_so2.{key}"] = value
+        if self.block_attn_res_ffns is not None:
+            for i, attn in enumerate(self.block_attn_res_ffns):
+                for key, value in attn.serialize()["@variables"].items():
+                    variables[f"block_attn_res_ffns.{i}.{key}"] = value
         return variables
 
-    def _load_variables(self, variables: dict[str, Any]) -> None:
-        """Load variables keyed by the pt ``state_dict`` key names."""
-        variables = dict(variables)
-        for name, sub in self._sub_modules():
-            if sub is None:
-                continue
-            full = f"{name}."
-            sv = {
-                key[len(full) :]: value
+    def _load_variables(self, variables: dict[str, np.ndarray]) -> None:
+        def load(module: NativeOP, prefix: str) -> NativeOP:
+            data = module.serialize()
+            data["@variables"] = {
+                key[len(prefix) :]: value
                 for key, value in variables.items()
-                if key.startswith(full)
+                if key.startswith(prefix)
             }
-            for key in list(variables):
-                if key.startswith(full):
-                    del variables[key]
-            if not sv:
-                raise KeyError(f"Missing variables with prefix: {full}")
-            if isinstance(sub, SO2Convolution):
-                sub._load_variables(sv)
-            elif isinstance(sub, EquivariantFFN):
-                sub._load_variables(sv)
-            else:
-                # norms: rebuild through the shape-checking deserialize
-                data = sub.serialize()
-                data["@variables"] = sv
-                new_sub = type(sub).deserialize(data)
-                attr, _, idx = name.partition(".")
-                if idx:
-                    getattr(self, attr)[int(idx)] = new_sub
-                else:
-                    setattr(self, attr, new_sub)
-        if variables:
-            raise KeyError(f"Unknown variables: {sorted(variables)}")
+            return type(module).deserialize(data)
+
+        if self.pre_so2_norm is not None:
+            self.pre_so2_norm = load(self.pre_so2_norm, "pre_so2_norm.")
+        if self.post_so2_norm is not None:
+            self.post_so2_norm = load(self.post_so2_norm, "post_so2_norm.")
+        self.so2_conv = load(self.so2_conv, "so2_conv.")
+        self.ffns = [load(ffn, f"ffns.{i}.") for i, ffn in enumerate(self.ffns)]
+        self.pre_ffn_norms = [
+            load(norm, f"pre_ffn_norms.{i}.") if norm is not None else None
+            for i, norm in enumerate(self.pre_ffn_norms)
+        ]
+        self.post_ffn_norms = [
+            load(norm, f"post_ffn_norms.{i}.") if norm is not None else None
+            for i, norm in enumerate(self.post_ffn_norms)
+        ]
+        if self.adam_ffn_layer_scales is not None:
+            self.adam_ffn_layer_scales = [
+                np.asarray(
+                    variables[f"adam_ffn_layer_scales.{i}"],
+                    dtype=PRECISION_DICT[self.precision],
+                )
+                for i in range(len(self.adam_ffn_layer_scales))
+            ]
+        if self.full_attn_res_so2 is not None:
+            self.full_attn_res_so2 = load(self.full_attn_res_so2, "full_attn_res_so2.")
+        if self.full_attn_res_ffns is not None:
+            self.full_attn_res_ffns = [
+                load(attn, f"full_attn_res_ffns.{i}.")
+                for i, attn in enumerate(self.full_attn_res_ffns)
+            ]
+        if self.block_attn_res_so2 is not None:
+            self.block_attn_res_so2 = load(
+                self.block_attn_res_so2, "block_attn_res_so2."
+            )
+        if self.block_attn_res_ffns is not None:
+            self.block_attn_res_ffns = [
+                load(attn, f"block_attn_res_ffns.{i}.")
+                for i, attn in enumerate(self.block_attn_res_ffns)
+            ]
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the SeZMInteractionBlock to a dict (pt-compatible format)."""
         return {
             "@class": "SeZMInteractionBlock",
             "@version": 1,
@@ -544,10 +1066,12 @@ def serialize(self) -> dict[str, Any]:
                 "focus_dim": self.focus_dim,
                 "focus_compete": self.focus_compete,
                 "so2_norm": self.so2_norm,
-                "so2_layers": self.so2_layers,
+                "mixing_layers": self.mixing_layers,
                 "so2_attn_res": self.so2_attn_res_mode,
                 "radial_so2_mode": self.radial_so2_mode,
                 "radial_so2_rank": self.radial_so2_rank,
+                "edge_cartesian": self.edge_cartesian,
+                "node_cartesian": self.node_cartesian,
                 "n_atten_head": self.n_atten_head,
                 "atten_f_mix": self.atten_f_mix,
                 "atten_v_proj": self.use_atten_v_proj,
@@ -590,16 +1114,14 @@ def serialize(self) -> dict[str, Any]:
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> SeZMInteractionBlock:
-        """Deserialize a SeZMInteractionBlock from a dict."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "SeZMInteractionBlock":
             raise ValueError(f"Invalid class for SeZMInteractionBlock: {data_cls}")
         version = int(data.pop("@version"))
         check_version_compatibility(version, 1, 1)
-        config = dict(data.pop("config"))
+        config = data.pop("config")
         variables = data.pop("@variables")
-        config["precision"] = str(config.pop("precision"))
         obj = cls(**config)
         obj._load_variables(variables)
         return obj
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/cartesian.py b/deepmd/dpmodel/descriptor/dpa4_nn/cartesian.py
new file mode 100644
index 0000000000..4b666c6d25
--- /dev/null
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/cartesian.py
@@ -0,0 +1,762 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""
+Cartesian rank-2 tensor-product mixers for DPA4/SeZM.
+
+For message-passing degree ``lmax <= 2`` the per-channel spherical-harmonic
+feature ``(l = 0, 1, 2)`` is isomorphic to a rank-2 Cartesian tensor (a ``3x3``
+matrix) that decomposes into a scalar (the trace), a vector (the antisymmetric
+part), and a symmetric-traceless tensor. A matrix product of two such tensors
+mixes these irreducible components while staying SO(3)-equivariant in the global
+frame, because ``(R X R^T)(R Y R^T) = R (X Y) R^T`` for any rotation ``R``. This
+replaces the rotate-to-local / ``SO2Linear`` stack / rotate-back core of
+:class:`SO2Convolution` without constructing any Wigner-D rotation.
+
+Two placements share the same scaffold (a per-degree channel linear, a gated
+nonlinearity, and a residual stack) but differ in the right operand of the
+``3x3`` product:
+
+* :class:`EdgeCartesianTensorProduct` runs per edge, before aggregation. The
+  right operand is the edge tensor
+  ``T_e = f_iso I + f_aniso A(r_hat) + f_sym S(r_hat)``, whose per-degree radial
+  weights ``f_*`` carry the edge condition. Because ``T_e`` depends only on the
+  edge direction it is shared across channels, so the product is evaluated
+  through channel-shared packed operators (below) without materializing any
+  ``3x3`` matrix per channel. With ``n_layers = 0`` the message is the single
+  modulation ``x @ T_e`` (no learnable channel-mixing layers); ``n_layers > 0``
+  refines it with the residual stack.
+* :class:`NodeCartesianTensorProduct` runs per node, after aggregation. It
+  couples the aggregated message with the destination node feature through the
+  product of ``linear(message)`` with ``node`` lifted by the orthonormal basis,
+  serving as the Cartesian counterpart of the ``message_node`` grid product.
+  Both operands are per-channel, so the product is the literal ``3x3`` form. The
+  one-sided product ``linear(message) @ node`` is SO(3)-equivariant; the
+  symmetrized product ``linear(message) @ node + node @ linear(message)``
+  additionally preserves the parity of each irreducible component.
+
+Placing the product per node makes its cost scale with the number of nodes
+rather than the number of edges, which is the regime where the Cartesian form is
+cheaper than the per-edge SO(2) rotation.
+
+Channel-shared edge evaluation
+------------------------------
+A literal ``to_cart -> Y @ T_e -> from_cart`` round trip materializes a ``3x3``
+matrix for every (edge, channel) pair and runs both basis changes once per
+layer, which is memory-bandwidth and kernel-launch bound. Instead, for a fixed
+edge the map ``y -> from_cart(to_cart(y) @ T_e)`` is linear in the packed
+coefficient ``y`` and splits, by linearity of ``T_e``, into
+
+    m = (f_iso / sqrt(3)) y + f_aniso (K_A y) + f_sym (K_S y),
+
+where ``K_A`` and ``K_S`` are ``(D, D)`` packed-basis operators for
+"right-multiply by ``A(r_hat)`` / ``S(r_hat)``". They depend only on the edge
+direction, hence are shared across channels: building them once per edge turns
+the per-layer geometry into a single channel-batched ``bmm(K, y)`` instead of
+two per-channel basis changes plus a per-channel ``3x3`` product. The identity
+component collapses to a scalar rescaling because the basis is orthonormal
+(``<B_p, B_d> = delta_{pd}``).
+
+With ``B`` the orthonormal packed-to-Cartesian basis, the projection from an
+edge component ``G`` to its packed right-multiply operator is
+``K_G[p, d] = sum_{k,j} W[p, d, k, j] G[k, j]`` with the fixed tensor
+``W[p, d, k, j] = sum_i B[p, i, j] B[d, i, k]``. The per-degree overall scale of
+``B`` is arbitrary (absorbed by the learnable layers), so it is chosen
+orthonormal for an exact, transpose-free round trip.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.cartesian``.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import math
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+
+import array_api_compat
+import numpy as np
+
+from deepmd.dpmodel import (
+    DEFAULT_PRECISION,
+    PRECISION_DICT,
+    NativeOP,
+)
+from deepmd.dpmodel.array_api import (
+    xp_asarray_nodetach,
+)
+from deepmd.dpmodel.utils.network import (
+    Identity,
+)
+from deepmd.dpmodel.utils.seed import (
+    child_seed,
+)
+from deepmd.utils.version import (
+    check_version_compatibility,
+)
+
+from .activation import (
+    GatedActivation,
+)
+from .indexing import (
+    get_so3_dim_of_lmax,
+)
+from .so3 import (
+    SO3Linear,
+)
+from .utils import (
+    get_promoted_dtype,
+    safe_norm,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+    )
+
+
+def build_cartesian_basis(
+    lmax: int,
+    *,
+    dtype: Any,
+) -> np.ndarray:
+    """
+    Build the orthonormal ``3x3`` basis aligned with the SeZM packed (l, m) layout.
+
+    Entry ``basis[d]`` is the Cartesian image of the d-th packed
+    spherical-harmonic coefficient, ordered ``l = 0``, ``l = 1`` (``m = -1, 0,
+    +1``), ``l = 2`` (``m = -2 .. +2``). The basis is orthonormal under the
+    Frobenius inner product; the inverse map reuses the same basis and the
+    coefficient round trip is exact.
+
+    The convention-critical part is the within-degree sign and ordering: it
+    matches the SeZM ``WignerDCalculator`` (``l = 1`` follows its
+    ``l1_perm``/``l1_sign``), which makes the basis intertwine the packed
+    Wigner-D rotation with the Cartesian rotation ``X -> R X R^T``. The
+    per-degree overall scale is free and absorbed by the learnable layers.
+
+    Parameters
+    ----------
+    lmax : int
+        Message-passing degree, must be 1 or 2.
+    dtype : np.dtype
+        Output dtype.
+
+    Returns
+    -------
+    np.ndarray
+        Basis with shape ``(D, 3, 3)`` where ``D = (lmax + 1) ** 2``.
+
+    Raises
+    ------
+    ValueError
+        If ``lmax`` is not 1 or 2.
+    """
+    if lmax not in (1, 2):
+        raise ValueError("Cartesian tensor product requires lmax in {1, 2}")
+    a = 1.0 / math.sqrt(2.0)
+    b = 1.0 / math.sqrt(3.0)
+    c = 1.0 / math.sqrt(6.0)
+    matrices: list[list[list[float]]] = [
+        # l = 0 : isotropic (trace)
+        [[b, 0.0, 0.0], [0.0, b, 0.0], [0.0, 0.0, b]],
+        # l = 1 : antisymmetric, m = -1, 0, +1
+        [[0.0, 0.0, -a], [0.0, 0.0, 0.0], [a, 0.0, 0.0]],
+        [[0.0, a, 0.0], [-a, 0.0, 0.0], [0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 0.0], [0.0, 0.0, -a], [0.0, a, 0.0]],
+    ]
+    if lmax == 2:
+        matrices += [
+            # l = 2 : symmetric traceless, m = -2 .. +2
+            [[0.0, -a, 0.0], [-a, 0.0, 0.0], [0.0, 0.0, 0.0]],
+            [[0.0, 0.0, 0.0], [0.0, 0.0, a], [0.0, a, 0.0]],
+            [[-c, 0.0, 0.0], [0.0, -c, 0.0], [0.0, 0.0, 2.0 * c]],
+            [[0.0, 0.0, -a], [0.0, 0.0, 0.0], [-a, 0.0, 0.0]],
+            [[a, 0.0, 0.0], [0.0, -a, 0.0], [0.0, 0.0, 0.0]],
+        ]
+    return np.array(matrices, dtype=dtype)
+
+
+def build_edge_cartesian_tensors(
+    r_hat: Any,
+) -> tuple[Any, Any]:
+    """
+    Build the antisymmetric and symmetric-traceless edge tensors from unit vectors.
+
+    Parameters
+    ----------
+    r_hat : Array
+        Unit edge vectors with shape ``(E, 3)``.
+
+    Returns
+    -------
+    tuple[Array, Array]
+        Tuple containing (A0, S0), each with shape (E, 3, 3).
+        - A0: The antisymmetric (l=1, vector) part, computed as skew(r_hat).
+        - S0: The symmetric traceless (l=2, tensor) part, given by r_hat r_hat^T minus the identity matrix divided by 3.
+        Both are 3x3 matrices which transform via matrix conjugation (M -> R M R^T) under rotation of r_hat, but occupy different irreducible SO(3) subspaces (l=1 for A0, l=2 for S0).
+    """
+    xp = array_api_compat.array_namespace(r_hat)
+    rx, ry, rz = r_hat[:, 0], r_hat[:, 1], r_hat[:, 2]
+    zero = xp.zeros_like(rx)
+    a0 = xp.stack(
+        [
+            xp.stack([zero, -rz, ry], axis=-1),
+            xp.stack([rz, zero, -rx], axis=-1),
+            xp.stack([-ry, rx, zero], axis=-1),
+        ],
+        axis=-2,
+    )  # (E, 3, 3)
+    eye = xp.eye(3, dtype=r_hat.dtype, device=array_api_compat.device(r_hat))
+    s0 = r_hat[..., None] * r_hat[..., None, :] - eye / 3.0  # (E, 3, 3)
+    return a0, s0
+
+
+class _CartesianTensorProduct(NativeOP):
+    """
+    Shared scaffold for the Cartesian rank-2 tensor-product mixers.
+
+    Holds the per-degree channel linears, the gated nonlinearities, and the
+    residual layer loop. Subclasses register the geometry buffer they need and
+    define ``forward``; the only per-layer difference is the equivariant ``3x3``
+    product supplied to :meth:`_run_layers`.
+
+    Parameters
+    ----------
+    lmax : int
+        Message-passing degree, must be 1 or 2.
+    focus_dim : int
+        Channel width per focus stream.
+    n_focus : int
+        Number of focus streams; the flattened channel width is
+        ``n_focus * focus_dim``.
+    n_layers : int
+        Number of stacked tensor-product layers.
+    activation_function : str
+        Activation function for the intermediate gated nonlinearities.
+    mlp_bias : bool
+        Whether the per-degree channel linear carries an ``l = 0`` bias.
+    precision : str
+        Parameter precision.
+    seed : int | list[int] | None
+        Base seed for deterministic initialization.
+    trainable : bool
+        Whether parameters are trainable.
+
+    Raises
+    ------
+    ValueError
+        If ``lmax`` is not 1 or 2, or ``n_layers`` is negative.
+    """
+
+    def __init__(
+        self,
+        *,
+        lmax: int,
+        focus_dim: int,
+        n_focus: int,
+        n_layers: int,
+        activation_function: str,
+        mlp_bias: bool,
+        precision: str = DEFAULT_PRECISION,
+        seed: int | list[int] | None,
+        trainable: bool,
+    ) -> None:
+        if lmax not in (1, 2):
+            raise ValueError("`lmax` must be 1 or 2 for the Cartesian tensor product")
+        self.lmax = int(lmax)
+        self.focus_dim = int(focus_dim)
+        self.n_focus = int(n_focus)
+        self.n_layers = int(n_layers)
+        if self.n_layers < 0:
+            raise ValueError("`n_layers` must be >= 0")
+        self.ebed_dim = get_so3_dim_of_lmax(self.lmax)
+        self.c_wide = self.n_focus * self.focus_dim
+        self.precision = precision
+        self.compute_precision = str(
+            np.dtype(get_promoted_dtype(PRECISION_DICT[self.precision])).name
+        )
+        self.activation_function = str(activation_function)
+        self.mlp_bias = bool(mlp_bias)
+        self.trainable = bool(trainable)
+
+        # Separate seed namespaces so the linear and activation seeds never
+        # collide regardless of ``n_layers``.
+        seed_linears = child_seed(seed, 0)
+        seed_activations = child_seed(seed, 1)
+
+        # === Step 1. Per-degree channel linears (cross-degree mixing comes from
+        # the matrix product, not the linear) ===
+        self.linears = [
+            SO3Linear(
+                lmax=self.lmax,
+                in_channels=self.focus_dim,
+                out_channels=self.focus_dim,
+                n_focus=self.n_focus,
+                precision=self.precision,
+                mlp_bias=mlp_bias,
+                trainable=trainable,
+                seed=child_seed(seed_linears, i),
+            )
+            for i in range(self.n_layers)
+        ]
+
+        # === Step 2. Gated nonlinearities; the last layer stays linear to mirror
+        # the trailing identity of the SO(2) mixing stack ===
+        activations: list[NativeOP] = []
+        for i in range(self.n_layers):
+            if i < self.n_layers - 1:
+                activations.append(
+                    GatedActivation(
+                        lmax=self.lmax,
+                        channels=self.focus_dim,
+                        n_focus=self.n_focus,
+                        precision=self.compute_precision,
+                        activation_function=activation_function,
+                        mlp_bias=mlp_bias,
+                        layout="ndfc",
+                        trainable=trainable,
+                        seed=child_seed(seed_activations, i),
+                    )
+                )
+            else:
+                activations.append(Identity())
+        self.activations = activations
+
+    def _run_layers(
+        self,
+        h: Any,
+        apply_product: Callable[[Any], Any],
+    ) -> Any:
+        """
+        Run the residual tensor-product stack in packed ``(B, D, C_wide)`` layout.
+
+        Each layer mixes channels per degree (``linear``), forms the equivariant
+        ``3x3`` product (``apply_product``), and adds a gated-nonlinear residual.
+
+        Parameters
+        ----------
+        h : Array
+            Input features with shape ``(B, D, C_wide)``.
+        apply_product : Callable[[Array], Array]
+            Maps the per-degree channel-mixed feature ``y`` to the equivariant
+            product term, both in ``(B, D, C_wide)`` layout.
+
+        Returns
+        -------
+        Array
+            Mixed features with shape ``(B, D, C_wide)``.
+        """
+        xp = array_api_compat.array_namespace(h)
+        n = h.shape[0]
+        d, f, cf, cw = self.ebed_dim, self.n_focus, self.focus_dim, self.c_wide
+        for linear, activation in zip(self.linears, self.activations, strict=True):
+            y = xp.reshape(linear(xp.reshape(h, (n, d, f, cf))), (n, d, cw))
+            m = apply_product(y)
+            h = h + xp.reshape(activation(xp.reshape(m, (n, d, f, cf))), (n, d, cw))
+        return h
+
+    def _sub_modules(self) -> list[tuple[str, NativeOP]]:
+        """Sub-modules with their pt module names."""
+        subs: list[tuple[str, NativeOP]] = []
+        for i, linear in enumerate(self.linears):
+            subs.append((f"linears.{i}", linear))
+        for i, activation in enumerate(self.activations):
+            subs.append((f"activations.{i}", activation))
+        return subs
+
+    def _variables(self) -> dict[str, Any]:
+        """Variables keyed by the pt ``state_dict`` key names."""
+        variables: dict[str, Any] = {}
+        for prefix, sub in self._sub_modules():
+            for key, value in sub.serialize().get("@variables", {}).items():
+                variables[f"{prefix}.{key}"] = value
+        return variables
+
+    def _load_variables(self, variables: dict[str, Any]) -> None:
+        """Load variables keyed by the pt ``state_dict`` key names."""
+        for attr, sub in self._sub_modules():
+            full = f"{attr}."
+            sv = {
+                key[len(full) :]: value
+                for key, value in variables.items()
+                if key.startswith(full)
+            }
+            data = sub.serialize()
+            data["@variables"] = sv
+            list_name, _, idx = attr.partition(".")
+            getattr(self, list_name)[int(idx)] = type(sub).deserialize(data)
+
+
+class EdgeCartesianTensorProduct(_CartesianTensorProduct):
+    """
+    Edge-wise Cartesian rank-2 tensor-product mixer (SO(3)-equivariant).
+
+    Per edge, the source spherical-harmonic feature is mixed with the edge tensor
+    ``T_e = f_iso I + f_aniso A(r_hat) + f_sym S(r_hat)``, whose per-degree radial
+    weights ``f_*`` carry the edge condition. The product is evaluated through
+    channel-shared packed operators (see the module docstring) so no ``3x3``
+    matrix is materialized per channel. Stacking ``n_layers`` such products
+    supplies the cross-degree mixing that the local-frame ``SO2Linear`` provided,
+    but in the global frame and without any Wigner-D rotation.
+
+    Parameters
+    ----------
+    lmax : int
+        Message-passing degree, must be 1 or 2.
+    focus_dim : int
+        Channel width per focus stream.
+    n_focus : int
+        Number of focus streams; the flattened channel width is
+        ``n_focus * focus_dim``.
+    n_layers : int
+        Number of stacked tensor-product layers.
+    activation_function : str
+        Activation function for the intermediate gated nonlinearities.
+    mlp_bias : bool
+        Whether the per-degree channel linear carries an ``l = 0`` bias.
+    eps : float
+        Epsilon for the edge-vector normalization.
+    precision : str
+        Parameter precision.
+    seed : int | list[int] | None
+        Base seed for deterministic initialization.
+    trainable : bool
+        Whether parameters are trainable.
+    """
+
+    def __init__(
+        self,
+        *,
+        lmax: int,
+        focus_dim: int,
+        n_focus: int,
+        n_layers: int,
+        activation_function: str,
+        mlp_bias: bool,
+        eps: float,
+        precision: str = DEFAULT_PRECISION,
+        seed: int | list[int] | None,
+        trainable: bool,
+    ) -> None:
+        super().__init__(
+            lmax=lmax,
+            focus_dim=focus_dim,
+            n_focus=n_focus,
+            n_layers=n_layers,
+            activation_function=activation_function,
+            mlp_bias=mlp_bias,
+            precision=precision,
+            seed=seed,
+            trainable=trainable,
+        )
+        self.eps = float(eps)
+
+        # Non-persistent: a deterministic constant rebuilt on construction, so it
+        # never enters the serialized state. The orthonormal basis ``B`` is
+        # contracted into the right-multiply projection
+        # ``W[p, d, k, j] = sum_i B[p, i, j] B[d, i, k]`` that maps an edge
+        # component to its channel-shared packed operator (see ``call``).
+        basis = build_cartesian_basis(
+            self.lmax, dtype=PRECISION_DICT[self.precision.lower()]
+        )
+        self.right_mult_proj = np.einsum("pij,dik->pdkj", basis, basis)
+
+    def call(
+        self,
+        x: Any,
+        edge_vec: Any,
+        rad_feat: Any,
+    ) -> Any:
+        """
+        Parameters
+        ----------
+        x : Array
+            Source node features in packed SO(3) layout with shape
+            ``(E, D, C_wide)``, where ``D = (lmax + 1) ** 2`` and
+            ``C_wide = n_focus * focus_dim``.
+        edge_vec : Array
+            Edge vectors with shape ``(E, 3)``, in Å.
+        rad_feat : Array
+            Per-degree radial weights with shape ``(E, lmax + 1, C_wide)``,
+            already projected to the hidden width.
+
+        Returns
+        -------
+        Array
+            Edge messages in packed SO(3) layout with shape ``(E, D, C_wide)``.
+        """
+        xp = array_api_compat.array_namespace(x)
+        device = array_api_compat.device(x)
+        d = self.ebed_dim
+        proj = xp.astype(
+            xp_asarray_nodetach(xp, self.right_mult_proj[...], device=device),
+            x.dtype,
+        )
+
+        # === Step 1. Channel-shared packed operators for the edge tensor ===
+        # A(r_hat) and S(r_hat) are rescaled to unit Frobenius norm (raw norms are
+        # sqrt(2) and sqrt(2/3)) so the per-degree radial weights modulate
+        # components of equal magnitude. Each is projected into a packed
+        # right-multiply operator of shape (E, D, D), shared by all channels; the
+        # identity component reduces to the scalar ``c_iso`` rescaling in Step 3.
+        r_hat = edge_vec / safe_norm(edge_vec, self.eps)  # (E, 3)
+        a0, s0 = build_edge_cartesian_tensors(xp.astype(r_hat, x.dtype))
+        a_hat = a0 / math.sqrt(2.0)
+        # einsum "pdkj,ekj->epd" as a flattened matmul contracting the Cartesian
+        # (k, j) entries against ``proj`` reshaped to ``(D * D, 3 * 3)``.
+        k_op = xp.reshape(
+            xp.matmul(
+                xp.reshape(a_hat, (a_hat.shape[0], -1)),
+                xp.permute_dims(xp.reshape(proj, (d * d, -1)), (1, 0)),
+            ),
+            (a_hat.shape[0], d, d),
+        )  # (E, D, D)
+        if self.lmax == 2:
+            s_hat = s0 / math.sqrt(2.0 / 3.0)
+            k_sym = xp.reshape(
+                xp.matmul(
+                    xp.reshape(s_hat, (s_hat.shape[0], -1)),
+                    xp.permute_dims(xp.reshape(proj, (d * d, -1)), (1, 0)),
+                ),
+                (s_hat.shape[0], d, d),
+            )  # (E, D, D)
+            # Stack so one batched matmul per layer covers both components.
+            k_op = xp.concat((k_op, k_sym), axis=1)  # (E, 2D, D)
+
+        # === Step 2. Per-degree radial weights, broadcast over the degree axis ===
+        c_iso = (rad_feat[:, 0, :] / math.sqrt(3.0))[:, None, :]  # (E, 1, C_wide)
+        c_aniso = rad_feat[:, 1, :][:, None, :]  # (E, 1, C_wide)
+        c_sym = (
+            rad_feat[:, 2, :][:, None, :] if self.lmax == 2 else None
+        )  # (E, 1, C_wide)
+
+        # === Step 3. Edge-tensor modulation, optionally refined by a mixing stack ===
+        def apply_product(y: Any) -> Any:
+            ky = xp.matmul(k_op, y)  # (E, lmax * D, C_wide)
+            m = c_iso * y + c_aniso * ky[:, :d, :]
+            if c_sym is not None:
+                m = m + c_sym * ky[:, d:, :]
+            return m
+
+        # ``n_layers == 0`` keeps only the edge-condition modulation ``x @ T_e``
+        # (radial scale + directional cross-degree coupling), with no learnable
+        # channel-mixing layers; ``n_layers > 0`` refines it with the residual
+        # stack of per-degree channel linears.
+        if self.n_layers == 0:
+            return apply_product(x)
+
+        return self._run_layers(x, apply_product)
+
+    def serialize(self) -> dict[str, Any]:
+        """Serialize the EdgeCartesianTensorProduct to a dict."""
+        return {
+            "@class": "EdgeCartesianTensorProduct",
+            "@version": 1,
+            "config": {
+                "lmax": self.lmax,
+                "focus_dim": self.focus_dim,
+                "n_focus": self.n_focus,
+                "n_layers": self.n_layers,
+                "activation_function": self.activation_function,
+                "mlp_bias": self.mlp_bias,
+                "eps": self.eps,
+                "precision": np.dtype(PRECISION_DICT[self.precision]).name,
+                "trainable": self.trainable,
+                "seed": None,
+            },
+            "@variables": self._variables(),
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict[str, Any]) -> EdgeCartesianTensorProduct:
+        """Deserialize an EdgeCartesianTensorProduct from a dict."""
+        data = data.copy()
+        data_cls = data.pop("@class")
+        if data_cls != "EdgeCartesianTensorProduct":
+            raise ValueError(
+                f"Invalid class for EdgeCartesianTensorProduct: {data_cls}"
+            )
+        version = int(data.pop("@version"))
+        check_version_compatibility(version, 1, 1)
+        config = dict(data.pop("config"))
+        variables = data.pop("@variables")
+        config["precision"] = str(config["precision"])
+        obj = cls(**config)
+        obj._load_variables(variables)
+        return obj
+
+
+class NodeCartesianTensorProduct(_CartesianTensorProduct):
+    """
+    Node-wise Cartesian rank-2 tensor-product mixer (SO(3)-equivariant).
+
+    Applied per node after aggregation, this couples the aggregated message with
+    the destination node feature, serving as the Cartesian counterpart of the
+    ``message_node`` grid product. The node feature is the fixed operator and the
+    message is the residual stream, so each layer forms the product of
+    ``linear(message)`` with ``node`` lifted by the orthonormal basis, then adds
+    a gated-nonlinear residual. There is no per-edge geometry, so the cost scales
+    with the number of nodes instead of the number of edges.
+
+    The ``symmetric`` flag selects the product form. The one-sided product
+    ``linear(message) @ node`` is SO(3)-equivariant and cheapest. The symmetrized
+    product ``linear(message) @ node + node @ linear(message)`` additionally
+    gives each irreducible component a definite parity under spatial inversion
+    (even scalar and symmetric-traceless parts, odd skew-symmetric part), which
+    the one-sided product mixes, at the cost of a second matrix product.
+
+    Parameters
+    ----------
+    lmax : int
+        Node degree, must be 1 or 2.
+    focus_dim : int
+        Channel width per focus stream.
+    n_focus : int
+        Number of focus streams; the flattened channel width is
+        ``n_focus * focus_dim``.
+    n_layers : int
+        Number of stacked tensor-product layers.
+    symmetric : bool
+        If True, use the parity-preserving symmetrized product ``Y N + N Y``;
+        if False, use the one-sided product ``Y N``.
+    activation_function : str
+        Activation function for the intermediate gated nonlinearities.
+    mlp_bias : bool
+        Whether the per-degree channel linear carries an ``l = 0`` bias.
+    precision : str
+        Parameter precision.
+    seed : int | list[int] | None
+        Base seed for deterministic initialization.
+    trainable : bool
+        Whether parameters are trainable.
+    """
+
+    def __init__(
+        self,
+        *,
+        lmax: int,
+        focus_dim: int,
+        n_focus: int,
+        n_layers: int,
+        symmetric: bool,
+        activation_function: str,
+        mlp_bias: bool,
+        precision: str = DEFAULT_PRECISION,
+        seed: int | list[int] | None,
+        trainable: bool,
+    ) -> None:
+        super().__init__(
+            lmax=lmax,
+            focus_dim=focus_dim,
+            n_focus=n_focus,
+            n_layers=n_layers,
+            activation_function=activation_function,
+            mlp_bias=mlp_bias,
+            precision=precision,
+            seed=seed,
+            trainable=trainable,
+        )
+        self.symmetric = bool(symmetric)
+        self.basis = build_cartesian_basis(
+            self.lmax, dtype=PRECISION_DICT[self.precision.lower()]
+        )
+
+    def call(self, message: Any, node: Any) -> Any:
+        """
+        Parameters
+        ----------
+        message : Array
+            Aggregated message in packed SO(3) layout with shape
+            ``(N, D, C_wide)``, where ``D = (lmax + 1) ** 2`` and
+            ``C_wide = n_focus * focus_dim``. This is the residual stream.
+        node : Array
+            Destination node feature in the same packed layout and shape. It is
+            the fixed right operand of the product across all layers.
+
+        Returns
+        -------
+        Array
+            Mixed message in packed SO(3) layout with shape ``(N, D, C_wide)``.
+        """
+        xp = array_api_compat.array_namespace(message)
+        device = array_api_compat.device(message)
+        basis = xp.astype(
+            xp_asarray_nodetach(xp, self.basis[...], device=device),
+            message.dtype,
+        )
+
+        # The node feature is the fixed per-node operator; lifting it to its
+        # per-(node, channel) 3x3 form once lets every layer reuse it.
+        # einsum "ndc,dij->ncij" as a flattened matmul over the degree axis.
+        node_cart = xp.reshape(
+            xp.matmul(
+                xp.permute_dims(node, (0, 2, 1)),
+                xp.reshape(basis, (basis.shape[0], -1)),
+            ),
+            (node.shape[0], node.shape[2], 3, 3),
+        )  # (N, C_wide, 3, 3)
+
+        def apply_product(y: Any) -> Any:
+            # einsum "ndc,dij->ncij" as a flattened matmul over the degree axis.
+            y_cart = xp.reshape(
+                xp.matmul(
+                    xp.permute_dims(y, (0, 2, 1)),
+                    xp.reshape(basis, (basis.shape[0], -1)),
+                ),
+                (y.shape[0], y.shape[2], 3, 3),
+            )  # (N, C_wide, 3, 3)
+            m_cart = xp.matmul(y_cart, node_cart)
+            if self.symmetric:
+                m_cart = m_cart + xp.matmul(node_cart, y_cart)
+            # einsum "ncij,dij->ndc" as a flattened matmul over the Cartesian
+            # (i, j) entries, then transpose back to packed (N, D, C_wide).
+            return xp.permute_dims(
+                xp.matmul(
+                    xp.reshape(m_cart, (m_cart.shape[0], m_cart.shape[1], -1)),
+                    xp.permute_dims(xp.reshape(basis, (basis.shape[0], -1)), (1, 0)),
+                ),
+                (0, 2, 1),
+            )  # (N, D, C_wide)
+
+        return self._run_layers(message, apply_product)
+
+    def serialize(self) -> dict[str, Any]:
+        """Serialize the NodeCartesianTensorProduct to a dict."""
+        return {
+            "@class": "NodeCartesianTensorProduct",
+            "@version": 1,
+            "config": {
+                "lmax": self.lmax,
+                "focus_dim": self.focus_dim,
+                "n_focus": self.n_focus,
+                "n_layers": self.n_layers,
+                "symmetric": self.symmetric,
+                "activation_function": self.activation_function,
+                "mlp_bias": self.mlp_bias,
+                "precision": np.dtype(PRECISION_DICT[self.precision]).name,
+                "trainable": self.trainable,
+                "seed": None,
+            },
+            "@variables": self._variables(),
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict[str, Any]) -> NodeCartesianTensorProduct:
+        """Deserialize a NodeCartesianTensorProduct from a dict."""
+        data = data.copy()
+        data_cls = data.pop("@class")
+        if data_cls != "NodeCartesianTensorProduct":
+            raise ValueError(
+                f"Invalid class for NodeCartesianTensorProduct: {data_cls}"
+            )
+        version = int(data.pop("@version"))
+        check_version_compatibility(version, 1, 1)
+        config = dict(data.pop("config"))
+        variables = data.pop("@variables")
+        config["precision"] = str(config["precision"])
+        obj = cls(**config)
+        obj._load_variables(variables)
+        return obj
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/edge_cache.py b/deepmd/dpmodel/descriptor/dpa4_nn/edge_cache.py
index 8d2b5cabf1..3ba5dde121 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/edge_cache.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/edge_cache.py
@@ -1,28 +1,13 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """
-Edge cache construction for the dpmodel DPA4/SeZM descriptor.
-
-This module defines the :class:`EdgeCache` dataclass (the dpmodel
-counterpart of the pt ``EdgeFeatureCache`` NamedTuple from
-``deepmd.pt.model.descriptor.sezm_nn.edge_cache``) and
-:func:`build_edge_cache`, the padded-layout counterpart of pt's
-sparse ``build_edge_cache``.
-
-Padded-edge layout
-------------------
-The pt implementation extracts a *sparse* edge list with ``torch.nonzero``:
-only valid neighbor slots become edges, and per-edge tensors have a
-data-dependent length ``E``. The dpmodel implementation instead uses a
-*padded* and frame-explicit edge layout: every neighbor slot of the DeePMD
-neighbor list contributes one edge, so
-
-    ``E = nf * nloc * nnei``
-
-with per-edge tensors flattened from ``(nf, nloc, nnei, ...)`` in row-major
-order. Invalid slots (``nlist == -1`` padding, excluded type pairs) stay in
-the arrays and are marked by ``edge_mask == 0``. Edge slot ``(f, i, j)``
-always belongs to destination node ``f * nloc + i``, so destination
-aggregation is a masked sum over the ``nnei`` axis instead of a scatter.
+Edge cache construction utilities for DPA4/SeZM.
+
+This module defines the shared procedures that assemble per-edge geometry,
+radial features, rotation blocks, and normalization terms used by the SeZM
+descriptor.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.edge_cache``.
 """
 
 from __future__ import (
@@ -30,6 +15,9 @@
 )
 
 import math
+from collections.abc import (
+    Callable,
+)
 from dataclasses import (
     dataclass,
     field,
@@ -42,6 +30,7 @@
 import numpy as np
 
 from deepmd.dpmodel.array_api import (
+    xp_add_at,
     xp_asarray_nodetach,
 )
 
@@ -54,33 +43,23 @@
     quaternion_z_rotation,
 )
 
+WignerCalculatorFn = Callable[[Any], "tuple[Any, Any]"]
+EdgeTypeKeepMaskFn = Callable[[Any, Any, Any], Any]
+
 
 @dataclass
 class EdgeCache:
     """
     Global edge feature cache created once per forward().
 
-    All per-edge arrays are aligned on the same padded edge axis
-    (``E = nf * nloc * nnei``); see the module docstring for the layout
-    contract. Node-level arrays use the local node axis ``N = nf * nloc``.
-
-    An ``EdgeCache`` must not be reused across forward passes:
-    ``D_to_m_cache``/``Dt_from_m_cache`` are keyed only by ``"lmax:mmax"``,
-    not by the contents of ``D_full``, so reuse with different Wigner blocks
-    would silently return stale projections.
+    All per-edge arrays are aligned on the same edge axis (E).
 
     Parameters
     ----------
     src
-        Source (neighbor) node indices with shape (E,), pointing into the
-        local node axis ``N = nf * nloc``. Invalid slots must hold a safe
-        in-range index (their contribution is masked out by ``edge_mask``).
+        Source node indices with shape (E,).
     dst
-        Destination (center) node indices with shape (E,). In the padded
-        layout this is slot-implicit and MUST equal
-        ``arange(nf * nloc)`` with each index repeated ``nnei`` consecutive
-        times (i.e. ``np.repeat(np.arange(nf * nloc), nnei)``;
-        node-contiguous order); aggregation code relies on this ordering.
+        Destination node indices with shape (E,).
     edge_type_feat
         Per-edge type embeddings with shape (E, C), computed as src+dst.
     edge_vec
@@ -89,10 +68,11 @@ class EdgeCache:
         Radial basis with shape (E, n_radial).
         The C^3 cutoff envelope is already baked in.
     edge_env
-        C^3 cutoff envelope weights with shape (E, 1). Zero on invalid slots.
+        C^3 cutoff envelope weights with shape (E, 1).
     deg
-        Envelope-squared smooth degree with shape (N,), computed as the
-        masked ``sum(edge_env**2)`` over each node's ``nnei`` slots.
+        Envelope-squared smooth degree with shape (N,), computed as
+        ``sum(edge_env**2)`` over incoming edges.
+        Used for smooth normalization in EnvironmentInitialEmbedding.
     inv_sqrt_deg
         Inverse square root smooth degree normalization with shape (N, 1, 1).
     D_full
@@ -100,27 +80,32 @@ class EdgeCache:
         Used for efficient batched rotation. None if not available.
     Dt_full
         Transpose of D_full with shape (E, D, D). None if not available.
+    edge_quat
+        Per-edge global-to-local quaternion actually used to build ``D_full`` and
+        ``Dt_full`` with shape (E, 4). Includes the optional random local-Z roll.
     D_to_m_cache
         Lazy cache for projected D matrices keyed by a normalized
-        ``"lmax:mmax"`` identifier. The key does not capture the contents
-        of ``D_full``, so the cache is only valid for the forward pass
-        that created this ``EdgeCache`` (see the class docstring).
+        ``"lmax:mmax"`` identifier.
     Dt_from_m_cache
         Lazy cache for projected Dt matrices keyed by a normalized
-        ``"lmax:mmax"`` identifier. Same single-forward-pass validity
-        caveat as ``D_to_m_cache``.
+        ``"lmax:mmax"`` identifier.
     edge_src_gate
         Optional per-edge Source Freeze Propagation Gate (SFPG) weight with
-        shape (E, 1). Present only in bridging mode; ``None`` otherwise.
-    edge_quat
-        Per-edge global-to-local quaternion used to build ``D_full`` and
-        ``Dt_full`` with shape (E, 4). None if not available.
+        shape (E, 1). Equals ``eta[src]`` where
+        ``eta[j] = prod_{k in N(j)} w(r_{jk})`` and ``w`` is the
+        :class:`BridgingSwitch` C3 switching amplitude. Present only when
+        the model runs in bridging mode; ``None`` otherwise. Aggregation
+        sites (``GeometricInitialEmbedding``, ``EnvironmentInitialEmbedding``,
+        ``SO2Convolution``) multiply their per-edge message contribution
+        by this gate to forbid any node whose local neighborhood enters
+        the frozen zone from propagating information along its outgoing
+        edges.
     edge_mask
-        Validity mask for the padded-edge layout with shape (E,) or (E, 1);
-        nonzero (1) marks a real edge, zero marks a padded/invalid slot.
-        ``None`` means all slots are valid. This field has no pt counterpart:
-        pt's sparse edge list contains valid edges only, while dpmodel keeps
-        the padded ``nf * nloc * nnei`` slots and masks the invalid ones.
+        Validity mask for the padded standard-path layout with shape (E,) or
+        (E, 1); 1 marks a real edge, 0 a padded/invalid slot. ``None`` means
+        all slots are valid (e.g. the sparse
+        :func:`build_edge_cache_from_edges` path, where masking is folded into
+        the per-edge weights). This field has no pt counterpart.
     """
 
     src: Any
@@ -140,76 +125,107 @@ class EdgeCache:
     edge_mask: Any = None
 
 
-def _build_edge_mask_and_src(
-    xp: Any,
-    nlist: Any,
-    mapping: Any,
-    pair_keep_mask: Any,
-    nall: int,
-) -> tuple[Any, Any, Any]:
+def compute_edge_src_gate(
+    *,
+    edge_len: Any,
+    src: Any,
+    n_nodes: int,
+    bridging_switch: Callable[[Any], Any],
+    edge_keep_f: Any = None,
+) -> Any:
     """
-    Build the padded edge validity mask and safe source-local indices.
-
-    Mirrors the pt edge-keep semantics of
-    ``sezm_nn.edge_cache._build_standard_edge_index`` exactly:
-
-    - padding slots (``nlist == -1``) are invalid;
-    - excluded type pairs (``pair_keep_mask == False``) are invalid;
-    - after mapping the neighbor's extended index to a local index, slots
-      whose source falls outside ``[0, nloc)`` are invalid (pt's ``src_ok``
-      filter; e.g. broken mapping or ghost-only neighbors);
-    - no distance-based filtering: edges beyond ``rcut`` stay valid and are
-      zeroed naturally by the smooth envelope.
-
-    Instead of dropping invalid slots (pt's ``torch.nonzero``), they are kept
-    with ``mask == False`` and safe (index 0) placeholder indices.
+    Compute the per-edge source gate for SFPG from edge lengths.
+
+    The gate implements a per-node "non-frozen confidence" and broadcasts
+    it back to edges along the source axis::
+
+        w_e      = bridging_switch(edge_len_e)               in [0, 1]
+        eta_j    = prod_{e: src_e = j} w_e                   in [0, 1]
+        gate_e   = eta_{src_e}                               in [0, 1]
+
+    ``w_e = 0`` at ``r_{jk} <= r_inner`` ensures ``eta_j = 0`` for any
+    node with at least one neighbor in the frozen zone. Masked edges
+    (padding, excluded type pairs) must contribute the multiplicative
+    identity ``1`` so they never spuriously mute a valid source node;
+    callers supply ``edge_keep_f`` for this.
+
+    The product is **not** realised by ``scatter_reduce(reduce="prod")``:
+    its registered backward handles exact zeros with a data-dependent
+    "count leave-one-out" branch that creates unbacked symints under
+    ``make_fx(tracing_mode="symbolic")`` and breaks the SeZM compile
+    path's double-backward tracing. Instead, the product is decomposed
+    into a log-sum on non-zero contributions combined with an explicit
+    "any zero per group" indicator that routes the frozen case through
+    ``where``. Both branches use only shape-preserving standard
+    ops (``scatter_add``, ``where``, ``exp``, ``log``) with backed
+    symints, so the graph survives symbolic tracing cleanly.
+
+    The gradient consequence at the plateau is exact: ``BridgingSwitch``
+    places ``w'(r) = 0`` for every ``r <= r_inner``, so the chain rule
+    ``d eta / d r = (leave-one-out factor) * w'(r) = anything * 0 = 0``
+    holds regardless of how the muted ``where`` branch treats the
+    upstream gradient. In the transition zone every edge has strictly
+    positive ``w`` and the log-sum branch gives the standard product
+    gradient.
 
     Parameters
     ----------
-    xp
-        Array namespace.
-    nlist
-        Neighbor list with shape (nf, nloc, nnei); -1 marks padding.
-    mapping
-        Extended-to-local mapping with shape (nf, nall), or None if the
-        neighbor indices are already local.
-    pair_keep_mask
-        Pair exclusion keep mask with shape (nf, nloc, nnei). True means keep.
-    nall
-        Number of atoms on the extended axis per frame.
+    edge_len
+        Per-edge distances with shape (E, 1).
+    src
+        Source node indices with shape (E,).
+    n_nodes
+        Total number of nodes N.
+    bridging_switch
+        Callable ``r -> w(r)`` with ``w: [0, ∞) -> [0, 1]``, typically a
+        :class:`BridgingSwitch` instance.
+    edge_keep_f
+        Optional per-edge keep weights with shape (E, 1), with ``0`` on
+        masked edges and ``1`` on kept edges. If provided, masked edges
+        are rewritten to ``w = 1`` before the product reduction.
 
     Returns
     -------
-    tuple[Any, Any, Any]
-        ``(mask, nlist_safe, src_local_safe)``, all with shape
-        (nf, nloc, nnei). ``mask`` is boolean; the two index arrays are int64
-        with 0 substituted on invalid slots.
+    Array
+        Per-edge source gate with shape (E, 1), aligned on the same edge
+        axis as the rest of the cache.
     """
-    nf, nloc, nnei = nlist.shape
-    nlist = xp.astype(nlist, xp.int64)
-    mask = (nlist >= 0) & pair_keep_mask
-    nlist_safe = xp.where(mask, nlist, xp.zeros_like(nlist))
-
-    if mapping is None:
-        # Neighbor indices are already local indices in [0, nloc).
-        src_local = nlist_safe
-    else:
-        # Map extended index -> local index for each frame.
-        mapping_flat = xp.astype(xp.reshape(mapping, (-1,)), xp.int64)
-        frame_idx = xp.reshape(
-            xp.arange(nf, dtype=xp.int64, device=array_api_compat.device(nlist)),
-            (nf, 1, 1),
-        )
-        flat_idx = xp.reshape(frame_idx * nall + nlist_safe, (-1,))
-        src_local = xp.reshape(xp.take(mapping_flat, flat_idx, axis=0), nlist.shape)
+    xp = array_api_compat.array_namespace(edge_len, src)
+    device = array_api_compat.device(edge_len)
+    # === Step 1. Per-edge switching amplitude w(r) in [0, 1] ===
+    edge_w = bridging_switch(edge_len)  # (E, 1)
+    if edge_keep_f is not None:
+        # Force w = 1 on masked edges so they are neutral for the product.
+        edge_w = edge_w * edge_keep_f + (1.0 - edge_keep_f)
+
+    edge_w_flat = edge_w[..., 0]  # (E,)
+    is_zero = edge_w_flat <= 0.0  # (E,) bool
+
+    # === Step 2. Log-sum reduction on non-zero contributions ===
+    # Replace exact zeros with the multiplicative identity 1 so their
+    # ``log`` contribution is 0 and the group-wise sum equals the log of
+    # the product of non-zero ``w`` values.
+    safe_w = xp.where(is_zero, xp.ones_like(edge_w_flat), edge_w_flat)
+    log_safe = xp.log(safe_w)
+    log_eta = xp_add_at(
+        xp.zeros((n_nodes,), dtype=edge_w.dtype, device=device), src, log_safe
+    )
+    eta_nonzero_path = xp.exp(log_eta)
+
+    # === Step 3. Exact-zero indicator per source node ===
+    # ``scatter_add`` over an ``int64`` cast of the zero mask counts how
+    # many frozen edges each source node owns. A strictly positive count
+    # means the product is 0 by the hard-freeze rule.
+    zero_count = xp_add_at(
+        xp.zeros((n_nodes,), dtype=xp.int64, device=device),
+        src,
+        xp.astype(is_zero, xp.int64),
+    )
+    any_zero = zero_count > 0
 
-    # pt's src_ok filter: drop (here: mask) edges mapping outside [0, nloc).
-    mask = mask & (src_local >= 0) & (src_local < nloc)
-    src_local_safe = xp.where(mask, src_local, xp.zeros_like(src_local))
-    # Re-zero nlist_safe after the src_ok update so coordinate gathers stay
-    # in-bounds when callers pass local nlists with out-of-range entries.
-    nlist_safe = xp.where(mask, nlist_safe, xp.zeros_like(nlist_safe))
-    return mask, nlist_safe, src_local_safe
+    # === Step 4. Combine and broadcast back to edges via source ===
+    eta = xp.where(any_zero, xp.zeros_like(eta_nonzero_path), eta_nonzero_path)
+    return xp.take(eta, src, axis=0)[:, None]
 
 
 def build_edge_cache(
@@ -221,66 +237,86 @@ def build_edge_cache(
     pair_keep_mask: Any,
     eps: float,
     deg_norm_floor: float,
-    edge_envelope: Any,
-    radial_basis: Any,
-    n_radial: int,  # unused: kept for pt signature parity (pt sizes its empty cache)
+    edge_envelope: Callable[[Any], Any],
+    radial_basis: Callable[[Any], Any],
+    n_radial: int,  # unused in padded layout; kept for pt signature parity
     random_gamma: bool,
-    wigner_calc: Any,
+    wigner_calc: WignerCalculatorFn,
+    build_wigner: bool = True,
     gamma: Any = None,
 ) -> EdgeCache:
     """
-    Build the global padded edge cache from a DeePMD padded neighbor list.
-
-    Padded counterpart of pt ``sezm_nn.edge_cache.build_edge_cache``. Instead
-    of extracting a sparse edge list with ``torch.nonzero`` (data-dependent
-    length), every neighbor slot becomes one edge slot:
-    ``E = nf * nloc * nnei`` flattened row-major, with invalid slots marked by
-    ``edge_mask == 0`` (see the :class:`EdgeCache` layout contract). In
-    particular ``dst == np.repeat(arange(nf * nloc), nnei)`` always, and there
-    is no empty-cache special case (E is shape-determined).
-
-    Masked-slot safety: gathered edge vectors on invalid slots are garbage
-    (placeholder index 0), and could even be exactly zero (self-difference),
-    which would produce a 0/0 in the normalization inside the quaternion
-    construction. Although the *forward* contribution of such slots is masked
-    out downstream, a NaN there would still poison the *backward* pass
-    (``where`` propagates NaN gradients from the unselected branch). Invalid
-    slots are therefore rewritten to the safe dummy unit vector ``+z`` BEFORE
-    any norm/quaternion/Wigner evaluation, and their envelope, radial basis,
-    and type features are multiplied by the mask so they are exactly zero.
+    Build the global edge cache from a DeePMD padded neighbor list.
+
+    This converts DeePMD's per-frame padded neighbor list into the per-edge
+    tensors reused across blocks. Where pt extracts a sparse list of valid
+    edges with ``torch.nonzero`` (data-dependent length), the array-API port
+    keeps one edge slot for every neighbor slot, so ``E = nf * nloc * nnei``
+    flattened row-major and ``dst == repeat(arange(nf * nloc), nnei)``. Invalid
+    slots (``nlist == -1`` padding, excluded type pairs, out-of-range mapped
+    sources) stay in the arrays, flagged by ``edge_mask``; their geometry,
+    envelope, radial basis, and type features are masked to zero.
+
+    The resulting cache contains:
+
+    - per-edge endpoints: ``src``, ``dst`` and per-edge type features: ``edge_type_feat`` (src+dst)
+    - per-edge geometry: ``edge_vec``
+    - per-edge smooth weights: C^3 cutoff envelope ``edge_env``
+    - per-edge radial basis: ``edge_rbf`` (envelope already baked in)
+    - per-edge rotation blocks: block-diagonal Wigner-D matrices ``D_full`` and ``Dt_full``
+    - destination-node smooth normalization: ``inv_sqrt_deg`` from
+      envelope-squared degree ``sum(edge_env**2)``
+
+    Notes
+    -----
+    Input formats follow DeePMD conventions:
+
+    - ``extended_coord`` has shape ``(nf, nall, 3)``.
+    - ``nlist`` has shape ``(nf, nloc, nnei)`` and stores indices into the extended axis
+      (``0..nall-1``), with ``-1`` indicating padding.
+    - ``mapping`` (when provided) maps extended indices to local indices ``0..nloc-1``.
+      When ``mapping`` is ``None``, the function assumes the neighbor indices are already local.
+
+    Gathered edge vectors on invalid slots are garbage (placeholder index 0)
+    and may even be exactly zero (self-difference), which would produce a 0/0
+    in the normalization inside the quaternion construction. Although the
+    forward contribution of such slots is masked out downstream, a NaN there
+    would still poison the backward pass (``where`` propagates NaN gradients
+    from the unselected branch). Invalid slots are therefore rewritten to the
+    safe dummy unit vector ``+z`` before any norm/quaternion/Wigner evaluation,
+    and their envelope, radial basis, and type features are multiplied by the
+    mask so they are exactly zero.
 
     Parameters
     ----------
     type_ebed
-        Per-node type embedding with shape (N, C), where N = nf * nloc.
+        Per-node type embedding with shape (N, C), where N=nf*nloc.
     extended_coord
         Extended coordinates with shape (nf, nall, 3).
     nlist
-        Neighbor list with shape (nf, nloc, nnei); -1 marks padding.
+        Neighbor list with shape (nf, nloc, nnei).
     mapping
-        Mapping from extended to local indices with shape (nf, nall), or None
-        when the neighbor indices are already local.
+        Mapping from extended indices to local indices with shape (nf, nall), or None.
     pair_keep_mask
-        Pair keep mask from ``PairExcludeMask`` with shape (nf, nloc, nnei).
-        True means keep.
+        Pair keep mask from `PairExcludeMask` with shape (nf, nloc, nnei). True means keep.
     eps
-        Small positive epsilon for safe norm / quaternion construction.
+        Small positive epsilon for safe norm.
     deg_norm_floor
-        Floor added to the envelope-squared degree before the inverse-sqrt
+        Floor added to the envelope-squared degree before inverse-sqrt
         normalization.
     edge_envelope
-        C^3 edge envelope callable ``(E, 1) -> (E, 1)``.
+        C^3 edge envelope module.
     radial_basis
-        Radial basis callable ``(E, 1) -> (E, n_radial)`` (envelope baked in).
+        Radial basis module.
     n_radial
-        Number of radial basis channels. Unused in the padded layout (kept
-        for signature parity with pt, where it sizes the empty cache).
+        Number of radial basis channels. Unused here; kept for signature
+        parity with pt.
     random_gamma
         Whether to apply a random roll around the local +Z axis before
         constructing Wigner-D blocks.
     wigner_calc
-        Callable converting edge quaternions (E, 4) into packed Wigner-D
-        blocks ``(D_full, Dt_full)``.
+        Callable that converts edge-aligned quaternions into packed Wigner-D
+        blocks.
     gamma
         Optional per-edge roll angles with shape (E,), used only when
         ``random_gamma`` is True. pt draws gamma internally with
@@ -299,7 +335,6 @@ def build_edge_cache(
     nf, nloc, nnei = nlist.shape
     nall = extended_coord.shape[1]
     n_nodes = nf * nloc
-    n_edge = n_nodes * nnei
 
     # === Step 1. Validity mask and safe indices (pt edge_keep semantics) ===
     mask, nlist_safe, src_local_safe = _build_edge_mask_and_src(
@@ -315,6 +350,7 @@ def build_edge_cache(
     dst = xp.reshape(xp.broadcast_to(node_idx[:, None], (n_nodes, nnei)), (-1,))
 
     # === Step 3. Gather per-edge geometry from extended coordinates ===
+    # edge_vec points from center -> neighbor: r_ij = r_j - r_i (in Å).
     coord_flat = xp.reshape(extended_coord, (nf * nall, 3))
     neighbor_coord_index = xp.reshape(frame_idx * nall + nlist_safe, (-1,))
     loc_idx = xp.reshape(xp.arange(nloc, dtype=xp.int64, device=device), (1, nloc, 1))
@@ -325,7 +361,8 @@ def build_edge_cache(
     vec = neighbor_pos - center_pos  # (E, 3)
 
     # === Step 4. Rewrite invalid slots to the safe +z dummy vector ===
-    # Gradient safety: see the function docstring.
+    # Gradient safety: see the function docstring. edge_len is the scalar
+    # distance, computed only after the safe rewrite so it stays finite.
     maskf = xp.astype(mask_flat, vec.dtype)[:, None]  # (E, 1)
     z_unit = xp_asarray_nodetach(
         xp, np.array([[0.0, 0.0, 1.0]]), dtype=vec.dtype, device=device
@@ -334,32 +371,37 @@ def build_edge_cache(
     edge_len = safe_norm(edge_vec, eps)  # (E, 1)
 
     # === Step 5. Envelope and radial basis, masked to zero on invalid slots ===
+    # Edges with r >= rcut are not removed from the cache. Their envelope is
+    # exactly zero, so messages vanish naturally while degree normalization
+    # remains smooth at the cutoff boundary.
     edge_env = edge_envelope(edge_len) * maskf  # (E, 1)
     edge_rbf = radial_basis(edge_len) * maskf  # (E, n_radial)
 
     # === Step 6. Edge quaternion -> Wigner-D blocks ===
-    edge_quat = build_edge_quaternion(edge_vec, edge_len=edge_len, eps=eps)
-    if random_gamma:
-        if gamma is None:
-            gamma = np.random.default_rng().uniform(0.0, 2.0 * math.pi, n_edge)
-        gamma = xp.astype(
-            xp_asarray_nodetach(xp, gamma, device=device), edge_quat.dtype
-        )
-        edge_quat = quaternion_multiply(quaternion_z_rotation(gamma), edge_quat)
-    D_full, Dt_full = wigner_calc(edge_quat)
+    D_full, Dt_full, edge_quat = _build_edge_wigner(
+        edge_vec=edge_vec,
+        edge_len=edge_len,
+        eps=eps,
+        random_gamma=random_gamma,
+        wigner_calc=wigner_calc,
+        gamma=gamma,
+        build_full=build_wigner,
+    )  # (E, D, D), (E, D, D), (E, 4)
 
     # === Step 7. Edge type features (src + dst), masked ===
-    edge_type_feat = (
-        xp.take(type_ebed, src, axis=0) + xp.take(type_ebed, dst, axis=0)
-    ) * xp.astype(maskf, type_ebed.dtype)
+    edge_type_feat = build_edge_type_feat(type_ebed, src, dst) * xp.astype(
+        maskf, type_ebed.dtype
+    )  # (E, C)
 
     # === Step 8. Smooth destination degrees ===
-    # pt accumulates env^2 with index_add_ over dst (edge_cache.py:622); in the
-    # padded node-contiguous layout this is a plain sum over the nnei axis.
+    # pt accumulates env^2 with ``index_add_`` over dst; in the padded
+    # node-contiguous layout this is a plain masked sum over the nnei axis.
     # edge_env is already exactly zero on invalid slots.
     env_sq = xp.reshape(edge_env[:, 0] * edge_env[:, 0], (n_nodes, nnei))
     deg = xp.sum(env_sq, axis=1)  # (N,)
-    inv_sqrt_deg = xp.reshape(1.0 / xp.sqrt(deg + deg_norm_floor), (n_nodes, 1, 1))
+    inv_sqrt_deg = xp.reshape(
+        1.0 / xp.sqrt(deg + deg_norm_floor), (n_nodes, 1, 1)
+    )  # (N, 1, 1)
 
     return EdgeCache(
         src=src,
@@ -378,3 +420,479 @@ def build_edge_cache(
         edge_quat=edge_quat,
         edge_mask=mask_flat,
     )
+
+
+def build_edge_cache_from_edges(
+    *,
+    type_ebed: Any,
+    atype_flat: Any,
+    edge_index: Any,
+    edge_vec: Any,
+    edge_mask: Any,
+    compute_dtype: Any,
+    eps: float,
+    deg_norm_floor: float,
+    inner_clamp: Callable[[Any], Any] | None,
+    bridging_switch: Callable[[Any], Any] | None,
+    edge_envelope: Callable[[Any], Any],
+    radial_basis: Callable[[Any], Any],
+    has_exclude_types: bool,
+    edge_type_keep_mask: EdgeTypeKeepMaskFn,
+    random_gamma: bool,
+    wigner_calc: WignerCalculatorFn,
+    build_wigner: bool = True,
+    gamma: Any = None,
+) -> EdgeCache:
+    """
+    Build the global edge cache from a sparse edge list.
+
+    Parameters
+    ----------
+    type_ebed
+        Per-node type embedding with shape (N, C), where N=nf*nloc.
+    atype_flat
+        Flattened local atom types with shape (N,).
+    edge_index
+        Edge indices with shape (2, E).
+    edge_vec
+        Edge vectors with shape (E, 3) in Å.
+    edge_mask
+        Edge mask with shape (E,). True means keep.
+    compute_dtype
+        Promoted compute dtype used for geometry and radial features.
+    eps
+        Small positive epsilon for safe norm.
+    deg_norm_floor
+        Floor added to the envelope-squared degree before inverse-sqrt
+        normalization (see :func:`_finalize_edge_cache`).
+    inner_clamp
+        Optional inner clamp used to freeze short-range geometry below `r_inner`.
+    bridging_switch
+        Optional C3 switching amplitude ``w(r) -> [0, 1]`` that drives
+        the Source Freeze Propagation Gate. When provided, a per-edge
+        ``edge_src_gate`` is computed from the node-wise product of
+        ``w(r_{jk})`` along each source node's outgoing edges. Masked
+        edges (``edge_keep=False``) are forced to ``w=1`` so they never
+        leak into the product.
+    edge_envelope
+        C^3 edge envelope module.
+    radial_basis
+        Radial basis module.
+    has_exclude_types
+        Whether excluded type pairs should be filtered in this path.
+    edge_type_keep_mask
+        Callable that builds the keep mask for edge type exclusions.
+    random_gamma
+        Whether to apply a random roll around the local +Z axis before
+        constructing Wigner-D blocks.
+    wigner_calc
+        Callable that converts edge-aligned quaternions into packed Wigner-D
+        blocks.
+    gamma
+        Optional per-edge roll angles with shape (E,), used only when
+        ``random_gamma`` is True. pt draws gamma internally with
+        ``torch.rand`` and the draw cannot be reproduced here, so callers
+        needing determinism (e.g. tests) inject the angles explicitly. When
+        None, angles are drawn from ``numpy.random.default_rng()`` uniformly
+        in ``[0, 2*pi)``, matching pt's distribution.
+
+    Returns
+    -------
+    EdgeCache
+        Per-edge cache.
+    """
+    xp = array_api_compat.array_namespace(type_ebed, edge_index, edge_vec)
+    device = array_api_compat.device(edge_vec)
+    n_nodes = type_ebed.shape[0]
+    src = xp.astype(edge_index[0], xp.int64)
+    dst = xp.astype(edge_index[1], xp.int64)
+
+    # === Step 1. Normalize mask and apply type exclusions ===
+    edge_keep = xp.astype(edge_mask, xp.bool)
+    if has_exclude_types:
+        edge_keep = edge_keep & edge_type_keep_mask(atype_flat, src, dst)
+
+    # === Step 2. Promote geometry dtype ===
+    edge_vec = xp.astype(edge_vec, compute_dtype)
+    edge_keep_f = xp.astype(edge_keep, compute_dtype)[:, None]
+    edge_vec = edge_vec * edge_keep_f
+    # Masked-out edges (zeroed above) are assigned the canonical +z direction so the
+    # length normalization and quaternion construction remain finite. Padding the
+    # keep-complement into the z channel constructs this term entirely on device.
+    zeros2 = xp.zeros((edge_keep_f.shape[0], 2), dtype=edge_vec.dtype, device=device)
+    edge_vec = edge_vec + xp.concat([zeros2, 1.0 - edge_keep_f], axis=-1)
+
+    # === Step 3. Edge length, envelope, and radial basis ===
+    edge_len = safe_norm(edge_vec, eps)
+    if inner_clamp is not None:
+        clamped = inner_clamp(edge_len)
+        scale = clamped / edge_len
+        edge_vec = edge_vec * scale
+        edge_len = clamped
+    edge_env = edge_envelope(edge_len) * edge_keep_f  # (E, 1)
+    edge_rbf = radial_basis(edge_len) * edge_keep_f  # (E, n_radial)
+
+    # === Step 4. Edge quaternion -> Wigner-D blocks ===
+    D_full, Dt_full, edge_quat = _build_edge_wigner(
+        edge_vec=edge_vec,
+        edge_len=edge_len,
+        eps=eps,
+        random_gamma=random_gamma,
+        wigner_calc=wigner_calc,
+        gamma=gamma,
+        build_full=build_wigner,
+    )  # (E, D, D), (E, D, D), (E, 4)
+
+    # === Step 5. Edge type features ===
+    edge_type_feat = build_edge_type_feat(type_ebed, src, dst)
+    edge_type_feat = edge_type_feat * xp.astype(edge_keep_f, edge_type_feat.dtype)
+
+    # === Step 6. Source Freeze Propagation Gate (optional) ===
+    # The sparse-edge path packs masked dummy edges so the compiled graph sees
+    # a statically non-empty, non-singular edge tensor. ``edge_keep_f`` rewrites
+    # any such slot to ``w=1`` inside ``compute_edge_src_gate``, keeping the
+    # product reduction unaffected by padding.
+    edge_src_gate: Any = None
+    if bridging_switch is not None:
+        edge_src_gate = compute_edge_src_gate(
+            edge_len=edge_len,
+            src=src,
+            n_nodes=n_nodes,
+            bridging_switch=bridging_switch,
+            edge_keep_f=edge_keep_f,
+        )
+
+    return _finalize_edge_cache(
+        n_nodes=n_nodes,
+        src=src,
+        dst=dst,
+        edge_type_feat=edge_type_feat,
+        edge_vec=edge_vec,
+        edge_rbf=edge_rbf,
+        edge_env=edge_env,
+        D_full=D_full,
+        Dt_full=Dt_full,
+        edge_quat=edge_quat,
+        deg_norm_floor=deg_norm_floor,
+        edge_src_gate=edge_src_gate,
+    )
+
+
+def _build_edge_wigner(
+    *,
+    edge_vec: Any,
+    edge_len: Any,
+    eps: float,
+    random_gamma: bool,
+    wigner_calc: WignerCalculatorFn,
+    gamma: Any = None,
+    build_full: bool = True,
+) -> tuple[Any, Any, Any]:
+    """
+    Build packed Wigner-D blocks from edge vectors.
+
+    Parameters
+    ----------
+    edge_vec
+        Edge vectors with shape (E, 3) in Å.
+    edge_len
+        Edge lengths with shape (E, 1).
+    eps
+        Small positive epsilon used in quaternion construction.
+    random_gamma
+        Whether to apply a random roll around the local +Z axis.
+    wigner_calc
+        Callable that converts edge-aligned quaternions into packed Wigner-D
+        blocks.
+    gamma
+        Optional per-edge roll angles with shape (E,), used only when
+        ``random_gamma`` is True. When None, angles are drawn from
+        ``numpy.random.default_rng()`` uniformly in ``[0, 2*pi)``, matching
+        pt's ``torch.rand`` distribution.
+    build_full
+        Whether to materialize the full ``(E, D, D)`` Wigner-D blocks. When
+        False (all message-passing blocks take the Cartesian path), only the
+        quaternion is returned and the blocks are ``None``; the geometric
+        initial embedding reconstructs the zonal coupling from the quaternion.
+
+    Returns
+    -------
+    tuple[Array, Array, Array]
+        Packed Wigner-D matrices ``(D_full, Dt_full)`` with shape ``(E, D, D)``
+        (or ``None`` when ``build_full`` is False) and the quaternion used to
+        build them with shape ``(E, 4)``.
+    """
+    xp = array_api_compat.array_namespace(edge_vec)
+    device = array_api_compat.device(edge_vec)
+    # === Step 1. Build edge-aligned quaternions ===
+    edge_quat = build_edge_quaternion(
+        edge_vec,
+        edge_len=edge_len,
+        eps=eps,
+    )
+
+    # === Step 2. Apply optional random local-Z roll ===
+    # pt draws the roll with ``torch.rand``; here it is injected or drawn from
+    # numpy so the array-API call site stays reproducible.
+    if random_gamma:
+        if gamma is None:
+            gamma = np.random.default_rng().uniform(
+                0.0, 2.0 * math.pi, edge_quat.shape[0]
+            )
+        gamma = xp.astype(
+            xp_asarray_nodetach(xp, gamma, device=device), edge_quat.dtype
+        )
+        edge_quat = quaternion_multiply(quaternion_z_rotation(gamma), edge_quat)
+
+    # === Step 3. Convert quaternions to packed Wigner-D blocks ===
+    if not build_full:
+        return None, None, edge_quat
+    D_full, Dt_full = wigner_calc(edge_quat)
+    return D_full, Dt_full, edge_quat
+
+
+def _finalize_edge_cache(
+    *,
+    n_nodes: int,
+    src: Any,
+    dst: Any,
+    edge_type_feat: Any,
+    edge_vec: Any,
+    edge_rbf: Any,
+    edge_env: Any,
+    D_full: Any,
+    Dt_full: Any,
+    edge_quat: Any,
+    deg_norm_floor: float,
+    edge_src_gate: Any = None,
+) -> EdgeCache:
+    """
+    Assemble the shared `EdgeCache` layout.
+
+    Parameters
+    ----------
+    n_nodes
+        Number of local nodes in the flattened frame-major layout.
+    src
+        Source node indices with shape (E,).
+    dst
+        Destination node indices with shape (E,).
+    edge_type_feat
+        Per-edge type features with shape (E, C).
+    edge_vec
+        Edge vectors with shape (E, 3).
+    edge_rbf
+        Radial basis features with shape (E, n_radial).
+    edge_env
+        Smooth edge envelope weights with shape (E, 1).
+    D_full
+        Packed Wigner-D matrices with shape (E, D, D), or None when the
+        full Wigner-D construction is skipped (all-Cartesian model).
+    Dt_full
+        Transposed packed Wigner-D matrices with shape (E, D, D), or None
+        when the full Wigner-D construction is skipped.
+    edge_quat
+        Global-to-local quaternions used to build the Wigner-D matrices with
+        shape (E, 4).
+    deg_norm_floor
+        Floor added to the envelope-squared degree before the inverse-sqrt
+        normalization. A tiny ``eps`` reproduces the legacy behavior; an
+        ``O(1)`` value makes sparse-neighborhood features vanish smoothly at
+        ``rcut`` instead of saturating and kinking.
+    edge_src_gate
+        Optional per-edge SFPG weight with shape (E, 1). ``None`` in
+        non-bridging mode.
+
+    Returns
+    -------
+    EdgeCache
+        Finalized per-edge cache shared by eager and compile paths.
+    """
+    xp = array_api_compat.array_namespace(edge_vec, dst)
+    device = array_api_compat.device(edge_vec)
+    # === Step 1. Build smooth destination degrees ===
+    deg = xp.zeros((n_nodes,), dtype=edge_vec.dtype, device=device)  # (N,)
+    env_flat = xp.astype(edge_env[..., 0], edge_vec.dtype)
+    deg = xp_add_at(deg, dst, env_flat * env_flat)
+    inv_sqrt_deg = xp.reshape(
+        1.0 / xp.sqrt(deg + deg_norm_floor), (n_nodes, 1, 1)
+    )  # (N, 1, 1)
+
+    return EdgeCache(
+        src=src,
+        dst=dst,
+        edge_type_feat=edge_type_feat,
+        edge_vec=edge_vec,
+        edge_rbf=edge_rbf,
+        edge_env=edge_env,
+        deg=deg,
+        inv_sqrt_deg=inv_sqrt_deg,
+        D_full=D_full,
+        Dt_full=Dt_full,
+        D_to_m_cache={},
+        Dt_from_m_cache={},
+        edge_src_gate=edge_src_gate,
+        edge_quat=edge_quat,
+    )
+
+
+def _build_edge_mask_and_src(
+    xp: Any,
+    nlist: Any,
+    mapping: Any,
+    pair_keep_mask: Any,
+    nall: int,
+) -> tuple[Any, Any, Any]:
+    """
+    Build the padded edge validity mask and safe source-local indices.
+
+    This reproduces the pt edge-keep rules for the padded layout:
+
+    - padding slots (``nlist == -1``) are invalid;
+    - excluded type pairs (``pair_keep_mask == False``) are invalid;
+    - after mapping the neighbor's extended index to a local index, slots
+      whose source falls outside ``[0, nloc)`` are invalid (pt's ``src_ok``
+      filter; e.g. broken mapping or ghost-only neighbors);
+    - no distance-based filtering: edges beyond ``rcut`` stay valid and are
+      zeroed naturally by the smooth envelope.
+
+    Instead of dropping invalid slots (pt's ``torch.nonzero``), they are kept
+    with ``mask == False`` and safe (index 0) placeholder indices.
+
+    Parameters
+    ----------
+    xp
+        Array namespace.
+    nlist
+        Neighbor list with shape (nf, nloc, nnei); -1 marks padding.
+    mapping
+        Extended-to-local mapping with shape (nf, nall), or None if the
+        neighbor indices are already local.
+    pair_keep_mask
+        Pair exclusion keep mask with shape (nf, nloc, nnei). True means keep.
+    nall
+        Number of atoms on the extended axis per frame.
+
+    Returns
+    -------
+    tuple[Array, Array, Array]
+        ``(mask, nlist_safe, src_local_safe)``, all with shape
+        (nf, nloc, nnei). ``mask`` is boolean; the two index arrays are int64
+        with 0 substituted on invalid slots.
+    """
+    nf, nloc, nnei = nlist.shape
+    nlist = xp.astype(nlist, xp.int64)
+    mask = (nlist >= 0) & pair_keep_mask
+    nlist_safe = xp.where(mask, nlist, xp.zeros_like(nlist))
+
+    if mapping is None:
+        # Neighbor indices are already local indices in [0, nloc).
+        src_local = nlist_safe
+    else:
+        # Map extended index -> local index for each frame.
+        mapping_flat = xp.astype(xp.reshape(mapping, (-1,)), xp.int64)
+        frame_idx = xp.reshape(
+            xp.arange(nf, dtype=xp.int64, device=array_api_compat.device(nlist)),
+            (nf, 1, 1),
+        )
+        flat_idx = xp.reshape(frame_idx * nall + nlist_safe, (-1,))
+        src_local = xp.reshape(xp.take(mapping_flat, flat_idx, axis=0), nlist.shape)
+
+    # pt's src_ok filter: drop (here: mask) edges mapping outside [0, nloc).
+    mask = mask & (src_local >= 0) & (src_local < nloc)
+    src_local_safe = xp.where(mask, src_local, xp.zeros_like(src_local))
+    # Re-zero nlist_safe after the src_ok update so coordinate gathers stay
+    # in-bounds when callers pass local nlists with out-of-range entries.
+    nlist_safe = xp.where(mask, nlist_safe, xp.zeros_like(nlist_safe))
+    return mask, nlist_safe, src_local_safe
+
+
+def build_edge_type_feat(
+    type_ebed: Any,
+    src: Any,
+    dst: Any,
+) -> Any:
+    """
+    Build per-edge type features by summing src/dst embeddings.
+
+    Parameters
+    ----------
+    type_ebed
+        Per-node type embedding with shape (N, C).
+    src
+        Source node indices with shape (E,).
+    dst
+        Destination node indices with shape (E,).
+
+    Returns
+    -------
+    Array
+        Per-edge type features with shape (E, C).
+    """
+    xp = array_api_compat.array_namespace(type_ebed, src, dst)
+    # === Step 1. Normalize index dtypes ===
+    if src.dtype != xp.int64:
+        src = xp.astype(src, xp.int64)
+    if dst.dtype != xp.int64:
+        dst = xp.astype(dst, xp.int64)
+
+    # === Step 2. Sum source and destination embeddings ===
+    return xp.take(type_ebed, src, axis=0) + xp.take(type_ebed, dst, axis=0)
+
+
+def edge_cache_to_dtype(cache: EdgeCache, dtype: Any) -> EdgeCache:
+    """
+    Convert all floating-point tensors in EdgeCache to the specified dtype.
+
+    Integer tensors (src, dst) are unchanged. This is a standalone function
+    (not a method) to keep it side-effect free.
+
+    Parameters
+    ----------
+    cache
+        The edge feature cache to convert.
+    dtype
+        Target dtype for floating-point tensors.
+
+    Returns
+    -------
+    EdgeCache
+        New cache with converted tensors.
+    """
+    xp = array_api_compat.array_namespace(cache.edge_vec)
+    # Handle Optional tensors explicitly.
+    # Use local variables with explicit None check and assignment.
+    _D_full = cache.D_full
+    _Dt_full = cache.Dt_full
+    _edge_src_gate = cache.edge_src_gate
+    _edge_quat = cache.edge_quat
+    D_full: Any = None
+    Dt_full: Any = None
+    edge_src_gate: Any = None
+    edge_quat: Any = None
+    if _D_full is not None:
+        D_full = xp.astype(_D_full, dtype)
+    if _Dt_full is not None:
+        Dt_full = xp.astype(_Dt_full, dtype)
+    if _edge_src_gate is not None:
+        edge_src_gate = xp.astype(_edge_src_gate, dtype)
+    if _edge_quat is not None:
+        edge_quat = xp.astype(_edge_quat, dtype)
+
+    return EdgeCache(
+        src=cache.src,
+        dst=cache.dst,
+        edge_type_feat=xp.astype(cache.edge_type_feat, dtype),
+        edge_vec=xp.astype(cache.edge_vec, dtype),
+        edge_rbf=xp.astype(cache.edge_rbf, dtype),
+        edge_env=xp.astype(cache.edge_env, dtype),
+        deg=xp.astype(cache.deg, dtype),
+        inv_sqrt_deg=xp.astype(cache.inv_sqrt_deg, dtype),
+        D_full=D_full,
+        Dt_full=Dt_full,
+        D_to_m_cache=None if cache.D_to_m_cache is None else {},
+        Dt_from_m_cache=None if cache.Dt_from_m_cache is None else {},
+        edge_src_gate=edge_src_gate,
+        edge_quat=edge_quat,
+        edge_mask=cache.edge_mask,
+    )
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/embedding.py b/deepmd/dpmodel/descriptor/dpa4_nn/embedding.py
index 7cfec7f2bd..9d16db2d14 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/embedding.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/embedding.py
@@ -1,31 +1,12 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """
-Embedding layers for the dpmodel DPA4/SeZM descriptor.
-
-This module is the dpmodel port of
-``deepmd.pt.model.descriptor.sezm_nn.embedding``. It defines the type
-embedding, geometric initial embedding, and environment-seed embedding used
-to initialize SeZM node features.
-
-Padded-edge layout
-------------------
-The pt implementation aggregates sparse per-edge messages into nodes with
-``index_add_``. The dpmodel port uses the padded, frame-explicit edge layout
-of :class:`~deepmd.dpmodel.descriptor.dpa4_nn.edge_cache.EdgeCache`
-(``E = nf * nloc * nnei`` with invalid slots marked by ``edge_mask == 0``),
-so every destination aggregation becomes a masked sum over the ``nnei`` axis
-of the ``(N, nnei, ...)`` reshape. Each rewrite is commented with the pt
-line it replaces.
-
-Ported / skipped classes
-------------------------
-- ``SeZMTypeEmbedding``, ``GeometricInitialEmbedding`` and
-  ``EnvironmentInitialEmbedding`` are ported (core consumers:
-  ``sezm.py:710``, ``sezm.py:826`` and ``sezm.py:733`` respectively).
-- ``ChargeSpinEmbedding`` (pt ``embedding.py:591``) is NOT ported: it is
-  constructed only when ``add_chg_spin_ebd=True`` (``sezm.py:717``), and the
-  flag defaults to ``False`` (``sezm.py:440``), so it is outside the core
-  DPA4 configuration targeted by this port.
+Embedding layers for the DPA4/SeZM descriptor.
+
+This module defines the type embedding, geometric initial embedding, and
+environment-seed embedding used to initialize SeZM node features.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.embedding``.
 """
 
 from __future__ import (
@@ -47,6 +28,7 @@
     NativeOP,
 )
 from deepmd.dpmodel.array_api import (
+    xp_add_at,
     xp_asarray_nodetach,
 )
 from deepmd.dpmodel.common import (
@@ -73,16 +55,6 @@
     )
 
 
-def _edge_layout(n_edge: int, n_nodes: int) -> int:
-    """Validate the padded-edge layout and return ``nnei = E // N``."""
-    if n_nodes <= 0 or n_edge % n_nodes != 0:
-        raise ValueError(
-            "padded-edge layout requires E to be a multiple of N; "
-            f"got E={n_edge}, N={n_nodes}"
-        )
-    return n_edge // n_nodes
-
-
 class SeZMTypeEmbedding(NativeOP):
     """
     Minimal SeZM type embedding with Adam-routed parameter naming.
@@ -94,7 +66,7 @@ class SeZMTypeEmbedding(NativeOP):
     embed_dim
         Embedding dimension.
     precision
-        Floating-point precision of the embedding table.
+        Parameter precision.
     seed
         Random seed for initialization.
     trainable
@@ -104,8 +76,7 @@ class SeZMTypeEmbedding(NativeOP):
 
     Notes
     -----
-    The parameter is named with ``adam_`` prefix so HybridMuon routes it to
-    Adam (the name matches the pt ``state_dict`` key ``adam_type_embedding``).
+    The parameter is named with ``adam_`` prefix so HybridMuon routes it to Adam.
     """
 
     def __init__(
@@ -130,17 +101,24 @@ def __init__(
             raise ValueError("`embed_dim` must be positive")
         prec = PRECISION_DICT[self.precision.lower()]
 
-        # === Step 1+2. Build the table; active rows N(0, init_std), padding
-        # row zero (pt embedding.py:103-124). The numpy RNG stream differs
-        # from pt's torch generator; weight values are not bit-compatible.
+        # === Step 1. Build the full embedding table in a local array ===
+        # The table is assembled locally and assigned to ``self`` exactly once.
+        # The pt_expt backend converts ``self`` attributes into torch buffers on
+        # assignment, so a later in-place slice write into
+        # ``self.adam_type_embedding`` would raise; the local-then-assign pattern
+        # keeps the produced values identical while staying backend-agnostic.
+        n_rows = self.ntypes + int(self.padding)
         init_std = 1.0 / math.sqrt(float(self.ntypes + self.embed_dim))
         rng = np.random.default_rng(child_seed(seed, 0))
-        table = rng.normal(scale=init_std, size=(self.ntypes, self.embed_dim))
+        table = np.empty((n_rows, self.embed_dim), dtype=prec)
+        table[: self.ntypes] = rng.normal(
+            0.0, init_std, size=(self.ntypes, self.embed_dim)
+        )
         if self.padding:
-            table = np.concatenate(
-                [table, np.zeros((1, self.embed_dim), dtype=table.dtype)], axis=0
-            )
-        self.adam_type_embedding = table.astype(prec)
+            table[self.ntypes] = 0.0
+
+        # === Step 2. Register the embedding table parameter ===
+        self.adam_type_embedding = table
 
     def call(self, atype: Any) -> Any:
         """
@@ -149,10 +127,7 @@ def call(self, atype: Any) -> Any:
         Parameters
         ----------
         atype
-            Atom types with shape (...,). Valid type range is [0, ntypes-1]
-            (plus the padding row index ``ntypes`` when ``padding=True``).
-            Negative type ids are invalid input and are NOT validated here
-            (caller contract).
+            Atom types with shape (...,). Valid type range is [0, ntypes-1].
 
         Returns
         -------
@@ -163,36 +138,33 @@ def call(self, atype: Any) -> Any:
         weight = xp_asarray_nodetach(
             xp, self.adam_type_embedding[...], device=array_api_compat.device(atype)
         )
-        # pt embedding.py:143 torch.embedding -> flat int64 take + reshape.
+        # torch.embedding gather: flatten the indices to int64, take the rows,
+        # then restore the original index shape.
         index = xp.astype(xp.reshape(atype, (-1,)), xp.int64)
         out = xp.take(weight, index, axis=0)
         return xp.reshape(out, (*atype.shape, self.embed_dim))
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize to a dict.
-
-        The pt class has no ``serialize()``; the ``@variables`` key here
-        matches the pt ``state_dict()`` key (``adam_type_embedding``).
-        """
+        """Serialize the SeZMTypeEmbedding to a dict."""
         return {
             "@class": "SeZMTypeEmbedding",
             "@version": 1,
             "config": {
                 "ntypes": self.ntypes,
                 "embed_dim": self.embed_dim,
-                "padding": self.padding,
-                "precision": self.precision.lower(),
+                "precision": np.dtype(PRECISION_DICT[self.precision]).name,
                 "trainable": self.trainable,
+                "padding": self.padding,
                 "seed": None,
             },
             "@variables": {
-                "adam_type_embedding": to_numpy_array(self.adam_type_embedding)
+                "adam_type_embedding": to_numpy_array(self.adam_type_embedding),
             },
         }
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> SeZMTypeEmbedding:
-        """Deserialize from a dict."""
+        """Deserialize a SeZMTypeEmbedding from a dict."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "SeZMTypeEmbedding":
@@ -203,13 +175,9 @@ def deserialize(cls, data: dict[str, Any]) -> SeZMTypeEmbedding:
         variables = data.pop("@variables")
         obj = cls(**config)
         prec = PRECISION_DICT[obj.precision.lower()]
-        table = np.asarray(variables["adam_type_embedding"], dtype=prec)
-        if table.shape != obj.adam_type_embedding.shape:
-            raise ValueError(
-                f"adam_type_embedding shape {table.shape} does not match "
-                f"the expected shape {obj.adam_type_embedding.shape}"
-            )
-        obj.adam_type_embedding = table
+        obj.adam_type_embedding = np.asarray(
+            variables["adam_type_embedding"], dtype=prec
+        )
         return obj
 
 
@@ -217,10 +185,9 @@ class GeometricInitialEmbedding(NativeOP):
     """
     Geometric initial embedding that adds zonal (m=0) rotated features.
 
-    This module rotates pre-computed radial features for each degree l >= 1
-    using the zonal (m=0) column of the cached inverse Wigner-D blocks
-    (local->global). The l=0 component is not computed here since it comes
-    from type embedding.
+    This module rotates pre-computed radial features for each degree l >= 1 using the
+    zonal (m=0) column of the cached inverse Wigner-D blocks (local->global).
+    The l=0 component is not computed here since it comes from type embedding.
 
     Parameters
     ----------
@@ -229,8 +196,7 @@ class GeometricInitialEmbedding(NativeOP):
     channels
         Number of channels per (l, m) coefficient.
     precision
-        Floating-point precision label (kept for config parity with pt; the
-        computation follows the input dtype).
+        Parameter precision.
     """
 
     def __init__(
@@ -244,14 +210,16 @@ def __init__(
         self.channels = int(channels)
         self.ebed_dim = get_so3_dim_of_lmax(self.lmax)
         self.precision = precision
-        # One aligned entry per non-scalar node row: output row, local m=0
-        # column, and the matching radial degree slot (static int64 tables;
-        # pt registers them as persistent buffers, embedding.py:185-195).
         (
-            self.non_scalar_row_index,
-            self.zonal_m0_col_index_for_row,
-            self.radial_slot_index_for_row,
+            node_row_index,
+            node_zonal_m0_col_index,
+            node_radial_l_index,
         ) = build_gie_zonal_index(self.lmax)
+        # One aligned entry per non-scalar node row: output row, local m=0
+        # column, and the matching radial degree slot.
+        self.non_scalar_row_index = node_row_index
+        self.zonal_m0_col_index_for_row = node_zonal_m0_col_index
+        self.radial_slot_index_for_row = node_radial_l_index
 
     def call(
         self,
@@ -267,8 +235,7 @@ def call(
         n_nodes
             Number of nodes (nf*nloc).
         edge_cache
-            Per-edge cache containing geometry, weights, and Wigner-D blocks
-            in the padded layout (``E = n_nodes * nnei``).
+            Per-edge cache containing geometry, weights, and Wigner-D blocks.
         radial_feat
             Per-edge radial features with shape (E, lmax, C) for l=1..lmax.
         zonal_coupling
@@ -278,31 +245,21 @@ def call(
         Returns
         -------
         Array
-            Initial features to add with shape (N, D, C). l=0 is guaranteed
-            zero.
+            Initial features to add with shape (N, D, C). l=0 is guaranteed zero.
         """
         # === Step 1. Initialize output ===
         xp = array_api_compat.array_namespace(edge_cache.edge_vec)
         device = array_api_compat.device(edge_cache.edge_vec)
         dtype = edge_cache.edge_vec.dtype
         if self.lmax == 0:
-            # pt embedding.py:226-230: zeros short-circuit.
             return xp.zeros(
                 (n_nodes, self.ebed_dim, self.channels), dtype=dtype, device=device
-            )
-        # Keep ``n_edge``/``n_nodes`` symbolic (no ``int()``): they are the
-        # products ``nf*nloc*nnei`` / ``nf*nloc``. Casting to a Python int
-        # specializes them to the trace-time sample shape (e.g. nf*nloc==14),
-        # which breaks torch.export with a dynamic ``nloc`` dim. ``_edge_layout``
-        # returns a symbolic ``nnei`` and the masked-sum reshapes below use
-        # ``-1`` for the node axis to recover it symbolically.
+            )  # (N, D, C)
         n_edge = edge_cache.dst.shape[0]
-        nnei = _edge_layout(n_edge, n_nodes)
 
         # === Step 2. Gather all m=0 columns (l >= 1) in one shot ===
-        # pt embedding.py:235-241 pairs one packed non-scalar row with the
-        # zonal m=0 column from the same degree block via advanced indexing
-        # Dt_full[:, rows, cols]; here this becomes a flat row-major take.
+        # Advanced indexing pairs one packed non-scalar row with the zonal m=0 column
+        # from the same degree block in Dt_full.
         if zonal_coupling is None:
             Dt_full = edge_cache.Dt_full  # (E, D, D)
             dim_full = Dt_full.shape[-1]
@@ -318,8 +275,7 @@ def call(
             )  # (E, D-1)
 
         # === Step 3. Broadcast radial features per row ===
-        # Each non-scalar packed row reuses the radial feature of its degree l
-        # (pt embedding.py:245-250, index_select on axis 1).
+        # Each non-scalar packed row reuses the radial feature of its degree l.
         radial_slot_index = xp_asarray_nodetach(
             xp, self.radial_slot_index_for_row, device=device
         )
@@ -331,34 +287,38 @@ def call(
         )  # (E, D-1, C)
 
         # === Step 4. Source Freeze Propagation Gate (optional) ===
-        # pt embedding.py:256-260: mute messages emitted by nodes whose local
-        # neighborhood enters the frozen zone; ``edge_src_gate`` is ``None``
-        # outside bridging mode so this is a no-op in normal training.
+        # Mute messages emitted by nodes whose local neighborhood enters
+        # the frozen zone. ``edge_src_gate`` is ``None`` outside bridging
+        # mode so this is a no-op in normal training.
         src_gate = edge_cache.edge_src_gate
         if src_gate is not None:
             non_scalar_message = non_scalar_message * xp.astype(
                 xp.reshape(src_gate, (n_edge, 1, 1)), non_scalar_message.dtype
             )
 
-        # === Step 5. Aggregate to nodes and normalize ===
-        # pt embedding.py:264-267: non_scalar_out.index_add_(0, dst, msg) —
-        # padded-edge masked sum over the nnei axis (dst is slot-implicit).
+        # === Step 5. Scatter to nodes and normalize ===
+        # Destination scatter-add over ``edge_cache.dst`` (pt ``index_add_``),
+        # applied after the validity masking below. This reduction is
+        # layout-agnostic: it is correct both for the padded ``call`` (row-major
+        # ``dst`` makes the accumulation order identical to a sum over the
+        # ``nnei`` axis, hence bit-exact) and for the sparse ``call_with_edges``
+        # (arbitrary ``dst`` order and per-node degree). The l=0 row is left at
+        # its zero initialization by concatenating it below the contiguous
+        # non-scalar rows 1..D-1.
         edge_mask = edge_cache.edge_mask
         if edge_mask is not None:
             non_scalar_message = non_scalar_message * xp.astype(
                 xp.reshape(edge_mask, (n_edge, 1, 1)), non_scalar_message.dtype
             )
-        non_scalar_out = xp.sum(
-            xp.reshape(
-                non_scalar_message,
-                (-1, nnei, self.ebed_dim - 1, self.channels),
+        non_scalar_out = xp_add_at(
+            xp.zeros(
+                (n_nodes, self.ebed_dim - 1, self.channels),
+                dtype=non_scalar_message.dtype,
+                device=device,
             ),
-            axis=1,
+            edge_cache.dst,
+            non_scalar_message,
         )  # (N, D-1, C)
-        # pt embedding.py:268: out[:, non_scalar_row_index, :] = non_scalar_out
-        # with row 0 (l=0) left at its zeros init (pt embedding.py:226).
-        # ``non_scalar_row_index`` is the contiguous arange(1, D), so the
-        # writeback is a concat with a zero l=0 row.
         out = xp.concat(
             [
                 xp.zeros(
@@ -370,34 +330,27 @@ def call(
             ],
             axis=1,
         )  # (N, D, C)
-        # pt embedding.py:269: out.mul_(inv_sqrt_deg).
         out = out * xp.astype(edge_cache.inv_sqrt_deg, out.dtype)
         return xp.astype(out, dtype)
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize to a dict (config only; same flat layout as pt)."""
         return {
             "@class": "GeometricInitialEmbedding",
             "@version": 1,
             "lmax": self.lmax,
             "channels": self.channels,
-            "precision": self.precision.lower(),
+            "precision": np.dtype(PRECISION_DICT[self.precision]).name,
         }
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> GeometricInitialEmbedding:
-        """Deserialize from a dict (accepts the pt ``serialize()`` output)."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "GeometricInitialEmbedding":
             raise ValueError(f"Invalid class for GeometricInitialEmbedding: {data_cls}")
         version = int(data.pop("@version"))
         check_version_compatibility(version, 1, 1)
-        return cls(
-            lmax=int(data.pop("lmax")),
-            channels=int(data.pop("channels")),
-            precision=str(data.pop("precision")),
-        )
+        return cls(**data)
 
 
 class EnvironmentInitialEmbedding(NativeOP):
@@ -413,12 +366,9 @@ class EnvironmentInitialEmbedding(NativeOP):
 
     The computation follows the environment matrix approach where::
 
-        1. Build `r_tilde = [s, s*r_hat]` where `s = edge_env / r` and
-           `r_hat = edge_vec / r`
-        2. G network: `g = G(rbf_proj(edge_rbf), type_src, type_dst)` produces
-           per-edge features
-           - Uses independent `env_type_embed` instead of projecting from the
-             main type embedding
+        1. Build `r_tilde = [s, s*r_hat]` where `s = edge_env / r` and `r_hat = edge_vec / r`
+        2. G network: `g = G(rbf_proj(edge_rbf), type_src, type_dst)` produces per-edge features
+           - Uses independent `env_type_embed` instead of projecting from main type embedding
            - Uses `rbf_proj` to project edge_rbf to `rbf_out_dim`
         3. env_agg: aggregate outer product `r_tilde ⊗ g` by destination node
         4. D matrix: `D = env_agg^T @ env_agg[:, :, :axis_dim]`
@@ -448,7 +398,7 @@ class EnvironmentInitialEmbedding(NativeOP):
     eps : float
         Small epsilon for numerical stability.
     precision : str
-        Floating-point precision of the parameters.
+        Parameter precision.
     trainable : bool
         Whether parameters are trainable.
     seed : int | list[int] | None
@@ -492,9 +442,9 @@ def __init__(
         self.trainable = bool(trainable)
 
         # === RBF projection: n_radial -> rbf_out_dim (two-layer MLP) ===
-        # rbf_out_dim = max(32, embed_dim - 2*type_dim) to align G-network
-        # width to embed_dim. First layer: n_radial -> rbf_out_dim with
-        # activation. Second layer: rbf_out_dim -> rbf_out_dim linear.
+        # rbf_out_dim = max(32, embed_dim - 2*type_dim) to align G-network width to embed_dim
+        # First layer: n_radial -> rbf_out_dim with activation
+        # Second layer: rbf_out_dim -> rbf_out_dim linear
         self.rbf_out_dim = max(32, self.embed_dim - 2 * self.type_dim)
         seed_rbf_proj = child_seed(seed, 0)
         self.rbf_proj_layer1 = NativeLayer(
@@ -518,11 +468,12 @@ def __init__(
 
         # === Independent type embedding: ntypes -> type_dim ===
         # Individual type embedding
+        seed_type_embed = child_seed(seed, 1)
         self.env_type_embed = SeZMTypeEmbedding(
             ntypes=self.ntypes,
             embed_dim=self.type_dim,
             precision=self.precision,
-            seed=child_seed(seed, 1),
+            seed=seed_type_embed,
             trainable=self.trainable,
         )
 
@@ -549,20 +500,18 @@ def __init__(
         )
 
         # === Output projection: embed_dim * axis_dim -> 2*channels ===
-        # Zero init so FiLM logits start at zero (pt init="final",
-        # embedding.py:447-455); strengths control magnitude.
+        # Zero init so FiLM logits start at zero; strengths control magnitude.
+        seed_out = child_seed(seed, 3)
         self.output_proj = NativeLayer(
             self.embed_dim * self.axis_dim,
             2 * self.channels,
             bias=False,
             activation_function=None,
             precision=self.precision,
-            seed=child_seed(seed, 3),
+            seed=seed_out,
             trainable=self.trainable,
         )
-        # Use an explicit shape/dtype instead of np.zeros_like(self.output_proj.w):
-        # in pt_expt the attribute is a requires-grad torch Parameter, on which
-        # numpy __array__ conversion raises.
+        # NativeLayer has no ``init="final"``; replicate it by zeroing the weight.
         self.output_proj.w = np.zeros(
             (self.embed_dim * self.axis_dim, 2 * self.channels),
             dtype=PRECISION_DICT[self.precision.lower()],
@@ -581,8 +530,7 @@ def call(
         Parameters
         ----------
         edge_cache : EdgeCache
-            Edge cache containing src, dst, edge_vec, edge_rbf, edge_env in
-            the padded layout (``E = n_nodes * nnei``).
+            Edge cache containing src, dst, edge_vec, edge_rbf, edge_env.
         atype_flat : Array
             Flattened atom types with shape (N,), where N = nf * nloc.
         n_nodes : int
@@ -598,14 +546,10 @@ def call(
         edge_vec = edge_cache.edge_vec  # (E, 3)
         edge_rbf = edge_cache.edge_rbf  # (E, n_radial)
         edge_env = edge_cache.edge_env  # (E, 1)
-        # Keep ``n_edge``/``n_nodes`` symbolic (no ``int()``); see the matching
-        # comment in ``GeometricInitialEmbedding.call`` for why casting to a
-        # Python int breaks torch.export with a dynamic ``nloc`` dim.
         n_edge = dst.shape[0]
-        nnei = _edge_layout(n_edge, n_nodes)
 
         # === Step 1. Construct r_tilde = [s, s*r_hat] ===
-        # s = edge_env * (1/r), r_hat = edge_vec / r (pt embedding.py:489-495)
+        # s = edge_env * (1/r), r_hat = edge_vec / r
         r_sq = xp.sum(edge_vec * edge_vec, axis=-1, keepdims=True)  # (E, 1)
         inv_r = 1.0 / xp.sqrt(r_sq + self.eps * self.eps)  # (E, 1)
         s = edge_env * inv_r  # (E, 1)
@@ -614,10 +558,8 @@ def call(
 
         # === Step 2. Compute G network input and output ===
         # Use independent type embeddings (decoupled from main type embedding)
-        src_index = xp.astype(xp.reshape(src, (n_edge,)), xp.int64)
-        dst_index = xp.astype(xp.reshape(dst, (n_edge,)), xp.int64)
-        atype_src = xp.take(atype_flat, src_index, axis=0)  # (E,)
-        atype_dst = xp.take(atype_flat, dst_index, axis=0)  # (E,)
+        atype_src = xp.take(atype_flat, xp.astype(src, xp.int64), axis=0)  # (E,)
+        atype_dst = xp.take(atype_flat, xp.astype(dst, xp.int64), axis=0)  # (E,)
         type_src = self.env_type_embed(atype_src)  # (E, type_dim)
         type_dst = self.env_type_embed(atype_dst)  # (E, type_dim)
 
@@ -631,72 +573,98 @@ def call(
         g = self.g_layer2(self.g_layer1(g_input))  # (E, embed_dim)
 
         # === Step 3. Aggregate outer product by destination node ===
-        # pt embedding.py:515 einsum("ei,ej->eij") -> broadcast product.
+        # outer = r_tilde[:, :, None] * g[:, None, :], einsum "ei,ej->eij".
         outer = r_tilde[:, :, None] * g[:, None, :]  # (E, 4, embed_dim)
-        outer_flat = xp.reshape(outer, (n_edge, 4 * self.embed_dim))
-        # Source Freeze Propagation Gate (pt embedding.py:519-521): mute the
-        # outer-product contribution of any edge whose source node has a
-        # neighbor in the frozen zone.
+        outer_flat = xp.reshape(outer, (n_edge, 4 * self.embed_dim))  # (E, 4*embed_dim)
+        # Source Freeze Propagation Gate: mute the outer-product contribution
+        # of any edge whose source node has a neighbor in the frozen zone.
         src_gate = edge_cache.edge_src_gate
         if src_gate is not None:
             outer_flat = outer_flat * xp.astype(
                 xp.reshape(src_gate, (n_edge, 1)), outer_flat.dtype
             )
-        # pt embedding.py:522-523: env_agg.index_add_(0, dst, outer_flat) —
-        # padded-edge masked sum over the nnei axis (dst is slot-implicit).
+        # Destination scatter-add over ``dst`` (pt ``index_add_``), applied after
+        # the validity masking below. Layout-agnostic: correct for the padded
+        # ``call`` (row-major ``dst`` keeps the accumulation order identical to a
+        # sum over the ``nnei`` axis, hence bit-exact) and for the sparse
+        # ``call_with_edges`` (arbitrary ``dst`` order and per-node degree).
         edge_mask = edge_cache.edge_mask
         if edge_mask is not None:
             outer_flat = outer_flat * xp.astype(
                 xp.reshape(edge_mask, (n_edge, 1)), outer_flat.dtype
             )
-        env_agg = xp.sum(
-            xp.reshape(outer_flat, (-1, nnei, 4 * self.embed_dim)),
-            axis=1,
+        env_agg = xp_add_at(
+            xp.zeros(
+                (n_nodes, 4 * self.embed_dim),
+                dtype=outer_flat.dtype,
+                device=array_api_compat.device(outer_flat),
+            ),
+            dst,
+            outer_flat,
         )  # (N, 4*embed_dim)
-        env_agg = xp.reshape(env_agg, (n_nodes, 4, self.embed_dim))
+        env_agg = xp.reshape(env_agg, (n_nodes, 4, self.embed_dim))  # (N, 4, embed_dim)
 
         # === Step 4. Smooth normalization by envelope-squared degree ===
         # Reuse the cache's inverse-sqrt degree so the version-aware
         # ``deg_norm_floor`` is applied consistently with GIE.
         env_agg = env_agg * xp.astype(edge_cache.inv_sqrt_deg, env_agg.dtype)
 
-        # === Step 5. D matrix: D = env_agg^T @ env_agg[:, :, :axis_dim] ===
+        # === Step 5. D matrix construction: D = env_agg^T @ env_agg[:,:,:axis_dim] ===
         env_agg_t = xp.permute_dims(env_agg, (0, 2, 1))  # (N, embed_dim, 4)
         env_agg_axis = env_agg[:, :, : self.axis_dim]  # (N, 4, axis_dim)
-        mat_d = xp.matmul(env_agg_t, env_agg_axis)  # (N, embed_dim, axis_dim)
+        D = xp.matmul(env_agg_t, env_agg_axis)  # (N, embed_dim, axis_dim)
 
         # === Step 6. Output projection for FiLM logits ===
-        d_flat = xp.reshape(
-            mat_d, (n_nodes, self.embed_dim * self.axis_dim)
+        D_flat = xp.reshape(
+            D, (n_nodes, self.embed_dim * self.axis_dim)
         )  # (N, embed_dim*axis_dim)
-        return self.output_proj(d_flat)
-
-    def _variable_slots(self) -> dict[str, tuple[Any, str]]:
-        """Map pt ``state_dict`` keys to (owner object, attribute name)."""
-        slots: dict[str, tuple[Any, str]] = {}
-        for name in ("rbf_proj_layer1", "rbf_proj_layer2", "g_layer1", "g_layer2"):
-            layer = getattr(self, name)
-            slots[f"{name}.matrix"] = (layer, "w")
-            if self.mlp_bias:
-                slots[f"{name}.bias"] = (layer, "b")
-        slots["env_type_embed.adam_type_embedding"] = (
-            self.env_type_embed,
-            "adam_type_embedding",
-        )
-        slots["output_proj.matrix"] = (self.output_proj, "w")
-        return slots
-
-    def serialize(self) -> dict[str, Any]:
-        """Serialize to a dict.
+        return self.output_proj(D_flat)
 
-        The ``@variables`` keys match the pt ``state_dict()`` key names, so
-        the pt ``serialize()`` output deserializes directly into this class
-        (and vice versa).
-        """
+    def _variables(self) -> dict[str, np.ndarray]:
+        """Variables keyed by the pt ``state_dict`` key names."""
         variables = {
-            key: to_numpy_array(getattr(owner, attr))
-            for key, (owner, attr) in self._variable_slots().items()
+            "rbf_proj_layer1.matrix": to_numpy_array(self.rbf_proj_layer1.w),
+            "rbf_proj_layer2.matrix": to_numpy_array(self.rbf_proj_layer2.w),
+            "env_type_embed.adam_type_embedding": to_numpy_array(
+                self.env_type_embed.adam_type_embedding
+            ),
+            "g_layer1.matrix": to_numpy_array(self.g_layer1.w),
+            "g_layer2.matrix": to_numpy_array(self.g_layer2.w),
+            "output_proj.matrix": to_numpy_array(self.output_proj.w),
         }
+        if self.mlp_bias:
+            variables["rbf_proj_layer1.bias"] = to_numpy_array(self.rbf_proj_layer1.b)
+            variables["rbf_proj_layer2.bias"] = to_numpy_array(self.rbf_proj_layer2.b)
+            variables["g_layer1.bias"] = to_numpy_array(self.g_layer1.b)
+            variables["g_layer2.bias"] = to_numpy_array(self.g_layer2.b)
+        return variables
+
+    def _load_variables(self, variables: dict[str, Any]) -> None:
+        """Load variables keyed by the pt ``state_dict`` key names."""
+        prec = PRECISION_DICT[self.precision.lower()]
+        self.rbf_proj_layer1.w = np.asarray(
+            variables["rbf_proj_layer1.matrix"], dtype=prec
+        )
+        self.rbf_proj_layer2.w = np.asarray(
+            variables["rbf_proj_layer2.matrix"], dtype=prec
+        )
+        self.env_type_embed.adam_type_embedding = np.asarray(
+            variables["env_type_embed.adam_type_embedding"], dtype=prec
+        )
+        self.g_layer1.w = np.asarray(variables["g_layer1.matrix"], dtype=prec)
+        self.g_layer2.w = np.asarray(variables["g_layer2.matrix"], dtype=prec)
+        self.output_proj.w = np.asarray(variables["output_proj.matrix"], dtype=prec)
+        if self.mlp_bias:
+            self.rbf_proj_layer1.b = np.asarray(
+                variables["rbf_proj_layer1.bias"], dtype=prec
+            )
+            self.rbf_proj_layer2.b = np.asarray(
+                variables["rbf_proj_layer2.bias"], dtype=prec
+            )
+            self.g_layer1.b = np.asarray(variables["g_layer1.bias"], dtype=prec)
+            self.g_layer2.b = np.asarray(variables["g_layer2.bias"], dtype=prec)
+
+    def serialize(self) -> dict[str, Any]:
         return {
             "@class": "EnvironmentInitialEmbedding",
             "@version": 1,
@@ -711,16 +679,16 @@ def serialize(self) -> dict[str, Any]:
                 "mlp_bias": self.mlp_bias,
                 "activation_function": self.activation_function,
                 "eps": self.eps,
-                "precision": self.precision.lower(),
+                "precision": np.dtype(PRECISION_DICT[self.precision]).name,
                 "trainable": self.trainable,
                 "seed": None,
             },
-            "@variables": variables,
+            "@variables": self._variables(),
         }
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> EnvironmentInitialEmbedding:
-        """Deserialize from a dict (accepts the pt ``serialize()`` output)."""
+        """Deserialize from dictionary."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "EnvironmentInitialEmbedding":
@@ -730,20 +698,141 @@ def deserialize(cls, data: dict[str, Any]) -> EnvironmentInitialEmbedding:
         config = data.pop("config")
         variables = data.pop("@variables")
         obj = cls(**config)
-        prec = PRECISION_DICT[obj.precision.lower()]
-        slots = obj._variable_slots()
-        if set(variables) != set(slots):
-            raise ValueError(
-                f"variable keys {sorted(variables)} do not match the expected "
-                f"keys {sorted(slots)}"
-            )
-        for key, (owner, attr) in slots.items():
-            value = np.asarray(variables[key], dtype=prec)
-            expected_shape = getattr(owner, attr).shape
-            if value.shape != expected_shape:
-                raise ValueError(
-                    f"shape of {key} {value.shape} does not match "
-                    f"the expected shape {expected_shape}"
-                )
-            setattr(owner, attr, value)
+        obj._load_variables(variables)
+        return obj
+
+
+class ChargeSpinEmbedding(NativeOP):
+    """
+    Frame-level charge and spin embedding for scalar type features.
+
+    Parameters
+    ----------
+    embed_dim
+        Embedding dimension.
+    activation_function
+        Activation function used by the mixing layer.
+    precision
+        Parameter precision.
+    seed
+        Random seed for initialization.
+    trainable
+        Whether parameters are trainable.
+    """
+
+    def __init__(
+        self,
+        *,
+        embed_dim: int,
+        activation_function: str,
+        precision: str = DEFAULT_PRECISION,
+        seed: int | list[int] | None = None,
+        trainable: bool = True,
+    ) -> None:
+        self.embed_dim = int(embed_dim)
+        self.activation_function = str(activation_function)
+        self.precision = precision
+        self.trainable = bool(trainable)
+        if self.embed_dim <= 0:
+            raise ValueError("`embed_dim` must be positive")
+
+        self.charge_embedding = SeZMTypeEmbedding(
+            ntypes=200,
+            embed_dim=self.embed_dim,
+            precision=self.precision,
+            seed=child_seed(seed, 0),
+            trainable=self.trainable,
+            padding=False,
+        )
+        self.spin_embedding = SeZMTypeEmbedding(
+            ntypes=100,
+            embed_dim=self.embed_dim,
+            precision=self.precision,
+            seed=child_seed(seed, 1),
+            trainable=self.trainable,
+            padding=False,
+        )
+        self.mix_layer = NativeLayer(
+            2 * self.embed_dim,
+            self.embed_dim,
+            activation_function=self.activation_function,
+            precision=self.precision,
+            seed=child_seed(seed, 2),
+            trainable=self.trainable,
+        )
+
+    def call(self, charge_spin: Any) -> Any:
+        """
+        Embed frame-level charge and spin.
+
+        Parameters
+        ----------
+        charge_spin
+            Frame charge and spin values with shape (nf, 2).
+
+        Returns
+        -------
+        Array
+            Mixed condition embedding with shape (nf, embed_dim).
+        """
+        xp = array_api_compat.array_namespace(charge_spin)
+        charge = xp.astype(charge_spin[:, 0], xp.int64) + 100
+        spin = xp.astype(charge_spin[:, 1], xp.int64)
+        charge_embed = self.charge_embedding(charge)
+        spin_embed = self.spin_embedding(spin)
+        return self.mix_layer(xp.concat((charge_embed, spin_embed), axis=-1))
+
+    def _variables(self) -> dict[str, np.ndarray]:
+        """Variables keyed by the pt ``state_dict`` key names."""
+        return {
+            "charge_embedding.adam_type_embedding": to_numpy_array(
+                self.charge_embedding.adam_type_embedding
+            ),
+            "spin_embedding.adam_type_embedding": to_numpy_array(
+                self.spin_embedding.adam_type_embedding
+            ),
+            "mix_layer.matrix": to_numpy_array(self.mix_layer.w),
+            "mix_layer.bias": to_numpy_array(self.mix_layer.b),
+        }
+
+    def _load_variables(self, variables: dict[str, Any]) -> None:
+        """Load variables keyed by the pt ``state_dict`` key names."""
+        prec = PRECISION_DICT[self.precision.lower()]
+        self.charge_embedding.adam_type_embedding = np.asarray(
+            variables["charge_embedding.adam_type_embedding"], dtype=prec
+        )
+        self.spin_embedding.adam_type_embedding = np.asarray(
+            variables["spin_embedding.adam_type_embedding"], dtype=prec
+        )
+        self.mix_layer.w = np.asarray(variables["mix_layer.matrix"], dtype=prec)
+        self.mix_layer.b = np.asarray(variables["mix_layer.bias"], dtype=prec)
+
+    def serialize(self) -> dict[str, Any]:
+        """Serialize the ChargeSpinEmbedding to a dict."""
+        return {
+            "@class": "ChargeSpinEmbedding",
+            "@version": 1,
+            "config": {
+                "embed_dim": self.embed_dim,
+                "activation_function": self.activation_function,
+                "precision": np.dtype(PRECISION_DICT[self.precision]).name,
+                "trainable": self.trainable,
+                "seed": None,
+            },
+            "@variables": self._variables(),
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict[str, Any]) -> ChargeSpinEmbedding:
+        """Deserialize a ChargeSpinEmbedding from a dict."""
+        data = data.copy()
+        data_cls = data.pop("@class")
+        if data_cls != "ChargeSpinEmbedding":
+            raise ValueError(f"Invalid class for ChargeSpinEmbedding: {data_cls}")
+        version = int(data.pop("@version"))
+        check_version_compatibility(version, 1, 1)
+        config = data.pop("config")
+        variables = data.pop("@variables")
+        obj = cls(**config)
+        obj._load_variables(variables)
         return obj
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/ffn.py b/deepmd/dpmodel/descriptor/dpa4_nn/ffn.py
index 984573ffed..84924687b1 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/ffn.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/ffn.py
@@ -2,9 +2,11 @@
 """
 Equivariant feed-forward layers for DPA4/SeZM.
 
-This module is the dpmodel port of ``deepmd.pt.model.descriptor.sezm_nn.ffn``.
-It defines the full SO(3)-equivariant feed-forward network used inside SeZM
-interaction blocks.
+This module defines the full SO(3)-equivariant feed-forward network used
+inside SeZM interaction blocks and the descriptor output head.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.ffn``.
 """
 
 from __future__ import (
@@ -39,12 +41,12 @@
 from .projection import (
     resolve_s2_grid_resolution,
 )
-from .so2 import (
-    _compute_precision,
-)
 from .so3 import (
     SO3Linear,
 )
+from .utils import (
+    get_promoted_dtype,
+)
 
 
 class EquivariantFFN(NativeOP):
@@ -57,14 +59,23 @@ class EquivariantFFN(NativeOP):
     Default structure (glu_activation=True):
         SO3 linear (in -> 2*hidden) -> split -> GatedActivation(val, gate) -> SO3 linear (hidden -> out)
 
-    Optional grid-FFN structure (s2_activation=True):
-        SO3 linear (in -> 2*hidden)
-        -> project packed SO(3) coefficients to the S2 grid
-        -> grid GLU or scalar-routed polynomial branch on hidden features
+    Optional grid-FFN structure (s2_activation=True or ffn_so3_grid=True):
+        SO3 linear (in -> hidden)
+        -> project packed SO(3) coefficients to the S2 or SO3 grid
+        -> grid GLU, polynomial MLP, or scalar-routed attention on hidden features
         -> project grid features back to packed SO(3) coefficients
         -> add scalar LinearSwiGLU branch to l=0
         -> SO3 linear (hidden -> out)
 
+    GatedActivation serves as the unified "activation" for equivariant networks,
+    analogous to SiLU in standard MLPs, but respecting SO(3) equivariance:
+    - l=0: Uses the specified activation function (or GLU variant when glu_activation=True)
+    - l>0: sigmoid gate from l=0 scalar features
+
+    When glu_activation=True, the first linear outputs 2*hidden_channels, then splits into
+    value and gate branches. This transforms activations like silu->swiglu, gelu->geglu.
+    The split approach is more efficient than two separate linear layers.
+
     Parameters
     ----------
     lmax
@@ -76,13 +87,14 @@ class EquivariantFFN(NativeOP):
     kmax
         Maximum Wigner-D frame order (|k|) used by the SO3 Wigner-D FFN grid.
     grid_mlp
-        If True, select the polynomial grid MLP operation (``op_type='mlp'``)
-        when the block-internal FFN grid path is enabled. ``grid_branch`` takes
-        precedence when positive.
+        If True, select the polynomial grid MLP operation when the
+        block-internal FFN grid path is enabled.
     grid_branch
         Number of scalar-routed polynomial product branches used when the
         block-internal FFN grid path is enabled. ``0`` disables this branch
         mixer. Positive values take precedence over ``grid_mlp``.
+    precision
+        Parameter precision.
     s2_activation
         If True, enable the S2 FFN grid path.
     ffn_so3_grid
@@ -95,9 +107,8 @@ class EquivariantFFN(NativeOP):
         If True, use GLU-style gating (e.g., silu -> swiglu, gelu -> geglu).
     mlp_bias
         Whether to use bias in SO3Linear (l=0 bias), GatedActivation
-        (gate linear bias).
-    precision
-        Parameter precision.
+        (gate linear bias), and the scalar point-wise projection when
+        ``grid_mlp=True``.
     trainable
         Whether parameters are trainable.
     seed
@@ -152,8 +163,9 @@ def __init__(
         self.glu_activation = bool(glu_activation)
         self.mlp_bias = bool(mlp_bias)
         self.precision = precision
-        self.compute_precision = _compute_precision(precision)
-        self.trainable = bool(trainable)
+        self.compute_precision = np.dtype(
+            get_promoted_dtype(PRECISION_DICT[precision])
+        ).name
         self.grid_n_frames = 2 * self.kmax + 1 if self.ffn_so3_grid else 1
 
         # === Step 0. Split deterministic seeds at the module top-level ===
@@ -178,7 +190,7 @@ def __init__(
             n_focus=1,
             precision=self.precision,
             mlp_bias=self.mlp_bias,
-            trainable=self.trainable,
+            trainable=trainable,
             seed=seed_so3_in,
         )
 
@@ -189,7 +201,6 @@ def __init__(
                 if self.use_grid_branch
                 else ("mlp" if self.use_grid_mlp else "glu")
             )
-            self.act: NativeOP
             if self.ffn_so3_grid:
                 self.act = SO3GridNet(
                     lmax=self.lmax,
@@ -202,7 +213,7 @@ def __init__(
                     layout="ndfc",
                     grid_branches=max(1, self.grid_branch),
                     mlp_bias=self.mlp_bias,
-                    trainable=self.trainable,
+                    trainable=trainable,
                     seed=seed_act,
                 )
             else:
@@ -219,7 +230,7 @@ def __init__(
                     grid_method=self.s2_grid_method,
                     grid_branches=max(1, self.grid_branch),
                     mlp_bias=self.mlp_bias,
-                    trainable=self.trainable,
+                    trainable=trainable,
                     seed=seed_act,
                 )
         else:
@@ -227,10 +238,10 @@ def __init__(
                 lmax=self.lmax,
                 channels=self.hidden_channels,
                 precision=self.compute_precision,
-                activation_function=self.activation_function,
+                activation_function=activation_function,
                 mlp_bias=self.mlp_bias,
                 layout="ndfc",
-                trainable=self.trainable,
+                trainable=trainable,
                 seed=seed_act,
             )
 
@@ -243,11 +254,13 @@ def __init__(
             n_focus=1,
             precision=self.precision,
             mlp_bias=self.mlp_bias,
-            trainable=self.trainable,
+            trainable=trainable,
             seed=seed_so3_out,
             init_std=0.0,
         )
 
+        self.trainable = bool(trainable)
+
     def call(self, x: Any) -> Any:
         """
         Parameters
@@ -268,9 +281,9 @@ def call(self, x: Any) -> Any:
             x = self.act(x)
         elif self.glu_activation:
             # Split into value and gate branches along channel dimension
-            # (pt uses x.chunk(2, dim=-1); slicing is array-API portable)
-            x_val = x[..., : self.hidden_channels]
-            x_gate = x[..., self.hidden_channels :]
+            nc = (x.shape[-1] + 1) // 2
+            x_val = x[..., :nc]
+            x_gate = x[..., nc:]
             # Pass gate to GatedActivation for GLU-style gating
             x = self.act(x_val, gate=x_gate)
         else:
@@ -299,29 +312,21 @@ def _variables(self) -> dict[str, Any]:
 
     def _load_variables(self, variables: dict[str, Any]) -> None:
         """Load variables keyed by the pt ``state_dict`` key names."""
-        variables = dict(variables)
         for attr, sub in self._sub_modules():
-            full = f"{attr}."
-            sv = {
-                key[len(full) :]: value
+            prefix = f"{attr}."
+            sub_variables = {
+                key[len(prefix) :]: value
                 for key, value in variables.items()
-                if key.startswith(full)
+                if key.startswith(prefix)
             }
-            for key in list(variables):
-                if key.startswith(full):
-                    del variables[key]
-            if not sv:
-                raise KeyError(f"Missing variables with prefix: {full}")
-            # rebuild the sub-module through its own (shape-checking)
-            # deserialize, reusing its serialized config
+            # Rebuild each sub-module via its own deserialize, reusing the freshly
+            # serialized config and injecting the loaded @variables.
             data = sub.serialize()
-            data["@variables"] = sv
+            data["@variables"] = sub_variables
             setattr(self, attr, type(sub).deserialize(data))
-        if variables:
-            raise KeyError(f"Unknown variables: {sorted(variables)}")
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the EquivariantFFN to a dict (pt-compatible format)."""
+        """Serialize the EquivariantFFN to a dict."""
         return {
             "@class": "EquivariantFFN",
             "@version": 1,
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/grid_net.py b/deepmd/dpmodel/descriptor/dpa4_nn/grid_net.py
index 407cf75b3f..87338fa744 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/grid_net.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/grid_net.py
@@ -2,38 +2,20 @@
 """
 Grid-space nonlinearities for DPA4/SeZM coefficient tensors.
 
-This module is the dpmodel port of
-``deepmd.pt.model.descriptor.sezm_nn.grid_net``. A grid net receives
-coefficient tensors, converts them to quadrature values, applies one
-point-wise grid operation, and projects the result back to coefficients. The
-public shapes are:
+A grid net receives coefficient tensors, converts them to quadrature values,
+applies one point-wise grid operation, and projects the result back to
+coefficients.  The public shapes are:
 
 * ``mode='self'``: one input ``(N, D, F, 2*C)`` or ``(N, F, D, 2*C)``.
-* ``mode='cross'``: separate query/context inputs each with ``C`` channels.
-* grid values: ``(N, G, F, C)`` after S2 or SO(3) projection.
-
-Ported names: ``BaseGridNet`` (``mode`` 'self'/'cross'; ``op_type``
-'glu'/'mlp'/'branch'; ``layout`` 'ndfc'/'nfdc'/'flat'; ``residual_scale_init``;
-general ``n_frames``), ``S2GridNet``, ``GridProduct``, ``GridMLP``,
-``GridBranch``.
-
-``BaseGridNet`` mirrors the current pt ``BaseGridNet`` for arbitrary
-``n_frames`` (the ``_to_grid``/``_from_grid`` frame-axis contraction). The S2
-path (``n_frames == 1``, ``mode='self'``) keeps a dedicated fast branch that is
-byte-identical to the previous S2-only specialization. The SO(3) frame
-machinery (``SO3GridNet``, ``FrameContract``, ``FrameExpand``) is ported here;
-``SO3GridNet`` builds an ``SO3GridProjector`` (``n_frames = 2 * kmax + 1``) and,
-in ``mode='cross'``, plugs ``FrameExpand``/``FrameContract`` into the
-``BaseGridNet`` ``frame_expand``/``frame_contract`` seams (kept ``None`` for S2).
-
-Serialization contract: the pt ``S2GridNet`` and ``GridBranch`` define no
-``serialize()`` (they only appear nested inside larger modules'
-state-dicts); the dpmodel ``serialize()``/``deserialize()`` use
-``@variables`` keys equal to the pt ``state_dict`` key names
-(``scalar_gate.weight``, ``grid_op.left_proj.weight``, ...) so that pt
-state-dict fragments load directly. The fixed projector matrices are
-non-persistent buffers in pt (not in the state dict) and are rebuilt from
-the config on deserialization.
+* ``mode='cross'``: query and context inputs with separate ``C`` channels.
+* grid values: ``(N, G, F, C)`` after S2 or SO3 projection.
+
+The only nonlinear scalar functions are SwiGLU, sigmoid, and softmax on the
+``l=0`` scalar branch.  Non-scalar grid values use channel-linear maps and
+point-wise products so equivariance is governed by the projector quadrature.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.grid_net``.
 """
 
 from __future__ import (
@@ -44,6 +26,7 @@
 from typing import (
     TYPE_CHECKING,
     Any,
+    Literal,
 )
 
 import array_api_compat
@@ -92,12 +75,28 @@
         Callable,
     )
 
+GridNetLayout = Literal["ndfc", "nfdc", "flat"]
+GridNetMode = Literal["self", "cross"]
+GridNetOp = Literal["glu", "mlp", "branch"]
 
-def _softmax_last_axis(x: Any) -> Any:
-    """Numerically stable softmax on the last axis (matches torch.softmax)."""
-    xp = array_api_compat.array_namespace(x)
-    e_x = xp.exp(x - xp.max(x, axis=-1, keepdims=True))
-    return e_x / xp.sum(e_x, axis=-1, keepdims=True)
+
+def _build_frame_degree_index(
+    *,
+    lmax: int,
+    mmax: int,
+    coefficient_layout: str,
+) -> np.ndarray:
+    """Build the per-coefficient degree index used by frame channel mixers."""
+    coefficient_layout = str(coefficient_layout).lower()
+    if coefficient_layout == "m_major":
+        return build_m_major_l_index(lmax, mmax)
+    if coefficient_layout == "packed":
+        degree_index = map_degree_idx(lmax)
+        if int(mmax) == int(lmax):
+            return degree_index
+        coeff_index = build_l_major_index(lmax, mmax)
+        return degree_index[coeff_index]
+    raise ValueError("`coefficient_layout` must be either 'packed' or 'm_major'")
 
 
 def _project_frames(coeff: Any, proj: ChannelLinear, n_frames: int) -> Any:
@@ -106,7 +105,7 @@ def _project_frames(coeff: Any, proj: ChannelLinear, n_frames: int) -> Any:
 
     Parameters
     ----------
-    coeff
+    coeff : Array
         Frame-packed coefficients with shape ``(N, D, F, n_frames * C_in)``.
     proj : ChannelLinear
         Linear map acting on the per-frame channel axis (``C_in -> C_out``).
@@ -131,246 +130,6 @@ def _project_frames(coeff: Any, proj: ChannelLinear, n_frames: int) -> Any:
     return xp.reshape(projected, (n_batch, coeff_dim, n_focus, -1))
 
 
-def _build_frame_degree_index(
-    *,
-    lmax: int,
-    mmax: int,
-    coefficient_layout: str,
-) -> np.ndarray:
-    """Build the per-coefficient degree index used by the frame channel mixers.
-
-    The pt version's ``device`` parameter is dropped: the output is a static
-    ``np.int64`` table mapping each coefficient row to its degree ``l`` for the
-    packed / truncated / m-major layouts.
-    """
-    coefficient_layout = str(coefficient_layout).lower()
-    if coefficient_layout == "m_major":
-        return build_m_major_l_index(lmax, mmax)
-    if coefficient_layout == "packed":
-        degree_index = map_degree_idx(lmax)
-        if int(mmax) == int(lmax):
-            return degree_index
-        coeff_index = build_l_major_index(lmax, mmax)
-        return degree_index[coeff_index]
-    raise ValueError("`coefficient_layout` must be either 'packed' or 'm_major'")
-
-
-class _FrameMixer(NativeOP):
-    """Shared base for the per-degree frame channel mixers.
-
-    The pt ``FrameContract`` / ``FrameExpand`` are ``nn.Module`` wrappers around
-    a per-degree weight of shape ``(lmax + 1, in_ch, out_ch)`` selected by a
-    static degree-index buffer; they realise an
-    ``einsum("ndfi,dio->ndfo", coeff, weight[degree_index])``. ``mode='self'``
-    S2 grid nets have ``n_frames == 1`` and never construct these; they back the
-    SO(3) cross-mode grid products only. Subclasses set ``in_channels`` /
-    ``out_channels`` and the init ``bound`` (matching the pt weight init).
-    """
-
-    def __init__(
-        self,
-        *,
-        lmax: int,
-        mmax: int,
-        coefficient_layout: str,
-        n_frames: int,
-        channels: int,
-        in_channels: int,
-        out_channels: int,
-        init_bound: float,
-        precision: str = DEFAULT_PRECISION,
-        trainable: bool = True,
-        seed: int | list[int] | None = None,
-    ) -> None:
-        self.lmax = int(lmax)
-        self.mmax = int(mmax)
-        self.coefficient_layout = str(coefficient_layout).lower()
-        self.n_frames = int(n_frames)
-        self.channels = int(channels)
-        self.precision = precision
-        self.trainable = bool(trainable)
-        # static np.int64 table; rebuilt from config on deserialize (the pt
-        # degree_index is a non-persistent buffer, not in the state dict)
-        self.degree_index = _build_frame_degree_index(
-            lmax=self.lmax,
-            mmax=self.mmax,
-            coefficient_layout=self.coefficient_layout,
-        )
-        prec = PRECISION_DICT[self.precision.lower()]
-        rng = np.random.default_rng(seed)
-        shape = (self.lmax + 1, int(in_channels), int(out_channels))
-        self.weight = rng.uniform(-init_bound, init_bound, size=shape).astype(prec)
-
-    def call(self, coeff: Any) -> Any:
-        """Apply the per-degree frame/channel map preserving the order index.
-
-        ``einsum("ndfi,dio->ndfo", coeff, weight[degree_index])`` is realised as
-        a broadcast batched matmul: the gathered weight ``(D, i, o)`` broadcasts
-        over the leading frame batch dim of ``coeff``.
-        """
-        xp = array_api_compat.array_namespace(coeff)
-        device = array_api_compat.device(coeff)
-        weight = xp_asarray_nodetach(xp, self.weight[...], device=device)
-        if weight.dtype != coeff.dtype:
-            weight = xp.astype(weight, coeff.dtype)
-        degree_index = xp_asarray_nodetach(xp, self.degree_index, device=device)
-        weight = xp.take(weight, degree_index, axis=0)  # (D, i, o)
-        # (N, D, F, i) @ (1, D, i, o) -> (N, D, F, o)
-        return xp.matmul(coeff, weight[None, ...])
-
-    def _serialize_config(self) -> dict[str, Any]:
-        return {
-            "lmax": self.lmax,
-            "mmax": self.mmax,
-            "coefficient_layout": self.coefficient_layout,
-            "n_frames": self.n_frames,
-            "channels": self.channels,
-            "precision": np.dtype(PRECISION_DICT[self.precision]).name,
-            "trainable": self.trainable,
-            "seed": None,
-        }
-
-    @classmethod
-    def _deserialize(cls, data: dict[str, Any]) -> Any:
-        data = data.copy()
-        data_cls = data.pop("@class")
-        if data_cls != cls.__name__:
-            raise ValueError(f"Invalid class for {cls.__name__}: {data_cls}")
-        version = int(data.pop("@version"))
-        check_version_compatibility(version, 1, 1)
-        config = data.pop("config")
-        variables = data.pop("@variables")
-        obj = cls(
-            lmax=int(config["lmax"]),
-            mmax=int(config["mmax"]),
-            coefficient_layout=str(config["coefficient_layout"]),
-            n_frames=int(config["n_frames"]),
-            channels=int(config["channels"]),
-            precision=str(config["precision"]),
-            trainable=bool(config["trainable"]),
-            seed=config.get("seed"),
-        )
-        prec = PRECISION_DICT[obj.precision.lower()]
-        weight = np.asarray(variables["weight"], dtype=prec)
-        if weight.shape != obj.weight.shape:
-            raise ValueError(
-                f"weight shape {weight.shape} does not match "
-                f"the expected shape {obj.weight.shape}"
-            )
-        obj.weight = weight
-        return obj
-
-
-class FrameContract(_FrameMixer):
-    """Per-degree frame/channel contraction that preserves the order index.
-
-    Maps ``(N, D, F, K*C) -> (N, D, F, C)`` with a per-degree weight of shape
-    ``(lmax + 1, K*C, C)`` where ``K`` is ``n_frames``.
-    """
-
-    def __init__(
-        self,
-        *,
-        lmax: int,
-        mmax: int,
-        coefficient_layout: str,
-        n_frames: int,
-        channels: int,
-        precision: str = DEFAULT_PRECISION,
-        trainable: bool = True,
-        seed: int | list[int] | None = None,
-    ) -> None:
-        n_frames = int(n_frames)
-        channels = int(channels)
-        super().__init__(
-            lmax=lmax,
-            mmax=mmax,
-            coefficient_layout=coefficient_layout,
-            n_frames=n_frames,
-            channels=channels,
-            in_channels=n_frames * channels,
-            out_channels=channels,
-            init_bound=1.0 / math.sqrt(n_frames * channels),
-            precision=precision,
-            trainable=trainable,
-            seed=seed,
-        )
-
-    def serialize(self) -> dict[str, Any]:
-        """Serialize the FrameContract to a dict.
-
-        The pt ``FrameContract`` has no ``serialize()``; the ``@variables`` key
-        (``weight``) matches the pt ``state_dict`` key name. ``degree_index`` is
-        a non-persistent buffer in pt and is rebuilt from the config.
-        """
-        return {
-            "@class": "FrameContract",
-            "@version": 1,
-            "config": self._serialize_config(),
-            "@variables": {"weight": to_numpy_array(self.weight)},
-        }
-
-    @classmethod
-    def deserialize(cls, data: dict[str, Any]) -> FrameContract:
-        """Deserialize a FrameContract from a dict."""
-        return cls._deserialize(data)
-
-
-class FrameExpand(_FrameMixer):
-    """Per-degree frame/channel expansion that preserves the order index.
-
-    Maps ``(N, D, F, C) -> (N, D, F, K*C)`` with a per-degree weight of shape
-    ``(lmax + 1, C, K*C)`` where ``K`` is ``n_frames``.
-    """
-
-    def __init__(
-        self,
-        *,
-        lmax: int,
-        mmax: int,
-        coefficient_layout: str,
-        n_frames: int,
-        channels: int,
-        precision: str = DEFAULT_PRECISION,
-        trainable: bool = True,
-        seed: int | list[int] | None = None,
-    ) -> None:
-        n_frames = int(n_frames)
-        channels = int(channels)
-        super().__init__(
-            lmax=lmax,
-            mmax=mmax,
-            coefficient_layout=coefficient_layout,
-            n_frames=n_frames,
-            channels=channels,
-            in_channels=channels,
-            out_channels=n_frames * channels,
-            init_bound=1.0 / math.sqrt(channels),
-            precision=precision,
-            trainable=trainable,
-            seed=seed,
-        )
-
-    def serialize(self) -> dict[str, Any]:
-        """Serialize the FrameExpand to a dict.
-
-        The pt ``FrameExpand`` has no ``serialize()``; the ``@variables`` key
-        (``weight``) matches the pt ``state_dict`` key name. ``degree_index`` is
-        a non-persistent buffer in pt and is rebuilt from the config.
-        """
-        return {
-            "@class": "FrameExpand",
-            "@version": 1,
-            "config": self._serialize_config(),
-            "@variables": {"weight": to_numpy_array(self.weight)},
-        }
-
-    @classmethod
-    def deserialize(cls, data: dict[str, Any]) -> FrameExpand:
-        """Deserialize a FrameExpand from a dict."""
-        return cls._deserialize(data)
-
-
 class GridProduct(NativeOP):
     """Parameter-free quadratic grid product ``u(g) * v(g)``."""
 
@@ -388,67 +147,32 @@ def call(
 
         Parameters
         ----------
-        left, right
-            Coefficient operands with shape ``(N, D, F, C)``.
-        scalar_pair
+        left, right : Array
+            Coefficient operands with shape ``(N, D, F, n_frames * C)``.
+        scalar_pair : Array
             Invariant routing signal; unused on this path.
-        to_grid, from_grid
+        to_grid, from_grid : Callable
             Coefficient/grid projectors supplied by the owning grid net.
+
+        Returns
+        -------
+        Array
+            Coefficient result with shape ``(N, D, F, n_frames * C)``.
         """
         return from_grid(to_grid(left) * to_grid(right))
 
-    def serialize(self) -> dict[str, Any]:
-        """Serialize the parameter-free grid product to a dict."""
-        return {
-            "@class": "GridProduct",
-            "@version": 1,
-        }
-
-    @classmethod
-    def deserialize(cls, data: dict[str, Any]) -> GridProduct:
-        """Deserialize a GridProduct from a dict."""
-        data = data.copy()
-        data_cls = data.pop("@class", "GridProduct")
-        if data_cls != "GridProduct":
-            raise ValueError(f"Invalid class for GridProduct: {data_cls}")
-        check_version_compatibility(int(data.pop("@version", 1)), 1, 1)
-        return cls()
-
 
 class GridMLP(NativeOP):
-    """
-    Polynomial point-wise MLP applied independently at every grid point.
-
-    Frame-aware port of the pt ``GridMLP``: operands are packed as
-    ``(N, D, F, n_frames * C)`` and every channel projection is applied to each
-    Wigner-D frame independently through :func:`_project_frames`. The S2 case
-    (``n_frames == 1``) reduces to a plain per-channel projection, byte-for-byte
-    identical to the previous S2-only specialization.
-
-    Parameters
-    ----------
-    channels : int
-        Number of channels per grid point (per frame).
-    mode : str
-        Pairing mode, either ``"self"`` or ``"cross"``.
-    n_frames : int
-        Number of Wigner-D frames packed along the trailing channel axis.
-    precision : str
-        Parameter precision.
-    trainable : bool
-        Whether parameters are trainable.
-    seed : int | list[int] | None
-        Random seed for weight initialization.
-    """
+    """Polynomial point-wise MLP applied independently at every grid point."""
 
     def __init__(
         self,
         *,
         channels: int,
-        mode: str,
+        mode: GridNetMode,
         n_frames: int,
         precision: str = DEFAULT_PRECISION,
-        trainable: bool = True,
+        trainable: bool,
         seed: int | list[int] | None = None,
     ) -> None:
         self.channels = int(channels)
@@ -491,7 +215,7 @@ def call(
         self,
         left: Any,
         right: Any,
-        scalar_pair: Any = None,
+        scalar_pair: Any,
         *,
         to_grid: Callable[[Any], Any],
         from_grid: Callable[[Any], Any],
@@ -499,29 +223,33 @@ def call(
         """
         Apply the polynomial point-wise MLP on coefficient operands.
 
-        In self mode both projections see the per-frame concatenation of the
-        two operands and can form self and cross quadratic channel terms. In
+        In self mode, both projections see the per-frame concatenation of the
+        two operands and can form self and cross quadratic channel terms.  In
         cross mode the query and context roles stay separate:
         ``(W_q query) * (W_c context)``.
 
         Parameters
         ----------
-        left, right
+        left, right : Array
             Coefficient operands with shape ``(N, D, F, n_frames * C)``.
-        scalar_pair
+        scalar_pair : Array
             Invariant routing signal; unused on this path.
-        to_grid, from_grid
+        to_grid, from_grid : Callable
             Coefficient/grid projectors supplied by the owning grid net.
+
+        Returns
+        -------
+        Array
+            Coefficient result with shape ``(N, D, F, n_frames * C)``.
         """
+        xp = array_api_compat.array_namespace(left)
         # === Step 1. Channel projections at coefficient resolution ===
         if self.mode == "self":
-            xp = array_api_compat.array_namespace(left)
-            left_shape = tuple(left.shape)
-            shape = (*left_shape[:-1], self.n_frames, -1)
-            fused = xp.concat(
-                [xp.reshape(left, shape), xp.reshape(right, shape)], axis=-1
-            )  # per-frame concat -> (N, D, F, n_frames, 2C)
-            fused = xp.reshape(fused, (*left_shape[:-1], -1))  # (N, D, F, n_frames*2C)
+            shape = (*left.shape[:-1], self.n_frames, -1)
+            fused = xp.reshape(
+                xp.concat([xp.reshape(left, shape), xp.reshape(right, shape)], axis=-1),
+                (*left.shape[:-1], -1),
+            )  # per-frame concat -> (N, D, F, K*2C)
             left = _project_frames(fused, self.left_proj, self.n_frames)
             right = _project_frames(fused, self.right_proj, self.n_frames)
         else:
@@ -533,11 +261,7 @@ def call(
         return _project_frames(coeff, self.out_proj, self.n_frames)
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the GridMLP to a dict.
-
-        The pt ``GridMLP`` has no ``serialize()``; the ``@variables`` keys here
-        match the pt ``state_dict`` key names.
-        """
+        """Serialize the GridMLP to a dict."""
         return {
             "@class": "GridMLP",
             "@version": 1,
@@ -563,65 +287,27 @@ def deserialize(cls, data: dict[str, Any]) -> GridMLP:
         data_cls = data.pop("@class")
         if data_cls != "GridMLP":
             raise ValueError(f"Invalid class for GridMLP: {data_cls}")
-        version = int(data.pop("@version"))
-        check_version_compatibility(version, 1, 1)
+        check_version_compatibility(int(data.pop("@version")), 1, 1)
         config = data.pop("config")
         variables = data.pop("@variables")
-        obj = cls(
-            channels=int(config["channels"]),
-            mode=str(config["mode"]),
-            n_frames=int(config["n_frames"]),
-            precision=str(config["precision"]),
-            trainable=bool(config["trainable"]),
-            seed=config.get("seed"),
-        )
+        obj = cls(**config)
         obj._load_variables(variables)
         return obj
 
     def _load_variables(self, variables: dict[str, Any]) -> None:
         prec = PRECISION_DICT[self.precision.lower()]
-        for name, proj in (
-            ("left_proj", self.left_proj),
-            ("right_proj", self.right_proj),
-            ("out_proj", self.out_proj),
-        ):
-            weight = np.asarray(variables[f"{name}.weight"], dtype=prec)
-            if weight.shape != proj.weight.shape:
-                raise ValueError(
-                    f"{name}.weight shape {weight.shape} does not match "
-                    f"the expected shape {proj.weight.shape}"
-                )
-            proj.weight = weight
+        self.left_proj.weight = np.asarray(variables["left_proj.weight"], dtype=prec)
+        self.right_proj.weight = np.asarray(variables["right_proj.weight"], dtype=prec)
+        self.out_proj.weight = np.asarray(variables["out_proj.weight"], dtype=prec)
 
 
 class GridBranch(NativeOP):
     """
     Scalar-routed polynomial mixer over grid product branches.
 
-    The softmax sees only invariant scalar inputs. Each branch is a
+    The softmax sees only invariant scalar inputs.  Each branch is a
     quadratic product of grid fields, so rotations only act through the grid
     argument and the operation remains as band-limited as the product path.
-
-    Frame-aware port of the pt ``GridBranch``: operands are packed as
-    ``(N, D, F, n_frames * C)`` and every channel projection is applied to each
-    Wigner-D frame independently through :func:`_project_frames`. The S2 case
-    (``n_frames == 1``) reduces to a plain per-channel projection, byte-for-byte
-    identical to the previous S2-only specialization.
-
-    Parameters
-    ----------
-    channels : int
-        Number of channels per grid point (per frame).
-    n_branches : int
-        Number of scalar-routed product branches.
-    n_frames : int
-        Number of Wigner-D frames packed along the trailing channel axis.
-    precision : str
-        Parameter precision.
-    trainable : bool
-        Whether parameters are trainable.
-    seed : int | list[int] | None
-        Random seed for weight initialization.
     """
 
     def __init__(
@@ -631,7 +317,7 @@ def __init__(
         n_branches: int,
         n_frames: int,
         precision: str = DEFAULT_PRECISION,
-        trainable: bool = True,
+        trainable: bool,
         seed: int | list[int] | None = None,
     ) -> None:
         self.channels = int(channels)
@@ -686,20 +372,19 @@ def call(
         """
         Apply scalar-routed grid branch mixing on coefficient operands.
 
-        The channel maps are applied at coefficient resolution (per Wigner-D
-        frame via :func:`_project_frames`) and the grid transform is deferred to
-        the injected ``to_grid``/``from_grid`` callables, matching the pt
-        ``GridBranch``. The router operates on invariant scalars only, so the
-        softmax is frame-independent.
-
         Parameters
         ----------
-        left, right
+        left, right : Array
             Coefficient operands with shape ``(N, D, F, n_frames * C)``.
-        scalar_pair
+        scalar_pair : Array
             Invariant router source with shape ``(N, F, 2*C)``.
-        to_grid, from_grid
+        to_grid, from_grid : Callable
             Coefficient/grid projectors supplied by the owning grid net.
+
+        Returns
+        -------
+        Array
+            Coefficient result with shape ``(N, D, F, n_frames * C)``.
         """
         xp = array_api_compat.array_namespace(left)
         # === Step 1. Branch channel projections at coefficient resolution ===
@@ -710,10 +395,12 @@ def call(
         value = to_grid(left) * to_grid(right)  # (N, G, F, N_branches * C)
         n_batch, n_grid, n_focus, _ = value.shape
         value = xp.reshape(
-            value,
-            (n_batch, n_grid, n_focus, self.n_branches, self.channels),
-        )  # (N, G, F, N_branches, C)
-        router = _softmax_last_axis(self.router(scalar_pair))  # (N, F, N_branches)
+            value, (n_batch, n_grid, n_focus, self.n_branches, self.channels)
+        )
+        # torch.softmax over the branch axis -> (N, F, N_branches)
+        router = self.router(scalar_pair)
+        router = xp.exp(router - xp.max(router, axis=-1, keepdims=True))
+        router = router / xp.sum(router, axis=-1, keepdims=True)
         # einsum "ngfhc,nfh->ngfc" as a broadcast sum over the branch axis
         out = xp.sum(value * router[:, None, :, :, None], axis=3)  # (N, G, F, C)
 
@@ -721,11 +408,7 @@ def call(
         return _project_frames(from_grid(out), self.out_proj, self.n_frames)
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the GridBranch to a dict.
-
-        The pt ``GridBranch`` has no ``serialize()``; the ``@variables`` keys
-        here match the pt ``state_dict`` key names.
-        """
+        """Serialize the GridBranch to a dict."""
         return {
             "@class": "GridBranch",
             "@version": 1,
@@ -752,36 +435,183 @@ def deserialize(cls, data: dict[str, Any]) -> GridBranch:
         data_cls = data.pop("@class")
         if data_cls != "GridBranch":
             raise ValueError(f"Invalid class for GridBranch: {data_cls}")
-        version = int(data.pop("@version"))
-        check_version_compatibility(version, 1, 1)
+        check_version_compatibility(int(data.pop("@version")), 1, 1)
         config = data.pop("config")
         variables = data.pop("@variables")
-        obj = cls(
-            channels=int(config["channels"]),
-            n_branches=int(config["n_branches"]),
-            n_frames=int(config["n_frames"]),
-            precision=str(config["precision"]),
-            trainable=bool(config["trainable"]),
-            seed=config.get("seed"),
-        )
+        obj = cls(**config)
         obj._load_variables(variables)
         return obj
 
     def _load_variables(self, variables: dict[str, Any]) -> None:
         prec = PRECISION_DICT[self.precision.lower()]
-        for name, proj in (
-            ("left_proj", self.left_proj),
-            ("right_proj", self.right_proj),
-            ("router", self.router),
-            ("out_proj", self.out_proj),
-        ):
-            weight = np.asarray(variables[f"{name}.weight"], dtype=prec)
-            if weight.shape != proj.weight.shape:
-                raise ValueError(
-                    f"{name}.weight shape {weight.shape} does not match "
-                    f"the expected shape {proj.weight.shape}"
-                )
-            proj.weight = weight
+        self.left_proj.weight = np.asarray(variables["left_proj.weight"], dtype=prec)
+        self.right_proj.weight = np.asarray(variables["right_proj.weight"], dtype=prec)
+        self.router.weight = np.asarray(variables["router.weight"], dtype=prec)
+        self.out_proj.weight = np.asarray(variables["out_proj.weight"], dtype=prec)
+
+
+class FrameContract(NativeOP):
+    """Per-degree frame/channel contraction that preserves the order index."""
+
+    def __init__(
+        self,
+        *,
+        lmax: int,
+        mmax: int,
+        coefficient_layout: str,
+        n_frames: int,
+        channels: int,
+        precision: str = DEFAULT_PRECISION,
+        trainable: bool,
+        seed: int | list[int] | None = None,
+    ) -> None:
+        self.lmax = int(lmax)
+        self.mmax = int(mmax)
+        self.coefficient_layout = str(coefficient_layout).lower()
+        self.n_frames = int(n_frames)
+        self.channels = int(channels)
+        self.precision = precision
+        self.trainable = bool(trainable)
+        self.degree_index = _build_frame_degree_index(
+            lmax=self.lmax,
+            mmax=self.mmax,
+            coefficient_layout=self.coefficient_layout,
+        )
+        prec = PRECISION_DICT[self.precision.lower()]
+        rng = np.random.default_rng(seed)
+        bound = 1.0 / math.sqrt(self.n_frames * self.channels)
+        self.weight = rng.uniform(
+            -bound,
+            bound,
+            size=(self.lmax + 1, self.n_frames * self.channels, self.channels),
+        ).astype(prec)
+
+    def call(self, coeff: Any) -> Any:
+        """Contract ``(N, D, F, K*C)`` frame coefficients to ``(N, D, F, C)``."""
+        xp = array_api_compat.array_namespace(coeff)
+        device = array_api_compat.device(coeff)
+        weight = xp_asarray_nodetach(xp, self.weight[...], device=device)
+        degree_index = xp_asarray_nodetach(xp, self.degree_index, device=device)
+        weight = xp.take(weight, degree_index, axis=0)
+        # einsum "ndfi,dio->ndfo" as a broadcast batched matmul:
+        # (N, D, F, i) @ (1, D, i, o) -> (N, D, F, o)
+        return xp.matmul(coeff, weight[None, ...])
+
+    def serialize(self) -> dict[str, Any]:
+        """Serialize the FrameContract to a dict."""
+        return {
+            "@class": "FrameContract",
+            "@version": 1,
+            "config": {
+                "lmax": self.lmax,
+                "mmax": self.mmax,
+                "coefficient_layout": self.coefficient_layout,
+                "n_frames": self.n_frames,
+                "channels": self.channels,
+                "precision": np.dtype(PRECISION_DICT[self.precision]).name,
+                "trainable": self.trainable,
+                "seed": None,
+            },
+            "@variables": {"weight": to_numpy_array(self.weight)},
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict[str, Any]) -> FrameContract:
+        """Deserialize a FrameContract from a dict."""
+        data = data.copy()
+        data_cls = data.pop("@class")
+        if data_cls != "FrameContract":
+            raise ValueError(f"Invalid class for FrameContract: {data_cls}")
+        check_version_compatibility(int(data.pop("@version")), 1, 1)
+        config = data.pop("config")
+        variables = data.pop("@variables")
+        obj = cls(**config)
+        obj.weight = np.asarray(
+            variables["weight"], dtype=PRECISION_DICT[obj.precision.lower()]
+        )
+        return obj
+
+
+class FrameExpand(NativeOP):
+    """Per-degree frame/channel expansion that preserves the order index."""
+
+    def __init__(
+        self,
+        *,
+        lmax: int,
+        mmax: int,
+        coefficient_layout: str,
+        n_frames: int,
+        channels: int,
+        precision: str = DEFAULT_PRECISION,
+        trainable: bool,
+        seed: int | list[int] | None = None,
+    ) -> None:
+        self.lmax = int(lmax)
+        self.mmax = int(mmax)
+        self.coefficient_layout = str(coefficient_layout).lower()
+        self.n_frames = int(n_frames)
+        self.channels = int(channels)
+        self.precision = precision
+        self.trainable = bool(trainable)
+        self.degree_index = _build_frame_degree_index(
+            lmax=self.lmax,
+            mmax=self.mmax,
+            coefficient_layout=self.coefficient_layout,
+        )
+        prec = PRECISION_DICT[self.precision.lower()]
+        rng = np.random.default_rng(seed)
+        bound = 1.0 / math.sqrt(self.channels)
+        self.weight = rng.uniform(
+            -bound,
+            bound,
+            size=(self.lmax + 1, self.channels, self.n_frames * self.channels),
+        ).astype(prec)
+
+    def call(self, coeff: Any) -> Any:
+        """Expand ``(N, D, F, C)`` coefficients to ``(N, D, F, K*C)``."""
+        xp = array_api_compat.array_namespace(coeff)
+        device = array_api_compat.device(coeff)
+        weight = xp_asarray_nodetach(xp, self.weight[...], device=device)
+        degree_index = xp_asarray_nodetach(xp, self.degree_index, device=device)
+        weight = xp.take(weight, degree_index, axis=0)
+        # einsum "ndfi,dio->ndfo" as a broadcast batched matmul:
+        # (N, D, F, i) @ (1, D, i, o) -> (N, D, F, o)
+        return xp.matmul(coeff, weight[None, ...])
+
+    def serialize(self) -> dict[str, Any]:
+        """Serialize the FrameExpand to a dict."""
+        return {
+            "@class": "FrameExpand",
+            "@version": 1,
+            "config": {
+                "lmax": self.lmax,
+                "mmax": self.mmax,
+                "coefficient_layout": self.coefficient_layout,
+                "n_frames": self.n_frames,
+                "channels": self.channels,
+                "precision": np.dtype(PRECISION_DICT[self.precision]).name,
+                "trainable": self.trainable,
+                "seed": None,
+            },
+            "@variables": {"weight": to_numpy_array(self.weight)},
+        }
+
+    @classmethod
+    def deserialize(cls, data: dict[str, Any]) -> FrameExpand:
+        """Deserialize a FrameExpand from a dict."""
+        data = data.copy()
+        data_cls = data.pop("@class")
+        if data_cls != "FrameExpand":
+            raise ValueError(f"Invalid class for FrameExpand: {data_cls}")
+        check_version_compatibility(int(data.pop("@version")), 1, 1)
+        config = data.pop("config")
+        variables = data.pop("@variables")
+        obj = cls(**config)
+        obj.weight = np.asarray(
+            variables["weight"], dtype=PRECISION_DICT[obj.precision.lower()]
+        )
+        return obj
 
 
 class BaseGridNet(NativeOP):
@@ -789,16 +619,9 @@ class BaseGridNet(NativeOP):
     Shared implementation for S2 and SO(3) grid nets.
 
     ``mode='self'`` expects one input whose last channel axis contains two
-    branches; the first half supplies the SwiGLU gates of the scalar path.
-    ``mode='cross'`` expects separate query and context inputs.
-
-    Mirrors the current pt ``BaseGridNet``: ``mode`` ('self'/'cross'),
-    ``layout`` ('ndfc'/'nfdc'/'flat'), ``residual_scale_init`` and arbitrary
-    ``n_frames`` are all supported. The S2 (``n_frames == 1``) path keeps a
-    dedicated fast branch in ``_to_grid``/``_from_grid``/``_apply_scalar_path``
-    that is byte-identical to the previous S2-only specialization. The SO(3)
-    frame machinery (``frame_expand``/``frame_contract``) is built by the
-    not-yet-ported ``SO3GridNet``; the seams here stay ``None`` for S2.
+    branches.  ``mode='cross'`` expects query and context inputs; the query side
+    is the source of attention queries and SwiGLU gates, while the context side
+    is the key/value or second product branch.
     """
 
     def __init__(
@@ -807,12 +630,12 @@ def __init__(
         projector: BaseGridProjector,
         channels: int,
         n_focus: int,
-        mode: str,
-        op_type: str,
+        mode: GridNetMode,
+        op_type: GridNetOp,
         precision: str = DEFAULT_PRECISION,
-        layout: str,
+        layout: GridNetLayout,
         mlp_bias: bool,
-        trainable: bool = True,
+        trainable: bool,
         grid_branches: int = 1,
         frame_expand: NativeOP | None = None,
         frame_contract: NativeOP | None = None,
@@ -839,10 +662,6 @@ def __init__(
         self.mlp_bias = bool(mlp_bias)
         self.trainable = bool(trainable)
         self.expanded_channels = self.n_frames * self.channels
-        # ``frame_expand``/``frame_contract`` are the SO(3) frame machinery
-        # (built only by ``SO3GridNet`` in cross mode). They stay ``None`` for
-        # S2 (``n_frames == 1``); the seam below lets a later SO(3) port plug
-        # them in without touching the shared forward.
         self.frame_expand = frame_expand
         self.frame_contract = frame_contract
         self.query_channels = (
@@ -861,14 +680,6 @@ def __init__(
             self.channels if self.frame_contract is not None else self.expanded_channels
         )
         self.frame_zero_index = int(getattr(projector, "frame_zero_index", 0))
-        self.residual_scale_init = residual_scale_init
-        if residual_scale_init is None:
-            self.residual_scale: np.ndarray | None = None
-        else:
-            prec = PRECISION_DICT[self.precision.lower()]
-            self.residual_scale = np.ones(
-                (self.n_focus, self.output_channels), dtype=prec
-            ) * float(residual_scale_init)
 
         self.scalar_act = SwiGLU()
         self.scalar_gate = FocusLinear(
@@ -902,39 +713,54 @@ def __init__(
         else:
             self.grid_op = GridProduct()
 
+        self.residual_scale_init = residual_scale_init
+        if residual_scale_init is None:
+            self.residual_scale: np.ndarray | None = None
+        else:
+            prec = PRECISION_DICT[self.precision.lower()]
+            self.residual_scale = np.ones(
+                (self.n_focus, self.output_channels), dtype=prec
+            ) * float(residual_scale_init)
+
     def call(self, query: Any, context: Any = None) -> Any:
         """Apply the configured grid net and restore the input layout."""
         xp = array_api_compat.array_namespace(query)
         input_dtype = query.dtype
-        compute_dtype = get_xp_precision(xp, self.precision)
         query_ndfc, shape_info = self._to_ndfc(query)
-        left, right, scalar_pair = self._prepare_pair(
-            query_ndfc, context, compute_dtype
+        left, right, scalar_pair = self._prepare_pair(query_ndfc, context)
+        coeff_out = self.grid_op(
+            xp.astype(left, get_xp_precision(xp, self.precision)),
+            xp.astype(right, get_xp_precision(xp, self.precision)),
+            scalar_pair,
+            to_grid=self._to_grid,
+            from_grid=self._from_grid,
         )
-        coeff_out = self._apply_grid_op(left, right, scalar_pair, compute_dtype)
         coeff_out = self._apply_scalar_path(coeff_out, scalar_pair)
         coeff_out = self._contract_frames(coeff_out)
         coeff_out = self._apply_residual_scale(coeff_out)
-        if coeff_out.dtype != input_dtype:
-            coeff_out = xp.astype(coeff_out, input_dtype)
-        return self._restore_layout(coeff_out, shape_info)
+        return self._restore_layout(xp.astype(coeff_out, input_dtype), shape_info)
 
     def _prepare_pair(
-        self, query: Any, context: Any, compute_dtype: Any
+        self,
+        query: Any,
+        context: Any,
     ) -> tuple[Any, Any, Any]:
         if self.mode == "self":
-            return self._prepare_self_pair(query, compute_dtype)
-        return self._prepare_cross_pair(query, context, compute_dtype)
+            return self._prepare_self_pair(query)
+        return self._prepare_cross_pair(query, context)
 
     def _prepare_self_pair(
-        self, query: Any, compute_dtype: Any
+        self,
+        query: Any,
     ) -> tuple[Any, Any, Any]:
         left, right = self._split_self_query(query)
-        scalar_pair = self._make_scalar_pair(left, right, compute_dtype)
+        scalar_pair = self._make_scalar_pair(left, right)
         return left, right, scalar_pair
 
     def _prepare_cross_pair(
-        self, query: Any, context: Any, compute_dtype: Any
+        self,
+        query: Any,
+        context: Any,
     ) -> tuple[Any, Any, Any]:
         if context is None:
             raise ValueError("`context` is required when `mode='cross'`")
@@ -942,24 +768,20 @@ def _prepare_cross_pair(
         self._check_last_dim(query, self.context_channels, "query")
         self._check_last_dim(context_ndfc, self.context_channels, "context")
         if self.frame_expand is None:
-            scalar_pair = self._make_scalar_pair(query, context_ndfc, compute_dtype)
+            scalar_pair = self._make_scalar_pair(query, context_ndfc)
             return query, context_ndfc, scalar_pair
-        # SO(3) frame-expansion seam (built only by a later SO3GridNet port):
-        # the scalar pair is read from the d=0 slice before expansion, then
-        # both operands are lifted to the frame-packed width.
+
         xp = array_api_compat.array_namespace(query)
-        scalar_pair = xp.concat([query[:, 0, :, :], context_ndfc[:, 0, :, :]], axis=-1)
-        if scalar_pair.dtype != compute_dtype:
-            scalar_pair = xp.astype(scalar_pair, compute_dtype)
-        # Lift operands to compute_dtype BEFORE frame expansion so FrameExpand
-        # runs in the net's precision. ``_FrameMixer.call`` casts its weights to
-        # the operand dtype, so without this an fp32 input through an fp64 grid
-        # net would expand in fp32 and only upcast afterward; pt's fp64
-        # FrameExpand weights force fp64 expansion, so this matches pt.
-        if query.dtype != compute_dtype:
-            query = xp.astype(query, compute_dtype)
-        if context_ndfc.dtype != compute_dtype:
-            context_ndfc = xp.astype(context_ndfc, compute_dtype)
+        scalar_pair = xp.astype(
+            xp.concat(
+                [
+                    query[:, 0, :, :],
+                    context_ndfc[:, 0, :, :],
+                ],
+                axis=-1,
+            ),
+            get_xp_precision(xp, self.precision),
+        )
         return (
             self.frame_expand(query),
             self.frame_expand(context_ndfc),
@@ -978,58 +800,45 @@ def _apply_residual_scale(self, coeff: Any) -> Any:
         residual_scale = xp_asarray_nodetach(
             xp, self.residual_scale[...], device=array_api_compat.device(coeff)
         )
-        if residual_scale.dtype != coeff.dtype:
-            residual_scale = xp.astype(residual_scale, coeff.dtype)
+        residual_scale = xp.astype(residual_scale, coeff.dtype)
         return coeff * xp.reshape(
-            residual_scale, (1, 1, self.n_focus, self.output_channels)
+            residual_scale,
+            (1, 1, self.n_focus, self.output_channels),
         )
 
-    def _apply_grid_op(
+    def _apply_scalar_path(
         self,
-        left: Any,
-        right: Any,
+        coeff: Any,
         scalar_pair: Any,
-        compute_dtype: Any,
     ) -> Any:
-        xp = array_api_compat.array_namespace(left)
-        if left.dtype != compute_dtype:
-            left = xp.astype(left, compute_dtype)
-        if right.dtype != compute_dtype:
-            right = xp.astype(right, compute_dtype)
-        return self.grid_op(
-            left,
-            right,
-            scalar_pair,
-            to_grid=self._to_grid,
-            from_grid=self._from_grid,
-        )
-
-    def _apply_scalar_path(self, coeff: Any, scalar_pair: Any) -> Any:
         xp = array_api_compat.array_namespace(coeff)
-        scalar_out = self.scalar_act(scalar_pair)  # (N, F, C)
-        scalar_gate = xp_sigmoid(self.scalar_gate(scalar_pair))  # (N, F, C)
-        if self.n_frames == 1:
-            # Fast S2 path (byte-identical to the previous specialization).
-            coeff = coeff * scalar_gate[:, None, :, :]
-            # gradient-safe equivalent of the pt in-place
-            # ``coeff_view[:, 0, :, 0, :].add_(scalar_out)`` (n_frames == 1)
-            head = coeff[:, :1, :, :] + scalar_out[:, None, :, :]
-            return xp.concat([head, coeff[:, 1:, :, :]], axis=1)
-        # General frame-packed path mirroring the pt
-        # ``coeff_view = coeff.reshape(N, D, F, K, C)`` followed by a gated
-        # multiply and an in-place add into ``[:, 0, :, frame_zero_index, :]``.
+        scalar_out = self.scalar_act(scalar_pair)
+        scalar_gate = xp_sigmoid(self.scalar_gate(scalar_pair))
         n_batch, coeff_dim, n_focus, _ = coeff.shape
         coeff_view = xp.reshape(
-            coeff, (n_batch, coeff_dim, n_focus, self.n_frames, self.channels)
+            coeff,
+            (
+                n_batch,
+                coeff_dim,
+                n_focus,
+                self.n_frames,
+                self.channels,
+            ),
         )
         coeff_view = coeff_view * scalar_gate[:, None, :, None, :]
-        # gradient-safe in-place add into the d=0, frame_zero_index slice
+        # torch in-place ``coeff_view[:, 0, :, frame_zero_index, :].add_(scalar_out)``:
+        # array-API has no in-place item assignment, so the d=0 slab is rebuilt by
+        # functional concat with the scalar update added to the frame_zero_index frame.
         fzi = self.frame_zero_index
-        head = coeff_view[:, :1, :, :, :]  # (N, 1, F, K, C)
-        pre = head[:, :, :, :fzi, :]
-        mid = head[:, :, :, fzi : fzi + 1, :] + scalar_out[:, None, :, None, :]
-        post = head[:, :, :, fzi + 1 :, :]
-        head = xp.concat([pre, mid, post], axis=3)
+        head = coeff_view[:, :1, :, :, :]
+        head = xp.concat(
+            [
+                head[:, :, :, :fzi, :],
+                head[:, :, :, fzi : fzi + 1, :] + scalar_out[:, None, :, None, :],
+                head[:, :, :, fzi + 1 :, :],
+            ],
+            axis=3,
+        )
         coeff_view = xp.concat([head, coeff_view[:, 1:, :, :, :]], axis=1)
         return xp.reshape(
             coeff_view, (n_batch, coeff_dim, n_focus, self.expanded_channels)
@@ -1037,171 +846,174 @@ def _apply_scalar_path(self, coeff: Any, scalar_pair: Any) -> Any:
 
     def _split_self_query(self, query: Any) -> tuple[Any, Any]:
         self._check_last_dim(query, self.query_channels, "query")
-        # torch.chunk(query, 2, dim=-1) with an even channel count
+        # torch.chunk(query, chunks=2, dim=-1) with an even channel count
         return (
             query[..., : self.expanded_channels],
             query[..., self.expanded_channels :],
         )
 
-    def _make_scalar_pair(self, left: Any, right: Any, compute_dtype: Any) -> Any:
+    def _make_scalar_pair(self, left: Any, right: Any) -> Any:
         xp = array_api_compat.array_namespace(left)
-        scalar_pair = xp.concat(
-            [
-                self._extract_scalar(left),
-                self._extract_scalar(right),
-            ],
-            axis=-1,
+        return xp.astype(
+            xp.concat(
+                [
+                    self._extract_scalar(left),
+                    self._extract_scalar(right),
+                ],
+                axis=-1,
+            ),
+            get_xp_precision(xp, self.precision),
         )
-        if scalar_pair.dtype != compute_dtype:
-            scalar_pair = xp.astype(scalar_pair, compute_dtype)
-        return scalar_pair
 
     def _extract_scalar(self, coeff: Any) -> Any:
-        # (N, D, F, K*C) -> the (l=0, m=0) scalar slice (N, F, C).
-        if self.n_frames == 1:
-            return coeff[:, 0, :, :]
         xp = array_api_compat.array_namespace(coeff)
-        n_batch, coeff_dim, n_focus, _ = coeff.shape
+        n_batch, _, n_focus, _ = coeff.shape
         coeff_view = xp.reshape(
-            coeff, (n_batch, coeff_dim, n_focus, self.n_frames, self.channels)
+            coeff,
+            (
+                n_batch,
+                coeff.shape[1],
+                n_focus,
+                self.n_frames,
+                self.channels,
+            ),
         )
         return coeff_view[:, 0, :, self.frame_zero_index, :]
 
     def _to_grid(self, coeff: Any) -> Any:
+        # The per-frame channel width is inferred so the projector also serves
+        # widened operands (e.g. a branch hidden width ``n_branches * C``).
         xp = array_api_compat.array_namespace(coeff)
-        to_grid_mat = xp_asarray_nodetach(
+        n_batch, coeff_dim, n_focus, _ = coeff.shape
+        coeff_view = xp.reshape(coeff, (n_batch, coeff_dim, n_focus, self.n_frames, -1))
+        to_grid = xp_asarray_nodetach(
             xp, self.projector.to_grid_mat[...], device=array_api_compat.device(coeff)
         )
-        if to_grid_mat.dtype != coeff.dtype:
-            to_grid_mat = xp.astype(to_grid_mat, coeff.dtype)
-        if self.n_frames == 1:
-            # einsum "gd,ndfc->ngfc" as a broadcast batched matmul. The per-point
-            # channel width is inferred so the projector also serves widened
-            # operands (e.g. a branch hidden width ``n_branches * C``).
-            n_batch, coeff_dim, n_focus, n_channels = coeff.shape
-            flat = xp.reshape(coeff, (n_batch, coeff_dim, n_focus * n_channels))
-            out = xp.matmul(to_grid_mat[None, ...], flat)  # (N, G, F*C)
-            return xp.reshape(
-                out, (n_batch, self.projector.grid_size, n_focus, n_channels)
-            )
-        # General SO(3) frame-packed path mirroring the pt
-        # ``einsum("gdk,ndfkc->ngfc", to_grid.reshape(G, D, K), coeff_view)``.
-        # ``to_grid_mat`` columns are ordered (d outer, k inner), so the operand
-        # is permuted to the matching ``(d, k)`` flattening before the matmul.
-        n_batch, coeff_dim, n_focus, last = coeff.shape
-        n_channels = last // self.n_frames
-        coeff_view = xp.reshape(
-            coeff, (n_batch, coeff_dim, n_focus, self.n_frames, n_channels)
-        )
+        to_grid = xp.astype(to_grid, coeff.dtype)
+        # einsum "gdk,ndfkc->ngfc" (with to_grid reshaped (G, D, K)) as a
+        # broadcast batched matmul: the contracted (d, k) axes are flattened
+        # (d outer, k inner) and to_grid is already stored as (G, D*K).
+        n_channels = coeff_view.shape[-1]
         coeff_dk = xp.permute_dims(coeff_view, (0, 1, 3, 2, 4))  # (N, D, K, F, C)
         coeff_flat = xp.reshape(
             coeff_dk, (n_batch, coeff_dim * self.n_frames, n_focus * n_channels)
         )
-        out = xp.matmul(to_grid_mat[None, ...], coeff_flat)  # (N, G, F*C)
+        out = xp.matmul(to_grid[None, ...], coeff_flat)  # (N, G, F*C)
         return xp.reshape(out, (n_batch, self.projector.grid_size, n_focus, n_channels))
 
     def _from_grid(self, grid: Any) -> Any:
+        # Channel width is inferred to match the (possibly widened) grid field.
         xp = array_api_compat.array_namespace(grid)
-        from_grid_mat = xp_asarray_nodetach(
+        n_batch, _, n_focus, _ = grid.shape
+        coeff_dim = self.projector.coeff_dim // self.n_frames
+        from_grid = xp_asarray_nodetach(
             xp, self.projector.from_grid_mat[...], device=array_api_compat.device(grid)
         )
-        if from_grid_mat.dtype != grid.dtype:
-            from_grid_mat = xp.astype(from_grid_mat, grid.dtype)
-        if self.n_frames == 1:
-            # einsum "dg,ngfc->ndfc" as a broadcast batched matmul. The channel
-            # width is inferred to match the (possibly widened) grid field.
-            n_batch, n_grid, n_focus, n_channels = grid.shape
-            coeff_dim = self.projector.coeff_dim
-            flat = xp.reshape(grid, (n_batch, n_grid, n_focus * n_channels))
-            out = xp.matmul(from_grid_mat[None, ...], flat)  # (N, D, F*C)
-            return xp.reshape(out, (n_batch, coeff_dim, n_focus, n_channels))
-        # General SO(3) frame-packed path mirroring the pt
-        # ``einsum("dkg,ngfc->ndfkc", from_grid.reshape(D, K, G), grid)`` then a
-        # reshape to ``(N, D, F, K*C)``. ``from_grid_mat`` rows are ordered
-        # (d outer, k inner); the matmul output is reshaped/permuted to match.
-        n_batch, n_grid, n_focus, n_channels = grid.shape
-        coeff_dim = self.projector.coeff_dim // self.n_frames
-        flat = xp.reshape(grid, (n_batch, n_grid, n_focus * n_channels))
-        out = xp.matmul(from_grid_mat[None, ...], flat)  # (N, D*K, F*C)
-        out = xp.reshape(out, (n_batch, coeff_dim, self.n_frames, n_focus, n_channels))
-        out = xp.permute_dims(out, (0, 1, 3, 2, 4))  # (N, D, F, K, C)
+        from_grid = xp.astype(from_grid, grid.dtype)
+        # einsum "dkg,ngfc->ndfkc" (with from_grid reshaped (D, K, G)) as a
+        # broadcast batched matmul, then a reshape to (N, D, F, K*C). from_grid
+        # is already stored as (D*K, G); the matmul output is reshaped/permuted.
+        n_channels = grid.shape[-1]
+        grid_flat = xp.reshape(
+            grid, (n_batch, self.projector.grid_size, n_focus * n_channels)
+        )
+        coeff = xp.matmul(from_grid[None, ...], grid_flat)  # (N, D*K, F*C)
+        coeff = xp.reshape(
+            coeff, (n_batch, coeff_dim, self.n_frames, n_focus, n_channels)
+        )
+        coeff = xp.permute_dims(coeff, (0, 1, 3, 2, 4))  # (N, D, F, K, C)
         return xp.reshape(
-            out, (n_batch, coeff_dim, n_focus, self.n_frames * n_channels)
+            coeff, (n_batch, coeff_dim, n_focus, self.n_frames * n_channels)
         )
 
     def _to_ndfc(self, value: Any) -> tuple[Any, tuple[int, ...]]:
-        shape_info = tuple(value.shape)
+        xp = array_api_compat.array_namespace(value)
         if self.layout == "ndfc":
-            return value, shape_info
+            return value, tuple(value.shape)
         if self.layout == "nfdc":
-            # (N, F, D, C) -> (N, D, F, C)
-            xp = array_api_compat.array_namespace(value)
-            return xp.permute_dims(value, (0, 2, 1, 3)), shape_info
-        # "flat": (N, D, F*k*C) -> (N, D, F, k*C)
-        xp = array_api_compat.array_namespace(value)
+            return xp.permute_dims(value, (0, 2, 1, 3)), tuple(value.shape)
         n_batch, coeff_dim, _ = value.shape
         return (
             xp.reshape(value, (n_batch, coeff_dim, self.n_focus, -1)),
-            shape_info,
+            tuple(value.shape),
         )
 
-    def _restore_layout(self, value: Any, shape_info: tuple[int, ...]) -> Any:
+    def _restore_layout(
+        self,
+        value: Any,
+        shape_info: tuple[int, ...],
+    ) -> Any:
+        xp = array_api_compat.array_namespace(value)
         if self.layout == "ndfc":
             return value
-        xp = array_api_compat.array_namespace(value)
         if self.layout == "nfdc":
             return xp.permute_dims(value, (0, 2, 1, 3))
-        # "flat": (N, D, F, k*C) -> (N, D, F*k*C)
         n_batch, coeff_dim, _ = shape_info
         return xp.reshape(value, (n_batch, coeff_dim, -1))
 
-    def _check_last_dim(self, value: Any, expected: int, name: str) -> None:
+    def _check_last_dim(
+        self,
+        value: Any,
+        expected: int,
+        name: str,
+    ) -> None:
         if value.shape[-1] != expected:
             raise ValueError(
                 f"`{name}` last dimension must be {expected}, got {value.shape[-1]}"
             )
 
+    def _variables(self) -> dict[str, Any]:
+        """Collect weights keyed by the pt ``state_dict`` key names."""
+        variables: dict[str, Any] = {
+            "scalar_gate.weight": to_numpy_array(self.scalar_gate.weight)
+        }
+        if self.mlp_bias:
+            variables["scalar_gate.bias"] = to_numpy_array(self.scalar_gate.bias)
+        if self.op_type in {"mlp", "branch"}:
+            for key, value in self.grid_op.serialize()["@variables"].items():
+                variables[f"grid_op.{key}"] = value
+        if self.frame_expand is not None:
+            variables["frame_expand.weight"] = to_numpy_array(self.frame_expand.weight)
+        if self.frame_contract is not None:
+            variables["frame_contract.weight"] = to_numpy_array(
+                self.frame_contract.weight
+            )
+        if self.residual_scale is not None:
+            variables["residual_scale"] = to_numpy_array(self.residual_scale)
+        return variables
 
-class S2GridNet(BaseGridNet):
-    """Grid net using an S2 spherical-harmonic projector (Lebedev only).
+    def _load_variables(self, variables: dict[str, Any]) -> None:
+        """Load weights keyed by the pt ``state_dict`` key names."""
+        prec = PRECISION_DICT[self.precision.lower()]
+        self.scalar_gate.weight = np.asarray(
+            variables["scalar_gate.weight"], dtype=prec
+        )
+        if self.mlp_bias:
+            self.scalar_gate.bias = np.asarray(
+                variables["scalar_gate.bias"], dtype=prec
+            )
+        if self.op_type in {"mlp", "branch"}:
+            self.grid_op._load_variables(
+                {
+                    key[len("grid_op.") :]: value
+                    for key, value in variables.items()
+                    if key.startswith("grid_op.")
+                }
+            )
+        if self.frame_expand is not None:
+            self.frame_expand.weight = np.asarray(
+                variables["frame_expand.weight"], dtype=prec
+            )
+        if self.frame_contract is not None:
+            self.frame_contract.weight = np.asarray(
+                variables["frame_contract.weight"], dtype=prec
+            )
+        if self.residual_scale is not None:
+            self.residual_scale = np.asarray(variables["residual_scale"], dtype=prec)
 
-    Parameters
-    ----------
-    lmax : int
-        Maximum spherical harmonic degree.
-    mmax : int | None
-        Maximum order kept in the coefficient layout. If None, use ``lmax``.
-    channels : int
-        Number of channels per (l, m) coefficient.
-    n_focus : int
-        Number of focus streams.
-    mode : str
-        Pairing mode; ``"self"`` or ``"cross"``.
-    op_type : str
-        Point-wise grid operation; ``"glu"``, ``"mlp"`` or ``"branch"``.
-    precision : str
-        Parameter precision.
-    layout : str
-        Tensor layout convention: ``"ndfc"``, ``"nfdc"`` or ``"flat"``
-        (``"flat"`` is cross-only).
-    grid_resolution_list : list[int] | None
-        Lebedev ``[precision, n_points]`` pair; resolved automatically if None.
-    coefficient_layout : str
-        ``"packed"`` or ``"m_major"`` coefficient ordering.
-    grid_method : str
-        S2 quadrature backend; only ``"lebedev"`` is ported.
-    grid_branches : int
-        Number of scalar-routed branches when ``op_type='branch'``.
-    residual_scale_init : float | None
-        Initial value of the per-(focus, channel) residual scale; ``None``
-        disables the residual scale.
-    mlp_bias : bool
-        Whether to use bias in the scalar gate projection.
-    trainable : bool
-        Whether parameters are trainable.
-    seed : int | list[int] | None
-        Random seed for weight initialization.
-    """
+
+class S2GridNet(BaseGridNet):
+    """Grid net using an S2 spherical-harmonic projector."""
 
     def __init__(
         self,
@@ -1210,21 +1022,17 @@ def __init__(
         mmax: int | None = None,
         channels: int,
         n_focus: int = 1,
-        mode: str,
-        op_type: str,
+        mode: GridNetMode,
+        op_type: GridNetOp,
         precision: str = DEFAULT_PRECISION,
-        layout: str,
+        layout: GridNetLayout,
         grid_resolution_list: list[int] | None = None,
         coefficient_layout: str = "packed",
-        # Deliberate divergence from pt's default ("e3nn"): the e3nn
-        # product-grid branch is not ported to dpmodel and always raises, so
-        # the only usable default here is "lebedev". Checkpoint compatibility
-        # is unaffected because serialize always records the explicit value.
-        grid_method: str = "lebedev",
+        grid_method: str = "e3nn",
         grid_branches: int = 1,
         residual_scale_init: float | None = None,
         mlp_bias: bool = False,
-        trainable: bool = True,
+        trainable: bool,
         seed: int | list[int] | None = None,
     ) -> None:
         projector = S2GridProjector(
@@ -1254,22 +1062,7 @@ def __init__(
         )
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the S2GridNet to a dict.
-
-        The pt ``S2GridNet`` has no ``serialize()``; the ``@variables`` keys
-        here match the pt ``state_dict`` key names (the projector matrices
-        are non-persistent buffers in pt and are rebuilt from the config).
-        """
-        variables = {"scalar_gate.weight": to_numpy_array(self.scalar_gate.weight)}
-        if self.mlp_bias:
-            variables["scalar_gate.bias"] = to_numpy_array(self.scalar_gate.bias)
-        if self.op_type in {"mlp", "branch"}:
-            grid_op_data = self.grid_op.serialize()["@variables"]
-            for key, value in grid_op_data.items():
-                variables[f"grid_op.{key}"] = value
-        if self.residual_scale is not None:
-            # pt state-dict key name for the (n_focus, output_channels) parameter
-            variables["residual_scale"] = to_numpy_array(self.residual_scale)
+        """Serialize the S2GridNet to a dict."""
         return {
             "@class": "S2GridNet",
             "@version": 1,
@@ -1291,7 +1084,7 @@ def serialize(self) -> dict[str, Any]:
                 "trainable": self.trainable,
                 "seed": None,
             },
-            "@variables": variables,
+            "@variables": self._variables(),
         }
 
     @classmethod
@@ -1301,110 +1094,16 @@ def deserialize(cls, data: dict[str, Any]) -> S2GridNet:
         data_cls = data.pop("@class")
         if data_cls != "S2GridNet":
             raise ValueError(f"Invalid class for S2GridNet: {data_cls}")
-        version = int(data.pop("@version"))
-        check_version_compatibility(version, 1, 1)
+        check_version_compatibility(int(data.pop("@version")), 1, 1)
         config = data.pop("config")
         variables = data.pop("@variables")
-        obj = cls(
-            lmax=int(config["lmax"]),
-            mmax=int(config["mmax"]),
-            channels=int(config["channels"]),
-            n_focus=int(config["n_focus"]),
-            mode=str(config["mode"]),
-            op_type=str(config["op_type"]),
-            precision=str(config["precision"]),
-            layout=str(config["layout"]),
-            grid_resolution_list=config["grid_resolution_list"],
-            coefficient_layout=str(config["coefficient_layout"]),
-            grid_method=str(config["grid_method"]),
-            grid_branches=int(config["grid_branches"]),
-            residual_scale_init=config.get("residual_scale_init"),
-            mlp_bias=bool(config["mlp_bias"]),
-            trainable=bool(config["trainable"]),
-            seed=config.get("seed"),
-        )
-        prec = PRECISION_DICT[obj.precision.lower()]
-        weight = np.asarray(variables["scalar_gate.weight"], dtype=prec)
-        if weight.shape != obj.scalar_gate.weight.shape:
-            raise ValueError(
-                f"scalar_gate.weight shape {weight.shape} does not match "
-                f"the expected shape {obj.scalar_gate.weight.shape}"
-            )
-        obj.scalar_gate.weight = weight
-        if obj.mlp_bias:
-            obj.scalar_gate.bias = np.asarray(
-                variables["scalar_gate.bias"], dtype=prec
-            ).reshape(obj.scalar_gate.bias.shape)
-        if obj.residual_scale is not None:
-            residual_scale = np.asarray(variables["residual_scale"], dtype=prec)
-            if residual_scale.shape != obj.residual_scale.shape:
-                raise ValueError(
-                    f"residual_scale shape {residual_scale.shape} does not match "
-                    f"the expected shape {obj.residual_scale.shape}"
-                )
-            obj.residual_scale = residual_scale
-        if obj.op_type in {"mlp", "branch"}:
-            obj.grid_op._load_variables(
-                {
-                    key[len("grid_op.") :]: value
-                    for key, value in variables.items()
-                    if key.startswith("grid_op.")
-                }
-            )
+        obj = cls(**config)
+        obj._load_variables(variables)
         return obj
 
 
 class SO3GridNet(BaseGridNet):
-    """Grid net using a Wigner-D SO(3) projector with frame indices.
-
-    dpmodel port of the current pt
-    ``deepmd.pt.model.descriptor.sezm_nn.grid_net.SO3GridNet``. Unlike
-    ``S2GridNet`` (``n_frames == 1``), the SO(3) projector packs
-    ``n_frames = 2 * kmax + 1`` Wigner-D frames along the trailing channel axis,
-    exercising the general ``n_frames > 1`` ``_to_grid``/``_from_grid`` paths of
-    ``BaseGridNet``. In ``mode='cross'`` it additionally builds the per-degree
-    :class:`FrameExpand` / :class:`FrameContract` channel mixers and plugs them
-    into the ``BaseGridNet`` ``frame_expand``/``frame_contract`` seam: the query
-    and context are expanded ``C -> n_frames * C`` before the grid product and
-    contracted ``n_frames * C -> C`` afterwards.
-
-    Parameters
-    ----------
-    lmax : int
-        Maximum spherical harmonic degree.
-    mmax : int | None
-        Maximum order kept in the coefficient layout. If None, use ``lmax``.
-    kmax : int
-        Frame band width; ``n_frames = 2 * kmax + 1`` Wigner-D frames.
-    channels : int
-        Number of channels per (l, m) coefficient (per frame).
-    n_focus : int
-        Number of focus streams.
-    mode : str
-        Pairing mode; ``"self"`` or ``"cross"``.
-    op_type : str
-        Point-wise grid operation; ``"glu"``, ``"mlp"`` or ``"branch"``.
-    precision : str
-        Parameter precision.
-    layout : str
-        Tensor layout convention: ``"ndfc"``, ``"nfdc"`` or ``"flat"``
-        (``"flat"`` is cross-only).
-    lebedev_precision : int | None
-        Lebedev algebraic precision; resolved automatically if None.
-    coefficient_layout : str
-        ``"packed"`` or ``"m_major"`` coefficient ordering.
-    grid_branches : int
-        Number of scalar-routed branches when ``op_type='branch'``.
-    residual_scale_init : float | None
-        Initial value of the per-(focus, channel) residual scale; ``None``
-        disables the residual scale.
-    mlp_bias : bool
-        Whether to use bias in the scalar gate projection.
-    trainable : bool
-        Whether parameters are trainable.
-    seed : int | list[int] | None
-        Random seed for weight initialization.
-    """
+    """Grid net using a Wigner-D SO(3) projector with frame indices."""
 
     def __init__(
         self,
@@ -1414,16 +1113,16 @@ def __init__(
         kmax: int = 1,
         channels: int,
         n_focus: int = 1,
-        mode: str,
-        op_type: str,
+        mode: GridNetMode,
+        op_type: GridNetOp,
         precision: str = DEFAULT_PRECISION,
-        layout: str,
+        layout: GridNetLayout,
         lebedev_precision: int | None = None,
         coefficient_layout: str = "packed",
         grid_branches: int = 1,
         residual_scale_init: float | None = None,
         mlp_bias: bool = False,
-        trainable: bool = True,
+        trainable: bool,
         seed: int | list[int] | None = None,
     ) -> None:
         projector = SO3GridProjector(
@@ -1439,12 +1138,9 @@ def __init__(
         self.lebedev_precision = projector.lebedev_precision
         self.n_gamma = projector.n_gamma
         self.grid_branches = int(grid_branches)
-        frame_expand: FrameExpand | None = None
-        frame_contract: FrameContract | None = None
-        if str(mode).lower() == "cross":
-            # pt builds the frame mixers with child_seed(seed, 4)/(seed, 5);
-            # ``BaseGridNet`` uses child_seed(seed, 0)/(seed, 1) for scalar_gate
-            # /grid_op, so these branches never collide.
+        frame_expand = None
+        frame_contract = None
+        if mode == "cross":
             frame_expand = FrameExpand(
                 lmax=lmax,
                 mmax=projector.mmax,
@@ -1483,48 +1179,29 @@ def __init__(
         )
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the SO3GridNet to a dict.
-
-        The pt ``SO3GridNet`` has no ``serialize()``; the ``@variables`` keys
-        here match the pt ``state_dict`` key names (``scalar_gate.weight``,
-        ``grid_op.*``, ``frame_expand.weight``, ``frame_contract.weight``,
-        ``residual_scale``) so pt state-dict fragments load directly. The
-        projector matrices are non-persistent buffers in pt and are rebuilt
-        from the nested projector config on deserialization.
-        """
-        variables = {"scalar_gate.weight": to_numpy_array(self.scalar_gate.weight)}
-        if self.mlp_bias:
-            variables["scalar_gate.bias"] = to_numpy_array(self.scalar_gate.bias)
-        if self.op_type in {"mlp", "branch"}:
-            grid_op_data = self.grid_op.serialize()["@variables"]
-            for key, value in grid_op_data.items():
-                variables[f"grid_op.{key}"] = value
-        if self.frame_expand is not None:
-            variables["frame_expand.weight"] = to_numpy_array(self.frame_expand.weight)
-        if self.frame_contract is not None:
-            variables["frame_contract.weight"] = to_numpy_array(
-                self.frame_contract.weight
-            )
-        if self.residual_scale is not None:
-            variables["residual_scale"] = to_numpy_array(self.residual_scale)
+        """Serialize the SO3GridNet to a dict."""
         return {
             "@class": "SO3GridNet",
             "@version": 1,
             "config": {
+                "lmax": self.lmax,
+                "mmax": self.projector.mmax,
+                "kmax": self.kmax,
                 "channels": self.channels,
                 "n_focus": self.n_focus,
                 "mode": self.mode,
                 "op_type": self.op_type,
                 "precision": np.dtype(PRECISION_DICT[self.precision]).name,
                 "layout": self.layout,
+                "lebedev_precision": self.lebedev_precision,
+                "coefficient_layout": self.projector.coefficient_layout,
                 "grid_branches": self.grid_branches,
                 "residual_scale_init": self.residual_scale_init,
                 "mlp_bias": self.mlp_bias,
                 "trainable": self.trainable,
                 "seed": None,
-                "projector": self.projector.serialize(),
             },
-            "@variables": variables,
+            "@variables": self._variables(),
         }
 
     @classmethod
@@ -1534,80 +1211,9 @@ def deserialize(cls, data: dict[str, Any]) -> SO3GridNet:
         data_cls = data.pop("@class")
         if data_cls != "SO3GridNet":
             raise ValueError(f"Invalid class for SO3GridNet: {data_cls}")
-        version = int(data.pop("@version"))
-        check_version_compatibility(version, 1, 1)
+        check_version_compatibility(int(data.pop("@version")), 1, 1)
         config = data.pop("config")
         variables = data.pop("@variables")
-        projector_data = config["projector"]
-        projector_cls = projector_data.get("@class")
-        if projector_cls != "SO3GridProjector":
-            raise ValueError(
-                f"Invalid nested projector class for SO3GridNet: {projector_cls}"
-            )
-        if "@version" not in projector_data:
-            raise ValueError("nested SO3GridProjector payload is missing '@version'")
-        check_version_compatibility(int(projector_data["@version"]), 1, 1)
-        projector_config = projector_data["config"]
-        obj = cls(
-            lmax=int(projector_config["lmax"]),
-            mmax=int(projector_config["mmax"]),
-            kmax=int(projector_config["kmax"]),
-            channels=int(config["channels"]),
-            n_focus=int(config["n_focus"]),
-            mode=str(config["mode"]),
-            op_type=str(config["op_type"]),
-            precision=str(config["precision"]),
-            layout=str(config["layout"]),
-            lebedev_precision=int(projector_config["lebedev_precision"]),
-            coefficient_layout=str(projector_config["coefficient_layout"]),
-            grid_branches=int(config["grid_branches"]),
-            residual_scale_init=config.get("residual_scale_init"),
-            mlp_bias=bool(config["mlp_bias"]),
-            trainable=bool(config["trainable"]),
-            seed=config.get("seed"),
-        )
-        prec = PRECISION_DICT[obj.precision.lower()]
-        weight = np.asarray(variables["scalar_gate.weight"], dtype=prec)
-        if weight.shape != obj.scalar_gate.weight.shape:
-            raise ValueError(
-                f"scalar_gate.weight shape {weight.shape} does not match "
-                f"the expected shape {obj.scalar_gate.weight.shape}"
-            )
-        obj.scalar_gate.weight = weight
-        if obj.mlp_bias:
-            obj.scalar_gate.bias = np.asarray(
-                variables["scalar_gate.bias"], dtype=prec
-            ).reshape(obj.scalar_gate.bias.shape)
-        if obj.residual_scale is not None:
-            residual_scale = np.asarray(variables["residual_scale"], dtype=prec)
-            if residual_scale.shape != obj.residual_scale.shape:
-                raise ValueError(
-                    f"residual_scale shape {residual_scale.shape} does not match "
-                    f"the expected shape {obj.residual_scale.shape}"
-                )
-            obj.residual_scale = residual_scale
-        if obj.frame_expand is not None:
-            expand_weight = np.asarray(variables["frame_expand.weight"], dtype=prec)
-            if expand_weight.shape != obj.frame_expand.weight.shape:
-                raise ValueError(
-                    f"frame_expand.weight shape {expand_weight.shape} does not "
-                    f"match the expected shape {obj.frame_expand.weight.shape}"
-                )
-            obj.frame_expand.weight = expand_weight
-        if obj.frame_contract is not None:
-            contract_weight = np.asarray(variables["frame_contract.weight"], dtype=prec)
-            if contract_weight.shape != obj.frame_contract.weight.shape:
-                raise ValueError(
-                    f"frame_contract.weight shape {contract_weight.shape} does "
-                    f"not match the expected shape {obj.frame_contract.weight.shape}"
-                )
-            obj.frame_contract.weight = contract_weight
-        if obj.op_type in {"mlp", "branch"}:
-            obj.grid_op._load_variables(
-                {
-                    key[len("grid_op.") :]: value
-                    for key, value in variables.items()
-                    if key.startswith("grid_op.")
-                }
-            )
+        obj = cls(**config)
+        obj._load_variables(variables)
         return obj
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/indexing.py b/deepmd/dpmodel/descriptor/dpa4_nn/indexing.py
index 70ee724214..161c3c0bb1 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/indexing.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/indexing.py
@@ -3,14 +3,10 @@
 SO(3) packed-index and projection helpers for DPA4/SeZM.
 
 This module defines the packed `(l, m)` indexing helpers and the projection
-utilities used by the DPA4 equivariant operators. It is the dpmodel port of
-``deepmd.pt.model.descriptor.sezm_nn.indexing``.
+utilities used by the SeZM equivariant operators.
 
-The index-table builders run at module-init time on static index data and are
-implemented in plain numpy by design (not array-API); they return ``np.int64``
-arrays. The torch-specific ``device``/``dtype`` keyword parameters of the pt
-versions are dropped for those builders. Only ``project_D_to_m`` and
-``project_Dt_from_m`` operate on runtime tensors and are array-API compatible.
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.indexing``.
 """
 
 from __future__ import (
@@ -59,9 +55,6 @@ def map_degree_idx(lmax: int) -> np.ndarray:
     For each spherical harmonic coefficient position in the packed tensor,
     returns the corresponding angular momentum quantum number l.
 
-    The torch version's ``device`` parameter is dropped: the output is a static
-    numpy table.
-
     Examples
     --------
     For lmax=2, the packed layout has D=9 positions:
@@ -79,11 +72,14 @@ def map_degree_idx(lmax: int) -> np.ndarray:
     Returns
     -------
     np.ndarray
-        ``np.int64`` array with shape (D,), where D=(lmax+1)^2.
+        Integer array with shape (D,), where D=(lmax+1)^2.
         Each element is the l value for that position.
     """
     lmax = int(lmax)
-    counts = np.array([2 * degree + 1 for degree in range(lmax + 1)], dtype=np.int64)
+    counts = np.array(
+        [2 * degree + 1 for degree in range(lmax + 1)],
+        dtype=np.int64,
+    )
     return np.repeat(np.arange(lmax + 1, dtype=np.int64), counts)
 
 
@@ -95,9 +91,6 @@ def build_gie_zonal_index(lmax: int) -> tuple[np.ndarray, np.ndarray, np.ndarray
     coefficient in the node representation. They select the local ``m=0`` column
     of the matching degree from ``Dt_full`` or an equivalent zonal coupling table.
 
-    The torch version's ``device`` parameter is dropped: the output is a static
-    numpy table.
-
     Parameters
     ----------
     lmax
@@ -109,7 +102,6 @@ def build_gie_zonal_index(lmax: int) -> tuple[np.ndarray, np.ndarray, np.ndarray
         ``(node_row_index, node_zonal_m0_col_index, node_radial_l_index)``.
         The first two index packed SO(3) rows/columns; the last one indexes
         radial features with degree slots ``l=1..lmax`` stored as ``0..lmax-1``.
-        All are ``np.int64`` arrays.
     """
     lmax_i = int(lmax)
     ebed_dim = get_so3_dim_of_lmax(lmax_i)
@@ -136,8 +128,6 @@ def project_D_to_m(
     """
     Row-project block-diagonal Wigner-D to the m-major truncated layout.
 
-    This function operates on runtime tensors and is array-API compatible.
-
     Parameters
     ----------
     D_full
@@ -193,8 +183,6 @@ def project_Dt_from_m(
     """
     Column-project block-diagonal Wigner-D^T for inverse rotation.
 
-    This function operates on runtime tensors and is array-API compatible.
-
     Parameters
     ----------
     Dt_full
@@ -274,9 +262,6 @@ def build_l_major_index(lmax: int, mmax: int) -> np.ndarray:
     - l = 0..lmax
     - within each l, m = -min(mmax, l) .. +min(mmax, l)
 
-    The torch version's ``device`` parameter is dropped: the output is a static
-    numpy table.
-
     Parameters
     ----------
     lmax
@@ -287,9 +272,9 @@ def build_l_major_index(lmax: int, mmax: int) -> np.ndarray:
     Returns
     -------
     np.ndarray
-        ``np.int64`` array of indices with shape (D_m_trunc,), selecting
-        coefficients from the full packed layout with D=(lmax+1)^2, where
-        D_m_trunc is the number of coefficients kept under ``|m| <= min(mmax, l)``.
+        Long array of indices with shape (D_m_trunc,), selecting coefficients
+        from the full packed layout with D=(lmax+1)^2, where D_m_trunc is
+        the number of coefficients kept under ``|m| <= min(mmax, l)``.
 
     Examples
     --------
@@ -312,7 +297,7 @@ def build_l_major_index(lmax: int, mmax: int) -> np.ndarray:
         m_keep = min(mmax_i, degree)
         for m in range(-m_keep, m_keep + 1):
             indices.append(so3_packed_index(degree, m))
-    return np.asarray(indices, dtype=np.int64)
+    return np.array(indices, dtype=np.int64)
 
 
 def build_m_major_index(lmax: int, mmax: int) -> np.ndarray:
@@ -326,9 +311,6 @@ def build_m_major_index(lmax: int, mmax: int) -> np.ndarray:
         - negative part: l = m..lmax, coefficient (l, -m)
         - positive part: l = m..lmax, coefficient (l, +m)
 
-    The torch version's ``device`` parameter is dropped: the output is a static
-    numpy table.
-
     Parameters
     ----------
     lmax
@@ -339,9 +321,9 @@ def build_m_major_index(lmax: int, mmax: int) -> np.ndarray:
     Returns
     -------
     np.ndarray
-        ``np.int64`` array of indices with shape (D_m_trunc,), selecting
-        coefficients from the full packed layout with D=(lmax+1)^2, where
-        D_m_trunc is the number of coefficients kept under ``|m| <= min(mmax, l)``.
+        Long array of indices with shape (D_m_trunc,), selecting coefficients
+        from the full packed layout with D=(lmax+1)^2, where D_m_trunc is
+        the number of coefficients kept under ``|m| <= min(mmax, l)``.
 
     Examples
     --------
@@ -372,16 +354,13 @@ def build_m_major_index(lmax: int, mmax: int) -> np.ndarray:
         for degree in range(m, lmax_i + 1):
             indices.append(so3_packed_index(degree, m))
 
-    return np.asarray(indices, dtype=np.int64)
+    return np.array(indices, dtype=np.int64)
 
 
 def build_m_major_l_index(lmax: int, mmax: int) -> np.ndarray:
     """
     Build degree (l) index aligned with `build_m_major_index`.
 
-    The torch version's ``device`` parameter is dropped: the output is a static
-    numpy table.
-
     Parameters
     ----------
     lmax
@@ -392,8 +371,8 @@ def build_m_major_l_index(lmax: int, mmax: int) -> np.ndarray:
     Returns
     -------
     np.ndarray
-        ``np.int64`` array of degrees with shape (D_m_trunc,). Entry i is the
-        degree l for the i-th coefficient in the m-major layout.
+        Long array of degrees with shape (D_m_trunc,). Entry i is the degree
+        l for the i-th coefficient in the m-major layout.
 
     Examples
     --------
@@ -424,15 +403,13 @@ def build_m_major_l_index(lmax: int, mmax: int) -> np.ndarray:
         for degree in range(m, lmax_i + 1):
             degrees.append(degree)
 
-    return np.asarray(degrees, dtype=np.int64)
+    return np.array(degrees, dtype=np.int64)
 
 
 def build_rotate_inv_rescale(
     lmax: int,
     mmax: int,
     degree_index: np.ndarray,
-    *,
-    dtype: Any = np.float64,
 ) -> np.ndarray:
     """
     Build reduced-layout inverse-rotation rescale factors.
@@ -442,10 +419,6 @@ def build_rotate_inv_rescale(
     degrees by ``sqrt((2*l+1)/(2*mmax+1))`` so the reduced representation matches
     the amplitude expected by the full SO(3) basis.
 
-    The torch version's ``device`` parameter is dropped: the output is a static
-    numpy table. ``dtype`` is kept (as a numpy dtype) since the floating-point
-    precision of the rescale vector is meaningful.
-
     Parameters
     ----------
     lmax
@@ -455,8 +428,6 @@ def build_rotate_inv_rescale(
     degree_index
         Degree index aligned with the reduced coefficient layout, typically
         returned by ``build_m_major_l_index``.
-    dtype
-        Floating-point numpy dtype for the returned array.
 
     Returns
     -------
@@ -474,13 +445,13 @@ def build_rotate_inv_rescale(
         raise ValueError("`mmax` must be <= `lmax`")
 
     degrees = np.asarray(degree_index, dtype=np.int64)
-    rescale = np.ones(degrees.shape[0], dtype=dtype)
+    rescale = np.ones(degrees.shape[0], dtype=np.float64)
     if mmax_i == lmax_i:
         return rescale
 
     mask = degrees > mmax_i
     if mask.any():
         denom = float(2 * mmax_i + 1)
-        degree_values = degrees[mask].astype(dtype)
+        degree_values = degrees[mask].astype(np.float64)
         rescale[mask] = np.sqrt((2.0 * degree_values + 1.0) / denom)
     return rescale
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/lora.py b/deepmd/dpmodel/descriptor/dpa4_nn/lora.py
new file mode 100644
index 0000000000..a4600b7dbe
--- /dev/null
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/lora.py
@@ -0,0 +1,991 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""LoRA low-rank fine-tuning support for DPA4/SeZM.
+
+This module adds two things:
+
+* ``LoRASO3`` and ``LoRASO2`` subclasses that wrap the corresponding base
+  equivariant linear operators (``SO3Linear`` / ``SO2Linear``).  Each one
+  freezes the pre-trained weights and registers rank-``R`` adapter
+  parameters ``A``/``B`` whose shapes share the base's batch layout
+  (per-``l`` for SO(3), per-``|m|``-group for SO(2)).  The LoRA delta is
+  folded into the *effective* weight before the single large einsum that
+  already exists in the base module; forward FLOPs are therefore identical
+  to the base, and the overhead comes only from an ``O(R)`` weight-side
+  matmul that does not depend on the number of edges or nodes.
+
+* ``apply_lora_to_sezm``, ``merge_lora_into_base`` and a few helpers that
+  drive the fine-tune policy (which submodules stay trainable, which ones
+  remain frozen) and the merged-checkpoint export used by
+  ``Trainer.save_model_merged``.
+
+Naming convention: the LoRA parameter names -- ``A_by_l``, ``B_by_l``,
+``A_m0``, ``B_m0``, ``A_m``, ``B_m`` -- intentionally do **not** start with
+``adam_`` / ``adamw_`` and do not contain ``bias``.  ``HybridMuon.get_adam_route``
+therefore classifies them as ``muon`` and, because the tensors have the
+same rank structure as the corresponding base weights, the slice-mode
+matrix view gives per-``l`` / per-``|m|``-group Newton-Schulz updates that
+match the base training recipe.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.lora``.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import math
+from copy import (
+    deepcopy,
+)
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+
+import array_api_compat
+import numpy as np
+
+from deepmd.dpmodel import (
+    DEFAULT_PRECISION,
+    PRECISION_DICT,
+    NativeOP,
+)
+from deepmd.dpmodel.array_api import (
+    xp_asarray_nodetach,
+)
+from deepmd.dpmodel.common import (
+    to_numpy_array,
+)
+from deepmd.utils.version import (
+    check_version_compatibility,
+)
+
+from .activation import (
+    GatedActivation,
+)
+from .so2 import (
+    SO2Linear,
+)
+from .so3 import (
+    SO3Linear,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Iterator,
+    )
+
+    from deepmd.dpmodel.array_api import (
+        Array,
+    )
+
+# ---------------------------------------------------------------------------
+# LoRA adapter modules
+# ---------------------------------------------------------------------------
+
+
+class LoRASO3(SO3Linear):
+    """
+    Per-l ELoRA adapter for ``SO3Linear``.
+
+    The pre-trained weight ``self.weight`` (``(lmax+1, C_in, F*C_out)``) is
+    frozen.  Two new 3D parameters ``A_by_l`` (``(lmax+1, rank, C_in)``) and
+    ``B_by_l`` (``(lmax+1, F*C_out, rank)``) share the same ``lmax+1`` batch
+    axis as the base so that ``muon_mode="slice"`` updates every ``l``-block
+    independently.  SO(3) equivariance is preserved because the per-``l``
+    delta only rotates within each ``l``-block (no cross-``l`` mixing).
+
+    Parameters
+    ----------
+    lmax, in_channels, out_channels, n_focus, precision, mlp_bias, trainable, seed
+        Forwarded to ``SO3Linear`` to build the frozen base weight.
+    lora_rank
+        LoRA rank.  Must satisfy ``lora_rank >= 1``.
+    lora_alpha
+        Scaling numerator; the effective scaling is ``lora_alpha / lora_rank``.
+        ``None`` defaults to ``lora_alpha = lora_rank`` (scaling ``1.0``).
+    """
+
+    def __init__(
+        self,
+        *,
+        lmax: int,
+        in_channels: int,
+        out_channels: int,
+        n_focus: int = 1,
+        precision: str = DEFAULT_PRECISION,
+        mlp_bias: bool = False,
+        trainable: bool = False,
+        seed: int | list[int] | None = None,
+        lora_rank: int,
+        lora_alpha: float | None = None,
+    ) -> None:
+        if lora_rank < 1:
+            raise ValueError(f"LoRASO3 requires rank >= 1, got {lora_rank}")
+        # Build a same-shape SO3Linear base; the pre-trained weight is restored
+        # by ``deserialize`` afterwards.
+        super().__init__(
+            lmax=lmax,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            n_focus=n_focus,
+            precision=precision,
+            mlp_bias=mlp_bias,
+            trainable=False,
+            seed=seed,
+        )
+        self.trainable = bool(trainable)
+        prec = PRECISION_DICT[self.precision.lower()]
+
+        self.lora_rank = int(lora_rank)
+        alpha_value = float(lora_alpha) if lora_alpha is not None else float(lora_rank)
+        self.lora_alpha = alpha_value
+        self.scaling = alpha_value / float(lora_rank)
+        self.lora_scaling = np.array(self.scaling, dtype=prec)
+
+        num_l = self.lmax + 1
+        rng = np.random.default_rng(seed)
+        self.A_by_l = rng.normal(
+            0.0,
+            1.0 / math.sqrt(self.lora_rank),
+            size=(num_l, self.lora_rank, self.in_channels),
+        ).astype(prec)
+        # B is zero-initialised so that the initial forward is an exact
+        # identity to the base module; training backprop updates B first
+        # (gradA is zero while B is zero), which is the standard LoRA
+        # two-step unlock pattern and is compatible with Newton-Schulz on
+        # rectangular matrices.
+        self.B_by_l = np.zeros(
+            (num_l, self.n_focus * self.out_channels, self.lora_rank), dtype=prec
+        )
+
+    def _compute_delta_weight(self, xp: Any, device: Any) -> Array:
+        """Return ``ΔW`` with shape ``(lmax+1, C_in, F*C_out)``."""
+        B_by_l = xp_asarray_nodetach(xp, self.B_by_l[...], device=device)
+        A_by_l = xp_asarray_nodetach(xp, self.A_by_l[...], device=device)
+        # einsum "lor,lri->lio" as a per-l batched matmul (B @ A) then transpose:
+        # (L, F*Cout, R) @ (L, R, Cin) -> (L, F*Cout, Cin) -> (L, Cin, F*Cout)
+        return xp.permute_dims(xp.matmul(B_by_l, A_by_l), (0, 2, 1)) * self.scaling
+
+    def call(self, x: Array) -> Array:
+        """
+        Parameters
+        ----------
+        x
+            Input features with shape ``(N, D, F, C_in)`` where ``D=(lmax+1)^2``.
+
+        Returns
+        -------
+        Array
+            Output features with shape ``(N, D, F, C_out)``.
+        """
+        xp = array_api_compat.array_namespace(x)
+        device = array_api_compat.device(x)
+        delta_w = self._compute_delta_weight(xp, device)
+        weight = xp.reshape(
+            xp_asarray_nodetach(xp, self.weight[...], device=device) + delta_w,
+            (self.lmax + 1, self.in_channels, self.n_focus, self.out_channels),
+        )
+        expand_index = xp_asarray_nodetach(xp, self.expand_index, device=device)
+        weight_expanded = xp.take(weight, expand_index, axis=0)
+        # einsum "ndfi,difo->ndfo" as a broadcast batched matmul:
+        # (N, D, F, 1, Cin) @ (1, D, F, Cin, Cout) -> (N, D, F, 1, Cout)
+        weight_expanded = xp.permute_dims(weight_expanded, (0, 2, 1, 3))
+        out = xp.matmul(x[:, :, :, None, :], weight_expanded[None, ...])[..., 0, :]
+        if self.mlp_bias:
+            bias = xp.reshape(
+                xp_asarray_nodetach(xp, self.bias[...], device=device),
+                (self.n_focus, self.out_channels),
+            )
+            out = xp.concat(
+                [out[:, :1, :, :] + bias[None, None, ...], out[:, 1:, :, :]], axis=1
+            )
+        return out
+
+    def merge_into_base(self) -> SO3Linear:
+        """Build a plain ``SO3Linear`` whose weight has absorbed the LoRA delta."""
+        base = SO3Linear(
+            lmax=self.lmax,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            n_focus=self.n_focus,
+            precision=self.precision,
+            mlp_bias=self.mlp_bias,
+            trainable=True,
+            seed=None,
+            init_std=0.0,
+        )
+        xp = array_api_compat.array_namespace(self.B_by_l)
+        device = array_api_compat.device(self.B_by_l)
+        base.weight = to_numpy_array(
+            self.weight + self._compute_delta_weight(xp, device)
+        )
+        if self.bias is not None:
+            base.bias = to_numpy_array(self.bias)
+        return base
+
+    def serialize(self) -> dict[str, Any]:
+        """Serialize the LoRASO3 to a dict."""
+        data = super().serialize()
+        data["@class"] = "LoRASO3"
+        data["config"]["lora_rank"] = self.lora_rank
+        data["config"]["lora_alpha"] = self.lora_alpha
+        data["@variables"]["A_by_l"] = to_numpy_array(self.A_by_l)
+        data["@variables"]["B_by_l"] = to_numpy_array(self.B_by_l)
+        return data
+
+    @classmethod
+    def deserialize(cls, data: dict[str, Any]) -> LoRASO3:
+        """Deserialize a LoRASO3 from a dict."""
+        data = data.copy()
+        data_cls = data.pop("@class")
+        if data_cls != "LoRASO3":
+            raise ValueError(f"Invalid class for LoRASO3: {data_cls}")
+        version = int(data.pop("@version"))
+        check_version_compatibility(version, 1, 1)
+        config = data.pop("config")
+        variables = data.pop("@variables")
+        obj = cls(**config)
+        prec = PRECISION_DICT[obj.precision.lower()]
+        obj.expand_index = np.asarray(variables["expand_index"], dtype=np.int64)
+        obj.weight = np.asarray(variables["weight"], dtype=prec)
+        if obj.mlp_bias:
+            obj.bias = np.asarray(variables["bias"], dtype=prec)
+        obj.A_by_l = np.asarray(variables["A_by_l"], dtype=prec)
+        obj.B_by_l = np.asarray(variables["B_by_l"], dtype=prec)
+        return obj
+
+
+class LoRASO2(SO2Linear):
+    """
+    Per-``|m|``-group LoRA adapter for ``SO2Linear``.
+
+    ``weight_m0`` (``(num_in_m0, F*num_out_m0)``) and each
+    ``weight_m[i]`` (``(num_in_m, F*2*num_out_m)``) get an independent 2D
+    LoRA pair ``A``/``B``.  SO(2) equivariance is preserved because the
+    ``|m|>0`` 2x2 complex block ``[[W_u, -W_v], [W_v, W_u]]`` stays intact
+    when ``ΔW_m`` is absorbed into the concatenated ``[W_u | W_v]`` layout
+    before ``_build_so2_weight`` splits it (the shared input basis ``A``
+    splits naturally into ``ΔW_u = B_u A`` and ``ΔW_v = B_v A``).
+
+    The base ``call`` logic is inherited unchanged; only ``_build_so2_weight``
+    is overridden to fold the LoRA delta into each base block prior to
+    assembling the block-diagonal weight.  The ``ΔW_m`` construction does not
+    depend on the edge count ``E``, so the forward FLOPs remain identical to
+    the base.
+
+    Parameters
+    ----------
+    lmax, mmax, in_channels, out_channels, n_focus, precision, mlp_bias, trainable, seed
+        Forwarded to ``SO2Linear`` to build the frozen base weights.
+    lora_rank
+        LoRA rank.
+    lora_alpha
+        Scaling numerator; scaling is ``lora_alpha / lora_rank``.  ``None``
+        defaults to ``lora_alpha = lora_rank`` (scaling ``1.0``).
+    """
+
+    def __init__(
+        self,
+        *,
+        lmax: int,
+        mmax: int | None = None,
+        in_channels: int,
+        out_channels: int,
+        n_focus: int = 1,
+        precision: str = DEFAULT_PRECISION,
+        mlp_bias: bool = False,
+        seed: int | list[int] | None = None,
+        trainable: bool = False,
+        lora_rank: int,
+        lora_alpha: float | None = None,
+    ) -> None:
+        if lora_rank < 1:
+            raise ValueError(f"LoRASO2 requires rank >= 1, got {lora_rank}")
+        super().__init__(
+            lmax=lmax,
+            mmax=mmax,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            n_focus=n_focus,
+            precision=precision,
+            mlp_bias=mlp_bias,
+            seed=seed,
+            trainable=False,
+        )
+        self.trainable = bool(trainable)
+        prec = PRECISION_DICT[self.precision.lower()]
+
+        self.lora_rank = int(lora_rank)
+        alpha_value = float(lora_alpha) if lora_alpha is not None else float(lora_rank)
+        self.lora_alpha = alpha_value
+        self.scaling = alpha_value / float(lora_rank)
+        self.lora_scaling = np.array(self.scaling, dtype=prec)
+
+        rng = np.random.default_rng(seed)
+        num_in_m0 = (self.lmax + 1) * self.in_channels
+        num_out_m0_per_focus = (self.lmax + 1) * self.out_channels
+        focus_num_out_m0 = self.n_focus * num_out_m0_per_focus
+        self.A_m0 = rng.normal(
+            0.0, 1.0 / math.sqrt(self.lora_rank), size=(self.lora_rank, num_in_m0)
+        ).astype(prec)
+        self.B_m0 = np.zeros((focus_num_out_m0, self.lora_rank), dtype=prec)
+
+        self.A_m: list[np.ndarray] = []
+        self.B_m: list[np.ndarray] = []
+        for w in self.weight_m:
+            num_in, focus_two_num_out = w.shape
+            a_m = rng.normal(
+                0.0, 1.0 / math.sqrt(self.lora_rank), size=(self.lora_rank, num_in)
+            ).astype(prec)
+            b_m = np.zeros((focus_two_num_out, self.lora_rank), dtype=prec)
+            self.A_m.append(a_m)
+            self.B_m.append(b_m)
+
+    def _compute_delta_m0(self, xp: Any, device: Any) -> Array:
+        """Return ``ΔW_m0`` with shape ``(num_in_m0, F*num_out_m0)``."""
+        A_m0 = xp_asarray_nodetach(xp, self.A_m0[...], device=device)
+        B_m0 = xp_asarray_nodetach(xp, self.B_m0[...], device=device)
+        # einsum "ri,or->io" as a matmul (B @ A) then transpose:
+        # (F*num_out_m0, R) @ (R, num_in_m0) -> (F*num_out_m0, num_in_m0)
+        #   -> (num_in_m0, F*num_out_m0)
+        return xp.permute_dims(xp.matmul(B_m0, A_m0), (1, 0)) * self.scaling
+
+    def _compute_delta_m(self, m_idx: int, xp: Any, device: Any) -> Array:
+        """Return ``ΔW_m[m_idx]`` with the same shape as ``weight_m[m_idx]``."""
+        A_m = xp_asarray_nodetach(xp, self.A_m[m_idx][...], device=device)
+        B_m = xp_asarray_nodetach(xp, self.B_m[m_idx][...], device=device)
+        return xp.permute_dims(xp.matmul(B_m, A_m), (1, 0)) * self.scaling
+
+    def _build_so2_weight(self, xp: Any, device: Any) -> Array:
+        """Assemble the block-diagonal weight with LoRA delta folded in."""
+        out_total = self.reduced_dim * self.out_channels
+        num_in_m0 = (self.lmax + 1) * self.in_channels
+        num_out_m0 = (self.lmax + 1) * self.out_channels
+
+        # m=0 block: fold ΔW_m0 into the base weight before the view.
+        weight_m0 = xp.reshape(
+            xp_asarray_nodetach(xp, self.weight_m0[...], device=device)
+            + self._compute_delta_m0(xp, device),
+            (num_in_m0, self.n_focus, num_out_m0),
+        )
+        row_blocks = [
+            xp.concat(
+                [
+                    weight_m0,
+                    xp.zeros(
+                        (self._m0_in, self.n_focus, out_total - self._m0_out),
+                        dtype=weight_m0.dtype,
+                        device=device,
+                    ),
+                ],
+                axis=2,
+            )
+        ]
+
+        # |m|>0 blocks: same 2x2 coupling assembly as the base, but with
+        # ΔW_m folded into the concatenated [W_u | W_v] layout first.
+        for m_idx, w in enumerate(self.weight_m):
+            ni0, ni1, pi0, pi1, no0, no1, po0, po1 = self._block_slices[m_idx]
+            ib = ni1 - ni0
+            ob = no1 - no0
+            w = xp.reshape(
+                xp_asarray_nodetach(xp, w[...], device=device)
+                + self._compute_delta_m(m_idx, xp, device),
+                (ib, self.n_focus, 2 * ob),
+            )
+            w_u = w[:, :, :ob]
+            w_v = w[:, :, ob:]
+            left_pad = xp.zeros((ib, self.n_focus, no0), dtype=w.dtype, device=device)
+            right_pad = xp.zeros(
+                (ib, self.n_focus, out_total - po1), dtype=w.dtype, device=device
+            )
+            neg_row = xp.concat([left_pad, w_u, w_v, right_pad], axis=2)
+            pos_row = xp.concat([left_pad, -w_v, w_u, right_pad], axis=2)
+            row_blocks.append(neg_row)
+            row_blocks.append(pos_row)
+        return xp.concat(row_blocks, axis=0)
+
+    def merge_into_base(self) -> SO2Linear:
+        """Build a plain ``SO2Linear`` whose weights have absorbed every LoRA delta."""
+        base = SO2Linear(
+            lmax=self.lmax,
+            mmax=self.mmax,
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            n_focus=self.n_focus,
+            precision=self.precision,
+            mlp_bias=self.mlp_bias,
+            seed=None,
+            trainable=True,
+        )
+        xp = array_api_compat.array_namespace(self.weight_m0)
+        device = array_api_compat.device(self.weight_m0)
+        base.weight_m0 = to_numpy_array(
+            self.weight_m0 + self._compute_delta_m0(xp, device)
+        )
+        if self.bias0 is not None:
+            base.bias0 = to_numpy_array(self.bias0)
+        for m_idx, w in enumerate(self.weight_m):
+            base.weight_m[m_idx] = to_numpy_array(
+                w + self._compute_delta_m(m_idx, xp, device)
+            )
+        return base
+
+    def serialize(self) -> dict[str, Any]:
+        data = super().serialize()
+        data["@class"] = "LoRASO2"
+        data["config"]["lora_rank"] = self.lora_rank
+        data["config"]["lora_alpha"] = self.lora_alpha
+        variables = data["@variables"]
+        variables["A_m0"] = to_numpy_array(self.A_m0)
+        variables["B_m0"] = to_numpy_array(self.B_m0)
+        for i, (a, b) in enumerate(zip(self.A_m, self.B_m, strict=True)):
+            variables[f"A_m.{i}"] = to_numpy_array(a)
+            variables[f"B_m.{i}"] = to_numpy_array(b)
+        return data
+
+    @classmethod
+    def deserialize(cls, data: dict[str, Any]) -> LoRASO2:
+        data = data.copy()
+        data_cls = data.pop("@class")
+        if data_cls != "LoRASO2":
+            raise ValueError(f"Invalid class for LoRASO2: {data_cls}")
+        version = int(data.pop("@version"))
+        check_version_compatibility(version, 1, 1)
+        config = data.pop("config")
+        variables = data.pop("@variables")
+        obj = cls(**config)
+        prec = PRECISION_DICT[obj.precision.lower()]
+        obj.m0_idx = np.asarray(variables["m0_idx"], dtype=np.int64)
+        obj.pos_indices = np.asarray(variables["pos_indices"], dtype=np.int64)
+        obj.neg_indices = np.asarray(variables["neg_indices"], dtype=np.int64)
+        obj.weight_m0 = np.asarray(variables["weight_m0"], dtype=prec)
+        if obj.mlp_bias:
+            obj.bias0 = np.asarray(variables["bias0"], dtype=prec)
+        obj.weight_m = [
+            np.asarray(variables[f"weight_m.{i}"], dtype=prec)
+            for i in range(len(obj.weight_m))
+        ]
+        obj.A_m0 = np.asarray(variables["A_m0"], dtype=prec)
+        obj.B_m0 = np.asarray(variables["B_m0"], dtype=prec)
+        obj.A_m = [
+            np.asarray(variables[f"A_m.{i}"], dtype=prec) for i in range(len(obj.A_m))
+        ]
+        obj.B_m = [
+            np.asarray(variables[f"B_m.{i}"], dtype=prec) for i in range(len(obj.B_m))
+        ]
+        return obj
+
+
+# ---------------------------------------------------------------------------
+# Fine-tune policy: freeze / unfreeze rules
+# ---------------------------------------------------------------------------
+
+# Leaf parameter names that stay trainable during LoRA fine-tune.  These are small
+# scalar / per-l scales / attention gating weights whose full-rank update costs
+# are negligible but directly absorb the domain shift of the downstream dataset.
+_UNFREEZE_LEAF_NAMES: frozenset[str] = frozenset(
+    {
+        "adam_scale",
+        "adam_so2_layer_scales",
+        "adam_ffn_layer_scales",
+        "film_scale_strength_log",
+        "film_shift_strength_log",
+        "adamw_attn_logit_w",
+        "adamw_attn_z_bias_raw",
+        "adamw_attn_gate_w",
+        "adamw_focus_compete_w",
+        "adamw_pseudo_query",
+        "focus_compete_bias",
+        # LoRA adapter deltas: pt makes them trainable ``nn.Parameter`` at
+        # construction, but the dpmodel tracks trainability per module, so the
+        # owning ``LoRASO3``/``LoRASO2`` must be marked trainable here for its
+        # low-rank delta to receive gradients (the frozen base is restored by
+        # the backend, e.g. pt_expt ``_LORA_FROZEN_BASE``).
+        "A_by_l",
+        "B_by_l",
+        "A_m0",
+        "B_m0",
+        "A_m",
+        "B_m",
+    }
+)
+
+# Leaf names that stay frozen (override any unfreeze rule above).  The backbone
+# pre-training has already converged on these quantities for all-element
+# datasets; downstream fine-tuning should keep them fixed.
+_OVERRIDE_FREEZE_LEAF_NAMES: frozenset[str] = frozenset(
+    {
+        "adam_type_embedding",
+        "adam_freqs",
+    }
+)
+
+# Submodule paths (rooted at the SeZMModel) that get fully unfrozen.
+_UNFREEZE_SUBMODULE_PATHS: tuple[str, ...] = (
+    "atomic_model.fitting_net",
+    "atomic_model.dens_fitting_net",
+    "atomic_model.descriptor.radial_embedding",
+    "atomic_model.descriptor.env_seed_embedding",
+    "atomic_model.descriptor.film_scale_norm",
+    "atomic_model.descriptor.film_shift_norm",
+    "atomic_model.descriptor.final_full_attn_res",
+    "atomic_model.descriptor.final_block_attn_res",
+)
+
+# Per-interaction-block submodule paths that get fully unfrozen.  The
+# descriptor stores the block list at ``atomic_model.descriptor.blocks``.
+_UNFREEZE_PER_BLOCK_SUBPATHS: tuple[str, ...] = (
+    "full_attn_res_so2",
+    "full_attn_res_ffns",
+    "block_attn_res_so2",
+    "block_attn_res_ffns",
+    "so2_conv.attn_q_proj",
+    "so2_conv.attn_k_proj",
+    "so2_conv.attn_qk_norm",
+    "so2_conv.attn_output_gate_norm",
+    "so2_conv.focus_compete_norm",
+    "so2_conv.radial_hidden_proj",
+    "so2_conv.so2_layer_attn_res",
+)
+
+_BLOCKS_PATH: str = "atomic_model.descriptor.blocks"
+
+
+# ---------------------------------------------------------------------------
+# NativeOP tree traversal
+# ---------------------------------------------------------------------------
+# Children of a ``NativeOP`` are stored as plain attributes and as
+# ``list``/``tuple`` of ``NativeOP`` (the equivalent of ``nn.ModuleList``).
+# The helpers below walk that object graph to enumerate modules, parameters
+# and a flat weight dictionary.
+
+
+def _iter_named_modules(
+    root: NativeOP, prefix: str = "", memo: set[int] | None = None
+) -> Iterator[tuple[str, NativeOP]]:
+    """Yield ``(dotted_name, module)`` for *root* and every nested ``NativeOP``.
+
+    ``root`` is yielded first under *prefix*, then the walk descends into every
+    attribute value that is a ``NativeOP`` and into every ``NativeOP`` element
+    of a ``list``/``tuple``, building dotted paths (``attr`` and ``attr.{i}``).
+    A shared-module memo de-duplicates repeated references.
+    """
+    if memo is None:
+        memo = set()
+    if id(root) in memo:
+        return
+    memo.add(id(root))
+    yield prefix, root
+    for attr, value in vars(root).items():
+        if isinstance(value, NativeOP):
+            child = f"{prefix}.{attr}" if prefix else attr
+            yield from _iter_named_modules(value, child, memo)
+        elif isinstance(value, (list, tuple)):
+            for i, item in enumerate(value):
+                if isinstance(item, NativeOP):
+                    child = f"{prefix}.{attr}.{i}" if prefix else f"{attr}.{i}"
+                    yield from _iter_named_modules(item, child, memo)
+
+
+def _iter_named_parameters(
+    root: NativeOP,
+) -> Iterator[tuple[str, NativeOP, np.ndarray]]:
+    """Yield ``(dotted_name, owner, array)`` for every numpy-array parameter.
+
+    A dpmodel "parameter" is a ``numpy`` array stored as a module attribute (or
+    a ``numpy`` element of a ``list``/``tuple`` attribute, the equivalent of an
+    ``nn.ParameterList``).  ``owner`` is the module holding the array; because
+    the dpmodel tracks trainability per module (``module.trainable``) rather
+    than per tensor, callers toggle ``owner.trainable`` where the PyTorch code
+    toggles ``param.requires_grad``.
+    """
+    for mod_name, mod in _iter_named_modules(root):
+        base = mod_name + "." if mod_name else ""
+        for attr, value in vars(mod).items():
+            if isinstance(value, np.ndarray):
+                yield base + attr, mod, value
+            elif isinstance(value, (list, tuple)):
+                for i, item in enumerate(value):
+                    if isinstance(item, np.ndarray):
+                        yield f"{base}{attr}.{i}", mod, item
+
+
+def _module_state_dict(root: NativeOP) -> dict[str, np.ndarray]:
+    """Flat dotted ``{name: array}`` dict over the whole module tree."""
+    return {name: value for name, _owner, value in _iter_named_parameters(root)}
+
+
+def _leaf_name(param_name: str) -> str:
+    """Return the trailing non-numeric segment of a parameter name.
+
+    ``nn.ParameterList`` children show up as ``foo.0``, ``foo.1``, ...;
+    ``get_adam_route`` strips those numeric indices before routing, so this
+    helper keeps the policy in sync.
+    """
+    parts = param_name.split(".")
+    i = len(parts) - 1
+    while i > 0 and parts[i].isdigit():
+        i -= 1
+    return parts[i]
+
+
+def _get_submodule_or_none(root: NativeOP, path: str) -> Any:
+    if not path:
+        return root
+    obj: Any = root
+    for part in path.split("."):
+        if part.isdigit() and isinstance(obj, (list, tuple)):
+            index = int(part)
+            obj = obj[index] if index < len(obj) else None
+        else:
+            obj = getattr(obj, part, None)
+        if obj is None:
+            return None
+    return obj
+
+
+def _clear_sezm_compile_cache(model: NativeOP) -> None:
+    """No-op retained for parity with the PyTorch backend.
+
+    In PyTorch, LoRA injection or merge replaces submodules and therefore
+    invalidates any ``torch.compile`` / inductor callable captured on the
+    module graph, which must be cleared before the next forward.  The dpmodel
+    (array-API) backend compiles nothing, so there is no cache to clear and
+    this function intentionally does nothing.
+    """
+    return
+
+
+def _swap_submodule(parent: Any, attr: str, new_module: NativeOP) -> None:
+    """Replace ``parent.attr`` with ``new_module``.
+
+    Numeric attribute names address ``list``/``tuple`` children (the dpmodel
+    analogue of ``nn.ModuleList`` / ``nn.ParameterList`` elements) and are
+    assigned by index; every other name is a plain attribute assignment.
+    """
+    if attr.isdigit() and isinstance(parent, (list, tuple)):
+        parent[int(attr)] = new_module
+    else:
+        setattr(parent, attr, new_module)
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def has_lora(module: NativeOP) -> bool:
+    """Return ``True`` iff any submodule is a LoRA adapter."""
+    return any(
+        isinstance(m, (LoRASO3, LoRASO2)) for _name, m in _iter_named_modules(module)
+    )
+
+
+def apply_lora_to_sezm(
+    model: NativeOP,
+    *,
+    rank: int,
+    alpha: float | None = None,
+) -> NativeOP:
+    """
+    Inject LoRA adapters into every ``SO3Linear`` / ``SO2Linear`` of a SeZM
+    model and apply the SeZM fine-tune freeze/unfreeze policy in place.
+
+    This function is idempotent-safe: the ``type(mod) is SO3Linear`` (exact
+    type) test prevents re-wrapping a LoRASO3 that is already present.
+
+    Parameters
+    ----------
+    model
+        A ``SeZMModel`` instance (or any ``NativeOP`` containing SeZM
+        ``SO3Linear`` / ``SO2Linear`` submodules).
+    rank
+        LoRA rank applied uniformly to every adapter.
+    alpha
+        LoRA scaling numerator; scaling is ``alpha / rank``.  ``None``
+        defaults to ``alpha = rank`` (scaling ``1.0``).
+
+    Returns
+    -------
+    NativeOP
+        The same ``model`` after injection (returned for chaining).
+    """
+    # === Step 1. Freeze all parameters ===
+    for _name, mod in _iter_named_modules(model):
+        mod.trainable = False
+
+    # === Step 2. Replace SO3Linear / SO2Linear with LoRA subclasses ===
+    # Snapshot named_modules() first so the later in-place replacement does
+    # not invalidate the iterator.  ``type(...) is ...`` is deliberate: it
+    # matches only the exact base class, skipping any pre-existing LoRA
+    # adapter so apply_lora_to_sezm remains idempotent.
+    replacements: list[tuple[Any, str, NativeOP]] = []
+    for name, mod in list(_iter_named_modules(model)):
+        if type(mod) is SO3Linear:
+            parent_name, _, attr = name.rpartition(".")
+            parent = (
+                _get_submodule_or_none(model, parent_name) if parent_name else model
+            )
+            new_mod = LoRASO3(
+                **mod.serialize()["config"], lora_rank=rank, lora_alpha=alpha
+            )
+            new_mod.weight = mod.weight
+            new_mod.bias = mod.bias
+            replacements.append((parent, attr, new_mod))
+        elif type(mod) is SO2Linear:
+            parent_name, _, attr = name.rpartition(".")
+            parent = (
+                _get_submodule_or_none(model, parent_name) if parent_name else model
+            )
+            new_mod = LoRASO2(
+                **mod.serialize()["config"], lora_rank=rank, lora_alpha=alpha
+            )
+            new_mod.weight_m0 = mod.weight_m0
+            new_mod.bias0 = mod.bias0
+            new_mod.weight_m = list(mod.weight_m)
+            replacements.append((parent, attr, new_mod))
+    for parent, attr, new_mod in replacements:
+        _swap_submodule(parent, attr, new_mod)
+
+    # === Step 3. Unfreeze whole submodules (descriptor-level and per-block) ===
+    for path in _UNFREEZE_SUBMODULE_PATHS:
+        sub = _get_submodule_or_none(model, path)
+        if sub is None:
+            continue
+        for _name, mod in _iter_named_modules(sub):
+            mod.trainable = True
+
+    blocks = _get_submodule_or_none(model, _BLOCKS_PATH)
+    if blocks is not None:
+        for block in blocks:
+            for subpath in _UNFREEZE_PER_BLOCK_SUBPATHS:
+                sub = _get_submodule_or_none(block, subpath)
+                if sub is None:
+                    continue
+                for _name, mod in _iter_named_modules(sub):
+                    mod.trainable = True
+
+    # === Step 4. Unfreeze small parameters by leaf name ===
+    # Any name ending in a LoRA-listed leaf or containing ``bias`` becomes
+    # trainable.  The ``"bias" in leaf`` rule deliberately also re-enables the
+    # base biases that ``LoRASO3.__init__`` / ``LoRASO2.__init__`` had frozen
+    # (``SO3Linear.bias``, ``SO2Linear.bias0``); keeping those trainable lets
+    # the LoRA-preserved offsets absorb the downstream mean shift alongside
+    # the low-rank ``ΔW``.  The same rule also unfreezes norm biases
+    # (``EquivariantRMSNorm.bias``, ``ReducedEquivariantRMSNorm.bias0``)
+    # anywhere in the model -- tiny parameter counts, large domain-shift
+    # headroom.  ``adam_scale`` is listed similarly: every RMSNorm scale in
+    # the backbone (per-block ``pre/post_so2_norm``, ``pre/post_ffn_norms``,
+    # ``so2_inter_norms``, etc.) becomes trainable, again at negligible cost.
+    for name, owner, _value in _iter_named_parameters(model):
+        leaf = _leaf_name(name)
+        if leaf in _UNFREEZE_LEAF_NAMES or "bias" in leaf:
+            owner.trainable = True
+
+    # === Step 5. Override-freeze converged parameters by leaf name ===
+    # Must run after steps 3/4 because earlier whole-module unfreezes may
+    # have turned them back on (e.g. ``adam_type_embedding`` inside the
+    # unfrozen ``env_seed_embedding``).
+    for name, owner, _value in _iter_named_parameters(model):
+        leaf = _leaf_name(name)
+        if leaf in _OVERRIDE_FREEZE_LEAF_NAMES:
+            owner.trainable = False
+
+    # === Step 6. Override-freeze every GatedActivation submodule ===
+    # Stable gate patterns; avoids turning on gate_linear.bias via the
+    # step-4 "bias" rule.
+    for _name, mod in _iter_named_modules(model):
+        if isinstance(mod, GatedActivation):
+            for _sub_name, sub_mod in _iter_named_modules(mod):
+                sub_mod.trainable = False
+
+    return model
+
+
+def fold_lora_state_dict_keys(state_dict: dict[str, np.ndarray], prefix: str) -> None:
+    """Fold LoRA adapter keys into base weight keys in *state_dict* (in-place).
+
+    Scans for SO3-style ``A_by_l``/``B_by_l`` pairs and SO2-style
+    ``A_m0``/``B_m0``/``A_m.*``/``B_m.*`` groups under *prefix*.  For each
+    pair whose corresponding base weight key also exists, the delta
+    ``einsum(B, A) * scaling`` is added to the weight and the adapter keys
+    are popped.  ``lora_scaling`` is read from *state_dict* when present;
+    otherwise ``1.0`` is assumed (the default when ``alpha == rank``).
+
+    Called by ``DescrptSeZM._load_from_state_dict`` so that a LoRA-trained
+    checkpoint can be loaded into a plain (non-LoRA) descriptor transparently.
+
+    Parameters
+    ----------
+    state_dict
+        Flat state dict to mutate in place.
+    prefix
+        Key prefix that scopes the scan (e.g. ``"model.Default.atomic_model.descriptor."``).
+    """
+    # === SO3: fold A_by_l / B_by_l into weight ===
+    so3_prefixes = [
+        k[: -len("A_by_l")]
+        for k in list(state_dict)
+        if k.startswith(prefix) and k.endswith(".A_by_l")
+    ]
+    for sp in so3_prefixes:
+        a_key, b_key, w_key = sp + "A_by_l", sp + "B_by_l", sp + "weight"
+        if b_key not in state_dict or w_key not in state_dict:
+            continue
+        a = state_dict.pop(a_key)
+        b = state_dict.pop(b_key)
+        scaling_tensor = state_dict.pop(sp + "lora_scaling", None)
+        scaling = float(scaling_tensor) if scaling_tensor is not None else 1.0
+        state_dict[w_key] = (
+            state_dict[w_key] + np.transpose(np.matmul(b, a), (0, 2, 1)) * scaling
+        )
+
+    # === SO2: fold A_m0 / B_m0 and A_m.* / B_m.* into weight_m0 / weight_m.* ===
+    so2_prefixes = [
+        k[: -len("A_m0")]
+        for k in list(state_dict)
+        if k.startswith(prefix) and k.endswith(".A_m0")
+    ]
+    for sp in so2_prefixes:
+        a0_key, b0_key, w0_key = sp + "A_m0", sp + "B_m0", sp + "weight_m0"
+        if b0_key not in state_dict or w0_key not in state_dict:
+            continue
+        scaling_tensor = state_dict.pop(sp + "lora_scaling", None)
+        scaling = float(scaling_tensor) if scaling_tensor is not None else 1.0
+        a0 = state_dict.pop(a0_key)
+        b0 = state_dict.pop(b0_key)
+        state_dict[w0_key] = (
+            state_dict[w0_key] + np.transpose(np.matmul(b0, a0), (1, 0)) * scaling
+        )
+        m_idx = 0
+        while True:
+            a_key = sp + f"A_m.{m_idx}"
+            b_key = sp + f"B_m.{m_idx}"
+            w_key = sp + f"weight_m.{m_idx}"
+            if a_key not in state_dict:
+                break
+            a_m = state_dict.pop(a_key)
+            b_m = state_dict.pop(b_key)
+            state_dict[w_key] = (
+                state_dict[w_key] + np.transpose(np.matmul(b_m, a_m), (1, 0)) * scaling
+            )
+            m_idx += 1
+
+
+def build_merged_state_dict(
+    module: NativeOP,
+    state_dict: dict[str, np.ndarray] | None = None,
+    *,
+    prefix: str = "",
+) -> dict[str, np.ndarray]:
+    """
+    Produce a plain (LoRA-free) state dict from a LoRA-augmented module.
+
+    Walks ``module.named_modules()`` and, for every ``LoRASO3`` /
+    ``LoRASO2`` submodule, folds ``ΔW = BA·scaling`` into the base weight
+    key and removes the ``A``/``B`` keys.  The returned dict has the same
+    key set as a same-topology SeZM that has never been LoRA-wrapped, and
+    is suitable for loading into a plain SeZM model with ``strict=True``.
+
+    Non-destructive: when ``state_dict`` is ``None`` a deep copy of
+    ``module.state_dict()`` is taken; when the caller provides a
+    ``state_dict`` it is assumed to already be a detached copy (e.g. the
+    full-gathered state dict from FSDP2) and is *mutated in place* for
+    efficiency.
+
+    Parameters
+    ----------
+    module
+        The LoRA-augmented module tree.  Only used for structural
+        information (LoRA submodule prefixes, ``scaling``, ``weight_m``
+        length); its parameters are not modified.
+    state_dict
+        Optional pre-collected state dict (e.g. gathered from FSDP2).  If
+        ``None``, ``deepcopy(module.state_dict())`` is used.
+    prefix
+        Prefix to prepend to every LoRA submodule name when looking keys
+        up in ``state_dict``.  Use this when the caller has state keyed
+        under an outer wrapper (for example ``"model.Default."``).
+
+    Returns
+    -------
+    dict
+        Flat state dict with LoRA adapters folded into base weights.
+    """
+    state = deepcopy(_module_state_dict(module)) if state_dict is None else state_dict
+    for name, mod in _iter_named_modules(module):
+        key_prefix = prefix + name + "." if name else prefix
+        if isinstance(mod, LoRASO3):
+            a = state.pop(key_prefix + "A_by_l")
+            b = state.pop(key_prefix + "B_by_l")
+            state.pop(key_prefix + "lora_scaling", None)
+            weight_key = key_prefix + "weight"
+            delta = np.transpose(np.matmul(b, a), (0, 2, 1)) * mod.scaling
+            state[weight_key] = state[weight_key] + delta
+        elif isinstance(mod, LoRASO2):
+            a_m0 = state.pop(key_prefix + "A_m0")
+            b_m0 = state.pop(key_prefix + "B_m0")
+            state.pop(key_prefix + "lora_scaling", None)
+            w_m0_key = key_prefix + "weight_m0"
+            state[w_m0_key] = (
+                state[w_m0_key]
+                + np.transpose(np.matmul(b_m0, a_m0), (1, 0)) * mod.scaling
+            )
+            for m_idx in range(len(mod.weight_m)):
+                a_i = state.pop(key_prefix + f"A_m.{m_idx}")
+                b_i = state.pop(key_prefix + f"B_m.{m_idx}")
+                w_i_key = key_prefix + f"weight_m.{m_idx}"
+                state[w_i_key] = (
+                    state[w_i_key]
+                    + np.transpose(np.matmul(b_i, a_i), (1, 0)) * mod.scaling
+                )
+    return state
+
+
+def strip_lora_from_extra_state(extra_state: dict[str, Any]) -> dict[str, Any]:
+    """
+    Drop any ``lora`` entry from ``_extra_state["model_params"]``.
+
+    Handles both single-task (``model_params`` is the model config) and
+    multi-task (``model_params["model_dict"][<branch>]`` is each branch's
+    config).  Returns a deep-copied dict; the input is not mutated.
+    """
+    out = deepcopy(extra_state)
+    model_params = out.get("model_params")
+    if not isinstance(model_params, dict):
+        return out
+    model_params.pop("lora", None)
+    model_dict = model_params.get("model_dict")
+    if isinstance(model_dict, dict):
+        for branch_cfg in model_dict.values():
+            if isinstance(branch_cfg, dict):
+                branch_cfg.pop("lora", None)
+    return out
+
+
+def merge_lora_into_base(model: NativeOP) -> NativeOP:
+    """
+    Destructively replace every ``LoRASO3`` / ``LoRASO2`` with its merged
+    plain base module.
+
+    After this call the model no longer contains LoRA submodules: the
+    optimizer, EMA state, and any compiled callables that reference the old
+    submodules become invalid.  Prefer :func:`build_merged_state_dict` for
+    non-destructive checkpoint export during or after training; this function
+    is primarily useful in tests and offline scripts.
+    """
+    replacements: list[tuple[Any, str, NativeOP]] = []
+    for name, mod in list(_iter_named_modules(model)):
+        if isinstance(mod, (LoRASO3, LoRASO2)):
+            parent_name, _, attr = name.rpartition(".")
+            parent = (
+                _get_submodule_or_none(model, parent_name) if parent_name else model
+            )
+            replacements.append((parent, attr, mod.merge_into_base()))
+    for parent, attr, new_mod in replacements:
+        _swap_submodule(parent, attr, new_mod)
+    _clear_sezm_compile_cache(model)
+    return model
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/norm.py b/deepmd/dpmodel/descriptor/dpa4_nn/norm.py
index 02cc4755dc..2424ac1e1b 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/norm.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/norm.py
@@ -2,14 +2,11 @@
 """
 Normalization layers for the DPA4/SeZM descriptor.
 
-This module is the dpmodel port of ``deepmd.pt.model.descriptor.sezm_nn.norm``.
-All four pt norm classes are ported: ``RMSNorm`` (used by ``radial.RadialMLP``),
-``EquivariantRMSNorm`` (used by ``block``), ``ReducedEquivariantRMSNorm`` and
-``ScalarRMSNorm`` (used by ``so2``).
-
-Serialization contract: the ``@variables`` keys of each class match the
-``state_dict`` key names of its pt counterpart, so pt ``serialize()`` output
-deserializes directly into the dpmodel classes (and vice versa).
+This module defines the packed-layout, reduced-layout, generic, and scalar
+RMS normalization layers used throughout SeZM.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.norm``.
 """
 
 from __future__ import (
@@ -32,6 +29,7 @@
     xp_asarray_nodetach,
 )
 from deepmd.dpmodel.common import (
+    get_xp_precision,
     to_numpy_array,
 )
 from deepmd.utils.version import (
@@ -48,20 +46,20 @@ class RMSNorm(NativeOP):
     Generic RMSNorm on tensors with shape `(..., C)`.
 
     This is the plain channel-wise RMS normalization used for non-equivariant
-    branches whose last axis stores feature channels. A learnable affine scale
-    is applied on the channel axis only, while all leading axes are treated as
-    batch dimensions.
+    branches whose last axis stores feature channels. A learnable affine scale is
+    applied on the channel axis only, while all leading axes are treated as batch
+    dimensions.
 
     Parameters
     ----------
-    channels : int
+    channels
         Feature dimension of the last axis.
-    eps : float
+    eps
         Small epsilon for numerical stability.
-    precision : str
-        Parameter and computation precision. Caller should pass a compute
-        precision (fp32+) for numerical stability.
-    trainable : bool
+    precision
+        Parameter and computation precision. Caller should pass compute precision
+        (fp32+) for numerical stability.
+    trainable
         Whether parameters are trainable.
     """
 
@@ -74,20 +72,19 @@ def __init__(
         trainable: bool = True,
     ) -> None:
         self.channels = int(channels)
-        self.eps = float(eps)
         self.precision = precision
+        self.eps = float(eps)
         self.trainable = bool(trainable)
         prec = PRECISION_DICT[self.precision.lower()]
+
         # adam_ prefix routes this to Adam (no weight decay) in HybridMuon.
         self.adam_scale = np.ones((self.channels,), dtype=prec)
 
     def call(self, x: Any) -> Any:
         """
-        Apply RMS normalization.
-
         Parameters
         ----------
-        x : Array
+        x
             Input array with shape `(..., C)`.
 
         Returns
@@ -96,20 +93,18 @@ def call(self, x: Any) -> Any:
             Normalized array with shape `(..., C)`, same dtype as input.
         """
         xp = array_api_compat.array_namespace(x)
-        scale = xp_asarray_nodetach(
-            xp, self.adam_scale[...], device=array_api_compat.device(x)
-        )
+        device = array_api_compat.device(x)
         in_dtype = x.dtype
-        if in_dtype != scale.dtype:
-            x = xp.astype(x, scale.dtype)
+        x = xp.astype(x, get_xp_precision(xp, self.precision))
         inv_rms = 1.0 / xp.sqrt(xp.mean(x * x, axis=-1, keepdims=True) + self.eps)
-        out = x * inv_rms * scale
-        if out.dtype != in_dtype:
-            out = xp.astype(out, in_dtype)
-        return out
+        scale = xp.reshape(
+            xp_asarray_nodetach(xp, self.adam_scale[...], device=device),
+            (1,) * (x.ndim - 1) + (self.channels,),
+        )
+        x = x * inv_rms * scale
+        return xp.astype(x, in_dtype)
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the RMSNorm to a dict."""
         return {
             "@class": "RMSNorm",
             "@version": 1,
@@ -124,7 +119,6 @@ def serialize(self) -> dict[str, Any]:
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> RMSNorm:
-        """Deserialize an RMSNorm from a dict."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "RMSNorm":
@@ -133,20 +127,9 @@ def deserialize(cls, data: dict[str, Any]) -> RMSNorm:
         check_version_compatibility(version, 1, 1)
         config = data.pop("config")
         variables = data.pop("@variables")
-        obj = cls(
-            channels=int(config["channels"]),
-            eps=float(config["eps"]),
-            precision=str(config["precision"]),
-            trainable=bool(config["trainable"]),
-        )
+        obj = cls(**config)
         prec = PRECISION_DICT[obj.precision.lower()]
-        adam_scale = np.asarray(variables["adam_scale"], dtype=prec).reshape(-1)
-        if adam_scale.shape != obj.adam_scale.shape:
-            raise ValueError(
-                f"adam_scale shape {adam_scale.shape} does not match "
-                f"channels {obj.channels}"
-            )
-        obj.adam_scale = adam_scale
+        obj.adam_scale = np.asarray(variables["adam_scale"], dtype=prec)
         return obj
 
 
@@ -165,18 +148,19 @@ class EquivariantRMSNorm(NativeOP):
 
     Parameters
     ----------
-    lmax : int
+    lmax
         Maximum spherical harmonic degree.
-    channels : int
+    channels
         Channels per `(l, m)` coefficient in each focus stream.
-    n_focus : int
+    n_focus
         Number of focus streams. Affine parameters are independent per focus.
-    eps : float
+    eps
         Small epsilon for numerical stability.
-    precision : str
-        Parameter and computation precision. Caller should pass a compute
-        precision (fp32+) for numerical stability.
-    trainable : bool
+    precision
+        Parameter and computation precision. Caller should pass compute precision
+        (fp32+) for numerical stability and handle input/output conversion at
+        boundaries.
+    trainable
         Whether parameters are trainable.
     """
 
@@ -193,14 +177,14 @@ def __init__(
         self.lmax = int(lmax)
         self.channels = int(channels)
         self.n_focus = int(n_focus)
-        self.eps = float(eps)
         self.precision = precision
+        self.eps = float(eps)
         self.trainable = bool(trainable)
         prec = PRECISION_DICT[self.precision.lower()]
 
         # === Step 1. Learnable Parameters ===
         # Store affine scales in degree-major layout (L, F, C). This matches the
-        # packed output layout after degree expansion.
+        # packed output layout after degree expansion
         # adam_ prefix routes this to Adam (no weight decay) in HybridMuon.
         self.adam_scale = np.ones(
             (self.lmax + 1, self.n_focus, self.channels), dtype=prec
@@ -213,8 +197,10 @@ def __init__(
 
         # Pre-fuse degree balancing and channel averaging into a single weight:
         #   w_d = 1 / ((2l+1) * (lmax+1) * C)
-        # so that the shared RMS statistic is a single weighted sum without
-        # allocating an intermediate (N, D, F, C) buffer beyond x^2 itself.
+        # so that
+        #   mean_variance = sum(x^2 * balance_weight, axis=(1, 3))
+        # directly computes the shared RMS statistic without allocating an
+        # intermediate (N, D, F, C) buffer beyond x^2 itself.
         weights_list = []
         scale = 1.0 / ((self.lmax + 1) * self.channels)
         for l in range(self.lmax + 1):
@@ -224,11 +210,9 @@ def __init__(
 
     def call(self, x: Any) -> Any:
         """
-        Apply degree-balanced equivariant RMS normalization.
-
         Parameters
         ----------
-        x : Array
+        x
             Features with shape `(N, D, F, C)` where `D = (lmax + 1)^2`.
 
         Returns
@@ -238,14 +222,8 @@ def call(self, x: Any) -> Any:
         """
         xp = array_api_compat.array_namespace(x)
         device = array_api_compat.device(x)
-        scale = xp_asarray_nodetach(xp, self.adam_scale[...], device=device)
-        bias = xp_asarray_nodetach(xp, self.bias[...], device=device)
-        balance_weight = xp_asarray_nodetach(
-            xp, self.balance_weight, device=array_api_compat.device(x)
-        )
         in_dtype = x.dtype
-        if in_dtype != scale.dtype:
-            x = xp.astype(x, scale.dtype)
+        x = xp.astype(x, get_xp_precision(xp, self.precision))
         x0 = x[:, :1, :, :]  # (N, 1, F, C)
         xt = x[:, 1:, :, :]  # (N, D-1, F, C)
 
@@ -253,6 +231,7 @@ def call(self, x: Any) -> Any:
         x0 = x0 - xp.mean(x0, axis=-1, keepdims=True)
 
         # === Step 2. Compute a shared degree-balanced RMS ===
+        balance_weight = xp_asarray_nodetach(xp, self.balance_weight, device=device)
         mean_variance = xp.sum(x0 * x0, axis=(1, 3)) * balance_weight[0]
         if self.lmax > 0:
             mean_variance = mean_variance + xp.sum(
@@ -266,26 +245,26 @@ def call(self, x: Any) -> Any:
             xt = xt * inv_rms
 
         # === Step 3. Apply per-degree affine parameters ===
-        expand_index = xp_asarray_nodetach(
-            xp, self.expand_index, device=array_api_compat.device(x)
-        )
-        expanded_scale = xp.take(scale, expand_index, axis=0)
+        adam_scale = xp_asarray_nodetach(xp, self.adam_scale[...], device=device)
+        expand_index = xp_asarray_nodetach(xp, self.expand_index, device=device)
+        expanded_scale = xp.take(adam_scale, expand_index, axis=0)
         expanded_scale = expanded_scale[None, ...]  # (1, D, F, C)
         x0 = x0 * expanded_scale[:, :1, :, :]
         if self.lmax > 0:
             xt = xt * expanded_scale[:, 1:, :, :]
 
         # === Step 4. Add scalar bias and restore layout ===
-        bias0 = xp.reshape(bias, (1, 1, self.n_focus, -1))  # (1, 1, F, C)
+        bias0 = xp.reshape(
+            xp_asarray_nodetach(xp, self.bias[...], device=device),
+            (1, 1, self.n_focus, -1),
+        )  # (1, 1, F, C)
         x0 = x0 + bias0
 
         out = x0 if self.lmax == 0 else xp.concat([x0, xt], axis=1)
-        if out.dtype != in_dtype:
-            out = xp.astype(out, in_dtype)
+        out = xp.astype(out, in_dtype)
         return out
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the EquivariantRMSNorm to a dict."""
         return {
             "@class": "EquivariantRMSNorm",
             "@version": 1,
@@ -307,7 +286,6 @@ def serialize(self) -> dict[str, Any]:
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> EquivariantRMSNorm:
-        """Deserialize an EquivariantRMSNorm from a dict."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "EquivariantRMSNorm":
@@ -316,26 +294,12 @@ def deserialize(cls, data: dict[str, Any]) -> EquivariantRMSNorm:
         check_version_compatibility(version, 1, 1)
         config = data.pop("config")
         variables = data.pop("@variables")
-        obj = cls(
-            lmax=int(config["lmax"]),
-            channels=int(config["channels"]),
-            n_focus=int(config["n_focus"]),
-            eps=float(config["eps"]),
-            precision=str(config["precision"]),
-            trainable=bool(config["trainable"]),
-        )
+        obj = cls(**config)
         prec = PRECISION_DICT[obj.precision.lower()]
-        expand_index = np.asarray(variables["expand_index"], dtype=np.int64)
-        if not np.array_equal(expand_index, to_numpy_array(obj.expand_index)):
-            raise ValueError("expand_index does not match the lmax-derived table")
-        for name in ("adam_scale", "bias", "balance_weight"):
-            value = np.asarray(variables[name], dtype=prec)
-            if value.shape != getattr(obj, name).shape:
-                raise ValueError(
-                    f"{name} shape {value.shape} does not match "
-                    f"the expected shape {getattr(obj, name).shape}"
-                )
-            setattr(obj, name, value)
+        obj.adam_scale = np.asarray(variables["adam_scale"], dtype=prec)
+        obj.bias = np.asarray(variables["bias"], dtype=prec)
+        obj.expand_index = np.asarray(variables["expand_index"], dtype=np.int64)
+        obj.balance_weight = np.asarray(variables["balance_weight"], dtype=prec)
         return obj
 
 
@@ -355,23 +319,23 @@ class ReducedEquivariantRMSNorm(NativeOP):
 
     Parameters
     ----------
-    lmax : int
+    lmax
         Maximum spherical harmonic degree.
-    mmax : int
+    mmax
         Maximum order kept in the truncated layout.
-    channels : int
+    channels
         Number of channels per retained coefficient.
-    degree_index_m : np.ndarray
+    degree_index_m
         Degree index per coefficient in m-major truncated layout, with shape
         `(D_m_trunc,)`.
-    n_focus : int
+    n_focus
         Number of focus streams.
-    eps : float
+    eps
         Epsilon for numerical stability.
-    precision : str
-        Parameter and computation precision. Caller should pass a compute
-        precision (fp32+) for numerical stability.
-    trainable : bool
+    precision
+        Parameter and computation precision. Caller should pass compute precision
+        (fp32+) for numerical stability.
+    trainable
         Whether parameters are trainable.
     """
 
@@ -389,10 +353,6 @@ def __init__(
     ) -> None:
         self.lmax = int(lmax)
         self.mmax = int(mmax)
-        if self.mmax < 0:
-            raise ValueError("`mmax` must be non-negative")
-        if self.mmax > self.lmax:
-            raise ValueError("`mmax` must be <= `lmax`")
         self.channels = int(channels)
         self.n_focus = int(n_focus)
         self.eps = float(eps)
@@ -427,11 +387,9 @@ def __init__(
 
     def call(self, x: Any) -> Any:
         """
-        Apply degree-balanced reduced-layout RMS normalization.
-
         Parameters
         ----------
-        x : Array
+        x
             Input array with shape (E, F, D_m_trunc, C).
 
         Returns
@@ -442,15 +400,8 @@ def call(self, x: Any) -> Any:
         """
         xp = array_api_compat.array_namespace(x)
         device = array_api_compat.device(x)
-        scale = xp_asarray_nodetach(xp, self.adam_scale[...], device=device)
-        bias0_w = xp_asarray_nodetach(xp, self.bias0[...], device=device)
-        balance_weight = xp_asarray_nodetach(
-            xp, self.balance_weight, device=array_api_compat.device(x)
-        )
         in_dtype = x.dtype
-        if in_dtype != scale.dtype:
-            x = xp.astype(x, scale.dtype)
-        has_xt = self.degree_index_m.size > 1
+        x = xp.astype(x, get_xp_precision(xp, self.precision))
         x0 = x[:, :, :1, :]  # (E, F, 1, C)
         xt = x[:, :, 1:, :]  # (E, F, D_m_trunc-1, C)
 
@@ -458,8 +409,9 @@ def call(self, x: Any) -> Any:
         x0 = x0 - xp.mean(x0, axis=-1, keepdims=True)
 
         # === Step 2. Compute a shared degree-balanced RMS ===
+        balance_weight = xp_asarray_nodetach(xp, self.balance_weight, device=device)
         mean_variance = xp.sum(x0 * x0, axis=(2, 3)) * balance_weight[0]
-        if has_xt:
+        if self.degree_index_m.size > 1:
             mean_variance = mean_variance + xp.sum(
                 (xt * xt) * balance_weight[1:][None, None, :, None], axis=(2, 3)
             )
@@ -467,30 +419,30 @@ def call(self, x: Any) -> Any:
         inv_rms = inv_rms[:, :, None, None]  # (E, F, 1, 1)
 
         x0 = x0 * inv_rms
-        if has_xt:
+        if self.degree_index_m.size > 1:
             xt = xt * inv_rms
 
         # === Step 3. Apply per-degree affine parameters ===
-        degree_index_m = xp_asarray_nodetach(
-            xp, self.degree_index_m, device=array_api_compat.device(x)
-        )
-        expanded_scale = xp.take(scale, degree_index_m, axis=1)
+        adam_scale = xp_asarray_nodetach(xp, self.adam_scale[...], device=device)
+        degree_index_m = xp_asarray_nodetach(xp, self.degree_index_m, device=device)
+        expanded_scale = xp.take(adam_scale, degree_index_m, axis=1)
         expanded_scale = expanded_scale[None, ...]  # (1, F, D_m_trunc, C)
         x0 = x0 * expanded_scale[:, :, :1, :]
-        if has_xt:
+        if self.degree_index_m.size > 1:
             xt = xt * expanded_scale[:, :, 1:, :]
 
         # === Step 4. Add scalar bias and restore layout ===
-        bias0 = xp.reshape(bias0_w, (1, self.n_focus, 1, -1))  # (1, F, 1, C)
+        bias0 = xp.reshape(
+            xp_asarray_nodetach(xp, self.bias0[...], device=device),
+            (1, self.n_focus, 1, -1),
+        )  # (1, F, 1, C)
         x0 = x0 + bias0
 
-        out = xp.concat([x0, xt], axis=2) if has_xt else x0
-        if out.dtype != in_dtype:
-            out = xp.astype(out, in_dtype)
+        out = x0 if self.degree_index_m.size == 1 else xp.concat([x0, xt], axis=2)
+        out = xp.astype(out, in_dtype)
         return out
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the ReducedEquivariantRMSNorm to a dict."""
         return {
             "@class": "ReducedEquivariantRMSNorm",
             "@version": 1,
@@ -514,7 +466,6 @@ def serialize(self) -> dict[str, Any]:
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> ReducedEquivariantRMSNorm:
-        """Deserialize a ReducedEquivariantRMSNorm from a dict."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "ReducedEquivariantRMSNorm":
@@ -523,28 +474,12 @@ def deserialize(cls, data: dict[str, Any]) -> ReducedEquivariantRMSNorm:
         check_version_compatibility(version, 1, 1)
         config = data.pop("config")
         variables = data.pop("@variables")
-        obj = cls(
-            lmax=int(config["lmax"]),
-            mmax=int(config["mmax"]),
-            channels=int(config["channels"]),
-            degree_index_m=np.asarray(config["degree_index_m"], dtype=np.int64),
-            n_focus=int(config["n_focus"]),
-            eps=float(config["eps"]),
-            precision=str(config["precision"]),
-            trainable=bool(config["trainable"]),
-        )
+        obj = cls(**config)
         prec = PRECISION_DICT[obj.precision.lower()]
-        degree_index_m = np.asarray(variables["degree_index_m"], dtype=np.int64)
-        if not np.array_equal(degree_index_m, to_numpy_array(obj.degree_index_m)):
-            raise ValueError("degree_index_m variable does not match the config")
-        for name in ("balance_weight", "adam_scale", "bias0"):
-            value = np.asarray(variables[name], dtype=prec)
-            if value.shape != getattr(obj, name).shape:
-                raise ValueError(
-                    f"{name} shape {value.shape} does not match "
-                    f"the expected shape {getattr(obj, name).shape}"
-                )
-            setattr(obj, name, value)
+        obj.degree_index_m = np.asarray(variables["degree_index_m"], dtype=np.int64)
+        obj.balance_weight = np.asarray(variables["balance_weight"], dtype=prec)
+        obj.adam_scale = np.asarray(variables["adam_scale"], dtype=prec)
+        obj.bias0 = np.asarray(variables["bias0"], dtype=prec)
         return obj
 
 
@@ -559,16 +494,16 @@ class ScalarRMSNorm(NativeOP):
 
     Parameters
     ----------
-    channels : int
+    channels
         Feature dimension of the last axis.
-    n_focus : int
+    n_focus
         Number of focus streams.
-    eps : float
+    eps
         Small epsilon for numerical stability.
-    precision : str
-        Parameter and computation precision. Caller should pass a compute
-        precision (fp32+) for numerical stability.
-    trainable : bool
+    precision
+        Parameter and computation precision. Caller should pass compute precision
+        (fp32+) for numerical stability.
+    trainable
         Whether parameters are trainable.
     """
 
@@ -583,20 +518,19 @@ def __init__(
     ) -> None:
         self.channels = int(channels)
         self.n_focus = int(n_focus)
-        self.eps = float(eps)
         self.precision = precision
+        self.eps = float(eps)
         self.trainable = bool(trainable)
         prec = PRECISION_DICT[self.precision.lower()]
+
         # adam_ prefix routes this to Adam (no weight decay) in HybridMuon.
         self.adam_scale = np.ones((self.n_focus, self.channels), dtype=prec)
 
     def call(self, x: Any) -> Any:
         """
-        Apply per-focus RMS normalization.
-
         Parameters
         ----------
-        x : Array
+        x
             Input array with shape (B, F, C) or (B, C) when `n_focus=1`.
 
         Returns
@@ -605,25 +539,22 @@ def call(self, x: Any) -> Any:
             Normalized array with the same shape as input and same dtype.
         """
         xp = array_api_compat.array_namespace(x)
-        scale = xp_asarray_nodetach(
-            xp, self.adam_scale[...], device=array_api_compat.device(x)
-        )
+        device = array_api_compat.device(x)
         in_dtype = x.dtype
-        if in_dtype != scale.dtype:
-            x = xp.astype(x, scale.dtype)
+        x = xp.astype(x, get_xp_precision(xp, self.precision))
+
+        if x.ndim == 2:
+            inv_rms = 1.0 / xp.sqrt(xp.mean(x * x, axis=-1, keepdims=True) + self.eps)
+            x = x * inv_rms
+            x = x * xp_asarray_nodetach(xp, self.adam_scale[...], device=device)[0]
+            return xp.astype(x, in_dtype)
 
         inv_rms = 1.0 / xp.sqrt(xp.mean(x * x, axis=-1, keepdims=True) + self.eps)
         x = x * inv_rms
-        if x.ndim == 2:
-            x = x * scale[0, :]
-        else:
-            x = x * scale[None, ...]
-        if x.dtype != in_dtype:
-            x = xp.astype(x, in_dtype)
-        return x
+        x = x * xp_asarray_nodetach(xp, self.adam_scale[...], device=device)[None, ...]
+        return xp.astype(x, in_dtype)
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the ScalarRMSNorm to a dict."""
         return {
             "@class": "ScalarRMSNorm",
             "@version": 1,
@@ -639,7 +570,6 @@ def serialize(self) -> dict[str, Any]:
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> ScalarRMSNorm:
-        """Deserialize a ScalarRMSNorm from a dict."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "ScalarRMSNorm":
@@ -648,15 +578,7 @@ def deserialize(cls, data: dict[str, Any]) -> ScalarRMSNorm:
         check_version_compatibility(version, 1, 1)
         config = data.pop("config")
         variables = data.pop("@variables")
-        obj = cls(
-            channels=int(config["channels"]),
-            n_focus=int(config["n_focus"]),
-            eps=float(config["eps"]),
-            precision=str(config["precision"]),
-            trainable=bool(config["trainable"]),
-        )
+        obj = cls(**config)
         prec = PRECISION_DICT[obj.precision.lower()]
-        adam_scale = np.asarray(variables["adam_scale"], dtype=prec)
-        adam_scale = adam_scale.reshape(obj.adam_scale.shape)
-        obj.adam_scale = adam_scale
+        obj.adam_scale = np.asarray(variables["adam_scale"], dtype=prec)
         return obj
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/projection.py b/deepmd/dpmodel/descriptor/dpa4_nn/projection.py
index c487e16773..7c332449cb 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/projection.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/projection.py
@@ -1,37 +1,14 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """
-S2 grid projection helpers for DPA4/SeZM function-space nonlinearities.
-
-This module is the dpmodel port of
-``deepmd.pt.model.descriptor.sezm_nn.projection``, restricted to the Lebedev
-S2 quadrature path used by the core DPA4 configuration
-(``lebedev_quadrature=True``). The projectors only handle basis transforms:
-a projector maps coefficient tensors to a fixed quadrature grid, and maps
-grid fields back to coefficients with the matching quadrature rule.
-
-Ported names: ``BaseGridProjector``, ``S2GridProjector`` (Lebedev branch),
-``resolve_s2_grid_resolution`` (as-is, both methods — pure arithmetic), and
-``_normalize_s2_grid_resolution``.
-
-SO(3) Wigner-D grid machinery (consumed by ``SO3GridNet`` in
-``grid_net.py``, which backs the ``node_wise_so3``, ``message_node_so3``,
-and ``ffn_so3_grid`` paths): ``resolve_so3_grid``, ``_build_so3_frame_set``,
-and ``SO3GridProjector`` are ported here. The SO(3) projection matrices are
-assembled at init time with pure numpy via the Wigner-D quadrature
-(``WignerDCalculator``) over a Lebedev x gamma rotation grid, matching the pt
-float64 buffers to machine precision.
-
-Not ported (guarded): the e3nn product-grid branch of ``S2GridProjector``
-(``grid_method="e3nn"``, i.e. ``lebedev_quadrature=False``) raises
-``NotImplementedError`` at construction. Only the Lebedev path reproduces
-to-grid/from-grid roundtrip identities at machine precision.
-
-The Lebedev projection matrices are assembled at init time with pure numpy:
-``load_lebedev_rule`` replaces the pt Lebedev loader (same packaged data) and
-``real_spherical_harmonics`` exactly replaces the e3nn call
-``spherical_harmonics(list(range(lmax+1)), points, normalize=True,
-normalization="norm")``, so the buffers match the pt float64 buffers to
-machine precision.
+Grid projection helpers for DPA4/SeZM function-space nonlinearities.
+
+The projectors in this module only handle basis transforms.  They do not apply
+channel mixing or nonlinearities.  A projector maps coefficient tensors to a
+fixed quadrature grid, and maps grid fields back to coefficients with the
+matching quadrature rule.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.projection``.
 """
 
 from __future__ import (
@@ -85,9 +62,10 @@ class BaseGridProjector(NativeOP):
     Subclasses build ``to_grid_mat`` with shape ``(G, J)`` and
     ``from_grid_mat`` with shape ``(J, G)``, where ``G`` is the number of grid
     samples and ``J`` is the flattened coefficient axis consumed by the grid
-    net. For ordinary S2 projections, ``J`` is the SO(3) feature coefficient
-    axis: ``D = (lmax + 1)^2`` in packed layout, or the retained ``D_m`` axis
-    in m-major layout.
+    net.  For ordinary S2 projections, ``J`` is the SO(3) feature coefficient
+    axis: ``D = (lmax + 1)^2`` in packed layout, or the retained ``D_m`` axis in
+    m-major layout.  For SO(3) frame projections, ``J = D * n_frames`` with
+    frame index packed inside each coefficient row.
     """
 
     def __init__(
@@ -123,8 +101,8 @@ def __init__(
         if self.grid_size != int(from_grid_mat.shape[1]):
             raise ValueError("Projection matrix grid axes `G` do not match")
         prec = PRECISION_DICT[self.precision.lower()]
-        self.to_grid_mat = np.ascontiguousarray(to_grid_mat).astype(prec)
-        self.from_grid_mat = np.ascontiguousarray(from_grid_mat).astype(prec)
+        self.to_grid_mat = np.ascontiguousarray(to_grid_mat, dtype=prec)
+        self.from_grid_mat = np.ascontiguousarray(from_grid_mat, dtype=prec)
 
     def call(self, *args: Any, **kwargs: Any) -> Any:
         """Projectors expose ``to_grid``/``from_grid``; there is no forward."""
@@ -138,8 +116,7 @@ def to_grid(self, embedding: Any) -> Any:
         to_grid_mat = xp_asarray_nodetach(
             xp, self.to_grid_mat[...], device=array_api_compat.device(embedding)
         )
-        if to_grid_mat.dtype != embedding.dtype:
-            to_grid_mat = xp.astype(to_grid_mat, embedding.dtype)
+        to_grid_mat = xp.astype(to_grid_mat, embedding.dtype)
         # einsum "gj,njc->ngc" as a broadcast batched matmul
         return xp.matmul(to_grid_mat[None, ...], embedding)
 
@@ -149,8 +126,7 @@ def from_grid(self, grid: Any) -> Any:
         from_grid_mat = xp_asarray_nodetach(
             xp, self.from_grid_mat[...], device=array_api_compat.device(grid)
         )
-        if from_grid_mat.dtype != grid.dtype:
-            from_grid_mat = xp.astype(from_grid_mat, grid.dtype)
+        from_grid_mat = xp.astype(from_grid_mat, grid.dtype)
         # einsum "jg,ngc->njc" as a broadcast batched matmul
         return xp.matmul(from_grid_mat[None, ...], grid)
 
@@ -172,7 +148,7 @@ def _build_projection_mats(
 
 class S2GridProjector(BaseGridProjector):
     """
-    Project SO(3) coefficients to/from a flattened S2 grid (Lebedev only).
+    Project SO(3) coefficients to/from a flattened S2 grid.
 
     Parameters
     ----------
@@ -183,15 +159,16 @@ class S2GridProjector(BaseGridProjector):
     precision
         Buffer precision used by the projection matrices.
     grid_resolution_list
-        Two-element resolution list ``[precision, n_points]`` for
-        ``grid_method='lebedev'``. If None, resolved automatically.
+        Two-element resolution list. For ``grid_method='e3nn'`` it is
+        ``[R_phi, R_theta]`` and is converted to the ``e3nn``
+        ``(lat, long) = (R_theta, R_phi)`` ordering. For
+        ``grid_method='lebedev'`` it is ``[precision, n_points]``.
     coefficient_layout
         Coefficient ordering expected by the caller:
         - ``"packed"``: packed ``(l, m)`` order, optionally truncated by ``mmax``.
         - ``"m_major"``: reduced m-major order used inside ``SO2Convolution``.
     grid_method
-        S2 quadrature backend. Must be ``"e3nn"`` or ``"lebedev"``; only
-        ``"lebedev"`` (``lebedev_quadrature=True``) is ported to dpmodel.
+        S2 quadrature backend. Must be ``"e3nn"`` or ``"lebedev"``.
     """
 
     def __init__(
@@ -209,11 +186,6 @@ def __init__(
         self.grid_method = str(grid_method).lower()
         if self.grid_method not in {"e3nn", "lebedev"}:
             raise ValueError("`grid_method` must be either 'e3nn' or 'lebedev'")
-        if self.grid_method == "e3nn":
-            raise NotImplementedError(
-                "grid_method='e3nn' (lebedev_quadrature=False) is not ported "
-                "to dpmodel; use lebedev_quadrature=True"
-            )
 
         self.grid_resolution_list = _normalize_s2_grid_resolution(
             lmax_i,
@@ -221,9 +193,14 @@ def __init__(
             grid_resolution_list,
             method=self.grid_method,
         )
-        self.phi_resolution = 0
-        self.theta_resolution = 0
-        self.lebedev_precision, self.lebedev_npoints = self.grid_resolution_list
+        if self.grid_method == "e3nn":
+            self.phi_resolution, self.theta_resolution = self.grid_resolution_list
+            self.lebedev_precision = 0
+            self.lebedev_npoints = 0
+        else:
+            self.phi_resolution = 0
+            self.theta_resolution = 0
+            self.lebedev_precision, self.lebedev_npoints = self.grid_resolution_list
 
         super().__init__(
             lmax=lmax_i,
@@ -233,6 +210,17 @@ def __init__(
             coefficient_layout=coefficient_layout,
         )
 
+    def _rescale_truncated_orders(self, mat: np.ndarray) -> None:
+        if self.lmax == self.mmax:
+            return
+        for degree in range(self.lmax + 1):
+            if degree <= self.mmax:
+                continue
+            start_idx = degree * degree
+            length = 2 * degree + 1
+            rescale = math.sqrt(length / float(2 * self.mmax + 1))
+            mat[:, :, start_idx : start_idx + length] *= rescale
+
     def _rescale_truncated_matrix(self, mat: np.ndarray) -> None:
         if self.lmax == self.mmax:
             return
@@ -247,6 +235,86 @@ def _rescale_truncated_matrix(self, mat: np.ndarray) -> None:
     def _build_projection_mats(
         self,
         coeff_index: np.ndarray,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        if self.grid_method == "lebedev":
+            return self._build_lebedev_projection_mats(coeff_index)
+        return self._build_e3nn_projection_mats(coeff_index)
+
+    def _build_e3nn_projection_mats(
+        self,
+        coeff_index: np.ndarray,
+    ) -> tuple[np.ndarray, np.ndarray]:
+        # Under the component normalization, the e3nn ``ToS2Grid``/``FromS2Grid``
+        # product-grid buffers evaluate the real spherical harmonics on the
+        # ``(beta, alpha)`` tensor-product grid: sampling
+        # ``real_spherical_harmonics`` on those grid points reproduces
+        # ``einsum("mbi,am->bai", ToS2Grid.shb, ToS2Grid.sha)``, and synthesis of
+        # the from-grid matrix multiplies in the e3nn beta quadrature weights.
+        # This keeps the e3nn and Lebedev S2 backends drop-in replacements for
+        # the same grid net.
+        res_beta = int(self.theta_resolution)
+        res_alpha = int(self.phi_resolution)
+        betas = (np.arange(res_beta, dtype=np.float64) + 0.5) / res_beta * math.pi
+        alphas = np.arange(res_alpha, dtype=np.float64) / res_alpha * (2.0 * math.pi)
+        beta_grid, alpha_grid = np.meshgrid(betas, alphas, indexing="ij")
+        grid_points = np.stack(
+            [
+                np.sin(beta_grid) * np.sin(alpha_grid),
+                np.cos(beta_grid),
+                np.sin(beta_grid) * np.cos(alpha_grid),
+            ],
+            axis=-1,
+        )
+        harmonics = real_spherical_harmonics(grid_points, self.lmax)
+        scale = math.sqrt(float(self.lmax + 1))
+        degree_factors = np.asarray(
+            [
+                float(2 * degree + 1)
+                for degree in range(self.lmax + 1)
+                for _ in range(2 * degree + 1)
+            ],
+            dtype=np.float64,
+        )
+        # e3nn beta quadrature weights (``FromS2Grid._quadrature_weights``),
+        # one weight per beta row, scaled by ``res_beta**2 / res_alpha``.
+        half = res_beta // 2
+        order = np.arange(half, dtype=np.float64)
+        beta_index = np.arange(2 * half, dtype=np.float64)
+        quad_inner = np.sum(
+            np.sin(
+                (2.0 * beta_index[:, None] + 1.0)
+                * (2.0 * order[None, :] + 1.0)
+                * math.pi
+                / (4.0 * half)
+            )
+            / (2.0 * order[None, :] + 1.0),
+            axis=1,
+        )
+        quad_weight = (
+            (2.0 / half)
+            * np.sin(math.pi * (2.0 * beta_index + 1.0) / (4.0 * half))
+            * quad_inner
+        )
+        quad_weight /= 2.0 * (2 * half) ** 2
+        quad_weight = quad_weight * (res_beta**2 / res_alpha)
+        to_grid_mat = harmonics / scale
+        from_grid_mat = harmonics * (
+            quad_weight[:, None, None] * scale * degree_factors[None, None, :]
+        )
+        self._rescale_truncated_orders(to_grid_mat)
+        self._rescale_truncated_orders(from_grid_mat)
+
+        to_grid_mat = np.reshape(to_grid_mat, (res_beta * res_alpha, -1))[
+            :, coeff_index
+        ]
+        from_grid_mat = np.reshape(from_grid_mat, (res_beta * res_alpha, -1)).T[
+            coeff_index, :
+        ]
+        return to_grid_mat, from_grid_mat
+
+    def _build_lebedev_projection_mats(
+        self,
+        coeff_index: np.ndarray,
     ) -> tuple[np.ndarray, np.ndarray]:
         points, weights = load_lebedev_rule(self.lebedev_precision)
         points = np.asarray(points, dtype=np.float64)
@@ -276,7 +344,6 @@ def _build_projection_mats(
         return to_grid_mat, from_grid_mat
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the S2GridProjector to a dict (pt-compatible format)."""
         return {
             "@class": "S2GridProjector",
             "@version": 1,
@@ -293,7 +360,6 @@ def serialize(self) -> dict[str, Any]:
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> S2GridProjector:
-        """Deserialize an S2GridProjector from a dict."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "S2GridProjector":
@@ -302,14 +368,7 @@ def deserialize(cls, data: dict[str, Any]) -> S2GridProjector:
         check_version_compatibility(version, 1, 1)
         config = data.pop("config")
         data.pop("@variables", None)
-        return cls(
-            lmax=int(config["lmax"]),
-            mmax=int(config["mmax"]),
-            precision=str(config["precision"]),
-            grid_resolution_list=config["grid_resolution_list"],
-            coefficient_layout=str(config["coefficient_layout"]),
-            grid_method=str(config["grid_method"]),
-        )
+        return cls(**config)
 
 
 class SO3GridProjector(BaseGridProjector):
@@ -320,23 +379,6 @@ class SO3GridProjector(BaseGridProjector):
     ``(l, m)`` order outside and the configured frame set inside each row.  A
     frame index outside ``[-l, l]`` is kept as a zero column/row.  This keeps the
     tensor layout regular while preserving the exact per-degree frame support.
-
-    Parameters
-    ----------
-    lmax
-        Maximum spherical harmonic degree.
-    mmax
-        Maximum order kept in the coefficient layout. If None, use ``lmax``.
-    kmax
-        Frame-index half-width; the frame set is ``{0, -1, 1, ..., -kmax, kmax}``.
-    precision
-        Buffer precision used by the projection matrices.
-    lebedev_precision
-        Explicit Lebedev rule precision. If None, resolved automatically.
-    coefficient_layout
-        Coefficient ordering expected by the caller:
-        - ``"packed"``: packed ``(l, m)`` order, optionally truncated by ``mmax``.
-        - ``"m_major"``: reduced m-major order used inside ``SO2Convolution``.
     """
 
     def __init__(
@@ -368,7 +410,6 @@ def __init__(
             n_frames=len(self.frame_set),
             coefficient_layout=coefficient_layout,
         )
-        # plain numpy int64 attribute (becomes a torch buffer in pt_expt later)
         self.frame_values = np.asarray(self.frame_set, dtype=np.int64)
 
     def _build_projection_mats(
@@ -382,24 +423,20 @@ def _build_projection_mats(
             2.0 * math.pi / float(self.n_gamma)
         )
         edge_quaternion = build_edge_quaternion(points, eps=1e-14)
-        # torch ``repeat_interleave(n_gamma, dim=0)`` -> numpy ``repeat`` on axis 0
         edge_quaternion = np.repeat(edge_quaternion, self.n_gamma, axis=0)
-        # torch ``repeat(n_points, 1)`` (tile) -> numpy ``tile``
         gamma_quaternion = np.tile(quaternion_z_rotation(gamma), (points.shape[0], 1))
         grid_quaternion = quaternion_multiply(gamma_quaternion, edge_quaternion)
-        # WignerDCalculator.__call__ returns ``(D_full, Dt_full)``; take D_full.
-        wigner_grid = WignerDCalculator(self.lmax, precision="float64")(
+        wigner_grid, _ = WignerDCalculator(self.lmax, precision="float64")(
             grid_quaternion
-        )[0]
+        )
         # ``build_edge_quaternion`` follows SeZM's global-to-local convention.
-        # The transpose below stores the local m=0 column in the same layout as
-        # ``WignerDCalculator.forward_zonal`` and extends it to k != 0.
+        # The transpose below stores the local m=0 column in the same layout
+        # as ``WignerDCalculator.forward_zonal`` and extends it to k != 0.
         wigner_grid = np.ascontiguousarray(np.swapaxes(wigner_grid, -1, -2))
         haar_weight = np.repeat(weights, self.n_gamma) / float(self.n_gamma)
 
         grid_size = int(grid_quaternion.shape[0])
-        n_frames = len(self.frame_set)
-        coeff_dim = int(coeff_index.shape[0]) * n_frames
+        coeff_dim = int(coeff_index.shape[0] * len(self.frame_set))
         to_grid_mat = np.zeros((grid_size, coeff_dim), dtype=np.float64)
         from_grid_mat = np.zeros((coeff_dim, grid_size), dtype=np.float64)
 
@@ -407,14 +444,12 @@ def _build_projection_mats(
             degree_factor = float(2 * degree + 1)
             for m_order in range(-degree, degree + 1):
                 packed_idx = so3_packed_index(degree, m_order)
-                # init-time numpy control-flow index; replaces pt's
-                # ``(coeff_index == packed_idx).nonzero(as_tuple=False)``
                 coeff_positions = np.argwhere(coeff_index == packed_idx)
                 if coeff_positions.size == 0:
                     continue
                 coeff_pos = int(coeff_positions[0, 0])
                 for frame_pos, frame_order in enumerate(self.frame_set):
-                    flat_idx = coeff_pos * n_frames + frame_pos
+                    flat_idx = coeff_pos * len(self.frame_set) + frame_pos
                     if abs(frame_order) > degree:
                         continue
                     row = so3_packed_index(degree, m_order)
@@ -425,7 +460,6 @@ def _build_projection_mats(
         return to_grid_mat, from_grid_mat
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the SO3GridProjector to a dict (pt-compatible format)."""
         return {
             "@class": "SO3GridProjector",
             "@version": 1,
@@ -442,7 +476,6 @@ def serialize(self) -> dict[str, Any]:
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> SO3GridProjector:
-        """Deserialize an SO3GridProjector from a dict."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "SO3GridProjector":
@@ -451,14 +484,7 @@ def deserialize(cls, data: dict[str, Any]) -> SO3GridProjector:
         check_version_compatibility(version, 1, 1)
         config = data.pop("config")
         data.pop("@variables", None)
-        return cls(
-            lmax=int(config["lmax"]),
-            mmax=int(config["mmax"]),
-            kmax=int(config["kmax"]),
-            precision=str(config["precision"]),
-            lebedev_precision=int(config["lebedev_precision"]),
-            coefficient_layout=str(config["coefficient_layout"]),
-        )
+        return cls(**config)
 
 
 def resolve_s2_grid_resolution(
@@ -496,40 +522,6 @@ def resolve_s2_grid_resolution(
     return [phi_resolution, theta_resolution]
 
 
-def _normalize_s2_grid_resolution(
-    lmax: int,
-    mmax: int,
-    grid_resolution_list: list[int] | None,
-    *,
-    method: str,
-) -> list[int]:
-    """Resolve default grids or validate already-resolved low-level grids."""
-    method = str(method).lower()
-    if grid_resolution_list is None:
-        return resolve_s2_grid_resolution(lmax, mmax, method=method)
-    if method == "lebedev":
-        if len(grid_resolution_list) != 2:
-            raise ValueError(
-                "Lebedev `grid_resolution_list` must be [precision, n_points]"
-            )
-        precision = int(grid_resolution_list[0])
-        n_points = int(grid_resolution_list[1])
-        expected_n_points = LEBEDEV_PRECISION_TO_NPOINTS.get(precision)
-        if expected_n_points != n_points:
-            raise ValueError(
-                "Lebedev `grid_resolution_list` must match a packaged "
-                f"[precision, n_points] pair; got [{precision}, {n_points}]"
-            )
-        return [precision, n_points]
-
-    if len(grid_resolution_list) != 2:
-        raise ValueError("`grid_resolution_list` must contain two integers")
-    resolution = [int(grid_resolution_list[0]), int(grid_resolution_list[1])]
-    if resolution[0] < 1 or resolution[1] < 1:
-        raise ValueError("grid resolutions must be positive")
-    return resolution
-
-
 def resolve_so3_grid(
     lmax: int,
     *,
@@ -574,6 +566,40 @@ def resolve_so3_grid(
     return int(lebedev_precision), int(lebedev_npoints), int(n_gamma)
 
 
+def _normalize_s2_grid_resolution(
+    lmax: int,
+    mmax: int,
+    grid_resolution_list: list[int] | None,
+    *,
+    method: str,
+) -> list[int]:
+    """Resolve default grids or validate already-resolved low-level grids."""
+    method = str(method).lower()
+    if grid_resolution_list is None:
+        return resolve_s2_grid_resolution(lmax, mmax, method=method)
+    if method == "lebedev":
+        if len(grid_resolution_list) != 2:
+            raise ValueError(
+                "Lebedev `grid_resolution_list` must be [precision, n_points]"
+            )
+        precision = int(grid_resolution_list[0])
+        n_points = int(grid_resolution_list[1])
+        expected_n_points = LEBEDEV_PRECISION_TO_NPOINTS.get(precision)
+        if expected_n_points != n_points:
+            raise ValueError(
+                "Lebedev `grid_resolution_list` must match a packaged "
+                f"[precision, n_points] pair; got [{precision}, {n_points}]"
+            )
+        return [precision, n_points]
+
+    if len(grid_resolution_list) != 2:
+        raise ValueError("`grid_resolution_list` must contain two integers")
+    resolution = [int(grid_resolution_list[0]), int(grid_resolution_list[1])]
+    if resolution[0] < 1 or resolution[1] < 1:
+        raise ValueError("grid resolutions must be positive")
+    return resolution
+
+
 def _build_so3_frame_set(kmax: int) -> list[int]:
     """Build the symmetric frame-index set with zero first."""
     kmax_i = int(kmax)
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/radial.py b/deepmd/dpmodel/descriptor/dpa4_nn/radial.py
index 94e506fd0b..3766588de9 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/radial.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/radial.py
@@ -2,14 +2,11 @@
 """
 Radial building blocks for the DPA4/SeZM descriptor.
 
-This module is the dpmodel port of ``deepmd.pt.model.descriptor.sezm_nn.radial``.
-It defines the cutoff envelope, radial basis, and radial multilayer perceptron
-used by the DPA4 descriptor. ``InnerClamp`` and ``BridgingSwitch`` are ported
-by later tasks together with the modules that consume them.
-
-Serialization contract: the ``@variables`` keys of each class match the
-``state_dict`` key names of its pt counterpart, so pt ``serialize()`` output
-deserializes directly into the dpmodel classes (and vice versa).
+This module defines the cutoff envelope, inner-distance clamp, radial basis,
+and radial multilayer perceptron used by SeZM.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.radial``.
 """
 
 from __future__ import (
@@ -66,8 +63,6 @@ class RadialMLP(NativeOP):
         Floating point precision for the linear layers.
     trainable : bool
         Whether the parameters are trainable.
-    seed : int | list[int] | None
-        Random seed for the layer initialization.
 
     Architecture
     ------------
@@ -94,35 +89,36 @@ def __init__(
     ) -> None:
         if len(mlp_layers) < 2:
             raise ValueError("`mlp_layers` must have at least 2 elements")
-        self.mlp_layers = [int(d) for d in mlp_layers]
+        self.mlp_layers = list(mlp_layers)
         self.activation_function = str(activation_function)
         self.precision = precision
         self.trainable = bool(trainable)
 
-        n_layers = len(self.mlp_layers)
-        self.layers: list[NativeLayer] = []
-        self.norms: list[RMSNorm] = []
+        modules: list = []
+        n_layers = len(mlp_layers)
         for i in range(n_layers - 1):
-            self.layers.append(
-                NativeLayer(
-                    self.mlp_layers[i],
-                    self.mlp_layers[i + 1],
-                    bias=False,
-                    activation_function=None,
-                    precision=self.precision,
-                    seed=child_seed(seed, i),
-                    trainable=self.trainable,
-                )
+            linear = NativeLayer(
+                mlp_layers[i],
+                mlp_layers[i + 1],
+                bias=False,
+                activation_function=None,
+                precision=self.precision,
+                seed=child_seed(seed, i),
+                trainable=trainable,
             )
+            modules.append(linear)
             # Last layer: no RMSNorm/activation
             if i < n_layers - 2:
-                self.norms.append(
+                modules.append(
                     RMSNorm(
-                        channels=self.mlp_layers[i + 1],
+                        channels=mlp_layers[i + 1],
                         precision=self.precision,
-                        trainable=self.trainable,
+                        trainable=trainable,
                     )
                 )
+                modules.append(get_activation_fn(self.activation_function))
+
+        self.net = modules
 
     def call(self, x: Any) -> Any:
         """
@@ -138,29 +134,18 @@ def call(self, x: Any) -> Any:
         Array
             Output array with shape (..., mlp_layers[-1]).
         """
-        n_hidden = len(self.norms)
-        for i, layer in enumerate(self.layers):
-            x = layer.call(x)
-            if i < n_hidden:
-                x = self.norms[i].call(x)
-                fn = get_activation_fn(self.activation_function)
-                x = fn(x)
+        for layer in self.net:
+            x = layer(x)
         return x
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the RadialMLP to a dict.
-
-        The ``@variables`` keys follow the pt ``net.state_dict()`` naming:
-        ``{3*i}.matrix`` for the i-th linear layer and ``{3*i+1}.adam_scale``
-        for the i-th RMSNorm (activation modules are parameter-free).
-        """
+        """Serialize the RadialMLP to a dict."""
         variables: dict[str, np.ndarray] = {}
-        for i, layer in enumerate(self.layers):
-            variables[f"{3 * i}.matrix"] = to_numpy_array(layer.w)
-            if i < len(self.norms):
-                variables[f"{3 * i + 1}.adam_scale"] = to_numpy_array(
-                    self.norms[i].adam_scale
-                )
+        for idx, layer in enumerate(self.net):
+            if isinstance(layer, NativeLayer):
+                variables[f"{idx}.matrix"] = to_numpy_array(layer.w)
+            elif isinstance(layer, RMSNorm):
+                variables[f"{idx}.adam_scale"] = to_numpy_array(layer.adam_scale)
         return {
             "@class": "RadialMLP",
             "@version": 1,
@@ -181,37 +166,16 @@ def deserialize(cls, data: dict[str, Any]) -> RadialMLP:
         version = int(data.pop("@version"))
         check_version_compatibility(version, 1, 1)
         variables = data.pop("@variables")
-        precision = str(data.pop("dtype"))
-        obj = cls(
-            data.pop("mlp_layers"),
-            activation_function=str(data.pop("activation_function")),
-            precision=precision,
-            trainable=bool(data.pop("trainable")),
-        )
-        prec = PRECISION_DICT[precision.lower()]
-        expected_keys = {f"{3 * i}.matrix" for i in range(len(obj.layers))} | {
-            f"{3 * i + 1}.adam_scale" for i in range(len(obj.norms))
-        }
-        if set(variables) != expected_keys:
-            raise ValueError(
-                f"variable keys {sorted(variables)} do not match the expected "
-                f"keys {sorted(expected_keys)}"
-            )
+        data["precision"] = data.pop("dtype")
+        obj = cls(**data)
+        prec = PRECISION_DICT[obj.precision.lower()]
         for key, value in variables.items():
-            idx_s, _, name = key.partition(".")
-            idx = int(idx_s)
-            value = np.asarray(value, dtype=prec)
+            idx, _, name = key.partition(".")
+            layer = obj.net[int(idx)]
             if name == "matrix":
-                layer = obj.layers[idx // 3]
-                if value.shape != layer.w.shape:
-                    raise ValueError(
-                        f"shape of {key} {value.shape} does not match "
-                        f"the layer shape {layer.w.shape}"
-                    )
-                layer.w = value
+                layer.w = np.asarray(value, dtype=prec)
             else:
-                norm = obj.norms[idx // 3]
-                norm.adam_scale = value.reshape(norm.adam_scale.shape)
+                layer.adam_scale = np.asarray(value, dtype=prec)
         return obj
 
 
@@ -253,9 +217,21 @@ class C3CutoffEnvelope(NativeOP):
         Cutoff radius in Å.
     exponent : int, optional
         Polynomial exponent (p), must be positive. Default is 5.
-    precision : str
-        Floating point precision label (kept for config parity with pt; the
-        computation follows the input dtype).
+
+    Attributes
+    ----------
+    rcut : float
+        Cutoff radius in Å.
+    p : float
+        Polynomial exponent.
+    a : float
+        Quadratic coefficient for x^p term.
+    b : float
+        Linear coefficient for x^(p+1) term.
+    c : float
+        Quadratic coefficient for x^(p+2) term.
+    d : float
+        Cubic coefficient for x^(p+3) term.
     """
 
     def __init__(
@@ -287,33 +263,160 @@ def call(self, dst: Any) -> Any:
         env_val = 1 + d_scaled**self.p * poly
         return env_val * xp.astype(d_scaled < 1.0, dst.dtype)
 
-    def serialize(self) -> dict[str, Any]:
-        """Serialize the C3CutoffEnvelope to a dict (config only, no state)."""
-        return {
-            "@class": "C3CutoffEnvelope",
-            "@version": 1,
-            "config": {
-                "rcut": self.rcut,
-                "exponent": self.p,
-                "precision": np.dtype(PRECISION_DICT[self.precision]).name,
-            },
-        }
 
-    @classmethod
-    def deserialize(cls, data: dict[str, Any]) -> C3CutoffEnvelope:
-        """Deserialize a C3CutoffEnvelope from a dict."""
-        data = data.copy()
-        data_cls = data.pop("@class")
-        if data_cls != "C3CutoffEnvelope":
-            raise ValueError(f"Invalid class for C3CutoffEnvelope: {data_cls}")
-        version = int(data.pop("@version"))
-        check_version_compatibility(version, 1, 1)
-        config = data.pop("config")
-        return cls(
-            rcut=float(config["rcut"]),
-            exponent=int(config["exponent"]),
-            precision=str(config["precision"]),
+class InnerClamp(NativeOP):
+    """
+    C3-continuous inner distance clamping for zone bridging.
+
+    Applies a septic Hermite polynomial transition that freezes distances
+    below ``r_inner`` to the constant ``r_inner``, then smoothly transitions
+    back to identity at ``r_outer``::
+
+        r̃(r) = r_inner                                    if r <= r_inner
+        r̃(r) = r_inner + (r_outer - r_inner) * h(t)       if r_inner < r < r_outer
+        r̃(r) = r                                          if r >= r_outer
+
+        h(t) = 20t^4 - 45t^5 + 36t^6 - 10t^7,  t = (r - r_inner) / (r_outer - r_inner)
+
+    Boundary conditions:
+    ``h(0)=0``, ``h(1)=1``, ``h'(0)=0``, ``h'(1)=1``,
+    ``h''(0)=0``, ``h''(1)=0``, ``h'''(0)=0``, ``h'''(1)=0``.
+    This ensures C3 continuity: ``dr̃/dr = 0`` at r_inner (frozen zone) and
+    ``dr̃/dr = 1`` at r_outer (identity zone), with matched second and third
+    derivatives at both boundaries.
+
+    Parameters
+    ----------
+    r_inner : float
+        Freeze radius in Å. Distances below this are clamped to ``r_inner``.
+    r_outer : float
+        Outer boundary of the transition zone in Å. Above this, ``r̃ = r``.
+
+    Raises
+    ------
+    ValueError
+        If ``r_inner >= r_outer`` or either is non-positive.
+    """
+
+    def __init__(self, r_inner: float, r_outer: float) -> None:
+        if r_inner <= 0 or r_outer <= 0:
+            raise ValueError("r_inner and r_outer must be positive")
+        if r_inner >= r_outer:
+            raise ValueError(f"r_inner ({r_inner}) must be < r_outer ({r_outer})")
+        self.r_inner = float(r_inner)
+        self.r_outer = float(r_outer)
+
+    def call(self, r: Any) -> Any:
+        """
+        Apply inner distance clamping.
+
+        Parameters
+        ----------
+        r : Array
+            Pair distances with shape (...) or (..., 1) in Å.
+
+        Returns
+        -------
+        Array
+            Clamped distances r̃ with the same shape as input.
+        """
+        xp = array_api_compat.array_namespace(r)
+        t = xp.clip(
+            (r - self.r_inner) / (self.r_outer - self.r_inner), min=0.0, max=1.0
         )
+        t2 = t * t
+        t4 = t2 * t2
+        # h(t) = 20t^4 - 45t^5 + 36t^6 - 10t^7
+        # Satisfies:
+        #   h(0)=0, h(1)=1
+        #   h'(0)=0, h'(1)=1
+        #   h''(0)=0, h''(1)=0
+        #   h'''(0)=0, h'''(1)=0
+        h = t4 * (20.0 + t * (-45.0 + t * (36.0 - 10.0 * t)))
+        interpolated = self.r_inner + (self.r_outer - self.r_inner) * h
+        # Identity zone: r >= r_outer returns r directly.
+        # Both branches have matching first three derivatives at r_outer,
+        # so xp.where preserves C3 continuity here.
+        return xp.where(r >= self.r_outer, r, interpolated)
+
+
+class BridgingSwitch(NativeOP):
+    r"""
+    C3-continuous switching amplitude for the SeZM bridging zone.
+
+    ``BridgingSwitch`` returns a per-edge scalar amplitude in ``[0, 1]``
+    that measures how far an edge sits outside the frozen zone. It is
+    the elementary piece the Source Freeze Propagation Gate (SFPG)
+    aggregates into a per-node "non-frozen confidence" via a product
+    over each source node's outgoing edges::
+
+        w(r) = 0                                             if r <= r_inner  (frozen)
+        w(r) = h((r - r_inner) / (r_outer - r_inner))        if r_inner < r < r_outer  (transition)
+        w(r) = 1                                             if r >= r_outer  (normal)
+
+        h(t) = 35 t^4 - 84 t^5 + 70 t^6 - 20 t^7
+
+    Boundary conditions at ``t=0`` and ``t=1``::
+
+        h(0)   = h'(0)   = h''(0)   = h'''(0)   = 0
+        h(1)=1, h'(1)    = h''(1)   = h'''(1)   = 0
+
+    The vanishing first three derivatives at both endpoints give
+    ``w \in C^3(\mathbb{R}_{\ge 0})`` with zero slope/curvature at
+    ``r_inner`` and ``r_outer``, so forces (first derivatives) and the
+    force derivatives consumed by second-order training stay continuous
+    across both zone boundaries.
+
+    The surrounding infrastructure (``compute_edge_src_gate``) owns the
+    per-node product reduction and broadcast; this module only encodes
+    the scalar amplitude shape.
+
+    Parameters
+    ----------
+    r_inner : float
+        Inner radius in Å. At or below this distance ``w = 0``.
+    r_outer : float
+        Outer radius in Å. At or above this distance ``w = 1``.
+
+    Raises
+    ------
+    ValueError
+        If ``r_inner <= 0``, ``r_outer <= 0``, or ``r_inner >= r_outer``.
+    """
+
+    def __init__(self, r_inner: float, r_outer: float) -> None:
+        if r_inner <= 0 or r_outer <= 0:
+            raise ValueError("r_inner and r_outer must be positive")
+        if r_inner >= r_outer:
+            raise ValueError(f"r_inner ({r_inner}) must be < r_outer ({r_outer})")
+        self.r_inner = float(r_inner)
+        self.r_outer = float(r_outer)
+
+    def call(self, r: Any) -> Any:
+        """
+        Evaluate the C3 switching amplitude.
+
+        Parameters
+        ----------
+        r : Array
+            Pair distances with shape (...) or (..., 1) in Å.
+
+        Returns
+        -------
+        Array
+            Switching amplitudes in ``[0, 1]`` with the same shape as input.
+        """
+        xp = array_api_compat.array_namespace(r)
+        t = xp.clip(
+            (r - self.r_inner) / (self.r_outer - self.r_inner), min=0.0, max=1.0
+        )
+        t2 = t * t
+        t4 = t2 * t2
+        # h(t) = 35 t^4 - 84 t^5 + 70 t^6 - 20 t^7  (Horner form).
+        # Degree-7 smootherstep: the unique polynomial of this degree that
+        # hits ``w(r_inner)=0, w(r_outer)=1`` together with C3 flatness at
+        # both radii.
+        return t4 * (35.0 + t * (-84.0 + t * (70.0 - 20.0 * t)))
 
 
 class RadialBasis(NativeOP):
@@ -325,15 +428,14 @@ class RadialBasis(NativeOP):
 
     Notes
     -----
-    The Bessel basis uses the normalized sinc function for numerical
-    stability::
+    The Bessel basis uses PyTorch's sinc function for numerical stability::
 
         phi_n(r) = w_n * sinc(w_n * r / π)
 
-    where ``sinc(z) = sin(π*z) / (π*z)`` with ``sinc(0) = 1`` (same convention
-    as ``torch.sinc`` and ``np.sinc``). This is mathematically equivalent to
-    the standard form ``sin(w_n * r) / r``, but sinc handles the r->0 limit,
-    providing continuous gradients without explicit epsilon clamping.
+    where ``torch.sinc(z) = sin(π*z) / (π*z)``. This is mathematically
+    equivalent to the standard form ``sin(w_n * r) / r``, but sinc handles
+    the r->0 limit via Taylor expansion, providing continuous gradients
+    without explicit epsilon clamping.
 
     The ``r -> 0`` limit is finite::
 
@@ -350,10 +452,10 @@ class RadialBasis(NativeOP):
     ----------
     rcut : float
         Cutoff radius in Å.
-    basis_type : str, optional
-        Radial basis type. Supported values are ``"bessel"`` and ``"gaussian"``.
     n_radial : int
         Number of basis functions.
+    basis_type : str, optional
+        Radial basis type. Supported values are ``"bessel"`` and ``"gaussian"``.
     precision : str
         Floating-point precision for the radial basis frequencies and outputs.
     exponent : int, optional
@@ -380,6 +482,7 @@ def __init__(
         self.precision = precision
         self.exponent = int(exponent)
         prec = PRECISION_DICT[self.precision.lower()]
+        self.pi_tensor = math.pi
 
         # Frequencies: n*π/rcut, n=1..n_radial
         # Shape: (1, n_radial), stored as a trainable array.
@@ -404,30 +507,28 @@ def call(self, r: Any) -> Any:
         Parameters
         ----------
         r : Array
-            Pair distances with shape (N, 1) in Å, where N is the number of
-            pairs.
+            Pair distances with shape (N, 1) in Å, where N is the number of pairs.
 
         Returns
         -------
         Array
-            Radial basis multiplied by C^3 cutoff envelope with shape
-            (N, n_rbf). The output is smoothly truncated to zero at r = rcut.
+            Radial basis multiplied by C^3 cutoff envelope with shape (N, n_rbf).
+            The output is smoothly truncated to zero at r = rcut.
         """
         xp = array_api_compat.array_namespace(r)
         freqs = xp_asarray_nodetach(
-            xp, self.adam_freqs, device=array_api_compat.device(r)
+            xp, self.adam_freqs[...], device=array_api_compat.device(r)
         )
         # === Step 1. Radial basis ===
         # Shape: (N, 1) * (1, n_radial) -> (N, n_radial)
         if self.basis_type == "bessel":
             # phi_n(r) = w_n * sinc(w_n * r / π)
             x = r * freqs  # (N, n_rbf)
-            # normalized sinc, mirroring torch.sinc(x / π):
-            # sinc(z) = sin(π*z) / (π*z), with sinc(0) = 1.
-            # The zero branch is selected through a safe denominator so that
-            # gradients stay finite at r = 0.
-            z = x / math.pi
-            pz = math.pi * z
+            # torch.sinc(z) = sin(π z) / (π z) with sinc(0) = 1. The array API
+            # has no sinc, so evaluate it directly with a guarded denominator so
+            # the r -> 0 limit and its gradient stay finite.
+            z = x / self.pi_tensor
+            pz = self.pi_tensor * z
             zero = z == 0.0
             safe_pz = xp.where(zero, xp.ones_like(pz), pz)
             sinc = xp.where(zero, xp.ones_like(pz), xp.sin(safe_pz) / safe_pz)
@@ -437,7 +538,7 @@ def call(self, r: Any) -> Any:
             raw = xp.exp(dr * dr * self.gaussian_coeff)  # (N, n_rbf)
 
         # === Step 2. Apply C^3 envelope for smooth cutoff ===
-        envelope = self.envelope.call(r)  # (N, 1)
+        envelope = self.envelope(r)  # (N, 1)
         return raw * envelope
 
     def serialize(self) -> dict[str, Any]:
@@ -469,18 +570,12 @@ def deserialize(cls, data: dict[str, Any]) -> RadialBasis:
         precision = str(config["precision"])
         obj = cls(
             rcut=float(config["rcut"]),
-            basis_type=str(config.get("basis_type", "bessel")),
             n_radial=int(config["n_radial"]),
-            precision=precision,
+            basis_type=str(config.get("basis_type", "bessel")),
             exponent=int(config.get("exponent", 7)),
+            precision=precision,
         )
         if variables is not None:
             prec = PRECISION_DICT[precision.lower()]
-            adam_freqs = np.asarray(variables["adam_freqs"], dtype=prec)
-            if adam_freqs.shape != obj.adam_freqs.shape:
-                raise ValueError(
-                    f"adam_freqs shape {adam_freqs.shape} does not match "
-                    f"the expected shape {obj.adam_freqs.shape}"
-                )
-            obj.adam_freqs = adam_freqs
+            obj.adam_freqs = np.asarray(variables["adam_freqs"], dtype=prec)
         return obj
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/so2.py b/deepmd/dpmodel/descriptor/dpa4_nn/so2.py
index 97774155cf..6173d77e45 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/so2.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/so2.py
@@ -2,31 +2,11 @@
 """
 SO(2)-equivariant message-passing layers for DPA4/SeZM.
 
-This module is the dpmodel port of ``deepmd.pt.model.descriptor.sezm_nn.so2``.
-It defines the reduced-layout SO(2) linear operator, the edge-conditioned
-radial degree mixer, and the edge convolution used inside SeZM interaction
-blocks.
-
-Padded-edge adaptation
-----------------------
-The pt ``SO2Convolution`` consumes a flat *sparse* edge list and aggregates
-per destination node with ``index_add_``. The dpmodel port uses the padded,
-frame-explicit edge layout documented in ``edge_cache.EdgeCache``
-(``E = nf * nloc * nnei`` with invalid slots marked by ``edge_mask == 0``),
-so every destination aggregation becomes a masked sum over the ``nnei`` axis
-and the destination-wise softmax becomes a masked softmax over ``nnei``
-(see ``attention.segment_envelope_gated_softmax``). Per-edge math (the
-SO(2) linear application, the Wigner rotations via the ``D_to_m``
-projections, and the radial modulation) is identical to pt, just evaluated
-over the padded edge axis.
-
-Branches guarded with ``NotImplementedError`` (flags unused by the core DPA4
-config): ``so2_attn_res != "none"``, ``layer_scale``, ``s2_activation``,
-``atten_f_mix``, ``atten_v_proj``, ``atten_o_proj``.
-
-The cross-mode SO(3)/S2 grid products (``node_wise_s2``/``node_wise_so3`` and
-``message_node_s2``/``message_node_so3``) are ported and wired into the
-convolution, mirroring the pt ``SO2Convolution`` forward placement.
+This module defines the reduced-layout SO(2) linear operator and the
+edge convolution used inside SeZM interaction blocks.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.so2``.
 """
 
 from __future__ import (
@@ -48,12 +28,17 @@
     NativeOP,
 )
 from deepmd.dpmodel.array_api import (
+    xp_add_at,
     xp_asarray_nodetach,
     xp_sigmoid,
 )
 from deepmd.dpmodel.common import (
+    get_xp_precision,
     to_numpy_array,
 )
+from deepmd.dpmodel.utils.network import (
+    Identity,
+)
 from deepmd.dpmodel.utils.seed import (
     child_seed,
 )
@@ -67,6 +52,13 @@
 from .attention import (
     segment_envelope_gated_softmax,
 )
+from .attn_res import (
+    DepthAttnRes,
+)
+from .cartesian import (
+    EdgeCartesianTensorProduct,
+    NodeCartesianTensorProduct,
+)
 from .grid_net import (
     S2GridNet,
     SO3GridNet,
@@ -94,46 +86,20 @@
 )
 from .utils import (
     ATTN_RES_MODES,
+    get_promoted_dtype,
     init_trunc_normal_fan_in_out,
 )
 
 if TYPE_CHECKING:
+    from deepmd.dpmodel.array_api import (
+        Array,
+    )
+
     from .edge_cache import (
         EdgeCache,
     )
 
 
-def _compute_precision(precision: str) -> str:
-    """Promote fp16/bf16 to fp32 (dpmodel analog of pt ``get_promoted_dtype``)."""
-    name = np.dtype(PRECISION_DICT[precision.lower()]).name
-    if "float16" in name:  # matches float16 and bfloat16
-        return "float32"
-    return precision
-
-
-def _check_shape_assign(obj: Any, attr: str, value: Any, dtype: Any, key: str) -> None:
-    """Assign ``value`` (cast to ``dtype``) to ``obj.attr`` with a shape check."""
-    expected = getattr(obj, attr)
-    arr = np.asarray(value, dtype=dtype)
-    if arr.shape != expected.shape:
-        raise ValueError(
-            f"{key} shape {arr.shape} does not match the expected shape "
-            f"{expected.shape}"
-        )
-    setattr(obj, attr, arr)
-
-
-def _check_index_table(expected: np.ndarray, value: Any, key: str) -> None:
-    """Validate that a serialized integer index table matches the rebuilt one."""
-    arr = np.asarray(value, dtype=np.int64)
-    # ``expected`` is a rebuilt buffer that may be a (possibly CUDA) torch
-    # tensor in the pt_expt backend; ``np.asarray`` raises on CUDA tensors and
-    # ``np.array_equal`` would silently swallow that into ``False``. Convert via
-    # ``to_numpy_array`` (dlpack-through-CPU fallback) before comparing.
-    if not np.array_equal(arr.reshape(-1), to_numpy_array(expected).reshape(-1)):
-        raise ValueError(f"{key} does not match the table derived from the config")
-
-
 class SO2Linear(NativeOP):
     """
     SO(2)-equivariant linear mixing in the edge-aligned local frame.
@@ -145,15 +111,16 @@ class SO2Linear(NativeOP):
         [  m=0: l=0..lmax  |  m=1: (l,-1) then (l,+1)  |  ...  |  m=mmax: ... ]
          |___ lmax+1 ____|   |_______ 2*(lmax) ________|
 
-    Each |m| group is contiguous, enabling per-group block matmuls.
+    Each |m| group is contiguous, enabling a single block-diagonal matmul.
 
     Block-diagonal weight structure
     -------------------------------
-    The conceptual full weight matrix is block-diagonal over |m| groups::
+    The full weight matrix W has shape ``(F, D_m_trunc*Cout, D_m_trunc*Cin)``
+    and is block-diagonal over |m| groups::
 
         W = diag[W_m0, B_m1, B_m2, ..., B_mmax]
 
-    - ``W_m0``: unconstrained ``(num_l*Cin, num_l*Cout)`` block for m=0.
+    - ``W_m0``: unconstrained ``(num_l*Cout, num_l*Cin)`` block for m=0.
       Cross-l mixing is allowed since m=0 coefficients are real scalars.
 
     - ``B_m`` (|m|>0): SO(2)-constrained 2x2 block coupling (-m, +m) pairs::
@@ -163,12 +130,12 @@ class SO2Linear(NativeOP):
 
       This structure is the real-valued form of complex multiplication
       ``(u + iv)(a + ib) = (ua - vb) + i(va + ub)``, which guarantees
-      SO(2) equivariance.
+      SO(2) equivariance: rotating the input by angle phi around z
+      rotates the output by the same angle.
 
-    Unlike pt (which assembles the dense block-diagonal matrix and applies a
-    single ``einsum``), the dpmodel forward contracts the diagonal blocks
-    directly with slicing + matmul + concat, which is array-API friendly and
-    numerically equivalent (the off-block entries are exact zeros).
+    The weight is assembled once per forward (training) or cached (eval)
+    by ``_build_so2_weight()``, then applied via a single batched matmul
+    over all focus streams: ``einsum("efi,foi->efo")``.
 
     Parameters
     ----------
@@ -182,7 +149,7 @@ class SO2Linear(NativeOP):
         Number of output channels per (l, m) coefficient.
     n_focus
         Number of independent focus streams. Each stream has its own
-        weight matrices.
+        weight matrices; the batched matmul vectorizes over all streams.
     precision
         Parameter precision.
     mlp_bias
@@ -203,8 +170,8 @@ def __init__(
         n_focus: int = 1,
         precision: str = DEFAULT_PRECISION,
         mlp_bias: bool = False,
-        seed: int | list[int] | None = None,
-        trainable: bool = True,
+        seed: int | list[int] | None,
+        trainable: bool,
     ) -> None:
         self.lmax = int(lmax)
         self.mmax = int(self.lmax if mmax is None else mmax)
@@ -217,7 +184,6 @@ def __init__(
         self.n_focus = int(n_focus)
         self.precision = precision
         self.mlp_bias = bool(mlp_bias)
-        self.trainable = bool(trainable)
         prec = PRECISION_DICT[self.precision.lower()]
 
         # === Step 1. Build m-major coefficient layout ===
@@ -243,12 +209,10 @@ def __init__(
             num_l = self.lmax - m + 1
             neg_start = offset
             pos_start = offset + num_l
-            neg_indices_list.append(
-                np.arange(neg_start, neg_start + num_l, dtype=np.int64)
-            )
-            pos_indices_list.append(
-                np.arange(pos_start, pos_start + num_l, dtype=np.int64)
-            )
+            neg_idx = np.arange(neg_start, neg_start + num_l, dtype=np.int64)
+            pos_idx = np.arange(pos_start, pos_start + num_l, dtype=np.int64)
+            neg_indices_list.append(neg_idx)
+            pos_indices_list.append(pos_idx)
             m_ranges.append((neg_start, pos_start, num_l))
             offset += 2 * num_l
 
@@ -257,10 +221,11 @@ def __init__(
         if len(pos_indices_list) > 0:
             self.pos_indices = np.concatenate(pos_indices_list)
             self.neg_indices = np.concatenate(neg_indices_list)
+            self._m_ranges = m_ranges
         else:
             self.pos_indices = np.empty(0, dtype=np.int64)
             self.neg_indices = np.empty(0, dtype=np.int64)
-        self._m_ranges = m_ranges
+            self._m_ranges = []
 
         # === Step 2. Learnable weight parameters ===
         # weight_m0: folded (num_l*Cin, F*num_l*Cout) storage — (in, out) convention.
@@ -275,19 +240,19 @@ def __init__(
             init_trunc_normal_fan_in_out(
                 weight_m0_view[:, focus_idx, :], child_seed(seed, 1000 + focus_idx)
             )
-        self.weight_m0 = weight_m0
+        self.weight_m0 = weight_m0.astype(prec)
         if self.mlp_bias:
             self.bias0: np.ndarray | None = np.zeros(
-                (self.n_focus * self.out_channels,), dtype=prec
+                self.n_focus * self.out_channels, dtype=prec
             )
         else:
             self.bias0 = None
 
-        # weight_m[i]: folded (num_l*Cin, F*2*num_l*Cout) storage — (in, out)
-        #   convention. Runtime view: (num_l*Cin, F, 2*num_l*Cout).
+        # weight_m[i]: folded (num_l*Cin, F*2*num_l*Cout) storage — (in, out) convention.
+        #   Runtime view: (num_l*Cin, F, 2*num_l*Cout).
         #   The factor of 2 comes from storing W_u and W_v concatenated along the
-        #   output axis. Scaling by 1/sqrt(2) compensates for the doubled
-        #   parameter count.
+        #   output axis. _build_so2_weight() splits them and fills the 2x2 block.
+        #   Scaling by 1/sqrt(2) compensates for the doubled parameter count.
         self.weight_m: list[np.ndarray] = []
         for m in range(1, self.mmax + 1):
             num_l = self.lmax - m + 1
@@ -302,13 +267,16 @@ def __init__(
                 )
             # Apply scaling for SO(2) equivariance
             weight *= 1.0 / math.sqrt(2.0)
-            self.weight_m.append(weight)
+            self.weight_m.append(weight.astype(prec))
+
+        self.trainable = bool(trainable)
 
-        # === Step 3. Precompute flattened slice ranges for the block matmuls ===
+        # === Step 3. Precompute flattened slice ranges for _build_so2_weight ===
         # Each |m|>0 group occupies two sub-blocks (neg, pos) in the flattened
-        # coefficient*channel axis.
-        # Tuple layout: (neg_i0, neg_i1, pos_i0, pos_i1,   <- input ranges
-        #                neg_o0, neg_o1, pos_o0, pos_o1)   <- output ranges
+        # weight matrix. Pre-computing the row/col ranges avoids repeated
+        # arithmetic in the hot path.
+        # Tuple layout: (neg_i0, neg_i1, pos_i0, pos_i1,   <- input row ranges
+        #                neg_o0, neg_o1, pos_o0, pos_o1)   <- output col ranges
         self._m0_in = (self.lmax + 1) * self.in_channels
         self._m0_out = (self.lmax + 1) * self.out_channels
         self._block_slices: list[tuple[int, int, int, int, int, int, int, int]] = []
@@ -328,20 +296,12 @@ def __init__(
                 )
             )
 
-    @staticmethod
-    def _focus_matmul(xp: Any, x: Any, w: Any) -> Any:
-        """Per-focus matmul: einsum("efi,fio->efo") via broadcast batched matmul.
-
-        Parameters
-        ----------
-        x
-            Input with shape (E, F, in_blk).
-        w
-            Weight with shape (F, in_blk, out_blk).
-        """
-        return xp.matmul(x[:, :, None, :], w[None, ...])[..., 0, :]
+        # The assembled SO(2) weight is block-diagonal over |m| groups; the
+        # forward contracts only the diagonal blocks (see _block_diagonal_matmul).
+        # Each |m| group occupies a contiguous (in, out) block on the diagonal.
+        self._block_diag_slices = self._build_block_diag_slices()
 
-    def call(self, x: Any) -> Any:
+    def call(self, x: Array) -> Array:
         """
         Parameters
         ----------
@@ -352,27 +312,98 @@ def call(self, x: Any) -> Any:
         Returns
         -------
         Array
-            Output with shape (E, F, D_m_trunc, Cout).
+            Output with shape (E, F, D_m_trunc, Cout), where Cout is output channels.
         """
         xp = array_api_compat.array_namespace(x)
+        device = array_api_compat.device(x)
         # === Step 1. Flatten coefficient + channel axes for matmul ===
         # (E, F, D_m, Cin) -> (E, F, D_m*Cin)
         n_edge = x.shape[0]
         in_dim_total = self.reduced_dim * self.in_channels
         x_flat = xp.reshape(x, (n_edge, self.n_focus, in_dim_total))
 
-        # === Step 2. Contract the diagonal |m| blocks ===
-        # m=0 block: unconstrained (num_l*Cin, num_l*Cout) per focus.
-        num_m0 = self.lmax + 1
-        device = array_api_compat.device(x)
+        # === Step 2. Get block-diagonal weight ===
+        weight = self._build_so2_weight(xp, device)
+
+        # === Step 3. Block-diagonal matmul over focus streams + reshape back ===
+        out_flat = self._block_diagonal_matmul(x_flat, weight)
+        out = xp.reshape(
+            out_flat, (n_edge, self.n_focus, self.reduced_dim, self.out_channels)
+        )
+
+        # === Step 4. Bias on l=0 scalar index ===
+        if self.mlp_bias:
+            bias0 = xp.reshape(
+                xp_asarray_nodetach(xp, self.bias0[...], device=device),
+                (self.n_focus, self.out_channels),
+            )
+            out = xp.concat(
+                [out[:, :, :1, :] + bias0[None, :, None, :], out[:, :, 1:, :]], axis=2
+            )
+        return out
+
+    def _build_block_diag_slices(self) -> list[tuple[int, int, int, int]]:
+        """Return the ``(in_start, in_end, out_start, out_end)`` diagonal blocks.
+
+        One entry per ``|m|`` group in m-major order: ``m = 0`` spans
+        ``lmax + 1`` coefficients and each ``|m| > 0`` spans ``2 * (lmax - m + 1)``
+        coefficients (negative and positive orders).
+        """
+        group_sizes = [self.lmax + 1] + [
+            2 * (self.lmax - m + 1) for m in range(1, self.mmax + 1)
+        ]
+        slices: list[tuple[int, int, int, int]] = []
+        in_off = out_off = 0
+        for num in group_sizes:
+            in_width = num * self.in_channels
+            out_width = num * self.out_channels
+            slices.append((in_off, in_off + in_width, out_off, out_off + out_width))
+            in_off += in_width
+            out_off += out_width
+        return slices
+
+    def _build_so2_weight(self, xp: Any, device: Any) -> Array:
+        """
+        Assemble the per-focus block-diagonal SO(2) weight matrix.
+
+        The flattened weight has shape ``(D_m*Cin, F, D_m*Cout)`` (in, out)
+        where both axes follow the same m-major coefficient ordering.
+        Off-diagonal blocks (cross-|m|) are zero, enforcing SO(2) equivariance.
+
+        Returns
+        -------
+        Array
+            Weight with shape (D_m*Cin, F, D_m*Cout).
+        """
+        in_total = self.reduced_dim * self.in_channels
+        out_total = self.reduced_dim * self.out_channels
+        num_in_m0 = (self.lmax + 1) * self.in_channels
+        num_out_m0 = (self.lmax + 1) * self.out_channels
         weight_m0 = xp.reshape(
             xp_asarray_nodetach(xp, self.weight_m0[...], device=device),
-            (num_m0 * self.in_channels, self.n_focus, num_m0 * self.out_channels),
+            (num_in_m0, self.n_focus, num_out_m0),
         )
-        weight_m0 = xp.permute_dims(weight_m0, (1, 0, 2))  # (F, in, out)
-        out_blocks = [self._focus_matmul(xp, x_flat[:, :, : self._m0_in], weight_m0)]
 
-        # |m|>0 blocks: real-valued complex multiplication on (-m, +m) pairs.
+        # m=0 block: (Cin_blk, F, Cout_blk) — (in, out) convention. The m=0 input
+        # rows carry the m=0 output block followed by zero pads spanning the
+        # |m|>0 output columns.
+        row_blocks = [
+            xp.concat(
+                [
+                    weight_m0,
+                    xp.zeros(
+                        (self._m0_in, self.n_focus, out_total - self._m0_out),
+                        dtype=weight_m0.dtype,
+                        device=device,
+                    ),
+                ],
+                axis=2,
+            )
+        ]
+
+        # |m|>0 blocks: fill the 2x2 SO(2) coupling structure.
+        # For each |m|, the learnable param w has shape (in_blk, F, 2*out_blk)
+        # which is split into W_u and W_v along the output axis.
         for m_idx, w in enumerate(self.weight_m):
             ni0, ni1, pi0, pi1, no0, no1, po0, po1 = self._block_slices[m_idx]
             ib = ni1 - ni0  # in_block size
@@ -381,39 +412,61 @@ def call(self, x: Any) -> Any:
                 xp_asarray_nodetach(xp, w[...], device=device),
                 (ib, self.n_focus, 2 * ob),
             )
-            w = xp.permute_dims(w, (1, 0, 2))  # (F, in_blk, 2*out_blk)
-            w_u = w[:, :, :ob]  # (F, in_blk, out_blk)
-            w_v = w[:, :, ob:]  # (F, in_blk, out_blk)
-            x_neg = x_flat[:, :, ni0:ni1]
-            x_pos = x_flat[:, :, pi0:pi1]
-            # 2x2 coupling: neg_out = x_neg @ W_u - x_pos @ W_v
-            #               pos_out = x_neg @ W_v + x_pos @ W_u
-            out_blocks.append(
-                self._focus_matmul(xp, x_neg, w_u) - self._focus_matmul(xp, x_pos, w_v)
-            )
-            out_blocks.append(
-                self._focus_matmul(xp, x_neg, w_v) + self._focus_matmul(xp, x_pos, w_u)
+            w_u = w[:, :, :ob]  # (in_blk, F, out_blk)
+            w_v = w[:, :, ob:]  # (in_blk, F, out_blk)
+            # Fill the 2x2 coupling:
+            #   Row = input (neg/pos), Col = output (neg/pos).
+            #   [ W_u^T, -W_v^T ]^T  =>  row=neg_in: W_u to neg_out, W_v to pos_out
+            #   [ W_v^T,  W_u^T ]^T  =>  row=pos_in: -W_v to neg_out, W_u to pos_out
+            # neg_out and pos_out are contiguous (no1 == po0); each input row band
+            # is built by concatenating [left pad, two coupling sub-blocks, right pad].
+            left_pad = xp.zeros((ib, self.n_focus, no0), dtype=w.dtype, device=device)
+            right_pad = xp.zeros(
+                (ib, self.n_focus, out_total - po1), dtype=w.dtype, device=device
             )
+            neg_row = xp.concat([left_pad, w_u, w_v, right_pad], axis=2)
+            pos_row = xp.concat([left_pad, -w_v, w_u, right_pad], axis=2)
+            row_blocks.append(neg_row)  # neg_in -> [neg_out, pos_out]
+            row_blocks.append(pos_row)  # pos_in -> [neg_out, pos_out]
+        return xp.concat(row_blocks, axis=0)
 
-        out_flat = (
-            xp.concat(out_blocks, axis=-1) if len(out_blocks) > 1 else out_blocks[0]
-        )
-        out = xp.reshape(
-            out_flat, (n_edge, self.n_focus, self.reduced_dim, self.out_channels)
-        )
+    def _block_diagonal_matmul(self, x_flat: Array, weight: Array) -> Array:
+        """Contract only the diagonal ``|m|`` blocks of the assembled weight.
 
-        # === Step 3. Bias on l=0 scalar index ===
-        if self.mlp_bias:
-            bias0 = xp.reshape(
-                xp_asarray_nodetach(xp, self.bias0[...], device=device),
-                (self.n_focus, self.out_channels),
+        ``weight`` is block-diagonal over ``|m|`` (cross-``|m|`` blocks are
+        exactly zero), so concatenating the per-group matmuls reproduces the
+        dense ``einsum`` over the full ``(D_m*Cin, D_m*Cout)`` matrix while
+        skipping the structural zeros. The result is fp32-equivalent to the
+        dense path up to the matmul reduction order.
+
+        Parameters
+        ----------
+        x_flat : Array
+            Flattened input with shape ``(E, F, D_m*Cin)``.
+        weight : Array
+            Assembled block-diagonal weight with shape ``(D_m*Cin, F, D_m*Cout)``.
+
+        Returns
+        -------
+        Array
+            Flattened output with shape ``(E, F, D_m*Cout)``.
+        """
+        xp = array_api_compat.array_namespace(x_flat)
+        blocks = [
+            # einsum("efi,ifo->efo"): a per-focus matmul batched over the focus
+            # axis, contracting the input coefficient/channel index i.
+            xp.permute_dims(
+                xp.matmul(
+                    xp.permute_dims(x_flat[:, :, in0:in1], (1, 0, 2)),
+                    xp.permute_dims(weight[in0:in1, :, out0:out1], (1, 0, 2)),
+                ),
+                (1, 0, 2),
             )
-            out0 = out[:, :, :1, :] + bias0[None, :, None, :]
-            out = xp.concat([out0, out[:, :, 1:, :]], axis=2)
-        return out
+            for in0, in1, out0, out1 in self._block_diag_slices
+        ]
+        return xp.concat(blocks, axis=-1)
 
-    def _variables(self) -> dict[str, np.ndarray]:
-        """Variables keyed by the pt ``state_dict`` key names."""
+    def serialize(self) -> dict[str, Any]:
         variables = {
             "m0_idx": to_numpy_array(self.m0_idx),
             "pos_indices": to_numpy_array(self.pos_indices),
@@ -422,40 +475,8 @@ def _variables(self) -> dict[str, np.ndarray]:
         }
         if self.mlp_bias:
             variables["bias0"] = to_numpy_array(self.bias0)
-        for m_idx, w in enumerate(self.weight_m):
-            variables[f"weight_m.{m_idx}"] = to_numpy_array(w)
-        return variables
-
-    def _load_variables(self, variables: dict[str, Any]) -> None:
-        """Load variables keyed by the pt ``state_dict`` key names."""
-        prec = PRECISION_DICT[self.precision.lower()]
-        _check_index_table(self.m0_idx, variables["m0_idx"], "m0_idx")
-        _check_index_table(self.pos_indices, variables["pos_indices"], "pos_indices")
-        _check_index_table(self.neg_indices, variables["neg_indices"], "neg_indices")
-        _check_shape_assign(
-            self, "weight_m0", variables["weight_m0"], prec, "weight_m0"
-        )
-        if self.mlp_bias:
-            self.bias0 = np.asarray(variables["bias0"], dtype=prec).reshape(
-                self.bias0.shape
-            )
-        # Rebuild the list and assign the whole attribute (rather than
-        # item-assignment) so that pt_expt, which converts the list to a
-        # torch ParameterList, can re-convert the new value cleanly.
-        new_weight_m = []
-        for m_idx in range(len(self.weight_m)):
-            key = f"weight_m.{m_idx}"
-            value = np.asarray(variables[key], dtype=prec)
-            if value.shape != tuple(self.weight_m[m_idx].shape):
-                raise ValueError(
-                    f"{key} shape {value.shape} does not match the expected "
-                    f"shape {tuple(self.weight_m[m_idx].shape)}"
-                )
-            new_weight_m.append(value)
-        self.weight_m = new_weight_m
-
-    def serialize(self) -> dict[str, Any]:
-        """Serialize the SO2Linear to a dict (pt-compatible format)."""
+        for i, w in enumerate(self.weight_m):
+            variables[f"weight_m.{i}"] = to_numpy_array(w)
         return {
             "@class": "SO2Linear",
             "@version": 1,
@@ -470,12 +491,11 @@ def serialize(self) -> dict[str, Any]:
                 "trainable": self.trainable,
                 "seed": None,
             },
-            "@variables": self._variables(),
+            "@variables": variables,
         }
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> SO2Linear:
-        """Deserialize an SO2Linear from a dict."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "SO2Linear":
@@ -484,18 +504,18 @@ def deserialize(cls, data: dict[str, Any]) -> SO2Linear:
         check_version_compatibility(version, 1, 1)
         config = data.pop("config")
         variables = data.pop("@variables")
-        obj = cls(
-            lmax=int(config["lmax"]),
-            mmax=int(config["mmax"]),
-            in_channels=int(config["in_channels"]),
-            out_channels=int(config["out_channels"]),
-            n_focus=int(config["n_focus"]),
-            precision=str(config["precision"]),
-            mlp_bias=bool(config["mlp_bias"]),
-            trainable=bool(config["trainable"]),
-            seed=config.get("seed"),
-        )
-        obj._load_variables(variables)
+        obj = cls(**config)
+        prec = PRECISION_DICT[obj.precision.lower()]
+        obj.m0_idx = np.asarray(variables["m0_idx"], dtype=np.int64)
+        obj.pos_indices = np.asarray(variables["pos_indices"], dtype=np.int64)
+        obj.neg_indices = np.asarray(variables["neg_indices"], dtype=np.int64)
+        obj.weight_m0 = np.asarray(variables["weight_m0"], dtype=prec)
+        if obj.mlp_bias:
+            obj.bias0 = np.asarray(variables["bias0"], dtype=prec)
+        obj.weight_m = [
+            np.asarray(variables[f"weight_m.{i}"], dtype=prec)
+            for i in range(len(obj.weight_m))
+        ]
         return obj
 
 
@@ -513,10 +533,6 @@ class DynamicRadialDegreeMixer(NativeOP):
 
     `mode="degree"` shares W across channels. `mode="degree_channel"` gives each
     channel its own W, optionally with a low-rank channel factorization.
-
-    The pt ``index_copy_`` scatter of the compact kernel into the dense
-    ``(D_m, D_m)`` layout is replaced by a precomputed gather index + mask
-    (functionally identical, array-API friendly).
     """
 
     def __init__(
@@ -528,8 +544,8 @@ def __init__(
         mode: str,
         rank: int = 0,
         precision: str = DEFAULT_PRECISION,
-        seed: int | list[int] | None = None,
-        trainable: bool = True,
+        seed: int | list[int] | None,
+        trainable: bool,
     ) -> None:
         self.lmax = int(lmax)
         self.mmax = int(self.lmax if mmax is None else mmax)
@@ -547,7 +563,6 @@ def __init__(
         if self.rank < 0:
             raise ValueError("`rank` must be non-negative")
         self.precision = precision
-        self.trainable = bool(trainable)
         prec = PRECISION_DICT[self.precision.lower()]
 
         # m-major reduced layout: m=0 block followed by (-m, +m) blocks.
@@ -567,29 +582,19 @@ def __init__(
 
         weight = np.empty((self.input_dim, self.proj_out_dim), dtype=prec)
         init_trunc_normal_fan_in_out(weight, child_seed(seed, 0))
-        self.weight = weight
+        self.weight = weight.astype(prec)
 
         if self.mode == "degree_channel" and self.rank > 0:
             channel_basis = np.empty((self.rank, self.channels), dtype=prec)
             init_trunc_normal_fan_in_out(channel_basis, child_seed(seed, 1))
-            self.channel_basis: np.ndarray | None = channel_basis
+            self.channel_basis: np.ndarray | None = channel_basis.astype(prec)
         else:
             self.channel_basis = None
 
         compact_idx, dense_idx = self._build_dense_scatter_indices()
         self.kernel_compact_index = compact_idx
         self.kernel_dense_index = dense_idx
-        # Gather-form of pt's index_copy_ scatter:
-        #   dense[:, dense_idx[j]] = compact[:, compact_idx[j]]
-        # becomes
-        #   dense = take(compact, gather_index, axis=1) * scatter_mask
-        dense_size = self.reduced_dim * self.reduced_dim
-        gather_index = np.zeros(dense_size, dtype=np.int64)
-        scatter_mask = np.zeros(dense_size, dtype=prec)
-        gather_index[dense_idx] = compact_idx
-        scatter_mask[dense_idx] = 1.0
-        self._dense_gather_index = gather_index
-        self._dense_scatter_mask = scatter_mask
+        self.trainable = bool(trainable)
 
     def _build_dense_scatter_indices(self) -> tuple[np.ndarray, np.ndarray]:
         compact_indices: list[int] = []
@@ -602,7 +607,7 @@ def append_block(start_in: int, start_out: int, num_l: int) -> None:
                 for l_out in range(num_l):
                     compact_indices.append(compact_offset + l_in * num_l + l_out)
                     # Store dense kernels in matmul layout (out, in) so forward
-                    # can use a batched matmul without transposing.
+                    # can call bmm/einsum without transposing the degree matrix.
                     dense_indices.append(
                         (start_out + l_out) * reduced_dim + start_in + l_in
                     )
@@ -625,33 +630,85 @@ def append_block(start_in: int, start_out: int, num_l: int) -> None:
             offset += 2 * num_l
 
         return (
-            np.asarray(compact_indices, dtype=np.int64),
-            np.asarray(dense_indices, dtype=np.int64),
+            np.array(compact_indices, dtype=np.int64),
+            np.array(dense_indices, dtype=np.int64),
         )
 
-    def _project_radial(self, xp: Any, radial_feat: Any) -> Any:
+    def _project_radial(self, radial_feat: Array) -> Array:
+        xp = array_api_compat.array_namespace(radial_feat)
+        device = array_api_compat.device(radial_feat)
         radial_m0 = xp.reshape(
             radial_feat[:, : self.lmax + 1, :],
             (radial_feat.shape[0], self.input_dim),
         )
-        weight = xp_asarray_nodetach(
-            xp, self.weight[...], device=array_api_compat.device(radial_feat)
-        )
+        weight = xp_asarray_nodetach(xp, self.weight[...], device=device)
         return xp.matmul(radial_m0, weight)
 
-    def _scatter_dense(self, xp: Any, compact: Any, device: Any) -> Any:
-        """Scatter the compact per-block kernel into the dense (D_m*D_m, ...) layout."""
-        gather_index = xp_asarray_nodetach(xp, self._dense_gather_index, device=device)
-        scatter_mask = xp.astype(
-            xp_asarray_nodetach(xp, self._dense_scatter_mask, device=device),
-            compact.dtype,
+    def _scatter_degree_kernel(self, compact: Array) -> Array:
+        xp = array_api_compat.array_namespace(compact)
+        device = array_api_compat.device(compact)
+        n_edge = compact.shape[0]
+        compact_index = xp_asarray_nodetach(
+            xp, self.kernel_compact_index[...], device=device
+        )
+        dense_index = xp_asarray_nodetach(
+            xp, self.kernel_dense_index[...], device=device
+        )
+        source = xp.take(compact, compact_index, axis=1)
+        dense = xp.zeros(
+            (self.reduced_dim * self.reduced_dim, n_edge),
+            dtype=compact.dtype,
+            device=device,
+        )
+        dense = xp_add_at(dense, dense_index, xp.permute_dims(source, (1, 0)))
+        dense = xp.permute_dims(dense, (1, 0))
+        return xp.reshape(dense, (n_edge, self.reduced_dim, self.reduced_dim))
+
+    def _scatter_rank_kernel(self, compact: Array) -> Array:
+        xp = array_api_compat.array_namespace(compact)
+        device = array_api_compat.device(compact)
+        n_edge = compact.shape[0]
+        compact_index = xp_asarray_nodetach(
+            xp, self.kernel_compact_index[...], device=device
+        )
+        dense_index = xp_asarray_nodetach(
+            xp, self.kernel_dense_index[...], device=device
+        )
+        source = xp.take(compact, compact_index, axis=1)
+        dense = xp.zeros(
+            (self.reduced_dim * self.reduced_dim, n_edge, self.rank),
+            dtype=compact.dtype,
+            device=device,
+        )
+        dense = xp_add_at(dense, dense_index, xp.permute_dims(source, (1, 0, 2)))
+        dense = xp.permute_dims(dense, (1, 0, 2))
+        return xp.reshape(
+            dense, (n_edge, self.reduced_dim, self.reduced_dim, self.rank)
+        )
+
+    def _scatter_channel_kernel(self, compact: Array) -> Array:
+        xp = array_api_compat.array_namespace(compact)
+        device = array_api_compat.device(compact)
+        n_edge = compact.shape[0]
+        compact_index = xp_asarray_nodetach(
+            xp, self.kernel_compact_index[...], device=device
+        )
+        dense_index = xp_asarray_nodetach(
+            xp, self.kernel_dense_index[...], device=device
+        )
+        source = xp.take(compact, compact_index, axis=1)
+        dense = xp.zeros(
+            (self.reduced_dim * self.reduced_dim, n_edge, self.channels),
+            dtype=compact.dtype,
+            device=device,
+        )
+        dense = xp_add_at(dense, dense_index, xp.permute_dims(source, (1, 0, 2)))
+        dense = xp.permute_dims(dense, (1, 0, 2))
+        return xp.reshape(
+            dense, (n_edge, self.reduced_dim, self.reduced_dim, self.channels)
         )
-        dense = xp.take(compact, gather_index, axis=1)
-        if compact.ndim == 2:
-            return dense * scatter_mask[None, :]
-        return dense * scatter_mask[None, :, None]
 
-    def call(self, x_local: Any, radial_feat: Any) -> Any:
+    def call(self, x_local: Array, radial_feat: Array) -> Array:
         """
         Parameters
         ----------
@@ -660,84 +717,75 @@ def call(self, x_local: Any, radial_feat: Any) -> Any:
         radial_feat
             Invariant radial/type features with shape (E, D_m, C_wide).
         """
+        xp = array_api_compat.array_namespace(x_local)
         if x_local.shape != radial_feat.shape:
             raise ValueError("`x_local` and `radial_feat` must have the same shape")
         if x_local.shape[1] != self.reduced_dim or x_local.shape[2] != self.channels:
             raise ValueError("Input shape is incompatible with this mixer")
 
-        xp = array_api_compat.array_namespace(x_local)
-        device = array_api_compat.device(x_local)
-        n_edge = x_local.shape[0]
-        kernel_flat = self._project_radial(xp, radial_feat)
+        kernel_flat = self._project_radial(radial_feat)
         if self.mode == "degree":
-            kernel = xp.reshape(
-                self._scatter_dense(xp, kernel_flat, device),
-                (n_edge, self.reduced_dim, self.reduced_dim),
-            )
+            kernel = self._scatter_degree_kernel(kernel_flat)
             return xp.matmul(kernel, x_local)
 
         if self.rank > 0:
             compact = xp.reshape(
-                kernel_flat, (n_edge, self.degree_kernel_size, self.rank)
-            )
-            kernel = xp.reshape(
-                self._scatter_dense(xp, compact, device),
-                (n_edge, self.reduced_dim, self.reduced_dim, self.rank),
-            )
-            # einsum "eoir,eic->eorc" as a broadcast batched matmul:
-            # (E, o, r, i) @ (E, 1, i, c) -> (E, o, r, c)
-            kernel = xp.permute_dims(kernel, (0, 1, 3, 2))
-            mixed = xp.matmul(kernel, x_local[:, None, :, :])
-            channel_basis = xp.reshape(
-                xp_asarray_nodetach(xp, self.channel_basis[...], device=device),
-                (1, 1, self.rank, self.channels),
+                kernel_flat, (x_local.shape[0], self.degree_kernel_size, self.rank)
             )
-            return xp.sum(mixed * channel_basis, axis=2)
+            return self._mix_rank_compact(compact, x_local)
 
         compact = xp.reshape(
-            kernel_flat, (n_edge, self.degree_kernel_size, self.channels)
+            kernel_flat, (x_local.shape[0], self.degree_kernel_size, self.channels)
         )
-        kernel = xp.reshape(
-            self._scatter_dense(xp, compact, device),
-            (n_edge, self.reduced_dim, self.reduced_dim, self.channels),
-        )
-        # einsum "eoic,eic->eoc"
+        kernel = self._scatter_channel_kernel(compact)
+        # einsum("eoic,eic->eoc"): contract l_in i per channel c (no channel mix).
         return xp.sum(kernel * x_local[:, None, :, :], axis=2)
 
-    def _variables(self) -> dict[str, np.ndarray]:
-        """Variables keyed by the pt ``state_dict`` key names."""
-        variables = {"weight": to_numpy_array(self.weight)}
-        if self.channel_basis is not None:
-            variables["channel_basis"] = to_numpy_array(self.channel_basis)
-        variables["kernel_compact_index"] = to_numpy_array(self.kernel_compact_index)
-        variables["kernel_dense_index"] = to_numpy_array(self.kernel_dense_index)
-        return variables
+    def _mix_rank_compact(self, compact: Array, x_local: Array) -> Array:
+        """
+        Mix the reduced features by the low-rank dynamic degree kernel.
 
-    def _load_variables(self, variables: dict[str, Any]) -> None:
-        """Load variables keyed by the pt ``state_dict`` key names."""
-        prec = PRECISION_DICT[self.precision.lower()]
-        _check_index_table(
-            self.kernel_compact_index,
-            variables["kernel_compact_index"],
-            "kernel_compact_index",
+        Parameters
+        ----------
+        compact : Array
+            Projected per-edge degree kernels with shape
+            (E, degree_kernel_size, R).
+        x_local : Array
+            Edge-local reduced features with shape (E, D_m, C).
+
+        Returns
+        -------
+        Array
+            Mixed features with shape (E, D_m, C).
+        """
+        xp = array_api_compat.array_namespace(compact)
+        device = array_api_compat.device(compact)
+        kernel = self._scatter_rank_kernel(compact)
+        # einsum("eoir,eic->eorc"): contract l_in i, batched over (l_out, rank)
+        # via a single matmul, then weight the rank channels by channel_basis.
+        kernel_or = xp.reshape(
+            xp.permute_dims(kernel, (0, 1, 3, 2)),
+            (x_local.shape[0], self.reduced_dim * self.rank, self.reduced_dim),
         )
-        _check_index_table(
-            self.kernel_dense_index,
-            variables["kernel_dense_index"],
-            "kernel_dense_index",
+        mixed = xp.matmul(kernel_or, x_local)
+        mixed = xp.reshape(
+            mixed,
+            (x_local.shape[0], self.reduced_dim, self.rank, self.channels),
         )
-        _check_shape_assign(self, "weight", variables["weight"], prec, "weight")
-        if self.channel_basis is not None:
-            _check_shape_assign(
-                self, "channel_basis", variables["channel_basis"], prec, "channel_basis"
-            )
+        channel_basis = xp.reshape(
+            xp_asarray_nodetach(xp, self.channel_basis[...], device=device),
+            (1, 1, self.rank, self.channels),
+        )
+        return xp.sum(mixed * channel_basis, axis=2)
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the DynamicRadialDegreeMixer to a dict.
-
-        The pt class has no ``serialize()``; the ``@variables`` keys here
-        match the pt ``state_dict`` key names.
-        """
+        variables = {
+            "weight": to_numpy_array(self.weight),
+            "kernel_compact_index": to_numpy_array(self.kernel_compact_index),
+            "kernel_dense_index": to_numpy_array(self.kernel_dense_index),
+        }
+        if self.channel_basis is not None:
+            variables["channel_basis"] = to_numpy_array(self.channel_basis)
         return {
             "@class": "DynamicRadialDegreeMixer",
             "@version": 1,
@@ -751,12 +799,11 @@ def serialize(self) -> dict[str, Any]:
                 "trainable": self.trainable,
                 "seed": None,
             },
-            "@variables": self._variables(),
+            "@variables": variables,
         }
 
     @classmethod
     def deserialize(cls, data: dict[str, Any]) -> DynamicRadialDegreeMixer:
-        """Deserialize a DynamicRadialDegreeMixer from a dict."""
         data = data.copy()
         data_cls = data.pop("@class")
         if data_cls != "DynamicRadialDegreeMixer":
@@ -765,20 +812,67 @@ def deserialize(cls, data: dict[str, Any]) -> DynamicRadialDegreeMixer:
         check_version_compatibility(version, 1, 1)
         config = data.pop("config")
         variables = data.pop("@variables")
-        obj = cls(
-            lmax=int(config["lmax"]),
-            mmax=int(config["mmax"]),
-            channels=int(config["channels"]),
-            mode=str(config["mode"]),
-            rank=int(config["rank"]),
-            precision=str(config["precision"]),
-            trainable=bool(config["trainable"]),
-            seed=config.get("seed"),
+        obj = cls(**config)
+        prec = PRECISION_DICT[obj.precision.lower()]
+        obj.weight = np.asarray(variables["weight"], dtype=prec)
+        obj.kernel_compact_index = np.asarray(
+            variables["kernel_compact_index"], dtype=np.int64
         )
-        obj._load_variables(variables)
+        obj.kernel_dense_index = np.asarray(
+            variables["kernel_dense_index"], dtype=np.int64
+        )
+        if obj.channel_basis is not None:
+            obj.channel_basis = np.asarray(variables["channel_basis"], dtype=prec)
         return obj
 
 
+def _parse_node_cartesian(spec: str) -> tuple[bool, bool, int]:
+    """
+    Parse the ``node_cartesian`` configuration string.
+
+    Grammar: ``"<mode>:<layers>"`` where ``mode`` is ``"default"`` (the one-sided
+    product ``Y N``) or ``"parity"`` (the symmetrized product ``Y N + N Y``), and
+    ``layers`` is a non-negative integer. A bare mode defaults to one layer; a
+    bare integer uses the default mode. ``"none"``, an empty string, or any zero
+    layer count disables the per-node product.
+
+    Parameters
+    ----------
+    spec : str
+        The configuration string.
+
+    Returns
+    -------
+    tuple[bool, bool, int]
+        ``(enabled, symmetric, n_layers)``.
+
+    Raises
+    ------
+    ValueError
+        If the mode is not ``"default"`` or ``"parity"``, or the layer count is
+        negative.
+    """
+    text = str(spec).strip().lower()
+    if text in ("", "none"):
+        return False, False, 0
+    if ":" in text:
+        mode, _, num = text.partition(":")
+        mode = mode.strip() or "default"
+        layers = int(num.strip())
+    elif text.isdigit():
+        mode, layers = "default", int(text)
+    else:
+        mode, layers = text, 1
+    if mode not in ("default", "parity"):
+        raise ValueError(
+            "`node_cartesian` mode must be 'default' or 'parity', got "
+            f"'{mode}' (expected '<mode>:<layers>', 'none', or a layer count)"
+        )
+    if layers < 0:
+        raise ValueError("`node_cartesian` layer count must be non-negative")
+    return layers > 0, mode == "parity", layers
+
+
 class SO2Convolution(NativeOP):
     """
     SO(2)-equivariant edge convolution with cached geometry and rotations.
@@ -790,18 +884,140 @@ class SO2Convolution(NativeOP):
     1. `pre_focus_mix`: project node features `(N, D, C)` to the SO(2) hidden width.
     2. rotate global -> local reduced basis with cached `D_to_m`.
     3. radial modulation in reduced layout.
-    4. `so2_layers` stacked local mixers:
-       `inter_norm -> SO2Linear -> non_linearity -> residual`.
+    4. `mixing_layers` stacked local mixers:
+       `inter_norm -> SO2Linear -> non_linearity -> residual(+LayerScale)`.
     5. rotate local -> global with cached `Dt_from_m`.
-    6. edge aggregation (plain envelope masked sum or envelope-aware masked
-       softmax attention with output-side head gate); see the module
-       docstring for the padded-edge adaptation.
+    6. edge aggregation (plain envelope scatter or envelope-aware grouped
+       softmax attention with exact envelope-gated competition and
+       output-side head gate).
     7. `post_focus_mix`: project aggregated hidden messages back to `(N, D, C)`.
 
-    See the pt ``SO2Convolution`` docstring for the full parameter
-    documentation; this port keeps the same constructor parameters with
-    ``dtype`` replaced by ``precision``. Flags unused by the core DPA4 config
-    raise ``NotImplementedError`` (listed in the module docstring).
+    Equivariance is preserved because both `pre_focus_mix` and `post_focus_mix`
+    only mix the channel axis for each `(l, m)` coefficient and never mix
+    coefficient indices across `(l, m)`.
+
+    Parameters
+    ----------
+    lmax
+        Maximum degree.
+    mmax
+        Maximum SO(2) order (|m|). If None, defaults to lmax.
+    kmax
+        Maximum Wigner-D frame order (|k|) used by SO(3) grid branches.
+    channels
+        Number of channels per (l, m) coefficient.
+    n_focus
+        Number of focus streams inside the SO(2) branch.
+    focus_dim
+        Hidden width per focus stream inside SO(2).
+        ``focus_dim=0`` means using ``channels``.
+    focus_compete
+        If True, apply cross-focus softmax competition in SO(2) local layout.
+        Competition logits are constructed only from l=0 scalar channels and the
+        resulting invariant weights are broadcast to all (l, m) components.
+    so2_norm
+        If True, apply intermediate ReducedEquivariantRMSNorm as pre-norm before
+        each SO(2) mixing layer. The last SO(2) layer always uses Identity.
+    mixing_layers
+        Number of learnable mixing layers in the per-edge message core (SO2Linear
+        layers for the SO(2) path, or refinement layers for ``edge_cartesian``).
+        ``0`` applies only the edge-condition modulation: the rotation-free
+        per-degree radial scaling for the SO(2) path, or a single ``x @ T_e`` for
+        ``edge_cartesian``.
+    so2_attn_res
+        Depth-wise attention residual mode across the internal SO(2) layer
+        history. Must be one of ``"none"``, ``"independent"``, or
+        ``"dependent"``. The same scalar weights are broadcast to the full
+        reduced equivariant tensor.
+    layer_scale
+        If True, apply per-layer learnable LayerScale (per-focus-channel,
+        init 1e-3) on each SO(2) residual branch.
+    n_atten_head
+        Number of attention heads used during aggregation.
+        - 0: plain envelope-weighted scatter-sum.
+        - >0: envelope-gated grouped softmax attention with output-side head
+          gates. Attention uses ``w**2 * exp(logit)`` in the numerator and
+          ``zeta + sum(w**2 * exp(logit))`` in the denominator.
+    atten_f_mix
+        If True, merge the internal focus streams into one attention stream
+        after rotate-back. Attention heads then split the full hidden width
+        ``n_focus * focus_dim`` instead of each focus stream independently.
+    atten_v_proj
+        If True, apply an explicit degree-aware value projection before
+        attention aggregation.
+    atten_o_proj
+        If True, apply an explicit degree-aware output projection after the
+        output-side attention gate.
+    s2_activation
+        If True, replace each intermediate reduced-layout gate with S2-grid
+        SwiGLU. Intermediate ``SO2Linear`` layers then output ``2 * focus_dim``
+        channels before the activation folds them back to ``focus_dim``.
+    node_wise_grid_mlp
+        If True, select the polynomial grid MLP operation for the node-wise
+        source-destination grid product.
+    node_wise_grid_branch
+        Number of scalar-routed polynomial product branches for the node-wise
+        grid product. ``0`` disables branch mixing; positive values take
+        precedence over ``node_wise_grid_mlp``.
+    message_node_grid_mlp
+        If True, select the polynomial grid MLP operation for the message-node
+        grid product.
+    message_node_grid_branch
+        Number of scalar-routed polynomial product branches for the
+        message-node grid product. ``0`` disables branch mixing; positive
+        values take precedence over ``message_node_grid_mlp``.
+    node_wise_s2
+        If True, add an edge-local S2 product branch between radial-fused source
+        features and destination features in the same edge frame.
+    node_wise_so3
+        If True, use the corresponding edge-local SO(3) Wigner-D grid branch.
+    message_node_s2
+        If True, add a packed-layout S2 product branch between the aggregated
+        hidden message and the destination node features before ``post_focus_mix``.
+    message_node_so3
+        If True, use the corresponding post-aggregation SO(3) Wigner-D grid
+        branch.
+    lebedev_quadrature
+        If True, use Lebedev quadrature for the S2 projector.
+    activation_function
+        Activation function for the gated activation path when
+        ``s2_activation=False``.
+    mlp_bias
+        Whether to use bias in SO2Linear (l=0 bias) and GatedActivation
+        (gate linear bias).
+    radial_so2_mode
+        Dynamic radial degree mixer mode. ``"none"`` applies elementwise
+        radial modulation, ``"degree"`` applies a channel-shared dynamic
+        cross-degree kernel, and ``"degree_channel"`` applies a
+        per-channel dynamic cross-degree kernel.
+    radial_so2_rank
+        Low-rank channel factorization rank for ``radial_so2_mode="degree_channel"``.
+        ``0`` uses the full per-channel dynamic degree kernel.
+    edge_cartesian
+        If True, replace the rotate-to-local / ``SO2Linear`` stack / rotate-back
+        core with the per-edge global-frame Cartesian rank-2 tensor product.
+        Requires ``lmax`` in ``{1, 2}`` and is incompatible with the S2/SO(3)
+        grid product branches. The dynamic radial degree mixer is bypassed
+        because the radial edge condition is carried by the Cartesian edge tensor
+        instead.
+    node_cartesian
+        Per-node global-frame Cartesian rank-2 tensor product applied to the
+        aggregated message, coupling it with the destination node feature after
+        the optional message-node grid product and before ``post_focus_mix``. The
+        Cartesian analog of the message-node grid product. Configured by a string
+        ``"<mode>:<layers>"`` where ``mode`` is ``"default"`` (one-sided product)
+        or ``"parity"`` (symmetrized product), and ``layers`` is the stack depth;
+        a bare integer ``N`` is shorthand for ``"default:N"``, and ``"none"`` (or
+        ``0``) disables it. Requires ``lmax`` in ``{1, 2}`` and is orthogonal to
+        ``edge_cartesian``.
+    eps
+        Small epsilon for normalization modules.
+    precision
+        Parameter precision.
+    seed
+        Random seed for weight initialization.
+    trainable
+        Whether parameters are trainable.
     """
 
     def __init__(
@@ -815,7 +1031,7 @@ def __init__(
         focus_dim: int = 0,
         focus_compete: bool = True,
         so2_norm: bool = False,
-        so2_layers: int = 4,
+        mixing_layers: int = 4,
         so2_attn_res: str = "none",
         layer_scale: bool = False,
         n_atten_head: int = 1,
@@ -836,10 +1052,12 @@ def __init__(
         mlp_bias: bool = False,
         radial_so2_mode: str = "none",
         radial_so2_rank: int = 0,
+        edge_cartesian: bool = False,
+        node_cartesian: str | int = "none",
         eps: float = 1e-7,
         precision: str = DEFAULT_PRECISION,
-        seed: int | list[int] | None = None,
-        trainable: bool = True,
+        seed: int | list[int] | None,
+        trainable: bool,
     ) -> None:
         self.lmax = int(lmax)
         self.mmax = int(self.lmax if mmax is None else mmax)
@@ -864,38 +1082,21 @@ def __init__(
         self.focus_softmax_tau = 1.0
         self.focus_label_smoothing = 0.02
         self.so2_norm = bool(so2_norm)
-        self.so2_layers = int(so2_layers)
-        if self.so2_layers < 1:
-            raise ValueError("`so2_layers` must be >= 1")
+        self.mixing_layers = int(mixing_layers)
+        if self.mixing_layers < 0:
+            raise ValueError("`mixing_layers` must be >= 0")
         self.so2_attn_res_mode = str(so2_attn_res).lower()
         if self.so2_attn_res_mode not in ATTN_RES_MODES:
             raise ValueError(
                 "`so2_attn_res` must be one of 'none', 'independent', or 'dependent'"
             )
-        if self.so2_attn_res_mode != "none":
-            raise NotImplementedError(
-                "so2_attn_res != 'none' (DepthAttnRes) is not ported to dpmodel"
-            )
+        self.use_so2_attn_res = self.so2_attn_res_mode != "none"
         self.layer_scale = bool(layer_scale)
-        if self.layer_scale:
-            raise NotImplementedError("layer_scale=True is not ported to dpmodel")
         self.n_atten_head = int(n_atten_head)
-        if self.n_atten_head < 0:
-            raise ValueError("`n_atten_head` must be non-negative")
         self.atten_f_mix = bool(atten_f_mix)
-        if self.atten_f_mix:
-            raise NotImplementedError("atten_f_mix=True is not ported to dpmodel")
         self.use_atten_v_proj = bool(atten_v_proj)
-        if self.use_atten_v_proj:
-            raise NotImplementedError("atten_v_proj=True is not ported to dpmodel")
         self.use_atten_o_proj = bool(atten_o_proj)
-        if self.use_atten_o_proj:
-            raise NotImplementedError("atten_o_proj=True is not ported to dpmodel")
         self.s2_activation = bool(s2_activation)
-        if self.s2_activation:
-            raise NotImplementedError(
-                "s2_activation=True (so2_s2_activation) is not ported to dpmodel"
-            )
         self.node_wise_grid_mlp = bool(node_wise_grid_mlp)
         self.node_wise_grid_branch = int(node_wise_grid_branch)
         self.message_node_grid_mlp = bool(message_node_grid_mlp)
@@ -918,17 +1119,22 @@ def __init__(
             self.lmax,
             method=self.s2_grid_method,
         )
-        # Mirror pt: the e3nn product-grid branch squares the max resolution.
-        # dpmodel only ports the Lebedev backend (the e3nn S2GridNet raises),
-        # so this just preserves the config-recorded resolution for parity.
         self.s2_full_grid_resolution = (
             [max(base_full_grid_resolution), max(base_full_grid_resolution)]
             if self.s2_grid_method == "e3nn"
             else base_full_grid_resolution
         )
         self.activation_function = str(activation_function)
-        self.attn_n_focus = self.n_focus
-        self.attn_focus_dim = self.so2_focus_dim
+        if self.n_atten_head < 0:
+            raise ValueError("`n_atten_head` must be non-negative")
+        self.attn_n_focus = (
+            1 if self.atten_f_mix and self.n_atten_head > 0 else self.n_focus
+        )
+        self.attn_focus_dim = (
+            self.hidden_channels
+            if self.atten_f_mix and self.n_atten_head > 0
+            else self.so2_focus_dim
+        )
         if self.n_atten_head > 0 and self.attn_focus_dim % self.n_atten_head != 0:
             raise ValueError(
                 "`n_atten_head` must divide the attention width "
@@ -948,105 +1154,97 @@ def __init__(
         self.radial_so2_rank = int(radial_so2_rank)
         if self.radial_so2_rank < 0:
             raise ValueError("`radial_so2_rank` must be non-negative")
+        self.edge_cartesian = bool(edge_cartesian)
+        self.node_cartesian = str(node_cartesian)
+        (
+            self._node_cartesian_enabled,
+            self._node_cartesian_symmetric,
+            self._node_cartesian_layers,
+        ) = _parse_node_cartesian(self.node_cartesian)
+        if self.edge_cartesian:
+            if self.lmax not in (1, 2):
+                raise ValueError("`edge_cartesian` requires lmax in {1, 2}")
+            if (
+                self.node_wise_s2
+                or self.node_wise_so3
+                or self.message_node_s2
+                or self.message_node_so3
+            ):
+                raise ValueError(
+                    "`edge_cartesian` is incompatible with the S2/SO(3) grid "
+                    "product branches"
+                )
+        if self._node_cartesian_enabled and self.lmax not in (1, 2):
+            raise ValueError("`node_cartesian` requires lmax in {1, 2}")
         self.eps = float(eps)
         self.ebed_dim_full = get_so3_dim_of_lmax(self.lmax)
         self.precision = precision
-        self.compute_precision = _compute_precision(precision)
-        self.trainable = bool(trainable)
-        prec = PRECISION_DICT[self.precision.lower()]
-
-        # === Step 1. Precompute coefficient indices for m-major reduced layout ===
-        self.coeff_index_m = build_m_major_index(self.lmax, self.mmax)
-        self.degree_index_m = build_m_major_l_index(self.lmax, self.mmax)
-        degree_index_full = map_degree_idx(self.lmax)
-        self.rotate_inv_rescale_full = build_rotate_inv_rescale(
-            self.lmax,
-            self.mmax,
-            degree_index_full,
-            dtype=prec,
-        )
-        self.reduced_dim = int(self.coeff_index_m.shape[0])
+        self.compute_precision = np.dtype(
+            get_promoted_dtype(PRECISION_DICT[precision])
+        ).name
 
-        # === Step 2. Split deterministic seeds at the module top-level ===
+        # === Step 1. Split deterministic seeds at the module top-level ===
         seed_so2_stack = child_seed(seed, 0)
         seed_non_linearities = child_seed(seed, 1)
         seed_so3_pre = child_seed(seed, 2)
         seed_so3_post = child_seed(seed, 3)
         seed_gate = child_seed(seed, 4)
+        seed_depth_attn = child_seed(seed, 5)
         seed_radial_hidden = child_seed(seed, 6)
         seed_radial_degree = child_seed(seed, 7)
         seed_node_wise_s2 = child_seed(seed, 8)
         seed_message_node_s2 = child_seed(seed, 9)
+        seed_node_cartesian = child_seed(seed, 10)
 
-        # === Step 3. Multiple SO2Linear layers ===
-        # (s2_activation is guarded above, so out_channels == so2_focus_dim.)
-        self.so2_linears = [
-            SO2Linear(
+        # === Step 2. Edge mixing core: SO(2) rotation stack or Cartesian product ===
+        if self.edge_cartesian:
+            self.edge_cartesian_tp = EdgeCartesianTensorProduct(
                 lmax=self.lmax,
-                mmax=self.mmax,
-                in_channels=self.so2_focus_dim,
-                out_channels=self.so2_focus_dim,
+                focus_dim=self.so2_focus_dim,
                 n_focus=self.n_focus,
-                precision=self.precision,
+                n_layers=self.mixing_layers,
+                activation_function=self.activation_function,
                 mlp_bias=self.mlp_bias,
-                seed=child_seed(seed_so2_stack, i),
+                eps=self.eps,
+                precision=self.precision,
+                seed=seed_so2_stack,
                 trainable=trainable,
             )
-            for i in range(self.so2_layers)
-        ]
-
-        # === Step 4. Intermediate norms (Optional) ===
-        # pt appends nn.Identity() entries; dpmodel uses None for Identity.
-        inter_norms: list[ReducedEquivariantRMSNorm | None] = []
-        if self.so2_norm:
-            for _ in range(max(0, self.so2_layers - 1)):
-                inter_norms.append(
-                    ReducedEquivariantRMSNorm(
-                        lmax=self.lmax,
-                        mmax=self.mmax,
-                        channels=self.so2_focus_dim,
-                        degree_index_m=self.degree_index_m,
-                        n_focus=self.n_focus,
-                        precision=self.compute_precision,
-                        trainable=trainable,
-                    )
-                )
         else:
-            for _ in range(max(0, self.so2_layers - 1)):
-                inter_norms.append(None)
-        inter_norms.append(None)
-        self.so2_inter_norms = inter_norms
+            self._build_so2_mixing(
+                seed_so2_stack=seed_so2_stack,
+                seed_non_linearities=seed_non_linearities,
+                seed_depth_attn=seed_depth_attn,
+                trainable=trainable,
+            )
 
-        # === Step 5. Intermediate non-linearity ===
-        # pt appends nn.Identity() as the last entry; dpmodel uses None.
-        non_linearities: list[GatedActivation | None] = []
-        for i in range(max(0, self.so2_layers - 1)):
-            non_linearities.append(
-                GatedActivation(
-                    lmax=self.lmax,
-                    mmax=self.mmax,
-                    channels=self.so2_focus_dim,
-                    n_focus=self.n_focus,
-                    precision=self.compute_precision,
-                    activation_function=self.activation_function,
-                    mlp_bias=self.mlp_bias,
-                    layout="nfdc",
-                    trainable=trainable,
-                    seed=child_seed(seed_non_linearities, i),
-                )
+        # === Step 2b. Optional per-node Cartesian mixing on the aggregated message ===
+        self.node_cartesian_tp: NodeCartesianTensorProduct | None = None
+        if self._node_cartesian_enabled:
+            self.node_cartesian_tp = NodeCartesianTensorProduct(
+                lmax=self.lmax,
+                focus_dim=self.so2_focus_dim,
+                n_focus=self.n_focus,
+                n_layers=self._node_cartesian_layers,
+                symmetric=self._node_cartesian_symmetric,
+                activation_function=self.activation_function,
+                mlp_bias=self.mlp_bias,
+                precision=self.precision,
+                seed=seed_node_cartesian,
+                trainable=trainable,
             )
-        non_linearities.append(None)
-        self.non_linearities = non_linearities
 
         # === Step 7. Optional attention projections (n_atten_head > 0) ===
         self.attn_qk_norm: ScalarRMSNorm | None = None
         self.attn_q_proj: FocusLinear | None = None
         self.attn_k_proj: FocusLinear | None = None
+        self.attn_focus_mix: SO3Linear | None = None
+        self.attn_v_proj: SO3Linear | None = None
+        self.attn_o_proj: SO3Linear | None = None
         self.adamw_attn_logit_w: np.ndarray | None = None
         self.adamw_attn_z_bias_raw: np.ndarray | None = None
         self.attn_output_gate_norm: ScalarRMSNorm | None = None
         self.adamw_attn_gate_w: np.ndarray | None = None
-        cprec = PRECISION_DICT[self.compute_precision.lower()]
         if self.n_atten_head > 0:
             self.attn_qk_norm = ScalarRMSNorm(
                 channels=self.attn_focus_dim,
@@ -1073,15 +1271,53 @@ def __init__(
                 seed=child_seed(seed_gate, 1),
                 trainable=trainable,
             )
-            rng = np.random.default_rng(child_seed(seed_gate, 2))
-            self.adamw_attn_logit_w = rng.normal(
-                0.0,
-                0.01,
-                size=(self.attn_focus_dim, self.attn_n_focus, self.n_atten_head),
-            ).astype(cprec)
+            if self.atten_f_mix:
+                self.attn_focus_mix = SO3Linear(
+                    lmax=self.lmax,
+                    in_channels=self.hidden_channels,
+                    out_channels=self.hidden_channels,
+                    n_focus=1,
+                    precision=self.compute_precision,
+                    mlp_bias=False,
+                    seed=child_seed(seed_gate, 19),
+                    trainable=trainable,
+                )
+            if self.use_atten_v_proj:
+                self.attn_v_proj = SO3Linear(
+                    lmax=self.lmax,
+                    in_channels=self.attn_focus_dim,
+                    out_channels=self.attn_focus_dim,
+                    n_focus=self.attn_n_focus,
+                    precision=self.compute_precision,
+                    mlp_bias=False,
+                    seed=child_seed(seed_gate, 20),
+                    trainable=trainable,
+                )
+            if self.use_atten_o_proj:
+                self.attn_o_proj = SO3Linear(
+                    lmax=self.lmax,
+                    in_channels=self.attn_focus_dim,
+                    out_channels=self.attn_focus_dim,
+                    n_focus=self.attn_n_focus,
+                    precision=self.compute_precision,
+                    mlp_bias=False,
+                    seed=child_seed(seed_gate, 21),
+                    trainable=trainable,
+                )
+            self.adamw_attn_logit_w = (
+                np.random.default_rng(child_seed(seed_gate, 2))
+                .normal(
+                    0.0,
+                    0.01,
+                    size=(self.attn_focus_dim, self.attn_n_focus, self.n_atten_head),
+                )
+                .astype(PRECISION_DICT[self.compute_precision])
+            )
             # softplus(0.5413) ~= 1.0 provides balanced initial competition.
             self.adamw_attn_z_bias_raw = np.full(
-                (self.attn_n_focus, self.n_atten_head), 0.5413, dtype=cprec
+                (self.attn_n_focus, self.n_atten_head),
+                0.5413,
+                dtype=PRECISION_DICT[self.compute_precision],
             )
             self.attn_output_gate_norm = ScalarRMSNorm(
                 channels=self.attn_focus_dim,
@@ -1090,12 +1326,15 @@ def __init__(
                 precision=self.compute_precision,
                 trainable=trainable,
             )
-            rng = np.random.default_rng(child_seed(seed_gate, 3))
-            self.adamw_attn_gate_w = rng.normal(
-                0.0,
-                0.01,
-                size=(self.attn_focus_dim, self.attn_n_focus, self.n_atten_head),
-            ).astype(cprec)
+            self.adamw_attn_gate_w = (
+                np.random.default_rng(child_seed(seed_gate, 3))
+                .normal(
+                    0.0,
+                    0.01,
+                    size=(self.attn_focus_dim, self.attn_n_focus, self.n_atten_head),
+                )
+                .astype(PRECISION_DICT[self.compute_precision])
+            )
 
         # === Step 7.5. Optional cross-focus competition ===
         self.focus_compete_norm: ScalarRMSNorm | None = None
@@ -1109,12 +1348,20 @@ def __init__(
                 precision=self.compute_precision,
                 trainable=trainable,
             )
-            rng = np.random.default_rng(child_seed(seed_gate, 4))
-            self.adamw_focus_compete_w = rng.normal(
-                0.0, 0.01, size=(self.so2_focus_dim, self.n_focus)
-            ).astype(cprec)
+            self.adamw_focus_compete_w = (
+                np.random.default_rng(child_seed(seed_gate, 4))
+                .normal(
+                    0.0,
+                    0.01,
+                    size=(self.so2_focus_dim, self.n_focus),
+                )
+                .astype(PRECISION_DICT[self.compute_precision])
+            )
             if self.mlp_bias:
-                self.focus_compete_bias = np.zeros((self.n_focus,), dtype=cprec)
+                self.focus_compete_bias = np.zeros(
+                    self.n_focus,
+                    dtype=PRECISION_DICT[self.compute_precision],
+                )
 
         # === Step 8. Optional radial hidden projection ===
         self.radial_hidden_proj: ChannelLinear | None = None
@@ -1128,7 +1375,7 @@ def __init__(
                 trainable=trainable,
             )
         self.radial_degree_mixer: DynamicRadialDegreeMixer | None = None
-        if self.radial_so2_mode != "none":
+        if not self.edge_cartesian and self.radial_so2_mode != "none":
             self.radial_degree_mixer = DynamicRadialDegreeMixer(
                 lmax=self.lmax,
                 mmax=self.mmax,
@@ -1139,11 +1386,6 @@ def __init__(
                 seed=seed_radial_degree,
                 trainable=trainable,
             )
-
-        # === Step 8.5. Optional cross-mode grid products ===
-        # ``op_type`` selection mirrors pt: ``branch`` (count > 0) takes
-        # precedence over ``mlp``, else ``glu``. When both ``*_s2`` and
-        # ``*_so3`` are set the SO(3) branch wins (per the argcheck doc).
         node_wise_op = (
             "branch"
             if self.node_wise_grid_branch > 0
@@ -1241,7 +1483,7 @@ def __init__(
             in_channels=self.channels,
             out_channels=self.hidden_channels,
             n_focus=1,
-            precision=self.precision,
+            precision=precision,
             mlp_bias=self.mlp_bias,
             trainable=trainable,
             seed=seed_so3_pre,
@@ -1253,30 +1495,35 @@ def __init__(
             in_channels=self.hidden_channels,
             out_channels=self.channels,
             n_focus=1,
-            precision=self.precision,
+            precision=precision,
             mlp_bias=self.mlp_bias,
             trainable=trainable,
             seed=seed_so3_post,
             init_std=0.0,
         )
 
+        # === Step 11. Edge-frame requirement for the SO(2) message ===
+        self.needs_local_frame = (not self.edge_cartesian) and (
+            self.mixing_layers > 0
+            or self.radial_so2_mode != "none"
+            or self.node_wise_grid_product is not None
+        )
+        self.trainable = bool(trainable)
+
     def call(
         self,
-        x: Any,
+        x: Array,
         edge_cache: EdgeCache,
-        radial_feat: Any,
-    ) -> Any:
+        radial_feat: Array,
+    ) -> Array:
         """
         Parameters
         ----------
         x
             Node features with shape (N, D, C), where D=(lmax+1)^2 is the
-            SO(3) coefficient dimension and N = nf * nloc is the local node
-            axis.
+            SO(3) coefficient dimension.
         edge_cache
-            Precomputed edge cache in the padded-edge layout
-            (``E = N * nnei``; see ``edge_cache.EdgeCache``). Must be
-            compatible with this block's lmax.
+            Precomputed edge cache. Must be compatible with this block's lmax.
         radial_feat
             Per-edge radial features with shape (E, lmax+1, C), already fused
             with edge type features.
@@ -1290,261 +1537,136 @@ def call(
         device = array_api_compat.device(x)
         src, dst = edge_cache.src, edge_cache.dst
         n_node = x.shape[0]
-        # Keep ``n_edge``/``n_node`` symbolic (no ``int()``): they are the
-        # products ``nf*nloc*nnei`` / ``nf*nloc``. Casting to a Python int
-        # specializes them to the trace-time sample shape (breaking
-        # torch.export with a dynamic ``nloc`` dim); the ``Mod`` check stays
-        # statically known and the ``(n_node, nnei, ...)`` reshape below
-        # recovers the layout symbolically.
         n_edge = src.shape[0]
-        if n_node <= 0 or n_edge % n_node != 0:
-            raise ValueError(
-                "padded-edge layout requires E to be a multiple of N; "
-                f"got E={n_edge}, N={n_node}"
-            )
-        nnei = n_edge // n_node
-        # Validity mask for the padded-edge layout (1 on real edges).
-        edge_mask = edge_cache.edge_mask
-        if edge_mask is not None:
-            mask_f = xp.astype(xp.reshape(edge_mask, (n_edge,)), x.dtype)
-        else:
-            mask_f = xp.ones((n_edge,), dtype=x.dtype, device=device)
 
         # === Step 1. Pre-focus channel mixing on full width ===
         # (N, D, C_wide), C_wide = F * Cf
         x = self.pre_focus_mix(x[:, :, None, :])[:, :, 0, :]
 
-        # === Step 2. Rotate to edge-aligned local frame ===
-        D_full = edge_cache.D_full
-        D_m_prime = project_D_to_m(
-            D_full=D_full,
-            coeff_index_m=self.coeff_index_m,
-            ebed_dim_full=self.ebed_dim_full,
-            cache=edge_cache.D_to_m_cache,
-            key_lmax=self.lmax,
-            key_mmax=self.mmax,
-        )
-        src_idx = xp.astype(xp.reshape(src, (n_edge,)), xp.int64)
-        x_src = xp.take(x, src_idx, axis=0)  # (E, D, C_wide)
-        x_local = xp.matmul(D_m_prime, x_src)  # (E, D_m, C_wide)
-        # pt rotates the *destination* node into the same edge frame for the
-        # node-wise cross-mode grid product (raw, before radial modulation).
-        x_dst_local: Any = None
-        if self.node_wise_grid_product is not None:
-            dst_idx_nw = xp.astype(xp.reshape(dst, (n_edge,)), xp.int64)
-            x_dst = xp.take(x, dst_idx_nw, axis=0)  # (E, D, C_wide)
-            x_dst_local = xp.matmul(D_m_prime, x_dst)  # (E, D_m, C_wide)
-
-        # === Step 3. Select radial/type features for reduced layout ===
-        degree_index_m = xp_asarray_nodetach(xp, self.degree_index_m, device=device)
-        rad_feat = xp.take(radial_feat, degree_index_m, axis=1)  # (E, D_m, C)
-        if self.radial_hidden_proj is not None:
-            rad_feat = self.radial_hidden_proj(rad_feat)
-        if self.radial_degree_mixer is None:
-            x_local = x_local * rad_feat
+        # === Step 2. Edge message: Cartesian product, SO(2) mixing, or the
+        # rotation-free radial message when no local-frame operation is needed ===
+        if self.edge_cartesian:
+            x_message, rad_feat = self.cartesian_message(x, edge_cache, radial_feat)
+        elif self.needs_local_frame:
+            x_message, rad_feat = self.so2_message(x, edge_cache, radial_feat)
         else:
-            x_local = self.radial_degree_mixer(x_local, rad_feat)
-        # pt Step 3: edge-local cross-mode grid product between the
-        # radial-fused source (query) and the raw destination (context),
-        # added as a residual in the reduced m-major layout (E, D_m, C_wide).
-        if self.node_wise_grid_product is not None:
-            x_local = x_local + self.node_wise_grid_product(x_local, x_dst_local)
-        rad_feat_l0_focus = xp.reshape(
-            rad_feat[:, 0, :], (n_edge, self.n_focus, self.so2_focus_dim)
-        )  # (E, F, Cf)
-
-        # === Step 4. Convert to SO(2) internal focus layout ===
-        focus_gate_src: Any = None
-        x_local = xp.permute_dims(
-            xp.reshape(
-                x_local, (n_edge, self.reduced_dim, self.n_focus, self.so2_focus_dim)
-            ),
-            (0, 2, 1, 3),
-        )  # (E, F, D_m, Cf)
-        if self.focus_compete and self.n_focus > 1:
-            focus_gate_src = x_local[:, :, 0, :]
-
-        # === Step 5. Multi-layer SO(2) mixing (pre-norm + residual) ===
-        def apply_bias_correction(
-            x_local: Any,
-            so2_linear: SO2Linear,
-            layer_idx: int,
-        ) -> Any:
-            if layer_idx != 0 or so2_linear.bias0 is None:
-                return x_local
-            bias0 = xp.reshape(
-                xp_asarray_nodetach(xp, so2_linear.bias0[...], device=device),
-                (1, self.n_focus, so2_linear.out_channels),
-            )
-            if so2_linear.out_channels == self.so2_focus_dim:
-                radial_factor = rad_feat_l0_focus
-            else:
-                raise RuntimeError(
-                    "Unexpected SO2Linear output width in bias correction"
-                )
-            edge_env = xp.reshape(
-                xp.astype(edge_cache.edge_env, x_local.dtype), (n_edge, 1, 1)
-            )
-            bias_correction = bias0 * (radial_factor * edge_env - 1.0)
-            x0 = x_local[:, :, :1, :] + bias_correction[:, :, None, :]
-            return xp.concat([x0, x_local[:, :, 1:, :]], axis=2)
-
-        for layer_idx, (so2_linear, inter_norm, non_linear) in enumerate(
-            zip(
-                self.so2_linears,
-                self.so2_inter_norms,
-                self.non_linearities,
-                strict=True,
-            )
-        ):
-            residual = x_local
-            if inter_norm is not None:
-                x_local = inter_norm(x_local)
-            x_local = so2_linear(x_local)
-            x_local = apply_bias_correction(x_local, so2_linear, layer_idx)
-
-            if non_linear is not None:
-                x_local = non_linear(x_local)
-
-            x_local = residual + x_local
+            x_message, rad_feat = self.radial_message(x, edge_cache, radial_feat)
 
-        # === Step 6. Cross-focus softmax competition ===
-        if self.focus_compete and self.n_focus > 1:
-            compete_w = xp_asarray_nodetach(
-                xp, self.adamw_focus_compete_w[...], device=device
-            )
-            gate_in = xp.astype(focus_gate_src, compete_w.dtype)
-            gate_normed = self.focus_compete_norm(gate_in)  # (E, F, Cf)
-            # einsum "efi,if->ef"
-            focus_logits = xp.sum(
-                gate_normed * xp.permute_dims(compete_w, (1, 0))[None, ...],
-                axis=-1,
-            )
-            if self.mlp_bias:
-                focus_logits = (
-                    focus_logits
-                    + xp_asarray_nodetach(
-                        xp, self.focus_compete_bias[...], device=device
-                    )[None, :]
-                )
-            focus_logits = focus_logits / self.focus_softmax_tau
-            logits_max = xp.max(focus_logits, axis=1, keepdims=True)
-            exp_logits = xp.exp(focus_logits - logits_max)
-            alpha = exp_logits / xp.sum(exp_logits, axis=1, keepdims=True)
-            alpha = xp.astype(alpha, x_local.dtype)
-            alpha = alpha * (1.0 - self.focus_label_smoothing) + (
-                self.focus_label_smoothing / float(self.n_focus)
-            )
-            x_local = x_local * alpha[:, :, None, None]
-
-        # === Step 7. Rotate back to global frame ===
-        Dt_full = edge_cache.Dt_full
-        # Restore reduced global layout (E, D_m, C_wide) for inverse rotation.
-        x_local = xp.reshape(
-            xp.permute_dims(x_local, (0, 2, 1, 3)),
-            (n_edge, self.reduced_dim, self.hidden_channels),
-        )
-        Dt_from_m = project_Dt_from_m(
-            Dt_full=Dt_full,
-            coeff_index_m=self.coeff_index_m,
-            ebed_dim_full=self.ebed_dim_full,
-            cache=edge_cache.Dt_from_m_cache,
-            key_lmax=self.lmax,
-            key_mmax=self.mmax,
-        )
-        x_message = xp.matmul(Dt_from_m, x_local)  # (E, D, C_wide)
-        # Reduced layouts keep only 2*mmax+1 orders for l>mmax. Applying the
-        # inverse-rotation degree rescale after the global lift restores the
-        # full-basis amplitude expected by the block output contract.
-        rescale = xp.astype(
-            xp_asarray_nodetach(xp, self.rotate_inv_rescale_full, device=device),
-            x_message.dtype,
-        )
-        x_message = x_message * xp.reshape(rescale, (1, -1, 1))
+        # === Step 3. Optional focus mixing for the attention stream ===
+        if self.attn_focus_mix is not None:
+            x_message = self.attn_focus_mix(x_message[:, :, None, :])[:, :, 0, :]
 
-        # === Step 8. Aggregate with optional head-wise gating ===
+        # === Step 4. Aggregate with optional head-wise gating ===
         # Source Freeze Propagation Gate: broadcast the per-edge scalar
         # eta[src] to the edge message before destination aggregation.
+        # ``edge_src_gate`` is ``None`` outside bridging mode, in which
+        # case this branch disappears and the baseline / attention paths
+        # run unchanged.
         edge_src_gate = edge_cache.edge_src_gate
         if self.n_atten_head == 0:
-            # Baseline path: envelope-weighted masked sum -> degree norm.
-            edge_weight = xp.astype(edge_cache.edge_env, x_message.dtype)  # (E, 1)
-            edge_weight = xp.reshape(edge_weight, (n_edge, 1))
+            # Baseline path: fused envelope-weighted scatter add -> degree norm.
+            # Folding edge_src_gate into the scalar envelope keeps the
+            # op count unchanged.
+            edge_weight = edge_cache.edge_env  # (E, 1)
             if edge_src_gate is not None:
-                edge_weight = edge_weight * xp.astype(
-                    xp.reshape(edge_src_gate, (n_edge, 1)), edge_weight.dtype
-                )
-            x_message = x_message * edge_weight[:, :, None]
-            # pt: out.index_add_(0, dst, x_message) — padded-edge masked sum
-            # over the nnei axis (dst is slot-implicit).
-            x_message = x_message * mask_f[:, None, None]
-            out = xp.sum(
-                xp.reshape(
-                    x_message,
-                    (n_node, nnei, self.ebed_dim_full, self.hidden_channels),
-                ),
-                axis=1,
+                edge_weight = edge_weight * xp.astype(edge_src_gate, edge_weight.dtype)
+            x_message = x_message * edge_weight[..., None]
+            out = xp.zeros(
+                x.shape,
+                dtype=get_xp_precision(xp, self.compute_precision),
+                device=device,
+            )
+            out = xp_add_at(
+                out,
+                dst,
+                xp.astype(x_message, get_xp_precision(xp, self.compute_precision)),
+            )
+            out = out * xp.astype(
+                edge_cache.inv_sqrt_deg, get_xp_precision(xp, self.compute_precision)
             )
-            inv_sqrt_deg = xp.astype(edge_cache.inv_sqrt_deg, out.dtype)
-            out = out * inv_sqrt_deg  # (N, D, C_wide)
+            out = xp.astype(out, get_xp_precision(xp, self.precision))  # (N, D, C_wide)
         else:
-            # === Step 8.1. Build attention logits from scalar channels ===
-            qk_w = xp_asarray_nodetach(xp, self.attn_q_proj.weight[...], device=device)
+            # === Step 4.1. Build attention logits from scalar channels ===
+            compute_dtype = get_xp_precision(xp, self.compute_precision)
             x_l0_node = xp.reshape(
                 x[:, 0, :], (n_node, self.attn_n_focus, self.attn_focus_dim)
             )  # (N, Fa, Ca)
-            x_l0_node = xp.astype(x_l0_node, qk_w.dtype)
-            qk_input = self.attn_qk_norm(x_l0_node)
+            qk_input = self.attn_qk_norm(xp.astype(x_l0_node, compute_dtype))
             q_node = self.attn_q_proj(qk_input)  # (N, Fa, Ca)
             k_node = self.attn_k_proj(qk_input)  # (N, Fa, Ca)
-            dst_idx = xp.astype(xp.reshape(dst, (n_edge,)), xp.int64)
             q_edge = xp.reshape(
-                xp.take(q_node, dst_idx, axis=0),
+                xp.take(q_node, dst, axis=0),
                 (n_edge, self.attn_n_focus, self.n_atten_head, self.head_dim),
             )  # (E, Fa, H, Ch), Ca = H * Ch
             k_edge = xp.reshape(
-                xp.take(k_node, src_idx, axis=0),
+                xp.take(k_node, src, axis=0),
                 (n_edge, self.attn_n_focus, self.n_atten_head, self.head_dim),
             )  # (E, Fa, H, Ch)
             radial_l0 = xp.reshape(
                 rad_feat[:, 0, :], (n_edge, self.attn_n_focus, self.attn_focus_dim)
             )  # (E, Fa, Ca)
-            radial_l0 = xp.astype(radial_l0, qk_w.dtype)
-            # einsum "efi,ifo->efo" as a broadcast batched matmul.
-            logit_w = xp.permute_dims(
-                xp_asarray_nodetach(xp, self.adamw_attn_logit_w[...], device=device),
+            # "efi,ifo->efo": per-focus contraction over the input channel,
+            # expressed as a batched matmul over the focus axis.
+            radial_bias = xp.permute_dims(
+                xp.matmul(
+                    xp.permute_dims(xp.astype(radial_l0, compute_dtype), (1, 0, 2)),
+                    xp.permute_dims(
+                        xp_asarray_nodetach(
+                            xp, self.adamw_attn_logit_w[...], device=device
+                        ),
+                        (1, 0, 2),
+                    ),
+                ),
                 (1, 0, 2),
-            )  # (Fa, Ca, H)
-            radial_bias = xp.matmul(radial_l0[:, :, None, :], logit_w[None, ...])[
-                ..., 0, :
-            ]  # (E, Fa, H)
-            attn_logits = xp.sum(q_edge * k_edge, axis=-1) * (self.head_dim**-0.5)
+            )  # (E, F, H)
+            attn_logits: Array = xp.sum(q_edge * k_edge, axis=-1) * (
+                self.head_dim**-0.5
+            )
             attn_logits = attn_logits + radial_bias
 
-            # === Step 8.2. Destination-wise stable envelope-gated softmax ===
-            # pt: scatter-based segment softmax keyed by dst — padded-edge
-            # masked softmax over the nnei axis. ``src_weight=edge_src_gate``
-            # folds SFPG into both the numerator and the denominator so a
-            # muted source drops out of the normalization entirely.
+            # === Step 4.2. Destination-wise stable envelope-gated softmax ===
+            # ``src_weight=edge_src_gate`` folds SFPG into both the
+            # numerator and the denominator of the softmax. A muted
+            # source (``eta_src = 0``) therefore drops out of the
+            # destination's attention normalization entirely, which
+            # is required for the attention path to honor the
+            # frozen-zone invariance: a post-multiplication on
+            # ``attn_alpha`` alone would still leave the muted
+            # source leaking through the shared denominator.
             attn_alpha = segment_envelope_gated_softmax(
                 logits=attn_logits,
-                edge_env=xp.astype(edge_cache.edge_env, attn_logits.dtype),
+                edge_env=xp.astype(edge_cache.edge_env, compute_dtype),
+                dst=dst,
                 n_nodes=n_node,
                 z_bias_raw=xp_asarray_nodetach(
-                    xp, self.adamw_attn_z_bias_raw, device=device
+                    xp, self.adamw_attn_z_bias_raw[...], device=device
                 ),
                 eps=self.eps,
                 src_weight=(
                     None
                     if edge_src_gate is None
-                    else xp.astype(edge_src_gate, attn_logits.dtype)
+                    else xp.astype(edge_src_gate, compute_dtype)
                 ),
-                edge_mask=mask_f,
+                edge_mask=edge_cache.edge_mask,
             )  # (E, F, H)
 
-            # === Step 8.3. Value projection and head-wise aggregation ===
+            # === Step 4.3. Value projection and head-wise aggregation ===
+            value_focus = xp.astype(
+                xp.reshape(
+                    x_message,
+                    (
+                        n_edge,
+                        self.ebed_dim_full,
+                        self.attn_n_focus,
+                        self.attn_focus_dim,
+                    ),
+                ),
+                compute_dtype,
+            )  # (E, D, Fa, Ca)
+            if self.attn_v_proj is not None:
+                value_focus = self.attn_v_proj(value_focus)
             value_heads = xp.reshape(
-                xp.astype(x_message, qk_w.dtype),
+                value_focus,
                 (
                     n_edge,
                     self.ebed_dim_full,
@@ -1556,324 +1678,791 @@ def apply_bias_correction(
             weighted_value = value_heads * xp.reshape(
                 attn_alpha, (n_edge, 1, self.attn_n_focus, self.n_atten_head, 1)
             )
-            # pt: out_heads.index_add_(0, dst, weighted_value) — padded-edge
-            # masked sum over the nnei axis (dst is slot-implicit).
-            weighted_value = (
-                weighted_value
-                * xp.astype(mask_f, weighted_value.dtype)[:, None, None, None, None]
-            )
-            out_heads = xp.sum(
-                xp.reshape(
-                    weighted_value,
-                    (
-                        n_node,
-                        nnei,
-                        self.ebed_dim_full,
-                        self.attn_n_focus,
-                        self.n_atten_head,
-                        self.head_dim,
-                    ),
+            out_heads = xp.zeros(
+                (
+                    n_node,
+                    self.ebed_dim_full,
+                    self.attn_n_focus,
+                    self.n_atten_head,
+                    self.head_dim,
                 ),
-                axis=1,
+                dtype=compute_dtype,
+                device=device,
             )  # (N, D, Fa, H, Ch)
+            out_heads = xp_add_at(out_heads, dst, weighted_value)
 
-            # === Step 8.4. Output-side head gate ===
-            gate_w = xp.permute_dims(
-                xp_asarray_nodetach(xp, self.adamw_attn_gate_w[...], device=device),
-                (1, 0, 2),
-            )  # (Fa, Ca, H)
-            gate_in = self.attn_output_gate_norm(x_l0_node)
+            # === Step 4.4. Output-side head gate ===
+            # "nfi,ifo->nfo": per-focus contraction over the input channel,
+            # expressed as a batched matmul over the focus axis.
             attn_output_gate = xp_sigmoid(
-                xp.matmul(gate_in[:, :, None, :], gate_w[None, ...])[..., 0, :]
+                xp.permute_dims(
+                    xp.matmul(
+                        xp.permute_dims(
+                            self.attn_output_gate_norm(
+                                xp.astype(x_l0_node, compute_dtype)
+                            ),
+                            (1, 0, 2),
+                        ),
+                        xp.permute_dims(
+                            xp_asarray_nodetach(
+                                xp, self.adamw_attn_gate_w[...], device=device
+                            ),
+                            (1, 0, 2),
+                        ),
+                    ),
+                    (1, 0, 2),
+                )
             )  # (N, F, H)
             out_heads = out_heads * xp.reshape(
-                attn_output_gate,
-                (n_node, 1, self.attn_n_focus, self.n_atten_head, 1),
+                attn_output_gate, (n_node, 1, self.attn_n_focus, self.n_atten_head, 1)
             )  # (N, D, Fa, H, Ch)
 
-            # === Step 8.5. Merge heads ===
+            # === Step 4.5. Output projection and merge heads ===
+            out_focus = xp.reshape(
+                out_heads,
+                (
+                    n_node,
+                    self.ebed_dim_full,
+                    self.attn_n_focus,
+                    self.attn_focus_dim,
+                ),
+            )  # (N, D, Fa, Ca)
+            if self.attn_o_proj is not None:
+                out_focus = self.attn_o_proj(out_focus)
             out = xp.astype(
                 xp.reshape(
-                    out_heads, (n_node, self.ebed_dim_full, self.hidden_channels)
+                    out_focus, (n_node, self.ebed_dim_full, self.hidden_channels)
                 ),
-                x.dtype,
+                get_xp_precision(xp, self.precision),
             )  # (N, D, C_wide)
 
-        # === Step 9. Optional message-node grid product ===
-        # pt: post-aggregation packed-layout cross-mode product between the
-        # aggregated message (query) and the pre-focus-mixed node features
-        # (context), added as a residual before the final channel mixing.
+        # === Step 5. Optional message-node grid product ===
         if self.message_node_grid_product is not None:
             out = out + self.message_node_grid_product(out, x)
 
-        # === Step 10. Final channel mixing ===
+        # === Step 6. Optional per-node Cartesian tensor-product mixing ===
+        # Couples the aggregated message with the destination node feature ``x``,
+        # the Cartesian analog of the message-node grid product.
+        if self.node_cartesian_tp is not None:
+            out = self.node_cartesian_tp(out, x)
+
+        # === Step 7. Final channel mixing ===
         out = self.post_focus_mix(out[:, :, None, :])[:, :, 0, :]
         return out  # (N, D, C)
 
-    def _variables(self) -> dict[str, np.ndarray]:
-        """Variables keyed by the pt ``state_dict`` key names."""
-        variables: dict[str, np.ndarray] = {}
-        for i, so2_linear in enumerate(self.so2_linears):
-            for key, value in so2_linear._variables().items():
-                variables[f"so2_linears.{i}.{key}"] = value
-        for i, inter_norm in enumerate(self.so2_inter_norms):
-            if inter_norm is not None:
-                for key, value in inter_norm.serialize()["@variables"].items():
-                    variables[f"so2_inter_norms.{i}.{key}"] = value
-        for i, non_linear in enumerate(self.non_linearities):
-            if non_linear is not None:
-                for key, value in non_linear.serialize()["@variables"].items():
-                    variables[f"non_linearities.{i}.{key}"] = value
-        if self.n_atten_head > 0:
-            variables["adamw_attn_logit_w"] = to_numpy_array(self.adamw_attn_logit_w)
-            variables["adamw_attn_z_bias_raw"] = to_numpy_array(
-                self.adamw_attn_z_bias_raw
-            )
-            variables["adamw_attn_gate_w"] = to_numpy_array(self.adamw_attn_gate_w)
-            variables["attn_qk_norm.adam_scale"] = to_numpy_array(
-                self.attn_qk_norm.adam_scale
+    def radial_message(
+        self,
+        x: Array,
+        edge_cache: EdgeCache,
+        radial_feat: Array,
+    ) -> tuple[Array, Array]:
+        """
+        Build edge messages by rotation-free per-degree radial scaling.
+
+        Used when no local-frame operation is required (``mixing_layers == 0``,
+        ``radial_so2_mode == "none"``, and no node-wise grid product). Per-degree
+        scalar radial scaling commutes with rotation, so the edge-aligned frame
+        is unnecessary and the message reduces to a source gather, an elementwise
+        per-degree scale, and the optional cross-focus competition.
+
+        Parameters
+        ----------
+        x : Array
+            Node features with shape (N, D, C_wide) after pre-focus mixing.
+        edge_cache : EdgeCache
+            Precomputed edge cache.
+        radial_feat : Array
+            Per-edge radial features with shape (E, lmax+1, C).
+
+        Returns
+        -------
+        tuple[Array, Array]
+            ``(x_message, rad_feat)`` with shapes (E, D, C_wide) and
+            (E, lmax+1, C_wide). The ``l=0`` slice of ``rad_feat`` is consumed by
+            the attention aggregation.
+        """
+        xp = array_api_compat.array_namespace(x)
+        device = array_api_compat.device(x)
+        src = edge_cache.src
+        n_edge = src.shape[0]
+
+        rad_feat = radial_feat  # (E, lmax+1, C)
+        if self.radial_hidden_proj is not None:
+            rad_feat = self.radial_hidden_proj(rad_feat)  # (E, lmax+1, C_wide)
+
+        # Broadcast each degree's radial weight over its 2l+1 orders and scale the
+        # gathered source feature in the global frame.
+        x_src = xp.take(x, src, axis=0)  # (E, D, C_wide)
+        rad_packed = xp.take(
+            rad_feat,
+            xp_asarray_nodetach(xp, self.degree_index_full[...], device=device),
+            axis=1,
+        )  # (E, D, C_wide)
+        x_message = x_src * rad_packed
+
+        # === Cross-focus softmax competition ===
+        # Gate on the radial-fused source l=0 scalar, matching the SO(2) path.
+        if self.focus_compete and self.n_focus > 1:
+            focus_gate_src = xp.reshape(
+                x_src[:, 0, :] * rad_feat[:, 0, :],
+                (n_edge, self.n_focus, self.so2_focus_dim),
+            )  # (E, F, Cf)
+            alpha = self._focus_alpha(focus_gate_src)
+            x_message = xp.reshape(
+                xp.reshape(
+                    x_message,
+                    (n_edge, self.ebed_dim_full, self.n_focus, self.so2_focus_dim),
+                )
+                * xp.astype(alpha, x_message.dtype)[:, None, :, None],
+                (n_edge, self.ebed_dim_full, self.hidden_channels),
             )
-            variables["attn_q_proj.weight"] = to_numpy_array(self.attn_q_proj.weight)
-            variables["attn_k_proj.weight"] = to_numpy_array(self.attn_k_proj.weight)
-            variables["attn_output_gate_norm.adam_scale"] = to_numpy_array(
-                self.attn_output_gate_norm.adam_scale
+        return x_message, rad_feat
+
+    def so2_message(
+        self,
+        x: Array,
+        edge_cache: EdgeCache,
+        radial_feat: Array,
+    ) -> tuple[Array, Array]:
+        """
+        Build edge messages by rotate-to-local, SO(2) mixing, and rotate-back.
+
+        Parameters
+        ----------
+        x : Array
+            Node features with shape (N, D, C_wide) after pre-focus mixing.
+        edge_cache : EdgeCache
+            Precomputed edge cache.
+        radial_feat : Array
+            Per-edge radial features with shape (E, lmax+1, C).
+
+        Returns
+        -------
+        tuple[Array, Array]
+            ``(x_message, rad_feat)`` with shapes (E, D, C_wide) and
+            (E, D_m, C_wide). The ``l=0`` slice of ``rad_feat`` is consumed by
+            the attention aggregation.
+        """
+        xp = array_api_compat.array_namespace(x)
+        device = array_api_compat.device(x)
+        src = edge_cache.src
+        n_edge = src.shape[0]
+
+        # === Step 1. Rotate to edge-aligned local frame ===
+        x_local, x_dst_local = self._rotate_to_local(x, edge_cache)
+
+        # === Step 2. Select radial/type features for reduced layout ===
+        rad_feat = xp.take(
+            radial_feat,
+            xp_asarray_nodetach(xp, self.degree_index_m[...], device=device),
+            axis=1,
+        )  # (E, D_m, C)
+        if self.radial_hidden_proj is not None:
+            rad_feat = self.radial_hidden_proj(rad_feat)
+        if self.radial_degree_mixer is None:
+            x_local = x_local * rad_feat
+        else:
+            x_local = self.radial_degree_mixer(x_local, rad_feat)
+        if self.node_wise_grid_product is not None:
+            x_local = x_local + self.node_wise_grid_product(
+                x_local,
+                x_dst_local,
             )
-        if self.focus_compete_norm is not None:
-            variables["adamw_focus_compete_w"] = to_numpy_array(
-                self.adamw_focus_compete_w
+        rad_feat_l0_focus = xp.reshape(
+            rad_feat[:, 0, :], (n_edge, self.n_focus, self.so2_focus_dim)
+        )  # (E, F, Cf)
+
+        # === Step 3. Convert to SO(2) internal focus layout ===
+        focus_gate_src: Array | None = None
+        x_local = xp.permute_dims(
+            xp.reshape(
+                x_local, (n_edge, self.reduced_dim, self.n_focus, self.so2_focus_dim)
+            ),
+            (0, 2, 1, 3),
+        )  # (E, F, D_m, Cf), strided
+        if self.focus_compete and self.n_focus > 1:
+            focus_gate_src = x_local[:, :, 0, :]
+
+        # === Step 4. Multi-layer SO(2) mixing (pre-norm + residual) ===
+
+        def so2_l0_extractor(v: Array) -> Array:
+            """Extract scalar features from SO(2) reduced layout."""
+            return xp.reshape(v[:, :, 0, :], (v.shape[0], self.hidden_channels))
+
+        def apply_bias_correction(
+            x_local: Array,
+            so2_linear: SO2Linear,
+            layer_idx: int,
+        ) -> Array:
+            if layer_idx != 0 or so2_linear.bias0 is None:
+                return x_local
+            bias0 = xp.reshape(
+                xp_asarray_nodetach(xp, so2_linear.bias0[...], device=device),
+                (self.n_focus, so2_linear.out_channels),
+            )[None, ...]
+            if so2_linear.out_channels == self.so2_focus_dim:
+                radial_factor = rad_feat_l0_focus
+            elif so2_linear.out_channels == 2 * self.so2_focus_dim:
+                radial_factor = xp.concat(
+                    [rad_feat_l0_focus, rad_feat_l0_focus], axis=-1
+                )
+            else:
+                raise RuntimeError(
+                    "Unexpected SO2Linear output width in bias correction"
+                )
+            bias_correction = bias0 * (
+                radial_factor * xp.reshape(edge_cache.edge_env, (-1, 1, 1)) - 1.0
             )
-            variables["focus_compete_norm.adam_scale"] = to_numpy_array(
-                self.focus_compete_norm.adam_scale
+            x_local = xp.concat(
+                [
+                    x_local[:, :, :1, :] + bias_correction[:, :, None, :],
+                    x_local[:, :, 1:, :],
+                ],
+                axis=2,
             )
-            if self.mlp_bias:
-                variables["focus_compete_bias"] = to_numpy_array(
-                    self.focus_compete_bias
+            return x_local
+
+        if self.use_so2_attn_res:
+            so2_depth_sources = [x_local]
+            for layer_idx, (so2_linear, inter_norm, non_linear) in enumerate(
+                zip(
+                    self.so2_linears,
+                    self.so2_inter_norms,
+                    self.non_linearities,
+                    strict=True,
+                )
+            ):
+                x_local: Array = self.so2_layer_attn_res[layer_idx](
+                    sources=so2_depth_sources,
+                    scalar_extractor=so2_l0_extractor,
+                    current_x=x_local,
+                )
+                residual = x_local
+                x_local = inter_norm(x_local)
+                x_local = so2_linear(x_local)
+                x_local = apply_bias_correction(x_local, so2_linear, layer_idx)
+
+                x_local = non_linear(x_local)
+
+                if self.layer_scale:
+                    scale: Array = xp.reshape(
+                        xp_asarray_nodetach(
+                            xp,
+                            self.adam_so2_layer_scales[layer_idx][...],
+                            device=device,
+                        ),
+                        (1, self.n_focus, 1, self.so2_focus_dim),
+                    )
+                    x_local = residual + scale * x_local
+                else:
+                    x_local = residual + x_local
+                so2_depth_sources.append(x_local - residual)
+        else:
+            for layer_idx, (so2_linear, inter_norm, non_linear) in enumerate(
+                zip(
+                    self.so2_linears,
+                    self.so2_inter_norms,
+                    self.non_linearities,
+                    strict=True,
                 )
+            ):
+                residual = x_local
+                x_local = inter_norm(x_local)
+                x_local = so2_linear(x_local)
+                x_local = apply_bias_correction(x_local, so2_linear, layer_idx)
+
+                x_local = non_linear(x_local)
+
+                if self.layer_scale:
+                    scale = xp.reshape(
+                        xp_asarray_nodetach(
+                            xp,
+                            self.adam_so2_layer_scales[layer_idx][...],
+                            device=device,
+                        ),
+                        (1, self.n_focus, 1, self.so2_focus_dim),
+                    )
+                    x_local = residual + scale * x_local
+                else:
+                    x_local = residual + x_local
+
+        # === Step 5. Cross-focus softmax competition ===
+        if self.focus_compete and self.n_focus > 1:
+            alpha = self._focus_alpha(focus_gate_src)
+            x_local = x_local * xp.astype(alpha, x_local.dtype)[..., None, None]
+
+        # === Step 6. Rotate back to global frame ===
+        x_message = self._rotate_back(x_local, edge_cache, n_edge)
+        # Reduced layouts keep only 2*mmax+1 orders for l>mmax. Applying the
+        # inverse-rotation degree rescale after the global lift restores the
+        # full-basis amplitude expected by the block output contract.
+        x_message = x_message * xp.reshape(
+            xp_asarray_nodetach(xp, self.rotate_inv_rescale_full[...], device=device),
+            (1, -1, 1),
+        )
+        return x_message, rad_feat
+
+    def _rotate_to_local(
+        self, x: Array, edge_cache: EdgeCache
+    ) -> tuple[Array, Array | None]:
+        """
+        Rotate node features into the edge-aligned reduced local frame.
+
+        Parameters
+        ----------
+        x : Array
+            Node features with shape (N, D, C_wide) after pre-focus mixing.
+        edge_cache : EdgeCache
+            Precomputed edge cache.
+
+        Returns
+        -------
+        tuple[Array, Array | None]
+            ``(x_local, x_dst_local)`` with shapes (E, D_m, C_wide). The
+            destination-node projection is built only for the node-wise grid
+            product and is ``None`` otherwise.
+        """
+        xp = array_api_compat.array_namespace(x)
+        D_full = edge_cache.D_full
+        D_m_prime = project_D_to_m(
+            D_full=D_full,
+            coeff_index_m=self.coeff_index_m,
+            ebed_dim_full=self.ebed_dim_full,
+            cache=edge_cache.D_to_m_cache,
+            key_lmax=self.lmax,
+            key_mmax=self.mmax,
+        )
+        x_src = xp.take(x, edge_cache.src, axis=0)  # (E, D, C_wide)
+        x_local = xp.matmul(D_m_prime, x_src)  # (E, D_m, C_wide)
+        x_dst_local: Array | None = None
+        if self.node_wise_grid_product is not None:
+            x_dst = xp.take(x, edge_cache.dst, axis=0)  # (E, D, C_wide)
+            x_dst_local = xp.matmul(D_m_prime, x_dst)  # (E, D_m, C_wide)
+        return x_local, x_dst_local
+
+    def _rotate_back(self, x_local: Array, edge_cache: EdgeCache, n_edge: int) -> Array:
+        """
+        Rotate the SO(2) focus-layout features back to the global frame.
+
+        Parameters
+        ----------
+        x_local : Array
+            Local features with shape (E, F, D_m, Cf) in the SO(2) focus layout
+            produced by the SO(2) mixing layers.
+        edge_cache : EdgeCache
+            Precomputed edge cache.
+        n_edge : int
+            Number of edges E.
+
+        Returns
+        -------
+        Array
+            Global-frame message with shape (E, D, C_wide), before the
+            inverse-rotation degree rescale.
+        """
+        xp = array_api_compat.array_namespace(x_local)
+        Dt_full = edge_cache.Dt_full
+        # Restore reduced global layout (E, D_m, C_wide) for inverse rotation.
+        x_local = xp.reshape(
+            xp.permute_dims(x_local, (0, 2, 1, 3)),
+            (n_edge, self.reduced_dim, self.hidden_channels),
+        )
+        Dt_from_m = project_Dt_from_m(
+            Dt_full=Dt_full,
+            coeff_index_m=self.coeff_index_m,
+            ebed_dim_full=self.ebed_dim_full,
+            cache=edge_cache.Dt_from_m_cache,
+            key_lmax=self.lmax,
+            key_mmax=self.mmax,
+        )
+        return xp.matmul(Dt_from_m, x_local)  # (E, D, C_wide)
+
+    def cartesian_message(
+        self,
+        x: Array,
+        edge_cache: EdgeCache,
+        radial_feat: Array,
+    ) -> tuple[Array, Array]:
+        """
+        Build edge messages via the global-frame Cartesian rank-2 tensor product.
+
+        Parameters
+        ----------
+        x : Array
+            Node features with shape (N, D, C_wide) after pre-focus mixing.
+        edge_cache : EdgeCache
+            Precomputed edge cache.
+        radial_feat : Array
+            Per-edge radial features with shape (E, lmax+1, C).
+
+        Returns
+        -------
+        tuple[Array, Array]
+            ``(x_message, rad_feat)`` with shapes (E, D, C_wide) and
+            (E, lmax+1, C_wide). The ``l=0`` slice of ``rad_feat`` is consumed by
+            the attention aggregation.
+        """
+        xp = array_api_compat.array_namespace(x)
+        src = edge_cache.src
+        n_edge = src.shape[0]
+
+        # === Step 1. Per-degree radial weights projected to the hidden width ===
+        rad_feat = radial_feat  # (E, lmax+1, C)
         if self.radial_hidden_proj is not None:
-            variables["radial_hidden_proj.weight"] = to_numpy_array(
-                self.radial_hidden_proj.weight
+            rad_feat = self.radial_hidden_proj(rad_feat)  # (E, lmax+1, C_wide)
+
+        # === Step 2. Global-frame Cartesian tensor product ===
+        x_src = xp.take(x, src, axis=0)  # (E, D, C_wide)
+        x_message = self.edge_cartesian_tp(
+            x_src, edge_cache.edge_vec, rad_feat
+        )  # (E, D, C_wide)
+
+        # === Step 3. Cross-focus softmax competition ===
+        # Gate on the radial-fused source l=0 scalar, matching the SO(2) path,
+        # whose competition reads the pre-mixing input (its l=0 equals the
+        # rotation-invariant source l=0 times the l=0 radial weight).
+        if self.focus_compete and self.n_focus > 1:
+            focus_gate_src = xp.reshape(
+                x_src[:, 0, :] * rad_feat[:, 0, :],
+                (n_edge, self.n_focus, self.so2_focus_dim),
+            )  # (E, F, Cf)
+            alpha = self._focus_alpha(focus_gate_src)
+            x_message = xp.reshape(
+                xp.reshape(
+                    x_message,
+                    (n_edge, self.ebed_dim_full, self.n_focus, self.so2_focus_dim),
+                )
+                * xp.astype(alpha, x_message.dtype)[:, None, :, None],
+                (n_edge, self.ebed_dim_full, self.hidden_channels),
             )
-        if self.radial_degree_mixer is not None:
-            for key, value in self.radial_degree_mixer._variables().items():
-                variables[f"radial_degree_mixer.{key}"] = value
-        # Cross-mode grid products: nest each net's @variables under the pt
-        # state_dict attribute name so ``deserialize(pt.serialize())`` matches.
-        for name, grid in (
-            ("node_wise_grid_product", self.node_wise_grid_product),
-            ("message_node_grid_product", self.message_node_grid_product),
-        ):
-            if grid is not None:
-                for key, value in grid.serialize()["@variables"].items():
-                    variables[f"{name}.{key}"] = value
-        for name, mix in (
-            ("pre_focus_mix", self.pre_focus_mix),
-            ("post_focus_mix", self.post_focus_mix),
-        ):
-            for key, value in mix.serialize()["@variables"].items():
-                variables[f"{name}.{key}"] = value
-        variables["coeff_index_m"] = to_numpy_array(self.coeff_index_m)
-        variables["degree_index_m"] = to_numpy_array(self.degree_index_m)
-        variables["rotate_inv_rescale_full"] = to_numpy_array(
-            self.rotate_inv_rescale_full
+        return x_message, rad_feat
+
+    def _focus_alpha(self, focus_gate_src: Array) -> Array:
+        """
+        Compute per-focus softmax competition weights from l=0 scalars.
+
+        Parameters
+        ----------
+        focus_gate_src : Array
+            Per-edge l=0 scalar features with shape (E, F, Cf).
+
+        Returns
+        -------
+        Array
+            Label-smoothed competition weights with shape (E, F), in the compute
+            dtype.
+        """
+        xp = array_api_compat.array_namespace(focus_gate_src)
+        device = array_api_compat.device(focus_gate_src)
+        focus_logits = xp.sum(
+            self.focus_compete_norm(
+                xp.astype(focus_gate_src, get_xp_precision(xp, self.compute_precision))
+            )
+            * xp.permute_dims(
+                xp_asarray_nodetach(xp, self.adamw_focus_compete_w[...], device=device),
+                (1, 0),
+            )[None, :, :],
+            axis=2,
+        )
+        if self.mlp_bias:
+            focus_logits = (
+                focus_logits
+                + xp_asarray_nodetach(xp, self.focus_compete_bias[...], device=device)[
+                    None, :
+                ]
+            )
+        focus_logits = focus_logits / self.focus_softmax_tau
+        alpha = xp.exp(focus_logits - xp.max(focus_logits, axis=1, keepdims=True))
+        alpha = alpha / xp.sum(alpha, axis=1, keepdims=True)
+        return alpha * (1.0 - self.focus_label_smoothing) + (
+            self.focus_label_smoothing / float(self.n_focus)
         )
-        return variables
 
-    def _load_variables(self, variables: dict[str, Any]) -> None:
-        """Load variables keyed by the pt ``state_dict`` key names."""
-        variables = dict(variables)
-        prec = PRECISION_DICT[self.precision.lower()]
-        cprec = PRECISION_DICT[self.compute_precision.lower()]
-
-        def pop(key: str) -> Any:
-            try:
-                return variables.pop(key)
-            except KeyError:
-                raise KeyError(f"Missing variable: {key}") from None
-
-        def sub_vars(prefix: str) -> dict[str, Any]:
-            full = f"{prefix}."
-            out = {
-                key[len(full) :]: value
-                for key, value in variables.items()
-                if key.startswith(full)
-            }
-            for key in list(variables):
-                if key.startswith(full):
-                    del variables[key]
-            if not out:
-                raise KeyError(f"Missing variables with prefix: {full}")
-            return out
-
-        # Top-level index buffers: validate against the config-derived tables.
-        _check_index_table(self.coeff_index_m, pop("coeff_index_m"), "coeff_index_m")
-        _check_index_table(self.degree_index_m, pop("degree_index_m"), "degree_index_m")
-        _check_shape_assign(
-            self,
-            "rotate_inv_rescale_full",
-            pop("rotate_inv_rescale_full"),
-            prec,
-            "rotate_inv_rescale_full",
+    def _build_so2_mixing(
+        self,
+        *,
+        seed_so2_stack: int | list[int] | None,
+        seed_non_linearities: int | list[int] | None,
+        seed_depth_attn: int | list[int] | None,
+        trainable: bool,
+    ) -> None:
+        """
+        Build the SO(2) rotation-frame mixing stack.
+
+        Populates the m-major reduced-layout buffers, the multi-layer
+        ``SO2Linear`` stack, its intermediate norms and nonlinearities, the
+        optional depth-wise attention residuals, and the optional per-layer
+        LayerScale. These are the SO(2)-only tensors; they are skipped entirely
+        when ``edge_cartesian`` is True.
+
+        Parameters
+        ----------
+        seed_so2_stack : int | list[int] | None
+            Seed for the ``SO2Linear`` layers.
+        seed_non_linearities : int | list[int] | None
+            Seed for the intermediate nonlinearities.
+        seed_depth_attn : int | list[int] | None
+            Seed for the depth-wise attention residuals.
+        trainable : bool
+            Whether parameters are trainable.
+        """
+        # === Step 1. Precompute coefficient indices for m-major reduced layout ===
+        coeff_index_m = build_m_major_index(self.lmax, self.mmax)
+        degree_index_m = build_m_major_l_index(self.lmax, self.mmax)
+        degree_index_full = map_degree_idx(self.lmax)
+        rotate_inv_rescale_full = build_rotate_inv_rescale(
+            lmax=self.lmax,
+            mmax=self.mmax,
+            degree_index=degree_index_full,
         )
+        self.coeff_index_m = coeff_index_m
+        self.degree_index_m = degree_index_m
+        # Packed (l, m) -> l index, used by the rotation-free radial message to
+        # broadcast each degree's radial weight over its orders.
+        self.degree_index_full = degree_index_full
+        self.rotate_inv_rescale_full = rotate_inv_rescale_full
+        self.reduced_dim = int(coeff_index_m.size)
 
-        for i, so2_linear in enumerate(self.so2_linears):
-            so2_linear._load_variables(sub_vars(f"so2_linears.{i}"))
-        for i, inter_norm in enumerate(self.so2_inter_norms):
-            if inter_norm is not None:
-                sv = sub_vars(f"so2_inter_norms.{i}")
-                _check_index_table(
-                    inter_norm.degree_index_m,
-                    sv["degree_index_m"],
-                    f"so2_inter_norms.{i}.degree_index_m",
-                )
-                for name in ("balance_weight", "adam_scale", "bias0"):
-                    _check_shape_assign(
-                        inter_norm,
-                        name,
-                        sv[name],
-                        cprec,
-                        f"so2_inter_norms.{i}.{name}",
+        # === Step 3. Multiple SO2Linear layers ===
+        self.so2_linears = [
+            SO2Linear(
+                lmax=self.lmax,
+                mmax=self.mmax,
+                in_channels=self.so2_focus_dim,
+                out_channels=(
+                    2 * self.so2_focus_dim
+                    if self.s2_activation and i < self.mixing_layers - 1
+                    else self.so2_focus_dim
+                ),
+                n_focus=self.n_focus,
+                precision=self.precision,
+                mlp_bias=self.mlp_bias,
+                seed=child_seed(seed_so2_stack, i),
+                trainable=trainable,
+            )
+            for i in range(self.mixing_layers)
+        ]
+
+        # === Step 4. Intermediate norms (the last layer always uses Identity) ===
+        inter_norms: list[NativeOP] = []
+        for i in range(self.mixing_layers):
+            if self.so2_norm and i < self.mixing_layers - 1:
+                inter_norms.append(
+                    ReducedEquivariantRMSNorm(
+                        lmax=self.lmax,
+                        mmax=self.mmax,
+                        channels=self.so2_focus_dim,
+                        degree_index_m=self.degree_index_m,
+                        n_focus=self.n_focus,
+                        precision=self.compute_precision,
+                        trainable=trainable,
                     )
-        for i, non_linear in enumerate(self.non_linearities):
-            if non_linear is not None:
-                sv = sub_vars(f"non_linearities.{i}")
-                _check_index_table(
-                    non_linear.expand_index,
-                    sv["expand_index"],
-                    f"non_linearities.{i}.expand_index",
                 )
-                _check_shape_assign(
-                    non_linear.gate_linear,
-                    "weight",
-                    sv["gate_linear.weight"],
-                    cprec,
-                    f"non_linearities.{i}.gate_linear.weight",
+            else:
+                inter_norms.append(Identity())
+        self.so2_inter_norms = inter_norms
+
+        # === Step 5. Intermediate non-linearity (the last layer stays linear) ===
+        non_linearities: list[NativeOP] = []
+        for i in range(self.mixing_layers):
+            if i >= self.mixing_layers - 1:
+                non_linearities.append(Identity())
+            elif self.s2_activation:
+                non_linearities.append(
+                    S2GridNet(
+                        lmax=self.lmax,
+                        mmax=self.mmax,
+                        channels=self.so2_focus_dim,
+                        n_focus=self.n_focus,
+                        mode="self",
+                        op_type="glu",
+                        precision=self.compute_precision,
+                        layout="nfdc",
+                        grid_resolution_list=self.s2_grid_resolution,
+                        coefficient_layout="m_major",
+                        grid_method=self.s2_grid_method,
+                        mlp_bias=self.mlp_bias,
+                        trainable=trainable,
+                        seed=child_seed(seed_non_linearities, i),
+                    )
                 )
-                if self.mlp_bias:
-                    _check_shape_assign(
-                        non_linear.gate_linear,
-                        "bias",
-                        sv["gate_linear.bias"],
-                        cprec,
-                        f"non_linearities.{i}.gate_linear.bias",
+            else:
+                non_linearities.append(
+                    GatedActivation(
+                        lmax=self.lmax,
+                        mmax=self.mmax,
+                        channels=self.so2_focus_dim,
+                        n_focus=self.n_focus,
+                        precision=self.compute_precision,
+                        activation_function=self.activation_function,
+                        mlp_bias=self.mlp_bias,
+                        layout="nfdc",
+                        trainable=trainable,
+                        seed=child_seed(seed_non_linearities, i),
                     )
-        if self.n_atten_head > 0:
-            for name in (
-                "adamw_attn_logit_w",
-                "adamw_attn_z_bias_raw",
-                "adamw_attn_gate_w",
-            ):
-                _check_shape_assign(self, name, pop(name), cprec, name)
-            _check_shape_assign(
-                self.attn_qk_norm,
-                "adam_scale",
-                pop("attn_qk_norm.adam_scale"),
-                cprec,
-                "attn_qk_norm.adam_scale",
-            )
-            _check_shape_assign(
-                self.attn_q_proj,
-                "weight",
-                pop("attn_q_proj.weight"),
-                cprec,
-                "attn_q_proj.weight",
-            )
-            _check_shape_assign(
-                self.attn_k_proj,
-                "weight",
-                pop("attn_k_proj.weight"),
-                cprec,
-                "attn_k_proj.weight",
-            )
-            _check_shape_assign(
-                self.attn_output_gate_norm,
-                "adam_scale",
-                pop("attn_output_gate_norm.adam_scale"),
-                cprec,
-                "attn_output_gate_norm.adam_scale",
-            )
-        if self.focus_compete_norm is not None:
-            _check_shape_assign(
-                self,
-                "adamw_focus_compete_w",
-                pop("adamw_focus_compete_w"),
-                cprec,
-                "adamw_focus_compete_w",
-            )
-            _check_shape_assign(
-                self.focus_compete_norm,
-                "adam_scale",
-                pop("focus_compete_norm.adam_scale"),
-                cprec,
-                "focus_compete_norm.adam_scale",
-            )
-            if self.mlp_bias:
-                _check_shape_assign(
-                    self,
-                    "focus_compete_bias",
-                    pop("focus_compete_bias"),
-                    cprec,
-                    "focus_compete_bias",
                 )
-        if self.radial_hidden_proj is not None:
-            _check_shape_assign(
-                self.radial_hidden_proj,
-                "weight",
-                pop("radial_hidden_proj.weight"),
-                prec,
-                "radial_hidden_proj.weight",
-            )
-        if self.radial_degree_mixer is not None:
-            self.radial_degree_mixer._load_variables(sub_vars("radial_degree_mixer"))
-
-        # Grid products have no ``_load_variables``; reuse their config (from a
-        # fresh ``serialize()``) plus the loaded @variables and re-deserialize
-        # in place. This exercises the full grid-net serialize round-trip.
-        def _grid_product_vars(prefix: str, template: dict) -> dict:
-            # Reject schema drift: the loaded @variables keys must exactly match
-            # the fresh template's, so an unexpected ``<prefix>.*`` key fails the
-            # conversion loudly instead of being silently dropped.
-            loaded = sub_vars(prefix)
-            expected = set(template["@variables"])
-            if set(loaded) != expected:
-                raise ValueError(
-                    f"{prefix} @variables keys {sorted(loaded)} do not match "
-                    f"the expected keys {sorted(expected)}"
+        self.non_linearities = non_linearities
+
+        # === Step 6. Optional depth-wise attention residuals across SO(2) layers ===
+        if self.use_so2_attn_res:
+            self.so2_layer_attn_res: list[DepthAttnRes] | None = [
+                DepthAttnRes(
+                    channels=self.hidden_channels,
+                    input_dependent=self.so2_attn_res_mode == "dependent",
+                    eps=self.eps,
+                    bias=self.mlp_bias,
+                    precision=self.compute_precision,
+                    trainable=trainable,
+                    seed=child_seed(seed_depth_attn, i),
                 )
-            return loaded
+                for i in range(self.mixing_layers)
+            ]
+        else:
+            self.so2_layer_attn_res = None
 
+        # === Step 7. Optional per-layer LayerScale for SO(2) residual branches ===
+        if self.layer_scale:
+            self.adam_so2_layer_scales = [
+                np.ones(
+                    (self.n_focus, self.so2_focus_dim),
+                    dtype=PRECISION_DICT[self.precision.lower()],
+                )
+                * 1e-3
+                for _ in range(self.mixing_layers)
+            ]
+        else:
+            self.adam_so2_layer_scales = None
+
+    def _sub_modules(self) -> list[tuple[str, NativeOP]]:
+        """Single equivariant sub-modules keyed by their pt ``state_dict`` prefixes."""
+        modules: list[tuple[str, NativeOP]] = []
+        if self.edge_cartesian:
+            modules.append(("edge_cartesian_tp", self.edge_cartesian_tp))
+        if self.node_cartesian_tp is not None:
+            modules.append(("node_cartesian_tp", self.node_cartesian_tp))
+        if self.attn_qk_norm is not None:
+            modules.append(("attn_qk_norm", self.attn_qk_norm))
+            modules.append(("attn_q_proj", self.attn_q_proj))
+            modules.append(("attn_k_proj", self.attn_k_proj))
+            if self.attn_focus_mix is not None:
+                modules.append(("attn_focus_mix", self.attn_focus_mix))
+            if self.attn_v_proj is not None:
+                modules.append(("attn_v_proj", self.attn_v_proj))
+            if self.attn_o_proj is not None:
+                modules.append(("attn_o_proj", self.attn_o_proj))
+            modules.append(("attn_output_gate_norm", self.attn_output_gate_norm))
+        if self.focus_compete_norm is not None:
+            modules.append(("focus_compete_norm", self.focus_compete_norm))
+        if self.radial_hidden_proj is not None:
+            modules.append(("radial_hidden_proj", self.radial_hidden_proj))
+        if self.radial_degree_mixer is not None:
+            modules.append(("radial_degree_mixer", self.radial_degree_mixer))
         if self.node_wise_grid_product is not None:
-            template = self.node_wise_grid_product.serialize()
-            template["@variables"] = _grid_product_vars(
-                "node_wise_grid_product", template
-            )
-            self.node_wise_grid_product = type(self.node_wise_grid_product).deserialize(
-                template
-            )
+            modules.append(("node_wise_grid_product", self.node_wise_grid_product))
         if self.message_node_grid_product is not None:
-            template = self.message_node_grid_product.serialize()
-            template["@variables"] = _grid_product_vars(
-                "message_node_grid_product", template
+            modules.append(
+                ("message_node_grid_product", self.message_node_grid_product)
             )
-            self.message_node_grid_product = type(
-                self.message_node_grid_product
-            ).deserialize(template)
-        for name, mix in (
-            ("pre_focus_mix", self.pre_focus_mix),
-            ("post_focus_mix", self.post_focus_mix),
+        modules.append(("pre_focus_mix", self.pre_focus_mix))
+        modules.append(("post_focus_mix", self.post_focus_mix))
+        return modules
+
+    def _variables(self) -> dict[str, Any]:
+        """Variables keyed by the pt ``state_dict`` key names."""
+        variables: dict[str, Any] = {}
+        # === Single equivariant sub-modules ===
+        for prefix, sub in self._sub_modules():
+            for key, value in sub.serialize().get("@variables", {}).items():
+                variables[f"{prefix}.{key}"] = value
+        # === SO(2) mixing stack (absent under the Cartesian edge core) ===
+        if not self.edge_cartesian:
+            for attr in (
+                "so2_linears",
+                "so2_inter_norms",
+                "non_linearities",
+                "so2_layer_attn_res",
+            ):
+                sub_list = getattr(self, attr)
+                if sub_list is None:
+                    continue
+                for i, sub in enumerate(sub_list):
+                    for key, value in sub.serialize().get("@variables", {}).items():
+                        variables[f"{attr}.{i}.{key}"] = value
+            if self.adam_so2_layer_scales is not None:
+                for i, value in enumerate(self.adam_so2_layer_scales):
+                    variables[f"adam_so2_layer_scales.{i}"] = to_numpy_array(value)
+        # === Raw attention and cross-focus competition parameters ===
+        for name in (
+            "adamw_attn_logit_w",
+            "adamw_attn_z_bias_raw",
+            "adamw_attn_gate_w",
+            "adamw_focus_compete_w",
+            "focus_compete_bias",
         ):
-            sv = sub_vars(name)
-            _check_index_table(
-                mix.expand_index, sv["expand_index"], f"{name}.expand_index"
-            )
-            _check_shape_assign(mix, "weight", sv["weight"], prec, f"{name}.weight")
-            if self.mlp_bias:
-                _check_shape_assign(mix, "bias", sv["bias"], prec, f"{name}.bias")
+            value = getattr(self, name)
+            if value is not None:
+                variables[name] = to_numpy_array(value)
+        return variables
 
-        if variables:
-            raise KeyError(f"Unknown variables: {sorted(variables)}")
+    def _load_variables(self, variables: dict[str, Any]) -> None:
+        """Load variables keyed by the pt ``state_dict`` key names."""
+        compute_prec = PRECISION_DICT[self.compute_precision]
+        prec = PRECISION_DICT[self.precision.lower()]
+        # === Single equivariant sub-modules ===
+        for attr, sub in self._sub_modules():
+            prefix = f"{attr}."
+            sub_variables = {
+                key[len(prefix) :]: value
+                for key, value in variables.items()
+                if key.startswith(prefix)
+            }
+            data = sub.serialize()
+            data["@variables"] = sub_variables
+            setattr(self, attr, type(sub).deserialize(data))
+        # === SO(2) mixing stack (absent under the Cartesian edge core) ===
+        if not self.edge_cartesian:
+            for attr in (
+                "so2_linears",
+                "so2_inter_norms",
+                "non_linearities",
+                "so2_layer_attn_res",
+            ):
+                sub_list = getattr(self, attr)
+                if sub_list is None:
+                    continue
+                for i, sub in enumerate(sub_list):
+                    prefix = f"{attr}.{i}."
+                    sub_variables = {
+                        key[len(prefix) :]: value
+                        for key, value in variables.items()
+                        if key.startswith(prefix)
+                    }
+                    data = sub.serialize()
+                    data["@variables"] = sub_variables
+                    sub_list[i] = type(sub).deserialize(data)
+            if self.adam_so2_layer_scales is not None:
+                # Rebuild the per-layer scales locally and assign the list once.
+                # Under pt_expt ``adam_so2_layer_scales`` is a ParameterList whose
+                # elements reject direct numpy assignment; reassigning the whole
+                # list lets the backend rebuild the container cleanly while the
+                # numeric values stay identical.
+                self.adam_so2_layer_scales = [
+                    np.asarray(variables[f"adam_so2_layer_scales.{i}"], dtype=prec)
+                    for i in range(len(self.adam_so2_layer_scales))
+                ]
+        # === Raw attention and cross-focus competition parameters ===
+        for name in (
+            "adamw_attn_logit_w",
+            "adamw_attn_z_bias_raw",
+            "adamw_attn_gate_w",
+            "adamw_focus_compete_w",
+            "focus_compete_bias",
+        ):
+            if name in variables:
+                setattr(self, name, np.asarray(variables[name], dtype=compute_prec))
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the SO2Convolution to a dict (pt-compatible format)."""
+        """Serialize the SO2Convolution to a dict."""
         return {
             "@class": "SO2Convolution",
             "@version": 1,
@@ -1886,7 +2475,7 @@ def serialize(self) -> dict[str, Any]:
                 "focus_dim": self.focus_dim,
                 "focus_compete": self.focus_compete,
                 "so2_norm": self.so2_norm,
-                "so2_layers": self.so2_layers,
+                "mixing_layers": self.mixing_layers,
                 "so2_attn_res": self.so2_attn_res_mode,
                 "layer_scale": self.layer_scale,
                 "n_atten_head": self.n_atten_head,
@@ -1907,6 +2496,8 @@ def serialize(self) -> dict[str, Any]:
                 "mlp_bias": self.mlp_bias,
                 "radial_so2_mode": self.radial_so2_mode,
                 "radial_so2_rank": self.radial_so2_rank,
+                "edge_cartesian": self.edge_cartesian,
+                "node_cartesian": self.node_cartesian,
                 "eps": self.eps,
                 "precision": np.dtype(PRECISION_DICT[self.precision]).name,
                 "trainable": self.trainable,
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/so3.py b/deepmd/dpmodel/descriptor/dpa4_nn/so3.py
index 6a4b36aa4d..8ca2dbc855 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/so3.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/so3.py
@@ -2,24 +2,11 @@
 """
 SO(3)-equivariant linear layers for DPA4/SeZM.
 
-This module is the dpmodel port of ``deepmd.pt.model.descriptor.sezm_nn.so3``.
-It defines the channel-only and focus-aware linear maps used by the DPA4
-SO(3) feature transformations. All three pt classes are ported:
-``FocusLinear`` (used by ``so2``, ``grid_net``, ``activation``),
-``ChannelLinear`` (used by ``so2``, ``grid_net``), and ``SO3Linear``
-(used by ``so2``, ``ffn``).
-
-Serialization contract: ``SO3Linear`` mirrors the pt ``serialize()`` format
-exactly (same config and ``@variables`` keys), so pt ``serialize()`` output
-deserializes directly. The pt ``FocusLinear`` and ``ChannelLinear`` define no
-``serialize()`` (they only appear nested inside larger modules' state_dicts);
-their dpmodel ``serialize()``/``deserialize()`` use ``@variables`` keys equal
-to the pt ``state_dict`` key names (``weight``, ``bias``) so that pt
-state-dict fragments load directly.
-
-Weight initialization is distribution-equivalent to the pt version (drawn
-from ``np.random.default_rng`` instead of the torch generator stream), the
-same convention as ``utils.init_trunc_normal_fan_in_out``.
+This module defines the channel-only and focus-aware linear maps used by SeZM
+SO(3) feature transformations.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.so3``.
 """
 
 from __future__ import (
@@ -74,21 +61,21 @@ class FocusLinear(NativeOP):
 
     Parameters
     ----------
-    in_channels : int
+    in_channels
         Input feature dimension.
-    out_channels : int
+    out_channels
         Output feature dimension.
-    n_focus : int
+    n_focus
         Number of focus streams.
-    precision : str
+    precision
         Parameter precision.
-    bias : bool
+    bias
         Whether to use bias.
-    trainable : bool
+    trainable
         Whether parameters are trainable.
-    seed : int | list[int] | None
+    seed
         Random seed for initialization.
-    init_std : float | None
+    init_std
         If given, use normal(0, init_std) instead of default uniform init.
         Useful for gate projections where small initial logits are desired.
     """
@@ -129,11 +116,9 @@ def __init__(
 
     def call(self, x: Any) -> Any:
         """
-        Apply the per-focus linear projection.
-
         Parameters
         ----------
-        x : Array
+        x
             Input array with shape (B, F, Cin).
 
         Returns
@@ -159,11 +144,7 @@ def call(self, x: Any) -> Any:
         return out
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the FocusLinear to a dict.
-
-        The pt ``FocusLinear`` has no ``serialize()``; the ``@variables`` keys
-        here match the pt ``state_dict`` key names (``weight``, ``bias``).
-        """
+        """Serialize the FocusLinear to a dict."""
         variables = {"weight": to_numpy_array(self.weight)}
         if self.use_bias:
             variables["bias"] = to_numpy_array(self.bias)
@@ -203,15 +184,9 @@ def deserialize(cls, data: dict[str, Any]) -> FocusLinear:
             seed=config.get("seed"),
         )
         prec = PRECISION_DICT[obj.precision.lower()]
-        weight = np.asarray(variables["weight"], dtype=prec)
-        if weight.shape != obj.weight.shape:
-            raise ValueError(
-                f"weight shape {weight.shape} does not match "
-                f"the expected shape {obj.weight.shape}"
-            )
-        obj.weight = weight
+        obj.weight = np.asarray(variables["weight"], dtype=prec)
         if obj.use_bias:
-            obj.bias = np.asarray(variables["bias"], dtype=prec).reshape(obj.bias.shape)
+            obj.bias = np.asarray(variables["bias"], dtype=prec)
         return obj
 
 
@@ -228,19 +203,19 @@ class ChannelLinear(NativeOP):
 
     Parameters
     ----------
-    in_channels : int
+    in_channels
         Input feature dimension.
-    out_channels : int
+    out_channels
         Output feature dimension.
-    precision : str
+    precision
         Parameter precision.
-    bias : bool
+    bias
         Whether to use bias.
-    trainable : bool
+    trainable
         Whether parameters are trainable.
-    seed : int | list[int] | None
+    seed
         Random seed for initialization.
-    init_std : float | None
+    init_std
         If given, use normal(0, init_std) instead of default uniform init.
         Useful for gate projections where small initial logits are desired.
     """
@@ -277,11 +252,9 @@ def __init__(
 
     def call(self, x: Any) -> Any:
         """
-        Apply the channel-only linear projection.
-
         Parameters
         ----------
-        x : Array
+        x
             Input array with shape ``(..., C_in)``.
 
         Returns
@@ -298,11 +271,7 @@ def call(self, x: Any) -> Any:
         return out
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the ChannelLinear to a dict.
-
-        The pt ``ChannelLinear`` has no ``serialize()``; the ``@variables``
-        keys here match the pt ``state_dict`` key names (``weight``, ``bias``).
-        """
+        """Serialize the ChannelLinear to a dict."""
         variables = {"weight": to_numpy_array(self.weight)}
         if self.use_bias:
             variables["bias"] = to_numpy_array(self.bias)
@@ -340,15 +309,9 @@ def deserialize(cls, data: dict[str, Any]) -> ChannelLinear:
             seed=config.get("seed"),
         )
         prec = PRECISION_DICT[obj.precision.lower()]
-        weight = np.asarray(variables["weight"], dtype=prec)
-        if weight.shape != obj.weight.shape:
-            raise ValueError(
-                f"weight shape {weight.shape} does not match "
-                f"the expected shape {obj.weight.shape}"
-            )
-        obj.weight = weight
+        obj.weight = np.asarray(variables["weight"], dtype=prec)
         if obj.use_bias:
-            obj.bias = np.asarray(variables["bias"], dtype=prec).reshape(obj.bias.shape)
+            obj.bias = np.asarray(variables["bias"], dtype=prec)
         return obj
 
 
@@ -356,8 +319,9 @@ class SO3Linear(NativeOP):
     """
     Focus-aware degree-wise linear self-interaction.
 
-    The key insight is that weights are shared across all ``m`` components
-    within each ``l`` block.
+    This vectorized implementation avoids Python loops by using ``torch.einsum``
+    and ``index_select``. The key insight is that weights are shared across all
+    ``m`` components within each ``l`` block.
 
     Notes
     -----
@@ -365,28 +329,29 @@ class SO3Linear(NativeOP):
     - Bias storage: ``(F*C_out,)``, only applied to ``l=0`` scalar components.
     - Runtime view restores weights to ``(lmax+1, C_in, F, C_out)`` via reshape.
     - ``expand_index`` maps each packed ``(l,m)`` position to its ``l`` value.
-    - The pt einsum ``ndfi,difo->ndfo`` is expressed as a broadcast batched
-      matmul, which keeps the whole multi-focus path vectorized.
+    - Einsum ``ndfi,difo->ndfo`` keeps the whole multi-focus path vectorized.
+    - In HybridMuon slice mode, each ``(C_in, F*C_out)`` slice gets independent
+      NS update with stable rectangular scaling.
 
     Parameters
     ----------
-    lmax : int
+    lmax
         Maximum spherical harmonic degree.
-    in_channels : int
+    in_channels
         Number of input channels per (l, m) coefficient.
-    out_channels : int
+    out_channels
         Number of output channels per (l, m) coefficient.
-    n_focus : int
+    n_focus
         Number of focus streams.
-    precision : str
+    precision
         Parameter precision.
-    mlp_bias : bool
+    mlp_bias
         Whether to use bias for l=0 (scalar) components.
-    trainable : bool
+    trainable
         Whether parameters are trainable.
-    seed : int | list[int] | None
+    seed
         Random seed for weight initialization.
-    init_std : float | None
+    init_std
         If given, use normal(0, init_std) for all weights instead of default
         trunc-normal fan-in/fan-out init. Use 0.0 for zero initialization.
     """
@@ -448,11 +413,9 @@ def __init__(
 
     def call(self, x: Any) -> Any:
         """
-        Apply the degree-wise linear self-interaction.
-
         Parameters
         ----------
-        x : Array
+        x
             Input features with shape (N, D, F, C_in) where D=(lmax+1)^2.
 
         Returns
@@ -473,7 +436,7 @@ def call(self, x: Any) -> Any:
         expand_index = xp_asarray_nodetach(
             xp, self.expand_index, device=array_api_compat.device(x)
         )
-        weight_expanded = xp.take(weight, expand_index, axis=0)
+        weight_expanded = xp.take(weight, expand_index, axis=0)  # (D, Cin, F, Cout)
 
         # === Step 2. Per-focus, per-degree channel mixing ===
         # einsum "ndfi,difo->ndfo" as a broadcast batched matmul:
@@ -489,13 +452,14 @@ def call(self, x: Any) -> Any:
                 xp, self.bias[...], device=array_api_compat.device(x)
             )
             bias = xp.reshape(bias, (self.n_focus, self.out_channels))
-            out0 = out[:, :1, :, :] + bias[None, None, ...]
-            out = xp.concat([out0, out[:, 1:, :, :]], axis=1) if self.lmax > 0 else out0
+            out = xp.concat(
+                [out[:, :1, :, :] + bias[None, None, ...], out[:, 1:, :, :]], axis=1
+            )
 
         return out
 
     def serialize(self) -> dict[str, Any]:
-        """Serialize the SO3Linear to a dict (pt-compatible format)."""
+        """Serialize the SO3Linear to a dict."""
         variables = {"weight": to_numpy_array(self.weight)}
         if self.mlp_bias:
             variables["bias"] = to_numpy_array(self.bias)
@@ -538,16 +502,8 @@ def deserialize(cls, data: dict[str, Any]) -> SO3Linear:
             seed=config.get("seed"),
         )
         prec = PRECISION_DICT[obj.precision.lower()]
-        expand_index = np.asarray(variables["expand_index"], dtype=np.int64)
-        if not np.array_equal(expand_index, to_numpy_array(obj.expand_index)):
-            raise ValueError("expand_index does not match the lmax-derived table")
-        weight = np.asarray(variables["weight"], dtype=prec)
-        if weight.shape != obj.weight.shape:
-            raise ValueError(
-                f"weight shape {weight.shape} does not match "
-                f"the expected shape {obj.weight.shape}"
-            )
-        obj.weight = weight
+        obj.expand_index = np.asarray(variables["expand_index"], dtype=np.int64)
+        obj.weight = np.asarray(variables["weight"], dtype=prec)
         if obj.mlp_bias:
-            obj.bias = np.asarray(variables["bias"], dtype=prec).reshape(obj.bias.shape)
+            obj.bias = np.asarray(variables["bias"], dtype=prec)
         return obj
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/utils.py b/deepmd/dpmodel/descriptor/dpa4_nn/utils.py
index 5a016f6e94..04d3a20de7 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/utils.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/utils.py
@@ -2,25 +2,11 @@
 """
 Utility helpers for the DPA4/SeZM descriptor package.
 
-This module is the dpmodel port of ``deepmd.pt.model.descriptor.sezm_nn.utils``.
-It provides the small numeric helpers shared across the DPA4 descriptor
-implementation.
+This module provides small numerical helpers and dtype conversion utilities
+shared across the SeZM descriptor implementation.
 
-Init-time helpers (``init_trunc_normal_fan_in_out``) operate on static numpy
-data and are plain numpy by design (not array-API). ``safe_norm`` operates on
-runtime tensors and is array-API compatible.
-
-Helpers from the pt version intentionally NOT ported:
-
-- ``nvtx_range``: CUDA profiling, torch-only.
-- ``use_triton_infer``: Triton inference kernels, torch-only.
-- ``safe_numpy_to_tensor``: numpy -> torch conversion glue; dpmodel code uses
-  ``xp.asarray`` directly.
-- ``np_safe``: torch -> numpy conversion glue; dpmodel code uses
-  ``deepmd.dpmodel.common.to_numpy_array`` instead.
-
-``get_promoted_dtype`` IS ported (numpy equivalent) because core modules use it
-to pick a stable computation/storage dtype.
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.utils``.
 """
 
 from __future__ import (
@@ -48,14 +34,10 @@ def init_trunc_normal_fan_in_out(
     Uses Xavier-like variance scaling: std = scale / sqrt(fan_in + fan_out).
     Truncation at +/-3*std prevents extreme outliers.
 
-    NumPy equivalent of the pt version: the weight is filled in place from a
-    ``np.random.default_rng(seed)`` stream (distribution-equivalent to the
-    torch version, not RNG-stream-identical).
-
     Parameters
     ----------
     weight : np.ndarray
-        Weight array with shape (out_features, in_features), modified in place.
+        Weight array with shape (out_features, in_features).
     seed : int | list[int] | None
         Random seed for reproducibility.
     scale : float, default=1.0
@@ -68,7 +50,7 @@ def init_trunc_normal_fan_in_out(
     fan_out, fan_in = weight.shape
     std = float(scale) / math.sqrt(fan_in + fan_out)
     rng = np.random.default_rng(seed)
-    # rejection sampling: exact truncated normal on [-3*std, 3*std]
+    # Rejection sampling reproduces the truncated normal on [-3*std, 3*std].
     values = rng.normal(0.0, std, size=weight.shape)
     out_of_bounds = np.abs(values) > 3.0 * std
     while out_of_bounds.any():
@@ -83,8 +65,7 @@ def safe_norm(x: Any, eps: float = 1e-7) -> Any:
     """
     Compute vector norm with smooth epsilon regularization.
 
-    Uses float32 for computation when input is fp16/bf16. This function
-    operates on runtime tensors and is array-API compatible.
+    Uses float32 for computation when input is fp16/bf16.
 
     Parameters
     ----------
@@ -100,11 +81,12 @@ def safe_norm(x: Any, eps: float = 1e-7) -> Any:
     """
     xp = array_api_compat.array_namespace(x)
     in_dtype = x.dtype
-    # matches "float16" and "bfloat16" dtype names across namespaces
+    # ``str(dtype)`` matches both "float16" and "bfloat16" across namespaces.
     promote = "float16" in str(in_dtype)
     if promote:
         x = xp.astype(x, xp.float32)
-    norm = xp.sqrt(xp.sum(x * x, axis=-1, keepdims=True) + float(eps) * float(eps))
+    eps_sq = float(eps) * float(eps)
+    norm = xp.sqrt(xp.sum(x * x, axis=-1, keepdims=True) + eps_sq)
     if promote:
         norm = xp.astype(norm, in_dtype)
     return norm
@@ -116,9 +98,6 @@ def get_promoted_dtype(dtype: Any) -> Any:
 
     For bf16/fp16, use float32 to ensure numerical stability
     in computation and storage compatibility.
-
-    NumPy equivalent of the pt version; accepts a numpy dtype (including
-    ``ml_dtypes.bfloat16``) and returns a numpy dtype.
     """
     name = getattr(dtype, "name", None) or str(dtype)
     if "float16" in name:  # matches float16 and bfloat16
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/wignerd.py b/deepmd/dpmodel/descriptor/dpa4_nn/wignerd.py
index c63f4b3eb6..42f99e2275 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/wignerd.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/wignerd.py
@@ -1,33 +1,12 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """
-Quaternion-based Wigner-D and edge-frame utilities for the DPA4/SeZM descriptor.
-
-This module is the dpmodel port of ``deepmd.pt.model.descriptor.sezm_nn.wignerd``.
-It defines the quaternion helpers and the Wigner-D evaluator used to construct
-edge-aligned SO(3) rotation blocks.
-
-Port notes
-----------
-- The pt reference evaluates the ``l=2..10`` blocks with monomial kernels whose
-  coefficients are solved at init time by ``torch.linalg.lstsq`` against the
-  generic closed-form quaternion polynomial path (seeded ``torch.randn`` fit
-  points). That fit is a performance optimization and is not bit-reproducible
-  without torch. The dpmodel port instead evaluates the generic closed-form
-  path (the very reference the pt kernels are fitted to) for every ``l >= 2``.
-  Outputs agree with pt within the fp64 round-off of the pt fit (validated by
-  the parity tests at ``rtol=1e-12, atol=1e-14``).
-- All coefficient tables are plain numpy arrays computed at ``__init__`` time;
-  ``call`` is pure array-API. The block-diagonal matrices are assembled
-  functionally with a precomputed ``xp.take`` gather index (no ``__setitem__``
-  on traced values), so the path is safe for later torch.export
-  functionalization.
-- Random-gamma gauge randomization is NOT part of this module in pt either:
-  it lives in ``edge_cache`` and only consumes the deterministic helpers
-  ``quaternion_z_rotation`` / ``quaternion_multiply`` ported here.
-
-Serialization contract: pt ``WignerDCalculator.serialize()`` emits only
-``{"@class", "@version"}`` (all buffers are derived constants rebuilt from
-``lmax``/``dtype`` by the parent). The dpmodel port mirrors that contract.
+Quaternion-based Wigner-D and edge-frame utilities for DPA4/SeZM.
+
+This module defines the quaternion helpers and Wigner-D evaluator used to
+construct edge-aligned SO(3) rotation blocks in SeZM.
+
+This module is the dpmodel (array-API) port of
+``deepmd.pt.model.descriptor.sezm_nn.wignerd``.
 """
 
 from __future__ import (
@@ -35,8 +14,12 @@
 )
 
 import math
+from itertools import (
+    permutations,
+)
 from typing import (
     Any,
+    ClassVar,
 )
 
 import array_api_compat
@@ -48,6 +31,7 @@
 )
 from deepmd.dpmodel.array_api import (
     xp_asarray_nodetach,
+    xp_take_along_axis,
 )
 from deepmd.dpmodel.common import (
     get_xp_precision,
@@ -56,15 +40,163 @@
     check_version_compatibility,
 )
 
-from .utils import (
-    safe_norm,
-)
+
+class CaseCoefficients:
+    """
+    Polynomial tables for one magnitude-ordered branch of the quaternion Wigner path.
+
+    The generic Wigner-D evaluation factors each matrix element into:
+    - a phase term carried by the arguments of ``Ra`` and ``Rb``;
+    - a real magnitude term evaluated by Horner recursion.
+
+    The magnitude formula has two numerically stable branches, depending on whether
+    ``|Ra| >= |Rb|`` or the opposite. Each branch stores the branch-specific Horner
+    coefficients and the powers of ``|Ra|`` / ``|Rb|`` that sit outside the Horner
+    polynomial.
+    """
+
+    def __init__(
+        self,
+        *,
+        coeff: np.ndarray,
+        horner: np.ndarray,
+        poly_len: np.ndarray,
+        ra_exp: np.ndarray,
+        rb_exp: np.ndarray,
+        sign: np.ndarray,
+    ) -> None:
+        self.coeff = coeff
+        self.horner = horner
+        self.poly_len = poly_len
+        self.ra_exp = ra_exp
+        self.rb_exp = rb_exp
+        self.sign = sign
+
+
+class WignerPolynomialCoefficients:
+    """
+    Precomputed coefficient tables for the generic quaternion Wigner evaluator.
+
+    Only one half of each real block is stored explicitly. The remaining entries are
+    reconstructed from the exact symmetry
+
+    ``D^l_{-m',-m} = (-1)^(m' - m) * conj(D^l_{m',m})``.
+
+    This keeps the runtime path branch-free with respect to ``(l, m', m)`` while
+    preserving the exact packed ``(l, m)`` layout used everywhere else in SeZM.
+    """
+
+    def __init__(
+        self,
+        *,
+        lmin: int,
+        lmax: int,
+        size: int,
+        max_poly_len: int,
+        n_primary: int,
+        n_derived: int,
+        primary_row: np.ndarray,
+        primary_col: np.ndarray,
+        case1: CaseCoefficients,
+        case2: CaseCoefficients,
+        mp_plus_m: np.ndarray,
+        m_minus_mp: np.ndarray,
+        diagonal_mask: np.ndarray,
+        anti_diagonal_mask: np.ndarray,
+        special_2m: np.ndarray,
+        anti_diag_sign: np.ndarray,
+        derived_row: np.ndarray,
+        derived_col: np.ndarray,
+        derived_primary_idx: np.ndarray,
+        derived_sign: np.ndarray,
+    ) -> None:
+        self.lmin = int(lmin)
+        self.lmax = int(lmax)
+        self.size = int(size)
+        self.max_poly_len = int(max_poly_len)
+        self.n_primary = int(n_primary)
+        self.n_derived = int(n_derived)
+
+        self.primary_row = primary_row
+        self.primary_col = primary_col
+        self.case1 = case1
+        self.case2 = case2
+        self.mp_plus_m = mp_plus_m
+        self.m_minus_mp = m_minus_mp
+        self.diagonal_mask = diagonal_mask
+        self.anti_diagonal_mask = anti_diagonal_mask
+        self.special_2m = special_2m
+        self.anti_diag_sign = anti_diag_sign
+        self.derived_row = derived_row
+        self.derived_col = derived_col
+        self.derived_primary_idx = derived_primary_idx
+        self.derived_sign = derived_sign
+
+
+class WignerSmallOrderCoefficients:
+    """
+    Precomputed low-order quaternion polynomial kernels in the SeZM packed basis.
+
+    Only kernels required by the owning ``WignerDCalculator`` are registered:
+
+    - ``C_l2`` stores the degree-4 tensor-contraction coefficients.
+    - ``C_l3`` .. ``C_l10`` store flattened monomial coefficient matrices.
+    - ``C_combined_l3l4`` lifts the ``l=3`` basis to degree 8 and stacks it with
+      ``l=4`` so both blocks can be produced by one matrix multiply.
+    - ``C_combined_l5l6`` applies the same degree-12 stacking for ``l=5,6``.
+    - ``C_combined_l7l8`` applies the same degree-16 stacking for ``l=7,8``.
+    - ``C_combined_l9l10`` applies the same degree-20 stacking for ``l=9,10``.
+    - ``exp_l3`` .. ``exp_l10`` store the monomial exponent tables used by the
+      runtime gather/prod path.
+    """
+
+    _EXTRA_KERNELS_BY_LMAX: ClassVar[tuple[tuple[int, tuple[str, ...]], ...]] = (
+        (3, ("C_l3", "exp_l3")),
+        (4, ("C_l4", "C_combined_l3l4", "exp_l4")),
+        (5, ("C_l5", "exp_l5")),
+        (6, ("C_l6", "C_combined_l5l6", "exp_l6")),
+        (7, ("C_l7", "exp_l7")),
+        (8, ("C_l8", "C_combined_l7l8", "exp_l8")),
+        (9, ("C_l9", "exp_l9")),
+        (10, ("C_l10", "C_combined_l9l10", "exp_l10")),
+    )
+
+    def __init__(
+        self,
+        *,
+        lmax: int,
+        kernels: dict[str, np.ndarray],
+    ) -> None:
+        for name in self.required_kernel_names(lmax):
+            setattr(self, name, kernels[name])
+
+    @classmethod
+    def required_kernel_names(cls, lmax: int) -> tuple[str, ...]:
+        """Return low-order kernel names required for ``lmax``."""
+        names = ["C_l2"]
+        for threshold, extra_names in cls._EXTRA_KERNELS_BY_LMAX:
+            if lmax >= threshold:
+                names.extend(extra_names)
+        return tuple(names)
+
+
+def _safe_norm_nd(x: Any, eps: float = 1e-7) -> Any:
+    """Compute an ``L2`` norm with smooth epsilon regularization."""
+    xp = array_api_compat.array_namespace(x)
+    in_dtype = x.dtype
+    # ``str(dtype)`` matches both "float16" and "bfloat16" across namespaces.
+    promote = "float16" in str(in_dtype)
+    if promote:
+        x = xp.astype(x, xp.float32)
+    norm = xp.sqrt(xp.sum(x * x, axis=-1, keepdims=True) + eps * eps)
+    if promote:
+        norm = xp.astype(norm, in_dtype)
+    return norm
 
 
 def quaternion_normalize(q: Any, eps: float = 1e-7) -> Any:
     """Normalize quaternions with a differentiable epsilon floor."""
-    # safe_norm is the array-API port of pt's _safe_norm_nd (same formula)
-    return q / safe_norm(q, eps)
+    return q / _safe_norm_nd(q, eps)
 
 
 def quaternion_multiply(q1: Any, q2: Any) -> Any:
@@ -87,9 +219,9 @@ def quaternion_to_rotation_matrix(q: Any) -> Any:
     """
     Convert unit quaternions to 3x3 rotation matrices.
 
-    The returned matrix is the active rotation represented by ``q``. In SeZM
-    this is the global->local edge rotation, so multiplying the edge direction
-    by this matrix sends it to local ``+Z``.
+    The returned matrix is the active rotation represented by ``q``. In SeZM this is
+    the global->local edge rotation, so multiplying the edge direction by this matrix
+    sends it to local ``+Z``.
     """
     xp = array_api_compat.array_namespace(q)
     w, x, y, z = q[..., 0], q[..., 1], q[..., 2], q[..., 3]
@@ -148,12 +280,9 @@ def _smooth_step_cinf(x: Any) -> Any:
     """
     Smooth ``C^inf`` step on ``[0, 1]``.
 
-    This function equals exactly 0 and 1 at the endpoints, and transitions with
-    all derivatives vanishing there. It is used only to blend the two valid
-    quaternion charts; the geometric constraint itself is still enforced by the
-    charts. The interior denominator ``left + right`` is bounded below by
-    ``exp(-2)`` on the clamped domain, so the dead branches of the ``where``
-    never divide by zero (gradient-safe).
+    This function equals exactly 0 and 1 at the endpoints, and transitions with all
+    derivatives vanishing there. It is used only to blend the two valid quaternion
+    charts; the geometric constraint itself is still enforced by the charts.
     """
     xp = array_api_compat.array_namespace(x)
     x_clamped = xp.clip(x, min=0.0, max=1.0)
@@ -178,9 +307,9 @@ def quaternion_nlerp(
     """
     Normalized linear interpolation on the shortest quaternion arc.
 
-    ``q`` and ``-q`` represent the same spatial rotation. Aligning signs before
-    the interpolation guarantees that the blended chart stays on the shorter
-    great-circle segment in ``S^3``.
+    ``q`` and ``-q`` represent the same spatial rotation. Aligning signs before the
+    interpolation guarantees that the blended chart stays on the shorter great-circle
+    segment in ``S^3``.
     """
     xp = array_api_compat.array_namespace(q0, q1, weight)
     dot = xp.sum(q0 * q1, axis=-1, keepdims=True)
@@ -189,7 +318,10 @@ def quaternion_nlerp(
     return quaternion_normalize(blended, eps)
 
 
-def _build_edge_quaternion_chart_pos_z(edge_unit: Any, eps: float) -> Any:
+def _build_edge_quaternion_chart_pos_z(
+    edge_unit: Any,
+    eps: float,
+) -> Any:
     """Quaternion chart that is exact away from the ``-Z`` pole."""
     xp = array_api_compat.array_namespace(edge_unit)
     x = edge_unit[..., 0]
@@ -199,7 +331,10 @@ def _build_edge_quaternion_chart_pos_z(edge_unit: Any, eps: float) -> Any:
     return quaternion_normalize(q, eps)
 
 
-def _build_edge_quaternion_chart_neg_z(edge_unit: Any, eps: float) -> Any:
+def _build_edge_quaternion_chart_neg_z(
+    edge_unit: Any,
+    eps: float,
+) -> Any:
     """Quaternion chart that is exact away from the ``+Z`` pole."""
     xp = array_api_compat.array_namespace(edge_unit)
     x = edge_unit[..., 0]
@@ -218,16 +353,16 @@ def build_edge_quaternion(
     """
     Build stable edge quaternions for the SeZM local ``+Z`` convention.
 
-    The returned quaternion represents the global->local edge rotation, so
-    applying its rotation matrix to the unit edge direction yields exactly
-    ``(0, 0, 1)``. Two exact quaternion charts are used:
+    The returned quaternion represents the global->local edge rotation, so applying its
+    rotation matrix to the unit edge direction yields exactly ``(0, 0, 1)``. Two exact
+    quaternion charts are used:
 
     - a ``+Z`` chart that is regular everywhere except the antipodal ``-Z`` pole;
     - a ``-Z`` chart that is regular everywhere except the antipodal ``+Z`` pole.
 
-    Both charts encode the same edge-aligned local frame. A smooth ``C^inf``
-    blend in the overlap region removes the hard pole switch while keeping the
-    represented rotation on the correct quaternion branch.
+    Both charts encode the same edge-aligned local frame. A smooth ``C^inf`` blend in
+    the overlap region removes the hard pole switch while keeping the represented
+    rotation on the correct quaternion branch.
 
     Parameters
     ----------
@@ -246,7 +381,7 @@ def build_edge_quaternion(
     """
     xp = array_api_compat.array_namespace(edge_vec)
     if edge_len is None:
-        edge_len = safe_norm(edge_vec, eps)
+        edge_len = _safe_norm_nd(edge_vec, eps)
     else:
         edge_len = xp.sqrt(edge_len * edge_len + eps * eps)
     edge_unit = edge_vec / edge_len
@@ -256,359 +391,28 @@ def build_edge_quaternion(
     return quaternion_nlerp(q_neg, q_pos, blend, eps=eps)
 
 
-def _factorial_table(n: int) -> np.ndarray:
-    """Return ``[0!, 1!, ..., n!]`` in fp64 (iterative, matching pt bit-exactly)."""
-    table = np.zeros(n + 1, dtype=np.float64)
-    table[0] = 1.0
-    for i in range(1, n + 1):
-        table[i] = table[i - 1] * i
-    return table
-
-
-def _binomial(n: int, k: int, factorial: np.ndarray) -> float:
-    """Evaluate ``C(n, k)`` from a precomputed factorial table."""
-    if k < 0 or k > n:
-        return 0.0
-    return float(factorial[n] / (factorial[k] * factorial[n - k]))
-
-
-class _CaseTables:
-    """
-    Plain numpy tables for one magnitude-ordered branch of the quaternion Wigner path.
-
-    Mirrors pt ``CaseCoefficients`` (init-time constants only).
-    """
-
-    def __init__(self, n_primary: int, max_poly_len: int) -> None:
-        self.coeff = np.zeros(n_primary, dtype=np.float64)
-        self.horner = np.zeros((n_primary, max_poly_len), dtype=np.float64)
-        self.poly_len = np.zeros(n_primary, dtype=np.int64)
-        self.ra_exp = np.zeros(n_primary, dtype=np.float64)
-        self.rb_exp = np.zeros(n_primary, dtype=np.float64)
-        self.sign = np.zeros(n_primary, dtype=np.float64)
-        # filled by _finalize_case_tables
-        self.valid_mask: np.ndarray | None = None
-        self.horner_step_mask: np.ndarray | None = None
-        self.signed_coeff: np.ndarray | None = None
-
-
-def _compute_case_coefficients(
-    case: _CaseTables,
-    idx: int,
-    ell: int,
-    mp: int,
-    m: int,
-    sqrt_factor: float,
-    factorial: np.ndarray,
-    *,
-    is_case1: bool,
-) -> None:
-    """
-    Fill one Horner branch for a fixed ``(ell, mp, m)`` entry.
-
-    The closed-form quaternion Wigner formula is reorganized so that only the
-    ratio ``-(|Rb|/|Ra|)^2`` or ``-(|Ra|/|Rb|)^2`` enters the Horner chain.
-    """
-    if is_case1:
-        rho_min = max(0, mp - m)
-        rho_max = min(ell + mp, ell - m)
-    else:
-        rho_min = max(0, -(mp + m))
-        rho_max = min(ell - m, ell - mp)
-
-    if rho_min > rho_max:
-        return
-
-    if is_case1:
-        binom1 = _binomial(ell + mp, rho_min, factorial)
-        binom2 = _binomial(ell - mp, ell - m - rho_min, factorial)
-    else:
-        binom1 = _binomial(ell + mp, ell - m - rho_min, factorial)
-        binom2 = _binomial(ell - mp, rho_min, factorial)
-    case.coeff[idx] = sqrt_factor * binom1 * binom2
-
-    poly_len = rho_max - rho_min + 1
-    case.poly_len[idx] = poly_len
-    for i, rho in enumerate(range(rho_max, rho_min, -1)):
-        if is_case1:
-            n1 = ell + mp - rho + 1
-            n2 = ell - m - rho + 1
-            d1 = rho
-            d2 = m - mp + rho
-        else:
-            n1 = ell - m - rho + 1
-            n2 = ell - mp - rho + 1
-            d1 = rho
-            d2 = mp + m + rho
-        if d1 != 0 and d2 != 0:
-            case.horner[idx, i] = (n1 * n2) / (d1 * d2)
-
-    if is_case1:
-        case.ra_exp[idx] = 2 * ell + mp - m - 2 * rho_min
-        case.rb_exp[idx] = m - mp + 2 * rho_min
-        case.sign[idx] = (-1) ** rho_min
-    else:
-        case.ra_exp[idx] = mp + m + 2 * rho_min
-        case.rb_exp[idx] = 2 * ell - mp - m - 2 * rho_min
-        case.sign[idx] = ((-1) ** (ell - m)) * ((-1) ** rho_min)
-
-
-def _finalize_case_tables(case: _CaseTables, max_poly_len: int) -> None:
-    """Attach runtime-ready masks and fused coefficients for one Horner branch."""
-    step_count = np.clip(case.poly_len - 1, 0, None)
-    if max_poly_len > 1:
-        horner_step_mask = (
-            np.arange(max_poly_len - 1, dtype=np.int64)[None, :] < step_count[:, None]
-        )
-    else:
-        horner_step_mask = np.zeros((case.poly_len.shape[0], 0), dtype=np.bool_)
-    case.valid_mask = case.poly_len > 0
-    case.horner_step_mask = horner_step_mask
-    case.signed_coeff = case.sign * case.coeff
-
-
-class _PolyTables:
-    """
-    Precomputed coefficient tables for the generic quaternion Wigner evaluator.
-
-    Mirrors pt ``WignerPolynomialCoefficients`` (init-time numpy constants only).
-    Only one half of each real block is stored explicitly. The remaining
-    entries are reconstructed from the exact symmetry
-    ``D^l_{-m',-m} = (-1)^(m' - m) * conj(D^l_{m',m})``.
-    """
-
-    def __init__(self, lmin: int, lmax: int) -> None:
-        if lmin < 0:
-            raise ValueError("`lmin` must be non-negative")
-        if lmax < lmin:
-            raise ValueError("`lmax` must be >= `lmin`")
-
-        factorial = _factorial_table(2 * lmax + 1)
-        n_total = sum((2 * ell + 1) ** 2 for ell in range(lmin, lmax + 1))
-        n_primary = sum(
-            1
-            for ell in range(lmin, lmax + 1)
-            for mp in range(-ell, ell + 1)
-            for m in range(-ell, ell + 1)
-            if mp + m > 0 or (mp + m == 0 and mp >= 0)
-        )
-        n_derived = n_total - n_primary
-        max_poly_len = lmax + 1
-        size = (lmax + 1) ** 2 - lmin * lmin
-
-        self.lmin = lmin
-        self.lmax = lmax
-        self.size = size
-        self.max_poly_len = max_poly_len
-        self.n_primary = n_primary
-        self.n_derived = n_derived
-
-        self.primary_row = np.zeros(n_primary, dtype=np.int64)
-        self.primary_col = np.zeros(n_primary, dtype=np.int64)
-        self.mp_plus_m = np.zeros(n_primary, dtype=np.float64)
-        self.m_minus_mp = np.zeros(n_primary, dtype=np.float64)
-        self.diagonal_mask = np.zeros(n_primary, dtype=np.bool_)
-        self.anti_diagonal_mask = np.zeros(n_primary, dtype=np.bool_)
-        self.special_2m = np.zeros(n_primary, dtype=np.float64)
-        self.anti_diag_sign = np.zeros(n_primary, dtype=np.float64)
-        self.case1 = _CaseTables(n_primary, max_poly_len)
-        self.case2 = _CaseTables(n_primary, max_poly_len)
-        self.derived_row = np.zeros(n_derived, dtype=np.int64)
-        self.derived_col = np.zeros(n_derived, dtype=np.int64)
-        self.derived_primary_idx = np.zeros(n_derived, dtype=np.int64)
-        self.derived_sign = np.zeros(n_derived, dtype=np.float64)
-
-        primary_map: dict[tuple[int, int], int] = {}
-        primary_idx = 0
-        block_start = 0
-        for ell in range(lmin, lmax + 1):
-            block_size = 2 * ell + 1
-            for mp_local in range(block_size):
-                mp = mp_local - ell
-                for m_local in range(block_size):
-                    m = m_local - ell
-                    row = block_start + mp_local
-                    col = block_start + m_local
-                    is_primary = (mp + m > 0) or (mp + m == 0 and mp >= 0)
-                    if not is_primary:
-                        continue
-
-                    primary_map[(row, col)] = primary_idx
-                    self.primary_row[primary_idx] = row
-                    self.primary_col[primary_idx] = col
-                    self.mp_plus_m[primary_idx] = mp + m
-                    self.m_minus_mp[primary_idx] = m - mp
-                    self.diagonal_mask[primary_idx] = mp == m
-                    self.anti_diagonal_mask[primary_idx] = mp == -m
-                    self.special_2m[primary_idx] = 2 * m
-                    self.anti_diag_sign[primary_idx] = (-1) ** (ell - m)
-
-                    sqrt_factor = math.sqrt(
-                        float(factorial[ell + m] * factorial[ell - m])
-                        / float(factorial[ell + mp] * factorial[ell - mp])
-                    )
-                    _compute_case_coefficients(
-                        self.case1,
-                        primary_idx,
-                        ell,
-                        mp,
-                        m,
-                        sqrt_factor,
-                        factorial,
-                        is_case1=True,
-                    )
-                    _compute_case_coefficients(
-                        self.case2,
-                        primary_idx,
-                        ell,
-                        mp,
-                        m,
-                        sqrt_factor,
-                        factorial,
-                        is_case1=False,
-                    )
-                    primary_idx += 1
-            block_start += block_size
-
-        derived_idx = 0
-        block_start = 0
-        for ell in range(lmin, lmax + 1):
-            block_size = 2 * ell + 1
-            for mp_local in range(block_size):
-                mp = mp_local - ell
-                for m_local in range(block_size):
-                    m = m_local - ell
-                    row = block_start + mp_local
-                    col = block_start + m_local
-                    is_primary = (mp + m > 0) or (mp + m == 0 and mp >= 0)
-                    if is_primary:
-                        continue
-
-                    self.derived_row[derived_idx] = row
-                    self.derived_col[derived_idx] = col
-                    self.derived_primary_idx[derived_idx] = primary_map[
-                        (block_start + (-mp + ell), block_start + (-m + ell))
-                    ]
-                    self.derived_sign[derived_idx] = (-1) ** (mp - m)
-                    derived_idx += 1
-            block_start += block_size
-
-        _finalize_case_tables(self.case1, max_poly_len)
-        _finalize_case_tables(self.case2, max_poly_len)
-
-        # Functional scatter replacement: gather index mapping each flat
-        # (row, col) of the packed (size, size) matrix to its source slot in
-        # ``concat([primary, derived, zero_slot])``. Entries outside the
-        # diagonal blocks point at the trailing zero slot.
-        flat_to_src = np.full(size * size, n_primary + n_derived, dtype=np.int64)
-        flat_to_src[self.primary_row * size + self.primary_col] = np.arange(
-            n_primary, dtype=np.int64
-        )
-        flat_to_src[self.derived_row * size + self.derived_col] = n_primary + np.arange(
-            n_derived, dtype=np.int64
-        )
-        self.flat_gather_idx = flat_to_src
-
-
-def _build_complex_to_real_sh_block(ell: int) -> np.ndarray:
-    """
-    Build the complex-to-real basis transform for one ``ell`` block.
-
-    The packed real basis follows the SeZM convention ``m = -ell, ..., +ell``
-    inside each block. This unitary transform defines the real tesseral basis
-    used by the packed ``D_full`` layout.
-    """
-    size = 2 * ell + 1
-    inv_sqrt2 = 1.0 / math.sqrt(2.0)
-    U = np.zeros((size, size), dtype=np.complex128)
-    for m in range(-ell, ell + 1):
-        row = m + ell
-        if m == 0:
-            U[row, ell] = 1.0
-        elif m > 0:
-            U[row, m + ell] = inv_sqrt2
-            U[row, -m + ell] = ((-1) ** m) * inv_sqrt2
-        else:
-            U[row, -m + ell] = -1j * inv_sqrt2
-            U[row, m + ell] = ((-1) ** m) * 1j * inv_sqrt2
-    return U
-
-
-def _assemble_block_diagonal_real_basis(
-    lmin: int, lmax: int
-) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
-    """Assemble per-``ell`` real-basis blocks into one block-diagonal transform."""
-    size = sum(2 * ell + 1 for ell in range(lmin, lmax + 1))
-    U_re_full = np.zeros((size, size), dtype=np.float64)
-    U_im_full = np.zeros((size, size), dtype=np.float64)
-    offset = 0
-    for ell in range(lmin, lmax + 1):
-        U = _build_complex_to_real_sh_block(ell)
-        block_size = 2 * ell + 1
-        block_end = offset + block_size
-        U_re_full[offset:block_end, offset:block_end] = U.real
-        U_im_full[offset:block_end, offset:block_end] = U.imag
-        offset = block_end
-    return (
-        U_re_full,
-        U_im_full,
-        np.ascontiguousarray(U_re_full.T),
-        np.ascontiguousarray(U_im_full.T),
-    )
-
-
-def _vectorized_horner(
-    xp: Any,
-    ratio: Any,
-    horner_coeffs: Any,
-    horner_step_mask: Any,
-) -> Any:
-    """Evaluate many varying-length Horner chains in one batched loop."""
-    n_batch = ratio.shape[0]
-    n_elements = horner_coeffs.shape[0]
-    result = xp.ones(
-        (n_batch, n_elements),
-        dtype=ratio.dtype,
-        device=array_api_compat.device(ratio),
-    )
-    if horner_step_mask.shape[1] == 0:
-        return result
-    ratio = ratio[:, None]
-    for i in range(horner_step_mask.shape[1]):
-        new_result = 1.0 + result * (ratio * horner_coeffs[None, :, i])
-        result = xp.where(horner_step_mask[None, :, i], new_result, result)
-    return result
-
-
 class WignerDCalculator(NativeOP):
     """
     Quaternion-driven Wigner-D blocks for the SeZM packed real spherical basis.
 
-    Input quaternions represent the global->local edge rotation that sends the
-    edge direction to local ``+Z``. The returned block-diagonal matrix keeps
-    the packed SeZM real spherical-harmonics layout, so downstream code
-    consumes ``D_full`` and ``Dt_full`` directly.
+    Input quaternions represent the global->local edge rotation that sends the edge
+    direction to local ``+Z``. The returned block-diagonal matrix keeps the packed
+    SeZM real spherical-harmonics layout, so downstream code continues to consume
+    ``D_full`` and ``Dt_full`` directly.
 
     Runtime structure:
-
     - ``l=0``: scalar identity block;
-    - ``l=1``: direct quaternion -> Cartesian rotation -> real ``l=1`` block;
-    - ``l>=2``: generic quaternion polynomial path with precomputed coefficient
-      tables (the pt reference path that the pt monomial kernels are fitted to;
-      see the module docstring).
-
-    Parameters
-    ----------
-    lmax : int
-        Maximum spherical-harmonics degree.
-    eps : float
-        Numerical floor used in quaternion normalization.
-    precision : str
-        Working floating-point precision of the returned blocks. The internal
-        polynomial algebra is evaluated in fp64 (as in pt) before the result is
-        cast back.
+    - ``l=1``: direct quaternion -> Cartesian rotation -> real l=1 block;
+    - ``l=2``: dedicated degree-4 quaternion tensor contraction;
+    - ``l=3,4``: dedicated quaternion monomial kernels;
+    - ``l=5,6``: dedicated quaternion monomial kernels;
+    - ``l=7,8``: dedicated quaternion monomial kernels;
+    - ``l=9,10``: dedicated quaternion monomial kernels;
+    - ``l>=11``: generic quaternion polynomial path with precomputed coefficient tables.
     """
 
+    _SMALL_ORDER_CACHE_CPU_FP64: ClassVar[dict[str, np.ndarray] | None] = None
+
     def __init__(
         self,
         lmax: int,
@@ -622,41 +426,54 @@ def __init__(
         self.precision = precision
         self.eps = float(eps)
         self.dim_full = (self.lmax + 1) ** 2
+        self.poly_lmin = 11
+        self.poly_offset = self.poly_lmin * self.poly_lmin
 
-        # l=1 block constants: permutation [1, 2, 0] is applied structurally in
-        # _compute_l1_block; the sign pattern is a plain numpy constant.
+        self.l1_perm = np.array([1, 2, 0], dtype=np.int64)
         l1_sign = np.array([-1.0, -1.0, 1.0], dtype=np.float64)
         self.l1_sign_outer = np.outer(l1_sign, l1_sign)
 
         if self.lmax >= 2:
-            self.poly_tables = _PolyTables(lmin=2, lmax=self.lmax)
-            (
-                self.poly_u_re,
-                self.poly_u_im,
-                self.poly_u_re_t,
-                self.poly_u_im_t,
-            ) = _assemble_block_diagonal_real_basis(2, self.lmax)
-
-        # Functional block-diagonal assembly: gather index mapping each flat
-        # (row, col) of D_full to its source slot in the concatenated value
-        # vector [l0 ones (1), l1 block (9), packed l>=2 block (size^2),
-        # trailing zero slot]. No __setitem__ on traced values is needed.
-        n_l1 = 9 if self.lmax >= 1 else 0
-        n_packed = (self.dim_full - 4) ** 2 if self.lmax >= 2 else 0
-        zero_slot = 1 + n_l1 + n_packed
-        full_idx = np.full(self.dim_full * self.dim_full, zero_slot, dtype=np.int64)
-        full_idx[0] = 0  # D_full[:, 0, 0] = 1
-        if self.lmax >= 1:
-            for i in range(3):
-                for j in range(3):
-                    full_idx[(1 + i) * self.dim_full + (1 + j)] = 1 + 3 * i + j
-        if self.lmax >= 2:
-            packed_size = self.dim_full - 4
-            for i in range(packed_size):
-                for j in range(packed_size):
-                    full_idx[(4 + i) * self.dim_full + (4 + j)] = (
-                        1 + n_l1 + packed_size * i + j
-                    )
+            self.small_order_kernels = self._build_small_order_kernels(lmax=self.lmax)
+
+        if self.lmax >= self.poly_lmin:
+            self.poly_coeffs = self._precompute_wigner_coefficients(
+                self.lmax,
+                lmin=self.poly_lmin,
+            )
+            blocks = self._precompute_real_basis_blocks(
+                lmin=self.poly_lmin,
+                lmax=self.lmax,
+            )
+            U_re, U_im, U_re_t, U_im_t = self._assemble_block_diagonal_real_basis(
+                blocks
+            )
+            self.poly_u_re = U_re
+            self.poly_u_im = U_im
+            self.poly_u_re_t = U_re_t
+            self.poly_u_im_t = U_im_t
+
+        # Functional block-diagonal assembly: precompute a gather index mapping each
+        # flat ``(row, col)`` of ``D_full`` to its slot in the concatenated per-degree
+        # block values (the trailing slot holds the off-block zero). Degree ``l``
+        # occupies rows/cols ``[l**2, (l+1)**2)`` regardless of which kernel produced
+        # it, so the index is fully determined by ``lmax``. This replaces pt's
+        # in-place block assignment with a path safe for torch.export.
+        dim = self.dim_full
+        small_lmax = min(self.lmax, 10)
+        segments = [(ell * ell, 2 * ell + 1) for ell in range(small_lmax + 1)]
+        if self.lmax >= self.poly_lmin:
+            segments.append((self.poly_offset, dim - self.poly_offset))
+        n_values = sum(block_dim * block_dim for _, block_dim in segments)
+        full_idx = np.full(dim * dim, n_values, dtype=np.int64)
+        base = 0
+        for offset, block_dim in segments:
+            local = np.arange(block_dim, dtype=np.int64)
+            rows = offset + local[:, None]
+            cols = offset + local[None, :]
+            src = base + (local[:, None] * block_dim + local[None, :])
+            full_idx[(rows * dim + cols).reshape(-1)] = src.reshape(-1)
+            base += block_dim * block_dim
         self.full_gather_idx = full_idx
 
     def call(self, edge_quaternion: Any) -> tuple[Any, Any]:
@@ -665,9 +482,9 @@ def call(self, edge_quaternion: Any) -> tuple[Any, Any]:
 
         Parameters
         ----------
-        edge_quaternion : Array
-            Unit quaternions with shape ``(E, 4)`` representing the
-            global->local edge rotation.
+        edge_quaternion
+            Unit quaternions with shape ``(E, 4)`` representing the global->local
+            edge rotation.
 
         Returns
         -------
@@ -677,23 +494,81 @@ def call(self, edge_quaternion: Any) -> tuple[Any, Any]:
         xp = array_api_compat.array_namespace(edge_quaternion)
         dtype = get_xp_precision(xp, self.precision)
         device = array_api_compat.device(edge_quaternion)
-        q = quaternion_normalize(
+        edge_quaternion = quaternion_normalize(
             xp.astype(edge_quaternion, dtype),
             eps=self.eps,
         )
-        n_edge = q.shape[0]
+        n_edge = edge_quaternion.shape[0]
 
-        segments = [xp.ones((n_edge, 1), dtype=dtype, device=device)]
+        blocks = [xp.ones((n_edge, 1, 1), dtype=dtype, device=device)]
         if self.lmax >= 1:
-            segments.append(
-                xp.reshape(self._compute_l1_block(q, xp, dtype, device), (n_edge, 9))
-            )
+            blocks.append(self._compute_l1_block(edge_quaternion))
+
         if self.lmax >= 2:
-            packed = self._compute_packed_blocks(q, xp, dtype, device)
-            packed_size = self.dim_full - 4
-            segments.append(xp.reshape(packed, (n_edge, packed_size * packed_size)))
-        segments.append(xp.zeros((n_edge, 1), dtype=dtype, device=device))
-        values = xp.concat(segments, axis=1)
+            blocks.append(self._compute_l2_block(edge_quaternion))
+
+        if self.lmax >= 3:
+            if self.lmax >= 4:
+                D_l3, D_l4 = self._compute_l3l4_blocks(edge_quaternion)
+                blocks.append(D_l3)
+                blocks.append(D_l4)
+            else:
+                blocks.append(self._compute_l3_block(edge_quaternion))
+
+        if self.lmax >= 5:
+            if self.lmax >= 6:
+                D_l5, D_l6 = self._compute_l5l6_blocks(edge_quaternion)
+                blocks.append(D_l5)
+                blocks.append(D_l6)
+            else:
+                blocks.append(self._compute_l5_block(edge_quaternion))
+
+        if self.lmax >= 7:
+            if self.lmax >= 8:
+                D_l7, D_l8 = self._compute_l7l8_blocks(edge_quaternion)
+                blocks.append(D_l7)
+                blocks.append(D_l8)
+            else:
+                blocks.append(self._compute_l7_block(edge_quaternion))
+
+        if self.lmax >= 9:
+            if self.lmax >= 10:
+                D_l9, D_l10 = self._compute_l9l10_blocks(edge_quaternion)
+                blocks.append(D_l9)
+                blocks.append(D_l10)
+            else:
+                blocks.append(self._compute_l9_block(edge_quaternion))
+
+        if self.lmax >= self.poly_lmin:
+            ra_re, ra_im, rb_re, rb_im = self._quaternion_to_ra_rb_real(edge_quaternion)
+            D_re, D_im = self._wigner_d_matrix_realpair(
+                ra_re,
+                ra_im,
+                rb_re,
+                rb_im,
+                self.poly_coeffs,
+                dtype=dtype,
+            )
+            D_poly = self._wigner_d_pair_to_real(
+                D_re,
+                D_im,
+                (
+                    self.poly_u_re,
+                    self.poly_u_im,
+                    self.poly_u_re_t,
+                    self.poly_u_im_t,
+                ),
+                lmax=self.lmax,
+                lmin=self.poly_lmin,
+            )
+            blocks.append(D_poly)
+
+        # Gather the per-degree blocks into the dense block-diagonal layout.
+        values = xp.concat(
+            [xp.reshape(b, (n_edge, b.shape[-1] * b.shape[-1])) for b in blocks]
+            + [xp.zeros((n_edge, 1), dtype=dtype, device=device)],
+            axis=1,
+        )
         idx = xp_asarray_nodetach(xp, self.full_gather_idx, device=device)
         D_full = xp.reshape(
             xp.take(values, idx, axis=1),
@@ -702,28 +577,33 @@ def call(self, edge_quaternion: Any) -> tuple[Any, Any]:
         Dt_full = xp.matrix_transpose(D_full)
         return D_full, Dt_full
 
-    def forward_zonal(self, edge_quaternion: Any, lmin: int = 1) -> Any:
+    def forward_zonal(
+        self,
+        edge_quaternion: Any,
+        lmin: int = 1,
+    ) -> Any:
         """
         Build local ``m=0`` to global coupling for GIE.
 
         The returned layout matches the packed node rows for degrees
         ``lmin..lmax``: each degree contributes ``2l+1`` values in packed
         ``m=-l..l`` order. These values are equivalent to gathering
-        ``Dt_full[:, row(l, m), col(l, 0)]`` from :meth:`call` over the same
-        degree range.
+        ``Dt_full[:, row(l, m), col(l, 0)]`` from :meth:`call` over the
+        same degree range.
 
         Parameters
         ----------
-        edge_quaternion : Array
-            Unit quaternions with shape ``(E, 4)`` representing the
-            global->local edge rotation.
-        lmin : int
+        edge_quaternion
+            Unit quaternions with shape ``(E, 4)`` representing the global->local
+            edge rotation.
+        lmin
             First degree to return.
 
         Returns
         -------
         Array
-            Zonal coupling with shape ``(E, (lmax + 1) ** 2 - lmin ** 2)``.
+            Zonal coupling with shape
+            ``(E, (lmax + 1) ** 2 - lmin ** 2)``.
         """
         lmin = int(lmin)
         if lmin < 1:
@@ -734,103 +614,1076 @@ def forward_zonal(self, edge_quaternion: Any, lmin: int = 1) -> Any:
         n_edge = edge_quaternion.shape[0]
         if self.lmax < lmin:
             return xp.zeros((n_edge, 0), dtype=dtype, device=device)
-        q = quaternion_normalize(
+        edge_quaternion = quaternion_normalize(
             xp.astype(edge_quaternion, dtype),
             eps=self.eps,
         )
 
-        zonal_blocks = []
+        zonal_blocks: list[Any] = []
         if lmin <= 1 <= self.lmax:
-            zonal_blocks.append(self._compute_l1_block(q, xp, dtype, device)[:, 1, :])
-        if self.lmax >= 2:
-            packed = self._compute_packed_blocks(q, xp, dtype, device)
+            zonal_blocks.append(self._compute_l1_block(edge_quaternion)[:, 1, :])
+
+        if lmin <= 2 <= self.lmax:
+            zonal_blocks.append(self._compute_l2_block(edge_quaternion)[:, 2, :])
+
+        if self.lmax >= 3 and lmin <= 4:
+            if self.lmax >= 4:
+                D_l3, D_l4 = self._compute_l3l4_blocks(edge_quaternion)
+                if lmin <= 3:
+                    zonal_blocks.append(D_l3[:, 3, :])
+                zonal_blocks.append(D_l4[:, 4, :])
+            else:
+                zonal_blocks.append(self._compute_l3_block(edge_quaternion)[:, 3, :])
+
+        if self.lmax >= 5 and lmin <= 6:
+            if self.lmax >= 6:
+                D_l5, D_l6 = self._compute_l5l6_blocks(edge_quaternion)
+                if lmin <= 5:
+                    zonal_blocks.append(D_l5[:, 5, :])
+                zonal_blocks.append(D_l6[:, 6, :])
+            else:
+                zonal_blocks.append(self._compute_l5_block(edge_quaternion)[:, 5, :])
+
+        if self.lmax >= 7 and lmin <= 8:
+            if self.lmax >= 8:
+                D_l7, D_l8 = self._compute_l7l8_blocks(edge_quaternion)
+                if lmin <= 7:
+                    zonal_blocks.append(D_l7[:, 7, :])
+                zonal_blocks.append(D_l8[:, 8, :])
+            else:
+                zonal_blocks.append(self._compute_l7_block(edge_quaternion)[:, 7, :])
+
+        if self.lmax >= 9 and lmin <= 10:
+            if self.lmax >= 10:
+                D_l9, D_l10 = self._compute_l9l10_blocks(edge_quaternion)
+                if lmin <= 9:
+                    zonal_blocks.append(D_l9[:, 9, :])
+                zonal_blocks.append(D_l10[:, 10, :])
+            else:
+                zonal_blocks.append(self._compute_l9_block(edge_quaternion)[:, 9, :])
+
+        if self.lmax >= self.poly_lmin and lmin <= self.lmax:
+            ra_re, ra_im, rb_re, rb_im = self._quaternion_to_ra_rb_real(edge_quaternion)
+            D_re, D_im = self._wigner_d_matrix_realpair(
+                ra_re,
+                ra_im,
+                rb_re,
+                rb_im,
+                self.poly_coeffs,
+                dtype=dtype,
+            )
+            D_poly = self._wigner_d_pair_to_real(
+                D_re,
+                D_im,
+                (
+                    self.poly_u_re,
+                    self.poly_u_im,
+                    self.poly_u_re_t,
+                    self.poly_u_im_t,
+                ),
+                lmax=self.lmax,
+                lmin=self.poly_lmin,
+            )
+            poly_lmin = max(lmin, self.poly_lmin)
             offset = 0
-            for degree in range(2, self.lmax + 1):
+            for degree in range(self.poly_lmin, self.lmax + 1):
                 block_size = 2 * degree + 1
                 block_end = offset + block_size
-                if degree >= lmin:
-                    zonal_blocks.append(packed[:, offset + degree, offset:block_end])
+                if degree >= poly_lmin:
+                    zonal_blocks.append(D_poly[:, offset + degree, offset:block_end])
                 offset = block_end
+
         return xp.concat(zonal_blocks, axis=1)
 
-    def _compute_l1_block(self, q: Any, xp: Any, dtype: Any, device: Any) -> Any:
-        """Compute the vector block directly from the Cartesian rotation matrix."""
-        rot = quaternion_to_rotation_matrix(q)
-        # row/column permutation [1, 2, 0], applied structurally (no gather)
-        rot = xp.stack([rot[..., 1, :], rot[..., 2, :], rot[..., 0, :]], axis=-2)
-        rot = xp.stack([rot[..., 1], rot[..., 2], rot[..., 0]], axis=-1)
-        sign = xp_asarray_nodetach(xp, self.l1_sign_outer, dtype=dtype, device=device)
-        return rot * sign
-
-    def _compute_packed_blocks(self, q: Any, xp: Any, dtype: Any, device: Any) -> Any:
-        """Evaluate the packed real Wigner blocks for ``l = 2..lmax``."""
-        # Cayley-Klein pair: Ra = w - i z, Rb = y - i x (SeZM convention)
-        ra_re = q[..., 0]
-        ra_im = -q[..., 3]
-        rb_re = q[..., 2]
-        rb_im = -q[..., 1]
-        D_re, D_im = self._wigner_d_matrix_realpair(
-            ra_re, ra_im, rb_re, rb_im, xp, dtype, device
-        )
-        u_re = xp_asarray_nodetach(xp, self.poly_u_re, dtype=dtype, device=device)
-        u_im = xp_asarray_nodetach(xp, self.poly_u_im, dtype=dtype, device=device)
-        u_re_t = xp_asarray_nodetach(xp, self.poly_u_re_t, dtype=dtype, device=device)
-        u_im_t = xp_asarray_nodetach(xp, self.poly_u_im_t, dtype=dtype, device=device)
-        temp_re = xp.matmul(D_re, u_re_t) + xp.matmul(D_im, u_im_t)
-        temp_im = xp.matmul(D_im, u_re_t) - xp.matmul(D_re, u_im_t)
-        return xp.matmul(u_re, temp_re) - xp.matmul(u_im, temp_im)
+    @classmethod
+    def _get_small_order_cache_cpu_fp64(cls, lmax: int) -> dict[str, np.ndarray]:
+        """Generate the required low-order kernel coefficients on CPU fp64."""
+        target_lmax = min(max(int(lmax), 2), 10)
+        if cls._SMALL_ORDER_CACHE_CPU_FP64 is None:
+            cls._SMALL_ORDER_CACHE_CPU_FP64 = {}
+        cache = cls._SMALL_ORDER_CACHE_CPU_FP64
+        required_names = WignerSmallOrderCoefficients.required_kernel_names(target_lmax)
+        if any(name not in cache for name in required_names):
+            cache.update(cls._generate_small_order_cache_cpu_fp64(target_lmax))
+        return cache
 
-    def _wigner_d_matrix_realpair(
-        self,
-        ra_re: Any,
-        ra_im: Any,
-        rb_re: Any,
-        rb_im: Any,
-        xp: Any,
-        out_dtype: Any,
-        device: Any,
-    ) -> tuple[Any, Any]:
+    @classmethod
+    def _build_small_order_kernels(
+        cls,
+        *,
+        lmax: int,
+    ) -> WignerSmallOrderCoefficients:
+        """Instantiate the specialized ``l=2..10`` kernels on the requested device/dtype."""
+        cache = cls._get_small_order_cache_cpu_fp64(lmax)
+        kernels = {}
+        for name in WignerSmallOrderCoefficients.required_kernel_names(lmax):
+            kernels[name] = cache[name]
+        return WignerSmallOrderCoefficients(
+            lmax=lmax,
+            kernels=kernels,
+        )
+
+    @classmethod
+    def _generate_small_order_cache_cpu_fp64(cls, lmax: int) -> dict[str, np.ndarray]:
         """
-        Evaluate the complex Wigner blocks in real/imaginary form.
+        Generate the low-order kernel coefficients from the generic SeZM reference path.
 
-        The runtime path uses only real arithmetic. The complex phase is
-        represented by two real tensors, while the polynomial and magnitude
-        algebra is evaluated in fp64 before the result is cast back to the
-        requested output dtype. All denominators are eps-floored before any
-        division (gradient-safe masked-denominator idiom, as in pt).
+        The coefficients are exact module constants. They are solved once in fp64 on CPU,
+        validated against the generic quaternion polynomial evaluator, and then reused by
+        every `WignerDCalculator` instance.
         """
-        coeffs = self.poly_tables
-        n_batch = ra_re.shape[0]
-        f64 = xp.float64
-        ra_re = xp.astype(ra_re, f64)
-        ra_im = xp.astype(ra_im, f64)
-        rb_re = xp.astype(rb_re, f64)
-        rb_im = xp.astype(rb_im, f64)
+        target_lmax = min(max(int(lmax), 2), 10)
+        rng = np.random.default_rng(20260404)
+
+        max_monomials = math.comb(2 * target_lmax + 3, 3)
+        n_fit = min(2048, max(128, 2 * max_monomials))
+        q_fit = rng.standard_normal((n_fit, 4))
+        q_fit = quaternion_normalize(q_fit, eps=float(np.finfo(np.float64).eps))
+        ref_blocks = cls._compute_generic_reference_blocks(q_fit, lmax=target_lmax)
+
+        monomials: dict[int, list[tuple[int, int, int, int]]] = {}
+        exponents: dict[int, np.ndarray] = {}
+        coefficients: dict[int, np.ndarray] = {}
+        cache: dict[str, np.ndarray] = {}
+
+        for ell in range(2, target_lmax + 1):
+            monomials[ell] = cls._generate_monomials(4, 2 * ell)
+            exponents[ell] = cls._monomials_to_exponent_tensor(monomials[ell])
+            coeff = cls._solve_monomial_coefficients(
+                q_fit,
+                ref_blocks[ell],
+                exponents[ell],
+            )
+            if ell == 2:
+                cache["C_l2"] = cls._build_l2_contraction_tensor(coeff, monomials[2])
+            else:
+                coefficients[ell] = coeff
+                cache[f"C_l{ell}"] = coeff
+                cache[f"exp_l{ell}"] = exponents[ell]
+
+        combined_builders = {
+            4: ("C_combined_l3l4", cls._build_combined_l3l4),
+            6: ("C_combined_l5l6", cls._build_combined_l5l6),
+            8: ("C_combined_l7l8", cls._build_combined_l7l8),
+            10: ("C_combined_l9l10", cls._build_combined_l9l10),
+        }
+        for even_ell, (name, builder) in combined_builders.items():
+            if target_lmax >= even_ell:
+                odd_ell = even_ell - 1
+                cache[name] = builder(
+                    coefficients[odd_ell],
+                    coefficients[even_ell],
+                    monomials[odd_ell],
+                    monomials[even_ell],
+                )
+
+        return cache
 
-        def cv(arr: np.ndarray) -> Any:  # constant table -> xp on input device
-            return xp_asarray_nodetach(xp, arr, device=device)
+    @classmethod
+    def _compute_generic_reference_blocks(
+        cls,
+        edge_quaternion: Any,
+        *,
+        lmax: int,
+    ) -> dict[int, np.ndarray]:
+        """Evaluate the generic SeZM polynomial path and extract per-degree blocks."""
+        coeffs = cls._precompute_wigner_coefficients(
+            lmax,
+            lmin=2,
+        )
+        blocks = cls._precompute_real_basis_blocks(
+            lmin=2,
+            lmax=lmax,
+        )
+        ra_re, ra_im, rb_re, rb_im = cls._quaternion_to_ra_rb_real(edge_quaternion)
+        D_re, D_im = cls._wigner_d_matrix_realpair(
+            ra_re,
+            ra_im,
+            rb_re,
+            rb_im,
+            coeffs,
+        )
+        D_ref = cls._wigner_d_pair_to_real(
+            D_re,
+            D_im,
+            blocks,
+            lmax=lmax,
+            lmin=2,
+        )
+        ref_blocks: dict[int, np.ndarray] = {}
+        offset = 0
+        for ell in range(2, lmax + 1):
+            block_size = 2 * ell + 1
+            block_end = offset + block_size
+            ref_blocks[ell] = D_ref[:, offset:block_end, offset:block_end]
+            offset = block_end
+        return ref_blocks
 
-        eps = float(np.finfo(np.float64).eps)
-        eps_sq = eps * eps
-        ra_sq = ra_re * ra_re + ra_im * ra_im
-        rb_sq = rb_re * rb_re + rb_im * rb_im
-        ra_small = ra_sq <= eps_sq
-        rb_small = rb_sq <= eps_sq
-        ra = xp.sqrt(xp.clip(ra_sq, min=eps_sq))
-        rb = xp.sqrt(xp.clip(rb_sq, min=eps_sq))
-        general_mask = ~ra_small & ~rb_small
-        use_case1 = (ra >= rb) & general_mask
-        use_case2 = (ra < rb) & general_mask
+    @classmethod
+    def _solve_monomial_coefficients(
+        cls,
+        edge_quaternion: np.ndarray,
+        D_block: np.ndarray,
+        monomial_exponents: np.ndarray,
+    ) -> np.ndarray:
+        """Solve the flattened monomial coefficient matrix for one low-order block."""
+        max_power = int(monomial_exponents.sum(axis=1).max())
+        powers = cls._precompute_powers(edge_quaternion, max_power)
+        M = cls._build_monomial_matrix(powers, monomial_exponents)
+        Y = np.reshape(D_block, (edge_quaternion.shape[0], -1))
+        return np.ascontiguousarray(np.linalg.lstsq(M, Y, rcond=None)[0].T)
 
-        safe_ra_re = xp.where(ra_small, xp.ones_like(ra_re), ra_re)
-        safe_ra_im = xp.where(ra_small, xp.zeros_like(ra_im), ra_im)
-        safe_rb_re = xp.where(rb_small, xp.ones_like(rb_re), rb_re)
-        safe_rb_im = xp.where(rb_small, xp.zeros_like(rb_im), rb_im)
-        phia = xp.atan2(safe_ra_im, safe_ra_re)
-        phib = xp.atan2(safe_rb_im, safe_rb_re)
+    @staticmethod
+    def _build_l2_contraction_tensor(
+        C_l2_flat: np.ndarray,
+        monomials: list[tuple[int, int, int, int]],
+    ) -> np.ndarray:
+        """Expand degree-4 monomial coefficients into the symmetric einsum tensor form."""
+        C_l2 = np.zeros((5, 5, 4, 4, 4, 4), dtype=C_l2_flat.dtype)
+        for flat_idx, coeff_row in enumerate(C_l2_flat):
+            i = flat_idx // 5
+            j = flat_idx % 5
+            for coeff, (a, b, c, d) in zip(coeff_row, monomials, strict=True):
+                if abs(float(coeff)) < 1e-15:
+                    continue
+                pool = [0] * a + [1] * b + [2] * c + [3] * d
+                unique_permutations = set(permutations(pool, 4))
+                share = coeff / len(unique_permutations)
+                for p0, p1, p2, p3 in unique_permutations:
+                    C_l2[i, j, p0, p1, p2, p3] = share
+        return C_l2
 
-        phase = (
-            phia[:, None] * cv(coeffs.mp_plus_m)[None, :]
+    @staticmethod
+    def _generate_monomials(
+        n_vars: int,
+        total_degree: int,
+    ) -> list[tuple[int, ...]]:
+        """Generate all monomials of fixed total degree in lexicographic order."""
+        monomials: list[tuple[int, ...]] = []
+
+        def _recurse(
+            remaining_vars: int,
+            remaining_degree: int,
+            current: list[int],
+        ) -> None:
+            if remaining_vars == 1:
+                monomials.append((*current, remaining_degree))
+                return
+            for i in range(remaining_degree + 1):
+                _recurse(remaining_vars - 1, remaining_degree - i, [*current, i])
+
+        _recurse(n_vars, total_degree, [])
+        return monomials
+
+    @staticmethod
+    def _monomials_to_exponent_tensor(
+        monomials: list[tuple[int, ...]],
+    ) -> np.ndarray:
+        """Convert monomial tuples to an ``int64`` exponent table."""
+        return np.array(monomials, dtype=np.int64)
+
+    @staticmethod
+    def _build_combined_l3l4(
+        C_l3: np.ndarray,
+        C_l4: np.ndarray,
+        monomials_l3: list[tuple[int, int, int, int]],
+        monomials_l4: list[tuple[int, int, int, int]],
+    ) -> np.ndarray:
+        """Lift the ``l=3`` basis to degree 8 and stack it with the ``l=4`` basis."""
+        mono8_to_idx = {mono: idx for idx, mono in enumerate(monomials_l4)}
+        C_l3_lifted = np.zeros(
+            (C_l3.shape[0], len(monomials_l4)),
+            dtype=C_l3.dtype,
+        )
+        for j, (a, b, c, d) in enumerate(monomials_l3):
+            for mono8 in (
+                (a + 2, b, c, d),
+                (a, b + 2, c, d),
+                (a, b, c + 2, d),
+                (a, b, c, d + 2),
+            ):
+                C_l3_lifted[:, mono8_to_idx[mono8]] += C_l3[:, j]
+        return np.concatenate([C_l3_lifted, C_l4], axis=0)
+
+    @staticmethod
+    def _build_combined_l5l6(
+        C_l5: np.ndarray,
+        C_l6: np.ndarray,
+        monomials_l5: list[tuple[int, int, int, int]],
+        monomials_l6: list[tuple[int, int, int, int]],
+    ) -> np.ndarray:
+        """Lift the ``l=5`` basis to degree 12 and stack it with the ``l=6`` basis."""
+        mono12_to_idx = {mono: idx for idx, mono in enumerate(monomials_l6)}
+        C_l5_lifted = np.zeros(
+            (C_l5.shape[0], len(monomials_l6)),
+            dtype=C_l5.dtype,
+        )
+        for j, (a, b, c, d) in enumerate(monomials_l5):
+            for mono12 in (
+                (a + 2, b, c, d),
+                (a, b + 2, c, d),
+                (a, b, c + 2, d),
+                (a, b, c, d + 2),
+            ):
+                C_l5_lifted[:, mono12_to_idx[mono12]] += C_l5[:, j]
+        return np.concatenate([C_l5_lifted, C_l6], axis=0)
+
+    @staticmethod
+    def _build_combined_l7l8(
+        C_l7: np.ndarray,
+        C_l8: np.ndarray,
+        monomials_l7: list[tuple[int, int, int, int]],
+        monomials_l8: list[tuple[int, int, int, int]],
+    ) -> np.ndarray:
+        """Lift the ``l=7`` basis to degree 16 and stack it with the ``l=8`` basis."""
+        mono16_to_idx = {mono: idx for idx, mono in enumerate(monomials_l8)}
+        C_l7_lifted = np.zeros(
+            (C_l7.shape[0], len(monomials_l8)),
+            dtype=C_l7.dtype,
+        )
+        for j, (a, b, c, d) in enumerate(monomials_l7):
+            for mono16 in (
+                (a + 2, b, c, d),
+                (a, b + 2, c, d),
+                (a, b, c + 2, d),
+                (a, b, c, d + 2),
+            ):
+                C_l7_lifted[:, mono16_to_idx[mono16]] += C_l7[:, j]
+        return np.concatenate([C_l7_lifted, C_l8], axis=0)
+
+    @staticmethod
+    def _build_combined_l9l10(
+        C_l9: np.ndarray,
+        C_l10: np.ndarray,
+        monomials_l9: list[tuple[int, int, int, int]],
+        monomials_l10: list[tuple[int, int, int, int]],
+    ) -> np.ndarray:
+        """Lift the ``l=9`` basis to degree 20 and stack it with the ``l=10`` basis."""
+        mono20_to_idx = {mono: idx for idx, mono in enumerate(monomials_l10)}
+        C_l9_lifted = np.zeros(
+            (C_l9.shape[0], len(monomials_l10)),
+            dtype=C_l9.dtype,
+        )
+        for j, (a, b, c, d) in enumerate(monomials_l9):
+            for mono20 in (
+                (a + 2, b, c, d),
+                (a, b + 2, c, d),
+                (a, b, c + 2, d),
+                (a, b, c, d + 2),
+            ):
+                C_l9_lifted[:, mono20_to_idx[mono20]] += C_l9[:, j]
+        return np.concatenate([C_l9_lifted, C_l10], axis=0)
+
+    @staticmethod
+    def _precompute_powers(
+        q: Any,
+        max_power: int,
+    ) -> Any:
+        """Precompute powers ``q_i^k`` as a dense table with shape ``(4, max_power+1, E)``."""
+        xp = array_api_compat.array_namespace(q)
+        device = array_api_compat.device(q)
+        n_edge = q.shape[0]
+        components = xp.permute_dims(q, (1, 0))
+        ones = xp.ones((4, n_edge), dtype=q.dtype, device=device)
+        if max_power == 0:
+            return xp.reshape(ones, (4, 1, n_edge))
+        # Cumulative products built by iterated multiplication (``max_power`` is a
+        # compile-time constant, so the unrolled loop is export-friendly).
+        levels = [ones]
+        acc = ones
+        for _ in range(max_power):
+            acc = acc * components
+            levels.append(acc)
+        return xp.stack(levels, axis=1)
+
+    @staticmethod
+    def _build_monomial_matrix(
+        powers: Any,
+        monomial_exponents: Any,
+    ) -> Any:
+        """Assemble the monomial design matrix for one fixed degree by gather/prod."""
+        xp = array_api_compat.array_namespace(powers)
+        n_mono = monomial_exponents.shape[0]
+        n_edge = powers.shape[-1]
+        gather_idx = xp.broadcast_to(
+            xp.permute_dims(monomial_exponents, (1, 0))[:, :, None],
+            (4, n_mono, n_edge),
+        )
+        selected = xp_take_along_axis(powers, gather_idx, axis=1)
+        return xp.permute_dims(xp.prod(selected, axis=0), (1, 0))
+
+    def _compute_l1_block(self, edge_quaternion: Any) -> Any:
+        """Compute the vector block directly from the Cartesian rotation matrix."""
+        xp = array_api_compat.array_namespace(edge_quaternion)
+        device = array_api_compat.device(edge_quaternion)
+        rot_mat = quaternion_to_rotation_matrix(edge_quaternion)
+        perm = xp_asarray_nodetach(xp, self.l1_perm, device=device)
+        rot_perm = xp.take(xp.take(rot_mat, perm, axis=-2), perm, axis=-1)
+        sign = xp_asarray_nodetach(
+            xp, self.l1_sign_outer, dtype=edge_quaternion.dtype, device=device
+        )
+        return rot_perm * sign
+
+    def _compute_l2_block(self, edge_quaternion: Any) -> Any:
+        """Compute the ``l=2`` block from the degree-4 quaternion contraction."""
+        xp = array_api_compat.array_namespace(edge_quaternion)
+        device = array_api_compat.device(edge_quaternion)
+        n_edge = edge_quaternion.shape[0]
+        q2 = edge_quaternion[..., :, None] * edge_quaternion[..., None, :]
+        q4 = q2[..., :, :, None, None] * q2[..., None, None, :, :]
+        c_l2 = xp_asarray_nodetach(
+            xp,
+            self.small_order_kernels.C_l2,
+            dtype=edge_quaternion.dtype,
+            device=device,
+        )
+        # einsum "nabcd,ijabcd->nij" as a flattened matmul over the (a, b, c, d) axes.
+        q4_flat = xp.reshape(q4, (n_edge, 256))
+        c_flat = xp.reshape(c_l2, (25, 256))
+        out = xp.matmul(q4_flat, xp.permute_dims(c_flat, (1, 0)))
+        return xp.reshape(out, (n_edge, 5, 5))
+
+    def _compute_l3_block(self, edge_quaternion: Any) -> Any:
+        """Compute the ``l=3`` block from the dedicated degree-6 monomial kernel."""
+        xp = array_api_compat.array_namespace(edge_quaternion)
+        device = array_api_compat.device(edge_quaternion)
+        n_edge = edge_quaternion.shape[0]
+        powers = self._precompute_powers(edge_quaternion, 6)
+        monomials = self._build_monomial_matrix(
+            powers,
+            xp_asarray_nodetach(xp, self.small_order_kernels.exp_l3, device=device),
+        )
+        c = xp_asarray_nodetach(
+            xp,
+            self.small_order_kernels.C_l3,
+            dtype=edge_quaternion.dtype,
+            device=device,
+        )
+        D_flat = xp.matmul(monomials, xp.permute_dims(c, (1, 0)))
+        return xp.reshape(D_flat, (n_edge, 7, 7))
+
+    def _compute_l3l4_blocks(
+        self,
+        edge_quaternion: Any,
+    ) -> tuple[Any, Any]:
+        """Compute the ``l=3`` and ``l=4`` blocks from one shared degree-8 kernel."""
+        xp = array_api_compat.array_namespace(edge_quaternion)
+        device = array_api_compat.device(edge_quaternion)
+        n_edge = edge_quaternion.shape[0]
+        powers = self._precompute_powers(edge_quaternion, 8)
+        monomials = self._build_monomial_matrix(
+            powers,
+            xp_asarray_nodetach(xp, self.small_order_kernels.exp_l4, device=device),
+        )
+        c = xp_asarray_nodetach(
+            xp,
+            self.small_order_kernels.C_combined_l3l4,
+            dtype=edge_quaternion.dtype,
+            device=device,
+        )
+        D_flat = xp.matmul(monomials, xp.permute_dims(c, (1, 0)))
+        D_l3 = xp.reshape(D_flat[:, :49], (n_edge, 7, 7))
+        D_l4 = xp.reshape(D_flat[:, 49:], (n_edge, 9, 9))
+        return D_l3, D_l4
+
+    def _compute_l5_block(self, edge_quaternion: Any) -> Any:
+        """Compute the ``l=5`` block from the dedicated degree-10 monomial kernel."""
+        xp = array_api_compat.array_namespace(edge_quaternion)
+        device = array_api_compat.device(edge_quaternion)
+        n_edge = edge_quaternion.shape[0]
+        powers = self._precompute_powers(edge_quaternion, 10)
+        monomials = self._build_monomial_matrix(
+            powers,
+            xp_asarray_nodetach(xp, self.small_order_kernels.exp_l5, device=device),
+        )
+        c = xp_asarray_nodetach(
+            xp,
+            self.small_order_kernels.C_l5,
+            dtype=edge_quaternion.dtype,
+            device=device,
+        )
+        D_flat = xp.matmul(monomials, xp.permute_dims(c, (1, 0)))
+        return xp.reshape(D_flat, (n_edge, 11, 11))
+
+    def _compute_l5l6_blocks(
+        self,
+        edge_quaternion: Any,
+    ) -> tuple[Any, Any]:
+        """Compute the ``l=5`` and ``l=6`` blocks from one shared degree-12 kernel."""
+        xp = array_api_compat.array_namespace(edge_quaternion)
+        device = array_api_compat.device(edge_quaternion)
+        n_edge = edge_quaternion.shape[0]
+        powers = self._precompute_powers(edge_quaternion, 12)
+        monomials = self._build_monomial_matrix(
+            powers,
+            xp_asarray_nodetach(xp, self.small_order_kernels.exp_l6, device=device),
+        )
+        c = xp_asarray_nodetach(
+            xp,
+            self.small_order_kernels.C_combined_l5l6,
+            dtype=edge_quaternion.dtype,
+            device=device,
+        )
+        D_flat = xp.matmul(monomials, xp.permute_dims(c, (1, 0)))
+        D_l5 = xp.reshape(D_flat[:, :121], (n_edge, 11, 11))
+        D_l6 = xp.reshape(D_flat[:, 121:], (n_edge, 13, 13))
+        return D_l5, D_l6
+
+    def _compute_l7_block(self, edge_quaternion: Any) -> Any:
+        """Compute the ``l=7`` block from the dedicated degree-14 monomial kernel."""
+        xp = array_api_compat.array_namespace(edge_quaternion)
+        device = array_api_compat.device(edge_quaternion)
+        n_edge = edge_quaternion.shape[0]
+        powers = self._precompute_powers(edge_quaternion, 14)
+        monomials = self._build_monomial_matrix(
+            powers,
+            xp_asarray_nodetach(xp, self.small_order_kernels.exp_l7, device=device),
+        )
+        c = xp_asarray_nodetach(
+            xp,
+            self.small_order_kernels.C_l7,
+            dtype=edge_quaternion.dtype,
+            device=device,
+        )
+        D_flat = xp.matmul(monomials, xp.permute_dims(c, (1, 0)))
+        return xp.reshape(D_flat, (n_edge, 15, 15))
+
+    def _compute_l7l8_blocks(
+        self,
+        edge_quaternion: Any,
+    ) -> tuple[Any, Any]:
+        """Compute the ``l=7`` and ``l=8`` blocks from one shared degree-16 kernel."""
+        xp = array_api_compat.array_namespace(edge_quaternion)
+        device = array_api_compat.device(edge_quaternion)
+        n_edge = edge_quaternion.shape[0]
+        powers = self._precompute_powers(edge_quaternion, 16)
+        monomials = self._build_monomial_matrix(
+            powers,
+            xp_asarray_nodetach(xp, self.small_order_kernels.exp_l8, device=device),
+        )
+        c = xp_asarray_nodetach(
+            xp,
+            self.small_order_kernels.C_combined_l7l8,
+            dtype=edge_quaternion.dtype,
+            device=device,
+        )
+        D_flat = xp.matmul(monomials, xp.permute_dims(c, (1, 0)))
+        D_l7 = xp.reshape(D_flat[:, :225], (n_edge, 15, 15))
+        D_l8 = xp.reshape(D_flat[:, 225:], (n_edge, 17, 17))
+        return D_l7, D_l8
+
+    def _compute_l9_block(self, edge_quaternion: Any) -> Any:
+        """Compute the ``l=9`` block from the dedicated degree-18 monomial kernel."""
+        xp = array_api_compat.array_namespace(edge_quaternion)
+        device = array_api_compat.device(edge_quaternion)
+        n_edge = edge_quaternion.shape[0]
+        powers = self._precompute_powers(edge_quaternion, 18)
+        monomials = self._build_monomial_matrix(
+            powers,
+            xp_asarray_nodetach(xp, self.small_order_kernels.exp_l9, device=device),
+        )
+        c = xp_asarray_nodetach(
+            xp,
+            self.small_order_kernels.C_l9,
+            dtype=edge_quaternion.dtype,
+            device=device,
+        )
+        D_flat = xp.matmul(monomials, xp.permute_dims(c, (1, 0)))
+        return xp.reshape(D_flat, (n_edge, 19, 19))
+
+    def _compute_l9l10_blocks(
+        self,
+        edge_quaternion: Any,
+    ) -> tuple[Any, Any]:
+        """Compute the ``l=9`` and ``l=10`` blocks from one shared degree-20 kernel."""
+        xp = array_api_compat.array_namespace(edge_quaternion)
+        device = array_api_compat.device(edge_quaternion)
+        n_edge = edge_quaternion.shape[0]
+        powers = self._precompute_powers(edge_quaternion, 20)
+        monomials = self._build_monomial_matrix(
+            powers,
+            xp_asarray_nodetach(xp, self.small_order_kernels.exp_l10, device=device),
+        )
+        c = xp_asarray_nodetach(
+            xp,
+            self.small_order_kernels.C_combined_l9l10,
+            dtype=edge_quaternion.dtype,
+            device=device,
+        )
+        D_flat = xp.matmul(monomials, xp.permute_dims(c, (1, 0)))
+        D_l9 = xp.reshape(D_flat[:, :361], (n_edge, 19, 19))
+        D_l10 = xp.reshape(D_flat[:, 361:], (n_edge, 21, 21))
+        return D_l9, D_l10
+
+    @staticmethod
+    def _factorial_table(n: int) -> np.ndarray:
+        """Return ``[0!, 1!, ..., n!]`` in the requested dtype/device."""
+        table = np.zeros(n + 1, dtype=np.float64)
+        table[0] = 1.0
+        for i in range(1, n + 1):
+            table[i] = table[i - 1] * i
+        return table
+
+    @staticmethod
+    def _binomial(n: int, k: int, factorial: np.ndarray) -> float:
+        """Evaluate ``C(n, k)`` from a precomputed factorial table."""
+        if k < 0 or k > n:
+            return 0.0
+        return float(factorial[n] / (factorial[k] * factorial[n - k]))
+
+    @staticmethod
+    def _allocate_case_coeffs(
+        n_primary: int,
+        max_poly_len: int,
+    ) -> CaseCoefficients:
+        """Allocate one branch of Horner tables for the quaternion Wigner evaluator."""
+        return CaseCoefficients(
+            coeff=np.zeros(n_primary, dtype=np.float64),
+            horner=np.zeros((n_primary, max_poly_len), dtype=np.float64),
+            poly_len=np.zeros(n_primary, dtype=np.int64),
+            ra_exp=np.zeros(n_primary, dtype=np.float64),
+            rb_exp=np.zeros(n_primary, dtype=np.float64),
+            sign=np.zeros(n_primary, dtype=np.float64),
+        )
+
+    @staticmethod
+    def _compute_case_coefficients(
+        case: CaseCoefficients,
+        idx: int,
+        ell: int,
+        mp: int,
+        m: int,
+        sqrt_factor: float,
+        factorial: np.ndarray,
+        *,
+        is_case1: bool,
+    ) -> None:
+        """
+        Fill one Horner branch for a fixed ``(ell, mp, m)`` entry.
+
+        The closed-form quaternion Wigner formula is reorganized so that only the ratio
+        ``-(|Rb|/|Ra|)^2`` or ``-(|Ra|/|Rb|)^2`` enters the Horner chain. This avoids a
+        large family of per-entry runtime branches and keeps the generic path stable for
+        every ``ell``.
+        """
+        if is_case1:
+            rho_min = max(0, mp - m)
+            rho_max = min(ell + mp, ell - m)
+        else:
+            rho_min = max(0, -(mp + m))
+            rho_max = min(ell - m, ell - mp)
+
+        if rho_min > rho_max:
+            return
+
+        if is_case1:
+            binom1 = WignerDCalculator._binomial(ell + mp, rho_min, factorial)
+            binom2 = WignerDCalculator._binomial(ell - mp, ell - m - rho_min, factorial)
+        else:
+            binom1 = WignerDCalculator._binomial(ell + mp, ell - m - rho_min, factorial)
+            binom2 = WignerDCalculator._binomial(ell - mp, rho_min, factorial)
+        case.coeff[idx] = sqrt_factor * binom1 * binom2
+
+        poly_len = rho_max - rho_min + 1
+        case.poly_len[idx] = poly_len
+        for i, rho in enumerate(range(rho_max, rho_min, -1)):
+            if is_case1:
+                n1 = ell + mp - rho + 1
+                n2 = ell - m - rho + 1
+                d1 = rho
+                d2 = m - mp + rho
+            else:
+                n1 = ell - m - rho + 1
+                n2 = ell - mp - rho + 1
+                d1 = rho
+                d2 = mp + m + rho
+            if d1 != 0 and d2 != 0:
+                case.horner[idx, i] = (n1 * n2) / (d1 * d2)
+
+        if is_case1:
+            case.ra_exp[idx] = 2 * ell + mp - m - 2 * rho_min
+            case.rb_exp[idx] = m - mp + 2 * rho_min
+            case.sign[idx] = (-1) ** rho_min
+        else:
+            case.ra_exp[idx] = mp + m + 2 * rho_min
+            case.rb_exp[idx] = 2 * ell - mp - m - 2 * rho_min
+            case.sign[idx] = ((-1) ** (ell - m)) * ((-1) ** rho_min)
+
+    @staticmethod
+    def _finalize_case_coefficients(
+        case: CaseCoefficients,
+        max_poly_len: int,
+    ) -> None:
+        """Attach runtime-ready masks and fused coefficients for one Horner branch."""
+        step_count = np.clip(case.poly_len - 1, 0, None)
+        if max_poly_len > 1:
+            horner_step_mask = (
+                np.arange(max_poly_len - 1, dtype=case.poly_len.dtype)[None, :]
+                < step_count[:, None]
+            )
+        else:
+            horner_step_mask = np.zeros((case.poly_len.shape[0], 0), dtype=np.bool_)
+        case.valid_mask = case.poly_len > 0
+        case.horner_step_mask = horner_step_mask
+        case.signed_coeff = case.sign * case.coeff
+
+    @staticmethod
+    def _vectorized_horner(
+        ratio: Any,
+        horner_coeffs: Any,
+        horner_step_mask: Any,
+    ) -> Any:
+        """Evaluate many varying-length Horner chains in one batched loop."""
+        xp = array_api_compat.array_namespace(ratio)
+        device = array_api_compat.device(ratio)
+        n_batch = ratio.shape[0]
+        n_elements = horner_coeffs.shape[0]
+        result = xp.ones((n_batch, n_elements), dtype=ratio.dtype, device=device)
+        if horner_step_mask.shape[1] == 0:
+            return result
+        ratio = ratio[:, None]
+        for i in range(horner_step_mask.shape[1]):
+            new_result = 1.0 + result * (ratio * horner_coeffs[None, :, i])
+            result = xp.where(horner_step_mask[None, :, i], new_result, result)
+        return result
+
+    @staticmethod
+    def _compute_case_magnitude(
+        log_ra: Any,
+        log_rb: Any,
+        ratio: Any,
+        case: CaseCoefficients,
+    ) -> Any:
+        """Compute the real magnitude factor for one stable Horner branch."""
+        xp = array_api_compat.array_namespace(log_ra)
+        device = array_api_compat.device(log_ra)
+        horner_sum = WignerDCalculator._vectorized_horner(
+            ratio,
+            xp_asarray_nodetach(xp, case.horner, device=device),
+            xp_asarray_nodetach(xp, case.horner_step_mask, device=device),
+        )
+        ra_powers = xp.exp(
+            log_ra[:, None]
+            * xp_asarray_nodetach(xp, case.ra_exp, device=device)[None, :]
+        )
+        rb_powers = xp.exp(
+            log_rb[:, None]
+            * xp_asarray_nodetach(xp, case.rb_exp, device=device)[None, :]
+        )
+        signed_coeff = xp_asarray_nodetach(xp, case.signed_coeff, device=device)
+        magnitude = signed_coeff[None, :] * ra_powers * rb_powers
+        return magnitude * horner_sum
+
+    @staticmethod
+    def _build_complex_to_real_sh_block(ell: int) -> np.ndarray:
+        """
+        Build the complex-to-real basis transform for one ``ell`` block.
+
+        The packed real basis follows the SeZM convention
+        ``m = -ell, ..., +ell`` inside each block. This unitary transform defines the
+        real tesseral basis used by the packed ``D_full`` layout.
+        """
+        size = 2 * ell + 1
+        inv_sqrt2 = 1.0 / math.sqrt(2.0)
+        U = np.zeros((size, size), dtype=np.complex128)
+        for m in range(-ell, ell + 1):
+            row = m + ell
+            if m == 0:
+                U[row, ell] = 1.0
+            elif m > 0:
+                U[row, m + ell] = inv_sqrt2
+                U[row, -m + ell] = ((-1) ** m) * inv_sqrt2
+            else:
+                U[row, -m + ell] = -1j * inv_sqrt2
+                U[row, m + ell] = ((-1) ** m) * 1j * inv_sqrt2
+        return U
+
+    @staticmethod
+    def _precompute_real_basis_blocks(
+        *,
+        lmin: int,
+        lmax: int,
+    ) -> list[tuple[np.ndarray, np.ndarray]]:
+        """Precompute complex-to-real basis transforms for ``ell in [lmin, lmax]``."""
+        if lmin > lmax:
+            return []
+        blocks: list[tuple[np.ndarray, np.ndarray]] = []
+        for ell in range(lmin, lmax + 1):
+            U = WignerDCalculator._build_complex_to_real_sh_block(ell)
+            blocks.append((U.real.astype(np.float64), U.imag.astype(np.float64)))
+        return blocks
+
+    @staticmethod
+    def _assemble_block_diagonal_real_basis(
+        U_blocks: list[tuple[np.ndarray, np.ndarray]],
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Assemble per-``ell`` real-basis blocks into one block-diagonal transform."""
+        if not U_blocks:
+            empty = np.zeros((0, 0), dtype=np.float64)
+            return empty, empty, empty, empty
+
+        size = sum(U_re.shape[0] for U_re, _ in U_blocks)
+        U_re_full = np.zeros((size, size), dtype=np.float64)
+        U_im_full = np.zeros((size, size), dtype=np.float64)
+        offset = 0
+        for U_re, U_im in U_blocks:
+            block_size = U_re.shape[0]
+            block_end = offset + block_size
+            U_re_full[offset:block_end, offset:block_end] = U_re
+            U_im_full[offset:block_end, offset:block_end] = U_im
+            offset = block_end
+        return (
+            U_re_full,
+            U_im_full,
+            np.ascontiguousarray(U_re_full.T),
+            np.ascontiguousarray(U_im_full.T),
+        )
+
+    @staticmethod
+    def _quaternion_to_ra_rb_real(
+        q: Any,
+    ) -> tuple[Any, Any, Any, Any]:
+        """
+        Decompose quaternion components into the Cayley-Klein pair used by the generic path.
+
+        For ``q = (w, x, y, z)`` the SeZM real-basis convention is aligned by
+
+        ``Ra = w - i z`` and ``Rb = y - i x``.
+
+        This pairing matches the packed SeZM real spherical-harmonics ordering used by
+        the block-diagonal ``D_full`` layout.
+        """
+        w = q[..., 0]
+        x = q[..., 1]
+        y = q[..., 2]
+        z = q[..., 3]
+        return w, -z, y, -x
+
+    @staticmethod
+    def _precompute_wigner_coefficients(
+        lmax: int,
+        *,
+        lmin: int = 0,
+    ) -> WignerPolynomialCoefficients:
+        """
+        Precompute the generic quaternion Wigner coefficient tables.
+
+        The runtime path only performs batched Horner evaluation and symmetry scatter.
+        All factorial ratios, branch exponents, and packed matrix indices are resolved once
+        here, which keeps the forward path independent of ``ell`` and stable for arbitrary
+        ``lmax``.
+        """
+        if lmin < 0:
+            raise ValueError("`lmin` must be non-negative")
+        if lmax < lmin:
+            raise ValueError("`lmax` must be >= `lmin`")
+
+        factorial = WignerDCalculator._factorial_table(2 * lmax + 1)
+        n_total = sum((2 * ell + 1) ** 2 for ell in range(lmin, lmax + 1))
+        n_primary = sum(
+            1
+            for ell in range(lmin, lmax + 1)
+            for mp in range(-ell, ell + 1)
+            for m in range(-ell, ell + 1)
+            if mp + m > 0 or (mp + m == 0 and mp >= 0)
+        )
+        n_derived = n_total - n_primary
+        max_poly_len = lmax + 1
+        size = (lmax + 1) ** 2 - lmin * lmin
+
+        primary_row = np.zeros(n_primary, dtype=np.int64)
+        primary_col = np.zeros(n_primary, dtype=np.int64)
+        mp_plus_m = np.zeros(n_primary, dtype=np.float64)
+        m_minus_mp = np.zeros(n_primary, dtype=np.float64)
+        diagonal_mask = np.zeros(n_primary, dtype=np.bool_)
+        anti_diagonal_mask = np.zeros(n_primary, dtype=np.bool_)
+        special_2m = np.zeros(n_primary, dtype=np.float64)
+        anti_diag_sign = np.zeros(n_primary, dtype=np.float64)
+        case1 = WignerDCalculator._allocate_case_coeffs(
+            n_primary,
+            max_poly_len,
+        )
+        case2 = WignerDCalculator._allocate_case_coeffs(
+            n_primary,
+            max_poly_len,
+        )
+        derived_row = np.zeros(n_derived, dtype=np.int64)
+        derived_col = np.zeros(n_derived, dtype=np.int64)
+        derived_primary_idx = np.zeros(n_derived, dtype=np.int64)
+        derived_sign = np.zeros(n_derived, dtype=np.float64)
+
+        primary_map: dict[tuple[int, int], int] = {}
+        primary_idx = 0
+        block_start = 0
+        for ell in range(lmin, lmax + 1):
+            block_size = 2 * ell + 1
+            for mp_local in range(block_size):
+                mp = mp_local - ell
+                for m_local in range(block_size):
+                    m = m_local - ell
+                    row = block_start + mp_local
+                    col = block_start + m_local
+                    is_primary = (mp + m > 0) or (mp + m == 0 and mp >= 0)
+                    if not is_primary:
+                        continue
+
+                    primary_map[(row, col)] = primary_idx
+                    primary_row[primary_idx] = row
+                    primary_col[primary_idx] = col
+                    mp_plus_m[primary_idx] = mp + m
+                    m_minus_mp[primary_idx] = m - mp
+                    diagonal_mask[primary_idx] = mp == m
+                    anti_diagonal_mask[primary_idx] = mp == -m
+                    special_2m[primary_idx] = 2 * m
+                    anti_diag_sign[primary_idx] = (-1) ** (ell - m)
+
+                    sqrt_factor = math.sqrt(
+                        float(factorial[ell + m] * factorial[ell - m])
+                        / float(factorial[ell + mp] * factorial[ell - mp])
+                    )
+                    WignerDCalculator._compute_case_coefficients(
+                        case1,
+                        primary_idx,
+                        ell,
+                        mp,
+                        m,
+                        sqrt_factor,
+                        factorial,
+                        is_case1=True,
+                    )
+                    WignerDCalculator._compute_case_coefficients(
+                        case2,
+                        primary_idx,
+                        ell,
+                        mp,
+                        m,
+                        sqrt_factor,
+                        factorial,
+                        is_case1=False,
+                    )
+                    primary_idx += 1
+            block_start += block_size
+
+        derived_idx = 0
+        block_start = 0
+        for ell in range(lmin, lmax + 1):
+            block_size = 2 * ell + 1
+            for mp_local in range(block_size):
+                mp = mp_local - ell
+                for m_local in range(block_size):
+                    m = m_local - ell
+                    row = block_start + mp_local
+                    col = block_start + m_local
+                    is_primary = (mp + m > 0) or (mp + m == 0 and mp >= 0)
+                    if is_primary:
+                        continue
+
+                    derived_row[derived_idx] = row
+                    derived_col[derived_idx] = col
+                    derived_primary_idx[derived_idx] = primary_map[
+                        (block_start + (-mp + ell), block_start + (-m + ell))
+                    ]
+                    derived_sign[derived_idx] = (-1) ** (mp - m)
+                    derived_idx += 1
+            block_start += block_size
+
+        WignerDCalculator._finalize_case_coefficients(case1, max_poly_len)
+        WignerDCalculator._finalize_case_coefficients(case2, max_poly_len)
+
+        coeffs = WignerPolynomialCoefficients(
+            lmin=lmin,
+            lmax=lmax,
+            size=size,
+            max_poly_len=max_poly_len,
+            n_primary=n_primary,
+            n_derived=n_derived,
+            primary_row=primary_row,
+            primary_col=primary_col,
+            case1=case1,
+            case2=case2,
+            mp_plus_m=mp_plus_m,
+            m_minus_mp=m_minus_mp,
+            diagonal_mask=diagonal_mask,
+            anti_diagonal_mask=anti_diagonal_mask,
+            special_2m=special_2m,
+            anti_diag_sign=anti_diag_sign,
+            derived_row=derived_row,
+            derived_col=derived_col,
+            derived_primary_idx=derived_primary_idx,
+            derived_sign=derived_sign,
+        )
+
+        # Functional scatter index: maps each flat ``(row, col)`` of the packed
+        # ``(size, size)`` matrix to its source slot in
+        # ``concat([primary, derived, zero])``. Off-block positions point at the
+        # trailing zero slot. This replaces pt's in-place ``D[:, row, col] = value``
+        # scatter with an export-safe gather.
+        flat_to_src = np.full(size * size, n_primary + n_derived, dtype=np.int64)
+        flat_to_src[primary_row * size + primary_col] = np.arange(
+            n_primary, dtype=np.int64
+        )
+        flat_to_src[derived_row * size + derived_col] = n_primary + np.arange(
+            n_derived, dtype=np.int64
+        )
+        coeffs.flat_gather_idx = flat_to_src
+
+        return coeffs
+
+    @staticmethod
+    def _wigner_d_matrix_realpair(
+        ra_re: Any,
+        ra_im: Any,
+        rb_re: Any,
+        rb_im: Any,
+        coeffs: WignerPolynomialCoefficients,
+        *,
+        dtype: Any = None,
+    ) -> tuple[Any, Any]:
+        """
+        Evaluate the complex Wigner blocks in real/imaginary form.
+
+        The runtime path uses only real arithmetic. The complex phase is represented by
+        two real tensors, while the polynomial and magnitude algebra is evaluated in
+        ``fp64`` before the result is cast back to the requested output dtype.
+        """
+        xp = array_api_compat.array_namespace(ra_re)
+        device = array_api_compat.device(ra_re)
+        n_batch = ra_re.shape[0]
+        output_dtype = ra_re.dtype if dtype is None else dtype
+        if coeffs.size == 0:
+            zeros = xp.zeros((n_batch, 0, 0), dtype=output_dtype, device=device)
+            return zeros, zeros
+
+        f64 = xp.float64
+        ra_re = xp.astype(ra_re, f64)
+        ra_im = xp.astype(ra_im, f64)
+        rb_re = xp.astype(rb_re, f64)
+        rb_im = xp.astype(rb_im, f64)
+
+        def cv(arr: np.ndarray) -> Any:
+            return xp_asarray_nodetach(xp, arr, device=device)
+
+        eps = float(np.finfo(np.float64).eps)
+        eps_sq = eps * eps
+        ra_sq = ra_re * ra_re + ra_im * ra_im
+        rb_sq = rb_re * rb_re + rb_im * rb_im
+        ra_small = ra_sq <= eps_sq
+        rb_small = rb_sq <= eps_sq
+        ra = xp.sqrt(xp.clip(ra_sq, min=eps_sq))
+        rb = xp.sqrt(xp.clip(rb_sq, min=eps_sq))
+        general_mask = ~ra_small & ~rb_small
+        use_case1 = (ra >= rb) & general_mask
+        use_case2 = (ra < rb) & general_mask
+
+        safe_ra_re = xp.where(ra_small, xp.ones_like(ra_re), ra_re)
+        safe_ra_im = xp.where(ra_small, xp.zeros_like(ra_im), ra_im)
+        safe_rb_re = xp.where(rb_small, xp.ones_like(rb_re), rb_re)
+        safe_rb_im = xp.where(rb_small, xp.zeros_like(rb_im), rb_im)
+        phia = xp.atan2(safe_ra_im, safe_ra_re)
+        phib = xp.atan2(safe_rb_im, safe_rb_re)
+
+        phase = (
+            phia[:, None] * cv(coeffs.mp_plus_m)[None, :]
             + phib[:, None] * cv(coeffs.m_minus_mp)[None, :]
         )
         exp_phase_re = xp.cos(phase)
@@ -868,84 +1721,98 @@ def cv(arr: np.ndarray) -> Any:  # constant table -> xp on input device
         result_re = xp.where(diag_mask, diag_re, result_re)
         result_im = xp.where(diag_mask, diag_im, result_im)
 
-        for case, case_rows, ratio in (
-            (
-                coeffs.case1,
-                use_case1,
-                -(rb * rb) / (safe_ra * safe_ra),
-            ),
-            (
-                coeffs.case2,
-                use_case2,
-                -(ra * ra) / (safe_rb * safe_rb),
-            ),
-        ):
-            magnitude = self._compute_case_magnitude(
-                xp,
-                xp.where(case_rows, log_ra, xp.zeros_like(log_ra)),
-                xp.where(case_rows, log_rb, xp.zeros_like(log_rb)),
-                xp.where(case_rows, ratio, xp.zeros_like(ratio)),
-                case,
-                device,
-            )
-            val_re = magnitude * exp_phase_re
-            val_im = magnitude * exp_phase_im
-            mask = case_rows[:, None] & cv(case.valid_mask)[None, :]
-            result_re = xp.where(mask, val_re, result_re)
-            result_im = xp.where(mask, val_im, result_im)
-
-        # Functional scatter into the dense packed matrix: derive the
-        # symmetry-completed entries by gather, then place primary + derived
-        # values with one precomputed take index (zero slot for off-block).
-        derived_idx = cv(coeffs.derived_primary_idx)
+        ratio1 = -(rb * rb) / (safe_ra * safe_ra)
+        case1_rows = use_case1
+        magnitude1 = WignerDCalculator._compute_case_magnitude(
+            xp.where(case1_rows, log_ra, xp.zeros_like(log_ra)),
+            xp.where(case1_rows, log_rb, xp.zeros_like(log_rb)),
+            xp.where(case1_rows, ratio1, xp.zeros_like(ratio1)),
+            coeffs.case1,
+        )
+        val1_re = magnitude1 * exp_phase_re
+        val1_im = magnitude1 * exp_phase_im
+        mask1 = case1_rows[:, None] & cv(coeffs.case1.valid_mask)[None, :]
+        result_re = xp.where(mask1, val1_re, result_re)
+        result_im = xp.where(mask1, val1_im, result_im)
+
+        ratio2 = -(ra * ra) / (safe_rb * safe_rb)
+        case2_rows = use_case2
+        magnitude2 = WignerDCalculator._compute_case_magnitude(
+            xp.where(case2_rows, log_ra, xp.zeros_like(log_ra)),
+            xp.where(case2_rows, log_rb, xp.zeros_like(log_rb)),
+            xp.where(case2_rows, ratio2, xp.zeros_like(ratio2)),
+            coeffs.case2,
+        )
+        val2_re = magnitude2 * exp_phase_re
+        val2_im = magnitude2 * exp_phase_im
+        mask2 = case2_rows[:, None] & cv(coeffs.case2.valid_mask)[None, :]
+        result_re = xp.where(mask2, val2_re, result_re)
+        result_im = xp.where(mask2, val2_im, result_im)
+
+        # Symmetry completion + scatter as one functional gather (see
+        # ``_precompute_wigner_coefficients`` for ``flat_gather_idx``).
+        derived_primary_idx = cv(coeffs.derived_primary_idx)
         derived_sign = cv(coeffs.derived_sign)
-        primary_re = xp.take(result_re, derived_idx, axis=1)
-        primary_im = xp.take(result_im, derived_idx, axis=1)
+        primary_re = xp.take(result_re, derived_primary_idx, axis=1)
+        primary_im = xp.take(result_im, derived_primary_idx, axis=1)
         derived_re = derived_sign[None, :] * primary_re
         derived_im = -derived_sign[None, :] * primary_im
         zero_col = xp.zeros((n_batch, 1), dtype=f64, device=device)
         flat_idx = cv(coeffs.flat_gather_idx)
         D_re = xp.reshape(
             xp.take(
-                xp.concat([result_re, derived_re, zero_col], axis=1), flat_idx, axis=1
+                xp.concat([result_re, derived_re, zero_col], axis=1),
+                flat_idx,
+                axis=1,
             ),
             (n_batch, coeffs.size, coeffs.size),
         )
         D_im = xp.reshape(
             xp.take(
-                xp.concat([result_im, derived_im, zero_col], axis=1), flat_idx, axis=1
+                xp.concat([result_im, derived_im, zero_col], axis=1),
+                flat_idx,
+                axis=1,
             ),
             (n_batch, coeffs.size, coeffs.size),
         )
-        return xp.astype(D_re, out_dtype), xp.astype(D_im, out_dtype)
+        return xp.astype(D_re, output_dtype), xp.astype(D_im, output_dtype)
 
     @staticmethod
-    def _compute_case_magnitude(
-        xp: Any,
-        log_ra: Any,
-        log_rb: Any,
-        ratio: Any,
-        case: _CaseTables,
-        device: Any,
+    def _wigner_d_pair_to_real(
+        D_re: Any,
+        D_im: Any,
+        U_blocks: list[tuple[np.ndarray, np.ndarray]] | tuple[Any, Any, Any, Any],
+        *,
+        lmax: int,
+        lmin: int,
     ) -> Any:
-        """Compute the real magnitude factor for one stable Horner branch."""
-        horner_sum = _vectorized_horner(
-            xp,
-            ratio,
-            xp_asarray_nodetach(xp, case.horner, device=device),
-            xp_asarray_nodetach(xp, case.horner_step_mask, device=device),
-        )
-        ra_powers = xp.exp(
-            log_ra[:, None]
-            * xp_asarray_nodetach(xp, case.ra_exp, device=device)[None, :]
-        )
-        rb_powers = xp.exp(
-            log_rb[:, None]
-            * xp_asarray_nodetach(xp, case.rb_exp, device=device)[None, :]
-        )
-        signed_coeff = xp_asarray_nodetach(xp, case.signed_coeff, device=device)
-        magnitude = signed_coeff[None, :] * ra_powers * rb_powers
-        return magnitude * horner_sum
+        """
+        Convert complex Wigner blocks to the current real packed basis.
+
+        Each block applies the SeZM complex-to-real basis transform for its degree.
+        This preserves the packed ``(l, m)`` contract of ``D_full`` and ``Dt_full``.
+        """
+        xp = array_api_compat.array_namespace(D_re)
+        device = array_api_compat.device(D_re)
+        n_batch = D_re.shape[0]
+        if lmin > lmax:
+            return xp.zeros((n_batch, 0, 0), dtype=D_re.dtype, device=device)
+
+        if isinstance(U_blocks, list):
+            U_re, U_im, U_re_t, U_im_t = (
+                WignerDCalculator._assemble_block_diagonal_real_basis(U_blocks)
+            )
+        else:
+            U_re, U_im, U_re_t, U_im_t = U_blocks
+
+        U_re = xp_asarray_nodetach(xp, U_re, dtype=D_re.dtype, device=device)
+        U_im = xp_asarray_nodetach(xp, U_im, dtype=D_re.dtype, device=device)
+        U_re_t = xp_asarray_nodetach(xp, U_re_t, dtype=D_re.dtype, device=device)
+        U_im_t = xp_asarray_nodetach(xp, U_im_t, dtype=D_re.dtype, device=device)
+
+        temp_re = xp.matmul(D_re, U_re_t) + xp.matmul(D_im, U_im_t)
+        temp_im = xp.matmul(D_im, U_re_t) - xp.matmul(D_re, U_im_t)
+        return xp.matmul(U_re, temp_re) - xp.matmul(U_im, temp_im)
 
     def serialize(self) -> dict[str, Any]:
         """Serialize WignerDCalculator (lmax and precision are stored by parent)."""
diff --git a/deepmd/pt/model/descriptor/sezm.py b/deepmd/pt/model/descriptor/sezm.py
index eba6ae5b4e..df4ade51bc 100644
--- a/deepmd/pt/model/descriptor/sezm.py
+++ b/deepmd/pt/model/descriptor/sezm.py
@@ -172,6 +172,24 @@ class DescrptSeZM(BaseDescriptor, nn.Module):
         If True, apply a random roll about the edge-aligned local ``+Z`` axis
         before building the Wigner-D blocks. The roll is sampled independently
         per edge and per forward call.
+    edge_cartesian
+        If True, every block whose message-passing degree is ``1`` or ``2``
+        replaces its per-edge SO(2) rotation-frame tensor product with the
+        equivalent global-frame Cartesian rank-2 tensor product, removing the two
+        per-edge Wigner-D rotations. Blocks with degree ``0`` or ``>= 3`` keep
+        the SO(2) path. When every block takes the Cartesian path the full
+        Wigner-D construction is skipped automatically, and the geometric initial
+        embedding falls back to the zonal coupling.
+    node_cartesian
+        Per-node global-frame Cartesian rank-2 tensor product on the aggregated
+        message, applied in every block whose message-passing degree is ``1`` or
+        ``2``. Configured by a ``"<mode>:<layers>"`` string where ``mode`` is
+        ``"default"`` (one-sided product) or ``"parity"`` (symmetrized product)
+        and ``layers`` is the stack depth; a bare integer ``N`` is shorthand for
+        ``"default:N"``, and ``"none"`` disables it. Orthogonal to
+        ``edge_cartesian``: either, both, or neither may be set. Unlike
+        ``edge_cartesian`` it does not affect the Wigner-D construction, since the
+        per-edge message path is left unchanged.
     lmax
         Maximum degree, only used when `l_schedule` is None.
     l_schedule
@@ -198,8 +216,12 @@ class DescrptSeZM(BaseDescriptor, nn.Module):
     so2_norm
         If True, apply intermediate ReducedEquivariantRMSNorm between SO(2) mixing layers.
         When False (default), no normalization is applied between layers.
-    so2_layers
-        Number of SO(2) mixing layers per block.
+    mixing_layers
+        Number of learnable mixing layers in the per-edge message core of each
+        block (legacy alias: ``so2_layers``). ``0`` applies only the
+        edge-condition modulation: the rotation-free per-degree radial scaling on
+        the SO(2) path, or a single ``x @ T_e`` when ``edge_cartesian`` applies.
+        The per-node ``node_cartesian`` stack carries its own independent depth.
     so2_attn_res
         SO(2)-internal depth-wise attention residual mode inside each interaction
         block. Must be one of ``"none"``, ``"independent"``, or ``"dependent"``.
@@ -207,7 +229,9 @@ class DescrptSeZM(BaseDescriptor, nn.Module):
         Dynamic radial degree mixer mode inside SO(2) convolution. ``"none"``
         applies elementwise radial modulation, ``"degree"`` uses a
         channel-shared edge-conditioned cross-degree kernel, and
-        ``"degree_channel"`` uses a per-channel cross-degree kernel.
+        ``"degree_channel"`` uses a per-channel cross-degree kernel. Has no
+        effect on blocks taking the Cartesian path (``edge_cartesian`` with
+        degree 1 or 2), where the dynamic radial degree mixer is bypassed.
     radial_so2_rank
         Low-rank channel factorization rank for
         ``radial_so2_mode="degree_channel"``. ``0`` uses the full
@@ -402,6 +426,8 @@ def __init__(
         radial_mlp: list[int] | None = None,
         use_env_seed: bool = True,
         random_gamma: bool = True,
+        edge_cartesian: bool = False,
+        node_cartesian: str | int = "none",
         lmax: int = 3,
         l_schedule: list[int] | None = None,
         mmax: int | None = 1,
@@ -410,7 +436,8 @@ def __init__(
         extra_node_l: int = 0,
         n_blocks: int = 3,
         so2_norm: bool = False,
-        so2_layers: int = 4,
+        mixing_layers: int = 4,
+        so2_layers: int | None = None,
         so2_attn_res: str = "none",
         radial_so2_mode: str = "degree_channel",
         radial_so2_rank: int = 1,
@@ -570,6 +597,8 @@ def __init__(
         self.trainable = bool(trainable)
         self.seed = seed
         self.random_gamma = bool(random_gamma)
+        self.edge_cartesian = bool(edge_cartesian)
+        self.node_cartesian = str(node_cartesian)
         self.add_chg_spin_ebd = bool(add_chg_spin_ebd)
         if default_chg_spin is not None and len(default_chg_spin) != 2:
             raise ValueError("`default_chg_spin` must contain [charge, spin].")
@@ -640,7 +669,9 @@ def __init__(
         self.rad_sizes_per_block = [l + 1 for l in self.l_schedule]
 
         self.so2_norm = bool(so2_norm)
-        self.so2_layers = int(so2_layers)
+        # ``so2_layers`` is the legacy alias for ``mixing_layers``; when supplied
+        # it takes precedence so existing configs keep working.
+        self.mixing_layers = int(mixing_layers if so2_layers is None else so2_layers)
         self.so2_attn_res_mode = str(so2_attn_res).lower()
         if self.so2_attn_res_mode not in ATTN_RES_MODES:
             raise ValueError(
@@ -827,12 +858,21 @@ def __init__(
         # === C^3 cutoff envelope for edge weight ===
         self.edge_envelope = C3CutoffEnvelope(rcut=self.rcut, exponent=self.env_exp[1])
 
-        wigner_lmax = self.l_schedule[0]
-        # force fp32+
+        # === Edge-aligned Wigner-D calculator ===
+        # Cartesian blocks (degree 1 or 2) skip the SO(2) rotations, so the full
+        # per-edge Wigner-D blocks are built only when a block keeps the SO(2)
+        # path (tracked by ``_need_full_wigner``).
+        block_edge_cartesian = [
+            self.edge_cartesian and l_b in (1, 2) for l_b in self.l_schedule
+        ]
+        block_node_cartesian = [
+            self.node_cartesian if l_b in (1, 2) else "none" for l_b in self.l_schedule
+        ]
+        self._need_full_wigner = not all(block_edge_cartesian)
         self.wigner_calc = WignerDCalculator(
-            lmax=wigner_lmax,
+            lmax=self.l_schedule[0],
             eps=self.eps,
-            dtype=self.compute_dtype,
+            dtype=self.compute_dtype,  # force fp32+
         )
 
         self.use_gie = self.use_env_seed and self.node_l_schedule[0] > 0
@@ -875,10 +915,12 @@ def __init__(
                     n_focus=self.n_focus,
                     focus_dim=self.focus_dim,
                     so2_norm=self.so2_norm,
-                    so2_layers=self.so2_layers,
+                    mixing_layers=self.mixing_layers,
                     so2_attn_res=self.so2_attn_res_mode,
                     radial_so2_mode=self.radial_so2_mode,
                     radial_so2_rank=self.radial_so2_rank,
+                    edge_cartesian=block_edge_cartesian[block_idx],
+                    node_cartesian=block_node_cartesian[block_idx],
                     ffn_neurons=self.block_ffn_neurons,
                     node_wise_grid_mlp=self.node_wise_grid_mlp,
                     node_wise_grid_branch=self.node_wise_grid_branch,
@@ -971,11 +1013,11 @@ def __init__(
         for p in self.parameters():
             p.requires_grad = self.trainable
 
-        # Pre-allocate empty tensor for interface compatibility (torch.compile + DDP)
+        # Pre-allocate empty tensor for interface compatibility (torch.compile + DDP).
         self.register_buffer(
             "_empty_tensor",
             torch.empty(0, device=env.DEVICE, dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            persistent=True,
+            persistent=False,
         )
 
         # === Statistics buffers (interface compatibility) ===
@@ -1148,6 +1190,7 @@ def forward(
                 # the model is roll-equivariant, so inference fixes gamma.
                 random_gamma=self.random_gamma and self.training,
                 wigner_calc=self.wigner_calc,
+                build_wigner=self._need_full_wigner,
             )
 
         ebed_dim_0 = self.node_ebed_dims[0]  # (node_lmax+1)^2
@@ -1379,6 +1422,7 @@ def forward_with_edges(
                 # the model is roll-equivariant, so inference fixes gamma.
                 random_gamma=self.random_gamma and self.training,
                 wigner_calc=self.wigner_calc,
+                build_wigner=self._need_full_wigner,
             )
 
         ebed_dim_0 = self.node_ebed_dims[0]  # (node_lmax+1)^2
@@ -1598,6 +1642,31 @@ def node_l0_extractor(v: torch.Tensor) -> torch.Tensor:
         ).to(dtype=self.dtype)
         return x
 
+    def _edge_quaternion(self, edge_cache: EdgeFeatureCache) -> torch.Tensor:
+        """
+        Return the cached global->local edge quaternion, rebuilding if absent.
+
+        Parameters
+        ----------
+        edge_cache : EdgeFeatureCache
+            Per-edge cache. ``edge_quat`` is populated by the cache builder; the
+            fallback covers caches produced without it.
+
+        Returns
+        -------
+        torch.Tensor
+            Unit quaternions with shape (E, 4).
+        """
+        edge_quat = edge_cache.edge_quat
+        if edge_quat is None:
+            edge_len = safe_norm(edge_cache.edge_vec, self.eps)
+            edge_quat = build_edge_quaternion(
+                edge_cache.edge_vec,
+                edge_len=edge_len,
+                eps=self.eps,
+            )
+        return edge_quat
+
     def _build_gie_zonal_coupling(
         self,
         edge_cache: EdgeFeatureCache,
@@ -1608,9 +1677,15 @@ def _build_gie_zonal_coupling(
         Returns
         -------
         torch.Tensor or None
-            Coupling with shape ``(E, D_node - 1)`` when ``extra_node_l > 0``;
-            otherwise None, letting GIE gather from the MP Wigner-D cache.
+            Coupling with shape ``(E, D_node - 1)``. ``None`` is returned only
+            when the full Wigner-D blocks are present and ``extra_node_l == 0``,
+            in which case GIE gathers the coupling from the cache directly. When
+            the blocks are skipped (all-Cartesian model) the full coupling is
+            reconstructed from the edge quaternion via the m=0-only path.
         """
+        if edge_cache.Dt_full is None:
+            calc = self.gie_zonal_wigner_calc or self.wigner_calc
+            return calc.forward_zonal(self._edge_quaternion(edge_cache), lmin=1)
         if self.gie_zonal_wigner_calc is None:
             return None
         mp_row_count = self.ebed_dims[0] - 1
@@ -1621,16 +1696,8 @@ def _build_gie_zonal_coupling(
             mp_row_index,
             mp_m0_col_index,
         ]
-        edge_quat = edge_cache.edge_quat
-        if edge_quat is None:
-            edge_len = safe_norm(edge_cache.edge_vec, self.eps)
-            edge_quat = build_edge_quaternion(
-                edge_cache.edge_vec,
-                edge_len=edge_len,
-                eps=self.eps,
-            )
         extra_coupling = self.gie_zonal_wigner_calc.forward_zonal(
-            edge_quat,
+            self._edge_quaternion(edge_cache),
             lmin=self.lmax + 1,
         )
         return torch.cat([mp_coupling, extra_coupling], dim=1)
@@ -2160,8 +2227,10 @@ def serialize(self) -> dict[str, Any]:
                 "radial_mlp": self.radial_mlp,
                 "use_env_seed": self.use_env_seed,
                 "random_gamma": self.random_gamma,
+                "edge_cartesian": self.edge_cartesian,
+                "node_cartesian": self.node_cartesian,
                 "so2_norm": self.so2_norm,
-                "so2_layers": self.so2_layers,
+                "mixing_layers": self.mixing_layers,
                 "so2_attn_res": self.so2_attn_res_mode,
                 "radial_so2_mode": self.radial_so2_mode,
                 "radial_so2_rank": self.radial_so2_rank,
diff --git a/deepmd/pt/model/descriptor/sezm_nn/__init__.py b/deepmd/pt/model/descriptor/sezm_nn/__init__.py
index a834ea1cd2..ddbf9c959b 100644
--- a/deepmd/pt/model/descriptor/sezm_nn/__init__.py
+++ b/deepmd/pt/model/descriptor/sezm_nn/__init__.py
@@ -19,6 +19,12 @@
 from .block import (
     SeZMInteractionBlock,
 )
+from .cartesian import (
+    EdgeCartesianTensorProduct,
+    NodeCartesianTensorProduct,
+    build_cartesian_basis,
+    build_edge_cartesian_tensors,
+)
 from .dens import (
     ForceEmbedding,
     SeZMDenoisingHead,
@@ -135,6 +141,7 @@
     "ChargeSpinEmbedding",
     "DepthAttnRes",
     "DynamicRadialDegreeMixer",
+    "EdgeCartesianTensorProduct",
     "EdgeFeatureCache",
     "EnvironmentInitialEmbedding",
     "EquivariantFFN",
@@ -148,6 +155,7 @@
     "InnerClamp",
     "LoRASO2",
     "LoRASO3",
+    "NodeCartesianTensorProduct",
     "RMSNorm",
     "RadialBasis",
     "RadialMLP",
@@ -168,8 +176,10 @@
     "SwiGLU",
     "WignerDCalculator",
     "apply_lora_to_sezm",
+    "build_cartesian_basis",
     "build_edge_cache",
     "build_edge_cache_from_edges",
+    "build_edge_cartesian_tensors",
     "build_edge_quaternion",
     "build_edge_type_feat",
     "build_gie_zonal_index",
diff --git a/deepmd/pt/model/descriptor/sezm_nn/block.py b/deepmd/pt/model/descriptor/sezm_nn/block.py
index c38fc78a5b..825ff2a5e3 100644
--- a/deepmd/pt/model/descriptor/sezm_nn/block.py
+++ b/deepmd/pt/model/descriptor/sezm_nn/block.py
@@ -154,8 +154,9 @@ class SeZMInteractionBlock(nn.Module):
     so2_norm
         If True, apply intermediate ReducedEquivariantRMSNorm between SO(2) mixing layers.
         When False (default), no normalization is applied between layers.
-    so2_layers
-        Number of SO(2) mixing layers.
+    mixing_layers
+        Number of learnable mixing layers in the per-edge message core. ``0``
+        applies only the edge-condition modulation.
     so2_attn_res
         Depth-wise attention residual mode across the internal SO(2) layer
         history. Must be one of ``"none"``, ``"independent"``, or
@@ -169,6 +170,16 @@ class SeZMInteractionBlock(nn.Module):
         Low-rank channel factorization rank for
         ``radial_so2_mode="degree_channel"``. ``0`` uses the full
         per-channel dynamic degree kernel.
+    edge_cartesian
+        If True, replace the per-edge SO(2) rotation-frame tensor product inside
+        ``SO2Convolution`` with the global-frame Cartesian rank-2 tensor
+        product. Requires ``lmax`` in ``{1, 2}``.
+    node_cartesian
+        Per-node global-frame Cartesian rank-2 tensor product on the aggregated
+        message inside ``SO2Convolution``, configured by a ``"<mode>:<layers>"``
+        string (``mode`` is ``"default"`` or ``"parity"``); a bare integer ``N``
+        is shorthand for ``"default:N"``, and ``"none"`` disables it. Requires
+        ``lmax`` in ``{1, 2}`` and is orthogonal to ``edge_cartesian``.
     n_atten_head
         Number of attention heads when aggregating messages in SO(2) convolution.
         0 means no attention is used; >0 enables envelope-gated grouped softmax
@@ -289,10 +300,12 @@ def __init__(
         focus_dim: int = 0,
         focus_compete: bool = True,
         so2_norm: bool = False,
-        so2_layers: int = 4,
+        mixing_layers: int = 4,
         so2_attn_res: str = "none",
         radial_so2_mode: str = "none",
         radial_so2_rank: int = 0,
+        edge_cartesian: bool = False,
+        node_cartesian: str | int = "none",
         n_atten_head: int = 1,
         atten_f_mix: bool = False,
         atten_v_proj: bool = False,
@@ -354,7 +367,7 @@ def __init__(
             raise ValueError("`focus_dim` must be >= 0")
         self.focus_compete = bool(focus_compete)
         self.so2_norm = bool(so2_norm)
-        self.so2_layers = int(so2_layers)
+        self.mixing_layers = int(mixing_layers)
         self.so2_attn_res_mode = str(so2_attn_res).lower()
         if self.so2_attn_res_mode not in ATTN_RES_MODES:
             raise ValueError(
@@ -362,6 +375,8 @@ def __init__(
             )
         self.radial_so2_mode = str(radial_so2_mode).lower()
         self.radial_so2_rank = int(radial_so2_rank)
+        self.edge_cartesian = bool(edge_cartesian)
+        self.node_cartesian = str(node_cartesian)
         self.n_atten_head = int(n_atten_head)
         self.atten_f_mix = bool(atten_f_mix)
         self.use_atten_v_proj = bool(atten_v_proj)
@@ -463,10 +478,12 @@ def __init__(
             focus_dim=self.focus_dim,
             focus_compete=self.focus_compete,
             so2_norm=self.so2_norm,
-            so2_layers=self.so2_layers,
+            mixing_layers=self.mixing_layers,
             so2_attn_res=self.so2_attn_res_mode,
             radial_so2_mode=self.radial_so2_mode,
             radial_so2_rank=self.radial_so2_rank,
+            edge_cartesian=self.edge_cartesian,
+            node_cartesian=self.node_cartesian,
             layer_scale=self.layer_scale,
             n_atten_head=n_atten_head,
             atten_f_mix=self.atten_f_mix,
@@ -1021,10 +1038,12 @@ def serialize(self) -> dict[str, Any]:
                 "focus_dim": self.focus_dim,
                 "focus_compete": self.focus_compete,
                 "so2_norm": self.so2_norm,
-                "so2_layers": self.so2_layers,
+                "mixing_layers": self.mixing_layers,
                 "so2_attn_res": self.so2_attn_res_mode,
                 "radial_so2_mode": self.radial_so2_mode,
                 "radial_so2_rank": self.radial_so2_rank,
+                "edge_cartesian": self.edge_cartesian,
+                "node_cartesian": self.node_cartesian,
                 "n_atten_head": self.n_atten_head,
                 "atten_f_mix": self.atten_f_mix,
                 "atten_v_proj": self.use_atten_v_proj,
diff --git a/deepmd/pt/model/descriptor/sezm_nn/cartesian.py b/deepmd/pt/model/descriptor/sezm_nn/cartesian.py
new file mode 100644
index 0000000000..b01ca0c697
--- /dev/null
+++ b/deepmd/pt/model/descriptor/sezm_nn/cartesian.py
@@ -0,0 +1,596 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""
+Cartesian rank-2 tensor-product mixers for SeZM.
+
+For message-passing degree ``lmax <= 2`` the per-channel spherical-harmonic
+feature ``(l = 0, 1, 2)`` is isomorphic to a rank-2 Cartesian tensor (a ``3x3``
+matrix) that decomposes into a scalar (the trace), a vector (the antisymmetric
+part), and a symmetric-traceless tensor. A matrix product of two such tensors
+mixes these irreducible components while staying SO(3)-equivariant in the global
+frame, because ``(R X R^T)(R Y R^T) = R (X Y) R^T`` for any rotation ``R``. This
+replaces the rotate-to-local / ``SO2Linear`` stack / rotate-back core of
+:class:`SO2Convolution` without constructing any Wigner-D rotation.
+
+Two placements share the same scaffold (a per-degree channel linear, a gated
+nonlinearity, and a residual stack) but differ in the right operand of the
+``3x3`` product:
+
+* :class:`EdgeCartesianTensorProduct` runs per edge, before aggregation. The
+  right operand is the edge tensor
+  ``T_e = f_iso I + f_aniso A(r_hat) + f_sym S(r_hat)``, whose per-degree radial
+  weights ``f_*`` carry the edge condition. Because ``T_e`` depends only on the
+  edge direction it is shared across channels, so the product is evaluated
+  through channel-shared packed operators (below) without materializing any
+  ``3x3`` matrix per channel. With ``n_layers = 0`` the message is the single
+  modulation ``x @ T_e`` (no learnable channel-mixing layers); ``n_layers > 0``
+  refines it with the residual stack.
+* :class:`NodeCartesianTensorProduct` runs per node, after aggregation. It
+  couples the aggregated message with the destination node feature through the
+  product of ``linear(message)`` with ``node`` lifted by the orthonormal basis,
+  serving as the Cartesian counterpart of the ``message_node`` grid product.
+  Both operands are per-channel, so the product is the literal ``3x3`` form. The
+  one-sided product ``linear(message) @ node`` is SO(3)-equivariant; the
+  symmetrized product ``linear(message) @ node + node @ linear(message)``
+  additionally preserves the parity of each irreducible component.
+
+Placing the product per node makes its cost scale with the number of nodes
+rather than the number of edges, which is the regime where the Cartesian form is
+cheaper than the per-edge SO(2) rotation.
+
+Channel-shared edge evaluation
+------------------------------
+A literal ``to_cart -> Y @ T_e -> from_cart`` round trip materializes a ``3x3``
+matrix for every (edge, channel) pair and runs both basis changes once per
+layer, which is memory-bandwidth and kernel-launch bound. Instead, for a fixed
+edge the map ``y -> from_cart(to_cart(y) @ T_e)`` is linear in the packed
+coefficient ``y`` and splits, by linearity of ``T_e``, into
+
+    m = (f_iso / sqrt(3)) y + f_aniso (K_A y) + f_sym (K_S y),
+
+where ``K_A`` and ``K_S`` are ``(D, D)`` packed-basis operators for
+"right-multiply by ``A(r_hat)`` / ``S(r_hat)``". They depend only on the edge
+direction, hence are shared across channels: building them once per edge turns
+the per-layer geometry into a single channel-batched ``bmm(K, y)`` instead of
+two per-channel basis changes plus a per-channel ``3x3`` product. The identity
+component collapses to a scalar rescaling because the basis is orthonormal
+(``<B_p, B_d> = delta_{pd}``).
+
+With ``B`` the orthonormal packed-to-Cartesian basis, the projection from an
+edge component ``G`` to its packed right-multiply operator is
+``K_G[p, d] = sum_{k,j} W[p, d, k, j] G[k, j]`` with the fixed tensor
+``W[p, d, k, j] = sum_i B[p, i, j] B[d, i, k]``. The per-degree overall scale of
+``B`` is arbitrary (absorbed by the learnable layers), so it is chosen
+orthonormal for an exact, transpose-free round trip.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import math
+from typing import (
+    TYPE_CHECKING,
+)
+
+import torch
+import torch.nn as nn
+
+from deepmd.dpmodel.utils.seed import (
+    child_seed,
+)
+from deepmd.pt.utils import (
+    env,
+)
+
+from .activation import (
+    GatedActivation,
+)
+from .indexing import (
+    get_so3_dim_of_lmax,
+)
+from .so3 import (
+    SO3Linear,
+)
+from .utils import (
+    get_promoted_dtype,
+    safe_norm,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+    )
+
+
+def build_cartesian_basis(
+    lmax: int,
+    *,
+    dtype: torch.dtype,
+    device: torch.device,
+) -> torch.Tensor:
+    """
+    Build the orthonormal ``3x3`` basis aligned with the SeZM packed (l, m) layout.
+
+    Entry ``basis[d]`` is the Cartesian image of the d-th packed
+    spherical-harmonic coefficient, ordered ``l = 0``, ``l = 1`` (``m = -1, 0,
+    +1``), ``l = 2`` (``m = -2 .. +2``). The basis is orthonormal under the
+    Frobenius inner product; the inverse map reuses the same basis and the
+    coefficient round trip is exact.
+
+    The convention-critical part is the within-degree sign and ordering: it
+    matches the SeZM ``WignerDCalculator`` (``l = 1`` follows its
+    ``l1_perm``/``l1_sign``), which makes the basis intertwine the packed
+    Wigner-D rotation with the Cartesian rotation ``X -> R X R^T``. The
+    per-degree overall scale is free and absorbed by the learnable layers.
+
+    Parameters
+    ----------
+    lmax : int
+        Message-passing degree, must be 1 or 2.
+    dtype : torch.dtype
+        Output dtype.
+    device : torch.device
+        Output device.
+
+    Returns
+    -------
+    torch.Tensor
+        Basis with shape ``(D, 3, 3)`` where ``D = (lmax + 1) ** 2``.
+
+    Raises
+    ------
+    ValueError
+        If ``lmax`` is not 1 or 2.
+    """
+    if lmax not in (1, 2):
+        raise ValueError("Cartesian tensor product requires lmax in {1, 2}")
+    a = 1.0 / math.sqrt(2.0)
+    b = 1.0 / math.sqrt(3.0)
+    c = 1.0 / math.sqrt(6.0)
+    matrices: list[list[list[float]]] = [
+        # l = 0 : isotropic (trace)
+        [[b, 0.0, 0.0], [0.0, b, 0.0], [0.0, 0.0, b]],
+        # l = 1 : antisymmetric, m = -1, 0, +1
+        [[0.0, 0.0, -a], [0.0, 0.0, 0.0], [a, 0.0, 0.0]],
+        [[0.0, a, 0.0], [-a, 0.0, 0.0], [0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 0.0], [0.0, 0.0, -a], [0.0, a, 0.0]],
+    ]
+    if lmax == 2:
+        matrices += [
+            # l = 2 : symmetric traceless, m = -2 .. +2
+            [[0.0, -a, 0.0], [-a, 0.0, 0.0], [0.0, 0.0, 0.0]],
+            [[0.0, 0.0, 0.0], [0.0, 0.0, a], [0.0, a, 0.0]],
+            [[-c, 0.0, 0.0], [0.0, -c, 0.0], [0.0, 0.0, 2.0 * c]],
+            [[0.0, 0.0, -a], [0.0, 0.0, 0.0], [-a, 0.0, 0.0]],
+            [[a, 0.0, 0.0], [0.0, -a, 0.0], [0.0, 0.0, 0.0]],
+        ]
+    return torch.tensor(matrices, dtype=dtype, device=device)
+
+
+def build_edge_cartesian_tensors(
+    r_hat: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Build the antisymmetric and symmetric-traceless edge tensors from unit vectors.
+
+    Parameters
+    ----------
+    r_hat : torch.Tensor
+        Unit edge vectors with shape ``(E, 3)``.
+
+    Returns
+    -------
+    tuple[torch.Tensor, torch.Tensor]
+        Tuple containing (A0, S0), each with shape (E, 3, 3).
+        - A0: The antisymmetric (l=1, vector) part, computed as skew(r_hat).
+        - S0: The symmetric traceless (l=2, tensor) part, given by r_hat r_hat^T minus the identity matrix divided by 3.
+        Both are 3x3 matrices which transform via matrix conjugation (M -> R M R^T) under rotation of r_hat, but occupy different irreducible SO(3) subspaces (l=1 for A0, l=2 for S0).
+    """
+    rx, ry, rz = r_hat[:, 0], r_hat[:, 1], r_hat[:, 2]
+    zero = torch.zeros_like(rx)
+    a0 = torch.stack(
+        [
+            torch.stack([zero, -rz, ry], dim=-1),
+            torch.stack([rz, zero, -rx], dim=-1),
+            torch.stack([-ry, rx, zero], dim=-1),
+        ],
+        dim=-2,
+    )  # (E, 3, 3)
+    eye = torch.eye(3, dtype=r_hat.dtype, device=r_hat.device)
+    s0 = r_hat.unsqueeze(-1) * r_hat.unsqueeze(-2) - eye / 3.0  # (E, 3, 3)
+    return a0, s0
+
+
+class _CartesianTensorProduct(nn.Module):
+    """
+    Shared scaffold for the Cartesian rank-2 tensor-product mixers.
+
+    Holds the per-degree channel linears, the gated nonlinearities, and the
+    residual layer loop. Subclasses register the geometry buffer they need and
+    define ``forward``; the only per-layer difference is the equivariant ``3x3``
+    product supplied to :meth:`_run_layers`.
+
+    Parameters
+    ----------
+    lmax : int
+        Message-passing degree, must be 1 or 2.
+    focus_dim : int
+        Channel width per focus stream.
+    n_focus : int
+        Number of focus streams; the flattened channel width is
+        ``n_focus * focus_dim``.
+    n_layers : int
+        Number of stacked tensor-product layers.
+    activation_function : str
+        Activation function for the intermediate gated nonlinearities.
+    mlp_bias : bool
+        Whether the per-degree channel linear carries an ``l = 0`` bias.
+    dtype : torch.dtype
+        Parameter dtype.
+    seed : int | list[int] | None
+        Base seed for deterministic initialization.
+    trainable : bool
+        Whether parameters are trainable.
+
+    Raises
+    ------
+    ValueError
+        If ``lmax`` is not 1 or 2, or ``n_layers`` is negative.
+    """
+
+    def __init__(
+        self,
+        *,
+        lmax: int,
+        focus_dim: int,
+        n_focus: int,
+        n_layers: int,
+        activation_function: str,
+        mlp_bias: bool,
+        dtype: torch.dtype,
+        seed: int | list[int] | None,
+        trainable: bool,
+    ) -> None:
+        super().__init__()
+        if lmax not in (1, 2):
+            raise ValueError("`lmax` must be 1 or 2 for the Cartesian tensor product")
+        self.lmax = int(lmax)
+        self.focus_dim = int(focus_dim)
+        self.n_focus = int(n_focus)
+        self.n_layers = int(n_layers)
+        if self.n_layers < 0:
+            raise ValueError("`n_layers` must be >= 0")
+        self.ebed_dim = get_so3_dim_of_lmax(self.lmax)
+        self.c_wide = self.n_focus * self.focus_dim
+        self.dtype = dtype
+        self.device = env.DEVICE
+        self.compute_dtype = get_promoted_dtype(self.dtype)
+
+        # Separate seed namespaces so the linear and activation seeds never
+        # collide regardless of ``n_layers``.
+        seed_linears = child_seed(seed, 0)
+        seed_activations = child_seed(seed, 1)
+
+        # === Step 1. Per-degree channel linears (cross-degree mixing comes from
+        # the matrix product, not the linear) ===
+        self.linears = nn.ModuleList(
+            SO3Linear(
+                lmax=self.lmax,
+                in_channels=self.focus_dim,
+                out_channels=self.focus_dim,
+                n_focus=self.n_focus,
+                dtype=self.dtype,
+                mlp_bias=mlp_bias,
+                trainable=trainable,
+                seed=child_seed(seed_linears, i),
+            )
+            for i in range(self.n_layers)
+        )
+
+        # === Step 2. Gated nonlinearities; the last layer stays linear to mirror
+        # the trailing identity of the SO(2) mixing stack ===
+        activations: list[nn.Module] = []
+        for i in range(self.n_layers):
+            if i < self.n_layers - 1:
+                activations.append(
+                    GatedActivation(
+                        lmax=self.lmax,
+                        channels=self.focus_dim,
+                        n_focus=self.n_focus,
+                        dtype=self.compute_dtype,
+                        activation_function=activation_function,
+                        mlp_bias=mlp_bias,
+                        layout="ndfc",
+                        trainable=trainable,
+                        seed=child_seed(seed_activations, i),
+                    )
+                )
+            else:
+                activations.append(nn.Identity())
+        self.activations = nn.ModuleList(activations)
+
+    def _run_layers(
+        self,
+        h: torch.Tensor,
+        apply_product: Callable[[torch.Tensor], torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Run the residual tensor-product stack in packed ``(B, D, C_wide)`` layout.
+
+        Each layer mixes channels per degree (``linear``), forms the equivariant
+        ``3x3`` product (``apply_product``), and adds a gated-nonlinear residual.
+
+        Parameters
+        ----------
+        h : torch.Tensor
+            Input features with shape ``(B, D, C_wide)``.
+        apply_product : Callable[[torch.Tensor], torch.Tensor]
+            Maps the per-degree channel-mixed feature ``y`` to the equivariant
+            product term, both in ``(B, D, C_wide)`` layout.
+
+        Returns
+        -------
+        torch.Tensor
+            Mixed features with shape ``(B, D, C_wide)``.
+        """
+        n = h.shape[0]
+        d, f, cf, cw = self.ebed_dim, self.n_focus, self.focus_dim, self.c_wide
+        for linear, activation in zip(self.linears, self.activations, strict=True):
+            y = linear(h.reshape(n, d, f, cf)).reshape(n, d, cw)
+            m = apply_product(y)
+            h = h + activation(m.reshape(n, d, f, cf)).reshape(n, d, cw)
+        return h
+
+
+class EdgeCartesianTensorProduct(_CartesianTensorProduct):
+    """
+    Edge-wise Cartesian rank-2 tensor-product mixer (SO(3)-equivariant).
+
+    Per edge, the source spherical-harmonic feature is mixed with the edge tensor
+    ``T_e = f_iso I + f_aniso A(r_hat) + f_sym S(r_hat)``, whose per-degree radial
+    weights ``f_*`` carry the edge condition. The product is evaluated through
+    channel-shared packed operators (see the module docstring) so no ``3x3``
+    matrix is materialized per channel. Stacking ``n_layers`` such products
+    supplies the cross-degree mixing that the local-frame ``SO2Linear`` provided,
+    but in the global frame and without any Wigner-D rotation.
+
+    Parameters
+    ----------
+    lmax : int
+        Message-passing degree, must be 1 or 2.
+    focus_dim : int
+        Channel width per focus stream.
+    n_focus : int
+        Number of focus streams; the flattened channel width is
+        ``n_focus * focus_dim``.
+    n_layers : int
+        Number of stacked tensor-product layers.
+    activation_function : str
+        Activation function for the intermediate gated nonlinearities.
+    mlp_bias : bool
+        Whether the per-degree channel linear carries an ``l = 0`` bias.
+    eps : float
+        Epsilon for the edge-vector normalization.
+    dtype : torch.dtype
+        Parameter dtype.
+    seed : int | list[int] | None
+        Base seed for deterministic initialization.
+    trainable : bool
+        Whether parameters are trainable.
+    """
+
+    def __init__(
+        self,
+        *,
+        lmax: int,
+        focus_dim: int,
+        n_focus: int,
+        n_layers: int,
+        activation_function: str,
+        mlp_bias: bool,
+        eps: float,
+        dtype: torch.dtype,
+        seed: int | list[int] | None,
+        trainable: bool,
+    ) -> None:
+        super().__init__(
+            lmax=lmax,
+            focus_dim=focus_dim,
+            n_focus=n_focus,
+            n_layers=n_layers,
+            activation_function=activation_function,
+            mlp_bias=mlp_bias,
+            dtype=dtype,
+            seed=seed,
+            trainable=trainable,
+        )
+        self.eps = float(eps)
+
+        # Non-persistent: a deterministic constant rebuilt on construction and
+        # moved with the module, so it never enters the serialized state. The
+        # orthonormal basis ``B`` is contracted into the right-multiply
+        # projection ``W[p, d, k, j] = sum_i B[p, i, j] B[d, i, k]`` that maps an
+        # edge component to its channel-shared packed operator (see ``forward``).
+        basis = build_cartesian_basis(self.lmax, dtype=self.dtype, device=self.device)
+        self.register_buffer(
+            "right_mult_proj",
+            torch.einsum("pij,dik->pdkj", basis, basis),
+            persistent=False,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        edge_vec: torch.Tensor,
+        rad_feat: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        x : torch.Tensor
+            Source node features in packed SO(3) layout with shape
+            ``(E, D, C_wide)``, where ``D = (lmax + 1) ** 2`` and
+            ``C_wide = n_focus * focus_dim``.
+        edge_vec : torch.Tensor
+            Edge vectors with shape ``(E, 3)``, in Å.
+        rad_feat : torch.Tensor
+            Per-degree radial weights with shape ``(E, lmax + 1, C_wide)``,
+            already projected to the hidden width.
+
+        Returns
+        -------
+        torch.Tensor
+            Edge messages in packed SO(3) layout with shape ``(E, D, C_wide)``.
+        """
+        d = self.ebed_dim
+        proj = self.right_mult_proj.to(dtype=x.dtype)
+
+        # === Step 1. Channel-shared packed operators for the edge tensor ===
+        # A(r_hat) and S(r_hat) are rescaled to unit Frobenius norm (raw norms are
+        # sqrt(2) and sqrt(2/3)) so the per-degree radial weights modulate
+        # components of equal magnitude. Each is projected into a packed
+        # right-multiply operator of shape (E, D, D), shared by all channels; the
+        # identity component reduces to the scalar ``c_iso`` rescaling in Step 3.
+        r_hat = edge_vec / safe_norm(edge_vec, self.eps)  # (E, 3)
+        a0, s0 = build_edge_cartesian_tensors(r_hat.to(dtype=x.dtype))
+        a_hat = a0 / math.sqrt(2.0)
+        k_op = torch.einsum("pdkj,ekj->epd", proj, a_hat)  # (E, D, D)
+        if self.lmax == 2:
+            s_hat = s0 / math.sqrt(2.0 / 3.0)
+            k_sym = torch.einsum("pdkj,ekj->epd", proj, s_hat)  # (E, D, D)
+            # Stack so one batched matmul per layer covers both components.
+            k_op = torch.cat((k_op, k_sym), dim=1)  # (E, 2D, D)
+
+        # === Step 2. Per-degree radial weights, broadcast over the degree axis ===
+        c_iso = (rad_feat[:, 0, :] / math.sqrt(3.0)).unsqueeze(1)  # (E, 1, C_wide)
+        c_aniso = rad_feat[:, 1, :].unsqueeze(1)  # (E, 1, C_wide)
+        c_sym = (
+            rad_feat[:, 2, :].unsqueeze(1) if self.lmax == 2 else None
+        )  # (E, 1, C_wide)
+
+        # === Step 3. Edge-tensor modulation, optionally refined by a mixing stack ===
+        def apply_product(y: torch.Tensor) -> torch.Tensor:
+            ky = torch.bmm(k_op, y)  # (E, lmax * D, C_wide)
+            m = c_iso * y + c_aniso * ky[:, :d, :]
+            if c_sym is not None:
+                m = m + c_sym * ky[:, d:, :]
+            return m
+
+        # ``n_layers == 0`` keeps only the edge-condition modulation ``x @ T_e``
+        # (radial scale + directional cross-degree coupling), with no learnable
+        # channel-mixing layers; ``n_layers > 0`` refines it with the residual
+        # stack of per-degree channel linears.
+        if self.n_layers == 0:
+            return apply_product(x)
+
+        return self._run_layers(x, apply_product)
+
+
+class NodeCartesianTensorProduct(_CartesianTensorProduct):
+    """
+    Node-wise Cartesian rank-2 tensor-product mixer (SO(3)-equivariant).
+
+    Applied per node after aggregation, this couples the aggregated message with
+    the destination node feature, serving as the Cartesian counterpart of the
+    ``message_node`` grid product. The node feature is the fixed operator and the
+    message is the residual stream, so each layer forms the product of
+    ``linear(message)`` with ``node`` lifted by the orthonormal basis, then adds
+    a gated-nonlinear residual. There is no per-edge geometry, so the cost scales
+    with the number of nodes instead of the number of edges.
+
+    The ``symmetric`` flag selects the product form. The one-sided product
+    ``linear(message) @ node`` is SO(3)-equivariant and cheapest. The symmetrized
+    product ``linear(message) @ node + node @ linear(message)`` additionally
+    gives each irreducible component a definite parity under spatial inversion
+    (even scalar and symmetric-traceless parts, odd skew-symmetric part), which
+    the one-sided product mixes, at the cost of a second matrix product.
+
+    Parameters
+    ----------
+    lmax : int
+        Node degree, must be 1 or 2.
+    focus_dim : int
+        Channel width per focus stream.
+    n_focus : int
+        Number of focus streams; the flattened channel width is
+        ``n_focus * focus_dim``.
+    n_layers : int
+        Number of stacked tensor-product layers.
+    symmetric : bool
+        If True, use the parity-preserving symmetrized product ``Y N + N Y``;
+        if False, use the one-sided product ``Y N``.
+    activation_function : str
+        Activation function for the intermediate gated nonlinearities.
+    mlp_bias : bool
+        Whether the per-degree channel linear carries an ``l = 0`` bias.
+    dtype : torch.dtype
+        Parameter dtype.
+    seed : int | list[int] | None
+        Base seed for deterministic initialization.
+    trainable : bool
+        Whether parameters are trainable.
+    """
+
+    def __init__(
+        self,
+        *,
+        lmax: int,
+        focus_dim: int,
+        n_focus: int,
+        n_layers: int,
+        symmetric: bool,
+        activation_function: str,
+        mlp_bias: bool,
+        dtype: torch.dtype,
+        seed: int | list[int] | None,
+        trainable: bool,
+    ) -> None:
+        super().__init__(
+            lmax=lmax,
+            focus_dim=focus_dim,
+            n_focus=n_focus,
+            n_layers=n_layers,
+            activation_function=activation_function,
+            mlp_bias=mlp_bias,
+            dtype=dtype,
+            seed=seed,
+            trainable=trainable,
+        )
+        self.symmetric = bool(symmetric)
+        self.register_buffer(
+            "basis",
+            build_cartesian_basis(self.lmax, dtype=self.dtype, device=self.device),
+            persistent=False,
+        )
+
+    def forward(self, message: torch.Tensor, node: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        message : torch.Tensor
+            Aggregated message in packed SO(3) layout with shape
+            ``(N, D, C_wide)``, where ``D = (lmax + 1) ** 2`` and
+            ``C_wide = n_focus * focus_dim``. This is the residual stream.
+        node : torch.Tensor
+            Destination node feature in the same packed layout and shape. It is
+            the fixed right operand of the product across all layers.
+
+        Returns
+        -------
+        torch.Tensor
+            Mixed message in packed SO(3) layout with shape ``(N, D, C_wide)``.
+        """
+        basis = self.basis.to(dtype=message.dtype)
+
+        # The node feature is the fixed per-node operator; lifting it to its
+        # per-(node, channel) 3x3 form once lets every layer reuse it.
+        node_cart = torch.einsum("ndc,dij->ncij", node, basis)  # (N, C_wide, 3, 3)
+
+        def apply_product(y: torch.Tensor) -> torch.Tensor:
+            y_cart = torch.einsum("ndc,dij->ncij", y, basis)  # (N, C_wide, 3, 3)
+            m_cart = torch.matmul(y_cart, node_cart)
+            if self.symmetric:
+                m_cart = m_cart + torch.matmul(node_cart, y_cart)
+            return torch.einsum("ncij,dij->ndc", m_cart, basis)  # (N, D, C_wide)
+
+        return self._run_layers(message, apply_product)
diff --git a/deepmd/pt/model/descriptor/sezm_nn/edge_cache.py b/deepmd/pt/model/descriptor/sezm_nn/edge_cache.py
index 19d25e8d71..383c4d3e5d 100644
--- a/deepmd/pt/model/descriptor/sezm_nn/edge_cache.py
+++ b/deepmd/pt/model/descriptor/sezm_nn/edge_cache.py
@@ -224,6 +224,7 @@ def build_edge_cache(
     n_radial: int,
     random_gamma: bool,
     wigner_calc: WignerCalculatorFn,
+    build_wigner: bool = True,
 ) -> EdgeFeatureCache:
     """
     Build the global edge cache from DeePMD padded neighbor list.
@@ -348,6 +349,7 @@ def build_edge_cache(
             eps=eps,
             random_gamma=random_gamma,
             wigner_calc=wigner_calc,
+            build_full=build_wigner,
         )  # (E, D, D), (E, D, D), (E, 4)
 
     edge_type_feat = build_edge_type_feat(type_ebed, src, dst)  # (E, C)
@@ -386,6 +388,7 @@ def build_edge_cache_from_edges(
     edge_type_keep_mask: EdgeTypeKeepMaskFn,
     random_gamma: bool,
     wigner_calc: WignerCalculatorFn,
+    build_wigner: bool = True,
 ) -> EdgeFeatureCache:
     """
     Build the global edge cache from a sparse edge list.
@@ -475,6 +478,7 @@ def build_edge_cache_from_edges(
             eps=eps,
             random_gamma=random_gamma,
             wigner_calc=wigner_calc,
+            build_full=build_wigner,
         )  # (E, D, D), (E, D, D), (E, 4)
 
     # === Step 5. Edge type features ===
@@ -520,7 +524,8 @@ def _build_edge_wigner(
     eps: float,
     random_gamma: bool,
     wigner_calc: WignerCalculatorFn,
-) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    build_full: bool = True,
+) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor]:
     """
     Build packed Wigner-D blocks from edge vectors.
 
@@ -537,12 +542,18 @@ def _build_edge_wigner(
     wigner_calc
         Callable that converts edge-aligned quaternions into packed Wigner-D
         blocks.
+    build_full
+        Whether to materialize the full ``(E, D, D)`` Wigner-D blocks. When
+        False (all message-passing blocks take the Cartesian path), only the
+        quaternion is returned and the blocks are ``None``; the geometric
+        initial embedding reconstructs the zonal coupling from the quaternion.
 
     Returns
     -------
-    tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+    tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor]
         Packed Wigner-D matrices ``(D_full, Dt_full)`` with shape ``(E, D, D)``
-        and the quaternion used to build them with shape ``(E, 4)``.
+        (or ``None`` when ``build_full`` is False) and the quaternion used to
+        build them with shape ``(E, 4)``.
     """
     # === Step 1. Build edge-aligned quaternions ===
     edge_quat = build_edge_quaternion(
@@ -561,6 +572,8 @@ def _build_edge_wigner(
         edge_quat = quaternion_multiply(quaternion_z_rotation(gamma), edge_quat)
 
     # === Step 3. Convert quaternions to packed Wigner-D blocks ===
+    if not build_full:
+        return None, None, edge_quat
     D_full, Dt_full = wigner_calc(edge_quat)
     return D_full, Dt_full, edge_quat
 
@@ -574,8 +587,8 @@ def _finalize_edge_cache(
     edge_vec: torch.Tensor,
     edge_rbf: torch.Tensor,
     edge_env: torch.Tensor,
-    D_full: torch.Tensor,
-    Dt_full: torch.Tensor,
+    D_full: torch.Tensor | None,
+    Dt_full: torch.Tensor | None,
     edge_quat: torch.Tensor,
     deg_norm_floor: float,
     edge_src_gate: torch.Tensor | None = None,
@@ -600,9 +613,11 @@ def _finalize_edge_cache(
     edge_env
         Smooth edge envelope weights with shape (E, 1).
     D_full
-        Packed Wigner-D matrices with shape (E, D, D).
+        Packed Wigner-D matrices with shape (E, D, D), or None when the
+        full Wigner-D construction is skipped (all-Cartesian model).
     Dt_full
-        Transposed packed Wigner-D matrices with shape (E, D, D).
+        Transposed packed Wigner-D matrices with shape (E, D, D), or None
+        when the full Wigner-D construction is skipped.
     edge_quat
         Global-to-local quaternions used to build the Wigner-D matrices with
         shape (E, 4).
diff --git a/deepmd/pt/model/descriptor/sezm_nn/so2.py b/deepmd/pt/model/descriptor/sezm_nn/so2.py
index 91839c82d4..5d6e3298ff 100644
--- a/deepmd/pt/model/descriptor/sezm_nn/so2.py
+++ b/deepmd/pt/model/descriptor/sezm_nn/so2.py
@@ -45,6 +45,10 @@
 from .attn_res import (
     DepthAttnRes,
 )
+from .cartesian import (
+    EdgeCartesianTensorProduct,
+    NodeCartesianTensorProduct,
+)
 from .grid_net import (
     S2GridNet,
     SO3GridNet,
@@ -777,6 +781,53 @@ def forward(self, x_local: torch.Tensor, radial_feat: torch.Tensor) -> torch.Ten
         return torch.einsum("eoic,eic->eoc", kernel, x_local)
 
 
+def _parse_node_cartesian(spec: str) -> tuple[bool, bool, int]:
+    """
+    Parse the ``node_cartesian`` configuration string.
+
+    Grammar: ``"<mode>:<layers>"`` where ``mode`` is ``"default"`` (the one-sided
+    product ``Y N``) or ``"parity"`` (the symmetrized product ``Y N + N Y``), and
+    ``layers`` is a non-negative integer. A bare mode defaults to one layer; a
+    bare integer uses the default mode. ``"none"``, an empty string, or any zero
+    layer count disables the per-node product.
+
+    Parameters
+    ----------
+    spec : str
+        The configuration string.
+
+    Returns
+    -------
+    tuple[bool, bool, int]
+        ``(enabled, symmetric, n_layers)``.
+
+    Raises
+    ------
+    ValueError
+        If the mode is not ``"default"`` or ``"parity"``, or the layer count is
+        negative.
+    """
+    text = str(spec).strip().lower()
+    if text in ("", "none"):
+        return False, False, 0
+    if ":" in text:
+        mode, _, num = text.partition(":")
+        mode = mode.strip() or "default"
+        layers = int(num.strip())
+    elif text.isdigit():
+        mode, layers = "default", int(text)
+    else:
+        mode, layers = text, 1
+    if mode not in ("default", "parity"):
+        raise ValueError(
+            "`node_cartesian` mode must be 'default' or 'parity', got "
+            f"'{mode}' (expected '<mode>:<layers>', 'none', or a layer count)"
+        )
+    if layers < 0:
+        raise ValueError("`node_cartesian` layer count must be non-negative")
+    return layers > 0, mode == "parity", layers
+
+
 class SO2Convolution(nn.Module):
     """
     SO(2)-equivariant edge convolution with cached geometry and rotations.
@@ -788,7 +839,7 @@ class SO2Convolution(nn.Module):
     1. `pre_focus_mix`: project node features `(N, D, C)` to the SO(2) hidden width.
     2. rotate global -> local reduced basis with cached `D_to_m`.
     3. radial modulation in reduced layout.
-    4. `so2_layers` stacked local mixers:
+    4. `mixing_layers` stacked local mixers:
        `inter_norm -> SO2Linear -> non_linearity -> residual(+LayerScale)`.
     5. rotate local -> global with cached `Dt_from_m`.
     6. edge aggregation (plain envelope scatter or envelope-aware grouped
@@ -822,8 +873,12 @@ class SO2Convolution(nn.Module):
     so2_norm
         If True, apply intermediate ReducedEquivariantRMSNorm as pre-norm before
         each SO(2) mixing layer. The last SO(2) layer always uses Identity.
-    so2_layers
-        Number of SO2Linear layers per convolution (default: 1).
+    mixing_layers
+        Number of learnable mixing layers in the per-edge message core (SO2Linear
+        layers for the SO(2) path, or refinement layers for ``edge_cartesian``).
+        ``0`` applies only the edge-condition modulation: the rotation-free
+        per-degree radial scaling for the SO(2) path, or a single ``x @ T_e`` for
+        ``edge_cartesian``.
     so2_attn_res
         Depth-wise attention residual mode across the internal SO(2) layer
         history. Must be one of ``"none"``, ``"independent"``, or
@@ -893,6 +948,23 @@ class SO2Convolution(nn.Module):
     radial_so2_rank
         Low-rank channel factorization rank for ``radial_so2_mode="degree_channel"``.
         ``0`` uses the full per-channel dynamic degree kernel.
+    edge_cartesian
+        If True, replace the rotate-to-local / ``SO2Linear`` stack / rotate-back
+        core with the per-edge global-frame Cartesian rank-2 tensor product.
+        Requires ``lmax`` in ``{1, 2}`` and is incompatible with the S2/SO(3)
+        grid product branches. The dynamic radial degree mixer is bypassed
+        because the radial edge condition is carried by the Cartesian edge tensor
+        instead.
+    node_cartesian
+        Per-node global-frame Cartesian rank-2 tensor product applied to the
+        aggregated message, coupling it with the destination node feature after
+        the optional message-node grid product and before ``post_focus_mix``. The
+        Cartesian analog of the message-node grid product. Configured by a string
+        ``"<mode>:<layers>"`` where ``mode`` is ``"default"`` (one-sided product)
+        or ``"parity"`` (symmetrized product), and ``layers`` is the stack depth;
+        a bare integer ``N`` is shorthand for ``"default:N"``, and ``"none"`` (or
+        ``0``) disables it. Requires ``lmax`` in ``{1, 2}`` and is orthogonal to
+        ``edge_cartesian``.
     eps
         Small epsilon for normalization modules.
     dtype
@@ -914,7 +986,7 @@ def __init__(
         focus_dim: int = 0,
         focus_compete: bool = True,
         so2_norm: bool = False,
-        so2_layers: int = 4,
+        mixing_layers: int = 4,
         so2_attn_res: str = "none",
         layer_scale: bool = False,
         n_atten_head: int = 1,
@@ -935,6 +1007,8 @@ def __init__(
         mlp_bias: bool = False,
         radial_so2_mode: str = "none",
         radial_so2_rank: int = 0,
+        edge_cartesian: bool = False,
+        node_cartesian: str | int = "none",
         eps: float = 1e-7,
         dtype: torch.dtype,
         seed: int | list[int] | None,
@@ -964,9 +1038,9 @@ def __init__(
         self.focus_softmax_tau = 1.0
         self.focus_label_smoothing = 0.02
         self.so2_norm = bool(so2_norm)
-        self.so2_layers = int(so2_layers)
-        if self.so2_layers < 1:
-            raise ValueError("`so2_layers` must be >= 1")
+        self.mixing_layers = int(mixing_layers)
+        if self.mixing_layers < 0:
+            raise ValueError("`mixing_layers` must be >= 0")
         self.so2_attn_res_mode = str(so2_attn_res).lower()
         if self.so2_attn_res_mode not in ATTN_RES_MODES:
             raise ValueError(
@@ -1036,6 +1110,28 @@ def __init__(
         self.radial_so2_rank = int(radial_so2_rank)
         if self.radial_so2_rank < 0:
             raise ValueError("`radial_so2_rank` must be non-negative")
+        self.edge_cartesian = bool(edge_cartesian)
+        self.node_cartesian = str(node_cartesian)
+        (
+            self._node_cartesian_enabled,
+            self._node_cartesian_symmetric,
+            self._node_cartesian_layers,
+        ) = _parse_node_cartesian(self.node_cartesian)
+        if self.edge_cartesian:
+            if self.lmax not in (1, 2):
+                raise ValueError("`edge_cartesian` requires lmax in {1, 2}")
+            if (
+                self.node_wise_s2
+                or self.node_wise_so3
+                or self.message_node_s2
+                or self.message_node_so3
+            ):
+                raise ValueError(
+                    "`edge_cartesian` is incompatible with the S2/SO(3) grid "
+                    "product branches"
+                )
+        if self._node_cartesian_enabled and self.lmax not in (1, 2):
+            raise ValueError("`node_cartesian` requires lmax in {1, 2}")
         self.eps = float(eps)
         self.ebed_dim_full = get_so3_dim_of_lmax(self.lmax)
         self.dtype = dtype
@@ -1048,54 +1144,8 @@ def __init__(
         # is a compile-time constant in the traced (``make_fx``) graph, and it
         # only takes effect during inference.
         self.use_triton_infer = use_triton_infer()
-        # Triton rotation kernels: block for the mmax == 1 layout, dense otherwise.
-        self._rotate_to_local_fn = None
-        self._rotate_back_fn = None
-        if self.use_triton_infer:
-            from .triton.so2_rotation import (
-                rotate_back_block_so2,
-                rotate_back_dense,
-                rotate_to_local_block,
-                rotate_to_local_dense,
-            )
-
-            if self.mmax == 1:
-                self._rotate_to_local_fn = lambda x, src, wigner: rotate_to_local_block(
-                    x, src, wigner, self.lmax
-                )
-                # The block kernel reads the (E, F, D_m, Cf) focus layout directly,
-                # so the rotate-back path passes ``x_local`` before the global
-                # reshape and the transpose-back copy is skipped (see Step 7).
-                self._rotate_back_fn = lambda x_local, wigner: rotate_back_block_so2(
-                    x_local, wigner, self.lmax
-                )
-            else:
-                self._rotate_to_local_fn = lambda x, src, wigner: rotate_to_local_dense(
-                    x, src, wigner, self.coeff_index_m, self.ebed_dim_full
-                )
-                self._rotate_back_fn = lambda x_local, wigner: rotate_back_dense(
-                    x_local, wigner, self.coeff_index_m, self.ebed_dim_full
-                )
-
-        # === Step 1. Precompute coefficient indices for m-major reduced layout ===
-        coeff_index_m = build_m_major_index(self.lmax, self.mmax, device=self.device)
-        degree_index_m = build_m_major_l_index(self.lmax, self.mmax, device=self.device)
-        degree_index_full = map_degree_idx(self.lmax, device=self.device)
-        rotate_inv_rescale_full = build_rotate_inv_rescale(
-            lmax=self.lmax,
-            mmax=self.mmax,
-            degree_index=degree_index_full,
-            device=self.device,
-            dtype=self.dtype,
-        )
-        self.register_buffer("coeff_index_m", coeff_index_m, persistent=True)
-        self.register_buffer("degree_index_m", degree_index_m, persistent=True)
-        self.register_buffer(
-            "rotate_inv_rescale_full", rotate_inv_rescale_full, persistent=True
-        )
-        self.reduced_dim = int(coeff_index_m.numel())
 
-        # === Step 2. Split deterministic seeds at the module top-level ===
+        # === Step 1. Split deterministic seeds at the module top-level ===
         seed_so2_stack = child_seed(seed, 0)
         seed_non_linearities = child_seed(seed, 1)
         seed_so3_pre = child_seed(seed, 2)
@@ -1106,128 +1156,45 @@ def __init__(
         seed_radial_degree = child_seed(seed, 7)
         seed_node_wise_s2 = child_seed(seed, 8)
         seed_message_node_s2 = child_seed(seed, 9)
+        seed_node_cartesian = child_seed(seed, 10)
 
-        # === Step 3. Multiple SO2Linear layers ===
-        self.so2_linears = nn.ModuleList(
-            [
-                SO2Linear(
-                    lmax=self.lmax,
-                    mmax=self.mmax,
-                    in_channels=self.so2_focus_dim,
-                    out_channels=(
-                        2 * self.so2_focus_dim
-                        if self.s2_activation and i < self.so2_layers - 1
-                        else self.so2_focus_dim
-                    ),
-                    n_focus=self.n_focus,
-                    dtype=self.dtype,
-                    mlp_bias=self.mlp_bias,
-                    seed=child_seed(seed_so2_stack, i),
-                    trainable=trainable,
-                )
-                for i in range(self.so2_layers)
-            ]
-        )
-
-        # === Step 4. Intermediate norms (Optional) ===
-        inter_norms: list[nn.Module] = []
-        if self.so2_norm:
-            for _ in range(max(0, self.so2_layers - 1)):
-                inter_norms.append(
-                    ReducedEquivariantRMSNorm(
-                        lmax=self.lmax,
-                        mmax=self.mmax,
-                        channels=self.so2_focus_dim,
-                        degree_index_m=self.degree_index_m,
-                        n_focus=self.n_focus,
-                        dtype=self.compute_dtype,
-                        trainable=trainable,
-                    )
-                )
-        else:
-            for _ in range(max(0, self.so2_layers - 1)):
-                inter_norms.append(nn.Identity())
-        inter_norms.append(nn.Identity())
-        self.so2_inter_norms = nn.ModuleList(inter_norms)
-
-        # === Step 5. Intermediate non-linearity ===
-        non_linearities: list[nn.Module] = []
-        for i in range(max(0, self.so2_layers - 1)):
-            if self.s2_activation:
-                non_linearities.append(
-                    S2GridNet(
-                        lmax=self.lmax,
-                        mmax=self.mmax,
-                        channels=self.so2_focus_dim,
-                        n_focus=self.n_focus,
-                        mode="self",
-                        op_type="glu",
-                        dtype=self.compute_dtype,
-                        layout="nfdc",
-                        grid_resolution_list=self.s2_grid_resolution,
-                        coefficient_layout="m_major",
-                        grid_method=self.s2_grid_method,
-                        mlp_bias=self.mlp_bias,
-                        trainable=trainable,
-                        seed=child_seed(seed_non_linearities, i),
-                    )
-                )
-            else:
-                non_linearities.append(
-                    GatedActivation(
-                        lmax=self.lmax,
-                        mmax=self.mmax,
-                        channels=self.so2_focus_dim,
-                        n_focus=self.n_focus,
-                        dtype=self.compute_dtype,
-                        activation_function=self.activation_function,
-                        mlp_bias=self.mlp_bias,
-                        layout="nfdc",
-                        trainable=trainable,
-                        seed=child_seed(seed_non_linearities, i),
-                    )
-                )
-        non_linearities.append(nn.Identity())
-        self.non_linearities = nn.ModuleList(non_linearities)
-
-        # === Step 5.5. Optional depth-wise attention residuals across SO(2) layers ===
-        if self.use_so2_attn_res:
-            self.so2_layer_attn_res: nn.ModuleList | None = nn.ModuleList(
-                [
-                    DepthAttnRes(
-                        channels=self.hidden_channels,
-                        input_dependent=self.so2_attn_res_mode == "dependent",
-                        eps=self.eps,
-                        bias=self.mlp_bias,
-                        dtype=self.compute_dtype,
-                        trainable=trainable,
-                        seed=child_seed(seed_depth_attn, i),
-                    )
-                    for i in range(self.so2_layers)
-                ]
+        # === Step 2. Edge mixing core: SO(2) rotation stack or Cartesian product ===
+        if self.edge_cartesian:
+            self.edge_cartesian_tp = EdgeCartesianTensorProduct(
+                lmax=self.lmax,
+                focus_dim=self.so2_focus_dim,
+                n_focus=self.n_focus,
+                n_layers=self.mixing_layers,
+                activation_function=self.activation_function,
+                mlp_bias=self.mlp_bias,
+                eps=self.eps,
+                dtype=self.dtype,
+                seed=seed_so2_stack,
+                trainable=trainable,
             )
         else:
-            self.so2_layer_attn_res = None
+            self._build_so2_mixing(
+                seed_so2_stack=seed_so2_stack,
+                seed_non_linearities=seed_non_linearities,
+                seed_depth_attn=seed_depth_attn,
+                trainable=trainable,
+            )
 
-        # === Step 6. Optional per-layer LayerScale for SO(2) residual branches ===
-        if self.layer_scale:
-            self.adam_so2_layer_scales = nn.ParameterList(
-                [
-                    nn.Parameter(
-                        torch.ones(
-                            self.n_focus,
-                            self.so2_focus_dim,
-                            dtype=self.dtype,
-                            device=self.device,
-                        )
-                        * 1e-3,
-                        requires_grad=trainable,
-                    )
-                    for _ in range(self.so2_layers)
-                ]
+        # === Step 2b. Optional per-node Cartesian mixing on the aggregated message ===
+        self.node_cartesian_tp: NodeCartesianTensorProduct | None = None
+        if self._node_cartesian_enabled:
+            self.node_cartesian_tp = NodeCartesianTensorProduct(
+                lmax=self.lmax,
+                focus_dim=self.so2_focus_dim,
+                n_focus=self.n_focus,
+                n_layers=self._node_cartesian_layers,
+                symmetric=self._node_cartesian_symmetric,
+                activation_function=self.activation_function,
+                mlp_bias=self.mlp_bias,
+                dtype=self.dtype,
+                seed=seed_node_cartesian,
+                trainable=trainable,
             )
-        else:
-            self.adam_so2_layer_scales = None
 
         # === Step 7. Optional attention projections (n_atten_head > 0) ===
         self.attn_qk_norm: ScalarRMSNorm | None = None
@@ -1398,7 +1365,7 @@ def __init__(
                 trainable=trainable,
             )
         self.radial_degree_mixer: DynamicRadialDegreeMixer | None = None
-        if self.radial_so2_mode != "none":
+        if not self.edge_cartesian and self.radial_so2_mode != "none":
             self.radial_degree_mixer = DynamicRadialDegreeMixer(
                 lmax=self.lmax,
                 mmax=self.mmax,
@@ -1525,6 +1492,13 @@ def __init__(
             init_std=0.0,
         )
 
+        # === Step 11. Edge-frame requirement for the SO(2) message ===
+        self.needs_local_frame = (not self.edge_cartesian) and (
+            self.mixing_layers > 0
+            or self.radial_so2_mode != "none"
+            or self.node_wise_grid_product is not None
+        )
+
     def forward(
         self,
         x: torch.Tensor,
@@ -1557,62 +1531,307 @@ def forward(
             # (N, D, C_wide), C_wide = F * Cf
             x = self.pre_focus_mix(x.unsqueeze(2)).squeeze(2)
 
-        # === Step 2. Rotate to edge-aligned local frame ===
-        with nvtx_range("SO2Conv/rotate_to_local"):
-            D_full = edge_cache.D_full
-            x_dst_local: torch.Tensor | None = None
-            if self.use_triton_infer and not self.training:
-                # ``self._rotate_to_local_fn`` was bound in ``__init__`` (the
-                # block kernel for the m-major ``mmax == 1`` layout, dense
-                # otherwise).
-                x_local = self._rotate_to_local_fn(x, src, D_full)  # (E, D_m, C_wide)
-                if self.node_wise_grid_product is not None:
-                    x_dst_local = self._rotate_to_local_fn(
-                        x, dst, D_full
-                    )  # (E, D_m, C_wide)
-            else:
-                D_m_prime = project_D_to_m(
-                    D_full=D_full,
-                    coeff_index_m=self.coeff_index_m,
-                    ebed_dim_full=self.ebed_dim_full,
-                    cache=edge_cache.D_to_m_cache,
-                    key_lmax=self.lmax,
-                    key_mmax=self.mmax,
-                )
-                x_src = x.index_select(0, src)  # (E, D, C_wide)
-                x_local = torch.bmm(D_m_prime, x_src)  # (E, D_m, C_wide)
-                if self.node_wise_grid_product is not None:
-                    x_dst = x.index_select(0, dst)  # (E, D, C_wide)
-                    x_dst_local = torch.bmm(D_m_prime, x_dst)  # (E, D_m, C_wide)
+        # === Step 2. Edge message: Cartesian product, SO(2) mixing, or the
+        # rotation-free radial message when no local-frame operation is needed ===
+        if self.edge_cartesian:
+            x_message, rad_feat = self.cartesian_message(x, edge_cache, radial_feat)
+        elif self.needs_local_frame:
+            x_message, rad_feat = self.so2_message(x, edge_cache, radial_feat)
+        else:
+            x_message, rad_feat = self.radial_message(x, edge_cache, radial_feat)
 
-        # === Step 3. Select radial/type features for reduced layout ===
-        with nvtx_range("SO2Conv/radial_fuse"):
-            rad_feat = radial_feat[:, self.degree_index_m, :]  # (E, D_m, C)
-            if self.radial_hidden_proj is not None:
-                rad_feat = self.radial_hidden_proj(rad_feat)
-            if self.radial_degree_mixer is None:
-                x_local.mul_(rad_feat)
+        # === Step 3. Optional focus mixing for the attention stream ===
+        if self.attn_focus_mix is not None:
+            x_message = self.attn_focus_mix(x_message.unsqueeze(2)).squeeze(2)
+
+        # === Step 4. Aggregate with optional head-wise gating ===
+        with nvtx_range("SO2Conv/aggregate"):
+            # Source Freeze Propagation Gate: broadcast the per-edge scalar
+            # eta[src] to the edge message before destination aggregation.
+            # ``edge_src_gate`` is ``None`` outside bridging mode, in which
+            # case this branch disappears and the baseline / attention paths
+            # run unchanged.
+            edge_src_gate = edge_cache.edge_src_gate
+            if self.n_atten_head == 0:
+                # Baseline path: fused envelope-weighted scatter add -> degree norm.
+                # Folding edge_src_gate into the scalar envelope keeps the
+                # op count unchanged.
+                edge_weight = edge_cache.edge_env  # (E, 1)
+                if edge_src_gate is not None:
+                    edge_weight = edge_weight * edge_src_gate.to(
+                        dtype=edge_weight.dtype
+                    )
+                x_message = x_message * edge_weight.unsqueeze(-1)
+                out = x.new_zeros(x.shape, dtype=self.compute_dtype)
+                out.index_add_(0, dst, x_message.to(dtype=self.compute_dtype))
+                out.mul_(edge_cache.inv_sqrt_deg.to(dtype=self.compute_dtype))
+                out = out.to(dtype=self.dtype)  # (N, D, C_wide)
             else:
-                x_local = self.radial_degree_mixer(x_local, rad_feat)
-            if self.node_wise_grid_product is not None:
-                x_local = x_local + self.node_wise_grid_product(
-                    x_local,
-                    x_dst_local,
+                # === Step 4.1. Build attention logits from scalar channels ===
+                compute_dtype = self.compute_dtype
+                x_l0_node = x[:, 0, :].reshape(
+                    n_node, self.attn_n_focus, self.attn_focus_dim
+                )  # (N, Fa, Ca)
+                qk_input = self.attn_qk_norm(x_l0_node.to(dtype=compute_dtype))
+                q_node = self.attn_q_proj(qk_input)  # (N, Fa, Ca)
+                k_node = self.attn_k_proj(qk_input)  # (N, Fa, Ca)
+                q_edge = q_node.index_select(0, dst).reshape(
+                    n_edge, self.attn_n_focus, self.n_atten_head, self.head_dim
+                )  # (E, Fa, H, Ch), Ca = H * Ch
+                k_edge = k_node.index_select(0, src).reshape(
+                    n_edge, self.attn_n_focus, self.n_atten_head, self.head_dim
+                )  # (E, Fa, H, Ch)
+                radial_l0 = rad_feat[:, 0, :].reshape(
+                    n_edge, self.attn_n_focus, self.attn_focus_dim
+                )  # (E, Fa, Ca)
+                radial_bias = torch.einsum(
+                    "efi,ifo->efo",
+                    radial_l0.to(dtype=compute_dtype),
+                    self.adamw_attn_logit_w,
+                )  # (E, F, H)
+                attn_logits: torch.Tensor = (q_edge * k_edge).sum(-1) * (
+                    self.head_dim**-0.5
                 )
-            rad_feat_l0_focus = rad_feat[:, 0, :].reshape(
-                n_edge, self.n_focus, self.so2_focus_dim
-            )  # (E, F, Cf)
+                attn_logits = attn_logits + radial_bias
 
-        # === Step 4. Convert to SO(2) internal focus layout ===
-        focus_gate_src: torch.Tensor | None = None
-        with nvtx_range("SO2Conv/reshape_for_so2"):
+                # === Step 4.2. Destination-wise stable envelope-gated softmax ===
+                # ``src_weight=edge_src_gate`` folds SFPG into both the
+                # numerator and the denominator of the softmax. A muted
+                # source (``eta_src = 0``) therefore drops out of the
+                # destination's attention normalization entirely, which
+                # is required for the attention path to honor the
+                # frozen-zone invariance: a post-multiplication on
+                # ``attn_alpha`` alone would still leave the muted
+                # source leaking through the shared denominator.
+                attn_alpha = segment_envelope_gated_softmax(
+                    logits=attn_logits,
+                    edge_env=edge_cache.edge_env.to(dtype=compute_dtype),
+                    dst=dst,
+                    n_nodes=n_node,
+                    z_bias_raw=self.adamw_attn_z_bias_raw,
+                    eps=self.eps,
+                    src_weight=(
+                        None
+                        if edge_src_gate is None
+                        else edge_src_gate.to(dtype=compute_dtype)
+                    ),
+                )  # (E, F, H)
+
+                # === Step 4.3. Value projection and head-wise aggregation ===
+                value_focus = x_message.reshape(
+                    n_edge,
+                    self.ebed_dim_full,
+                    self.attn_n_focus,
+                    self.attn_focus_dim,
+                ).to(dtype=compute_dtype)  # (E, D, Fa, Ca)
+                if self.attn_v_proj is not None:
+                    value_focus = self.attn_v_proj(value_focus)
+                value_heads = value_focus.reshape(
+                    n_edge,
+                    self.ebed_dim_full,
+                    self.attn_n_focus,
+                    self.n_atten_head,
+                    self.head_dim,
+                )  # (E, D, Fa, H, Ch)
+                weighted_value = value_heads * attn_alpha.reshape(
+                    n_edge, 1, self.attn_n_focus, self.n_atten_head, 1
+                )
+                out_heads = torch.zeros(
+                    n_node,
+                    self.ebed_dim_full,
+                    self.attn_n_focus,
+                    self.n_atten_head,
+                    self.head_dim,
+                    device=x.device,
+                    dtype=compute_dtype,
+                )  # (N, D, Fa, H, Ch)
+                out_heads.index_add_(0, dst, weighted_value)
+
+                # === Step 4.4. Output-side head gate ===
+                attn_output_gate = torch.sigmoid(
+                    torch.einsum(
+                        "nfi,ifo->nfo",
+                        self.attn_output_gate_norm(x_l0_node.to(dtype=compute_dtype)),
+                        self.adamw_attn_gate_w,
+                    )
+                )  # (N, F, H)
+                out_heads = out_heads * attn_output_gate.reshape(
+                    n_node, 1, self.attn_n_focus, self.n_atten_head, 1
+                )  # (N, D, Fa, H, Ch)
+
+                # === Step 4.5. Output projection and merge heads ===
+                out_focus = out_heads.reshape(
+                    n_node,
+                    self.ebed_dim_full,
+                    self.attn_n_focus,
+                    self.attn_focus_dim,
+                )  # (N, D, Fa, Ca)
+                if self.attn_o_proj is not None:
+                    out_focus = self.attn_o_proj(out_focus)
+                out = out_focus.reshape(
+                    n_node, self.ebed_dim_full, self.hidden_channels
+                ).to(dtype=self.dtype)  # (N, D, C_wide)
+
+        # === Step 5. Optional message-node grid product ===
+        if self.message_node_grid_product is not None:
+            with nvtx_range("SO2Conv/message_node_grid"):
+                out = out + self.message_node_grid_product(out, x)
+
+        # === Step 6. Optional per-node Cartesian tensor-product mixing ===
+        # Couples the aggregated message with the destination node feature ``x``,
+        # the Cartesian analog of the message-node grid product.
+        if self.node_cartesian_tp is not None:
+            with nvtx_range("SO2Conv/node_cartesian"):
+                out = self.node_cartesian_tp(out, x)
+
+        # === Step 7. Final channel mixing ===
+        with nvtx_range("SO2Conv/post_focus_mix"):
+            out = self.post_focus_mix(out.unsqueeze(2)).squeeze(2)
+        return out  # (N, D, C)
+
+    def radial_message(
+        self,
+        x: torch.Tensor,
+        edge_cache: EdgeFeatureCache,
+        radial_feat: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Build edge messages by rotation-free per-degree radial scaling.
+
+        Used when no local-frame operation is required (``mixing_layers == 0``,
+        ``radial_so2_mode == "none"``, and no node-wise grid product). Per-degree
+        scalar radial scaling commutes with rotation, so the edge-aligned frame
+        is unnecessary and the message reduces to a source gather, an elementwise
+        per-degree scale, and the optional cross-focus competition.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Node features with shape (N, D, C_wide) after pre-focus mixing.
+        edge_cache : EdgeFeatureCache
+            Precomputed edge cache.
+        radial_feat : torch.Tensor
+            Per-edge radial features with shape (E, lmax+1, C).
+
+        Returns
+        -------
+        tuple[torch.Tensor, torch.Tensor]
+            ``(x_message, rad_feat)`` with shapes (E, D, C_wide) and
+            (E, lmax+1, C_wide). The ``l=0`` slice of ``rad_feat`` is consumed by
+            the attention aggregation.
+        """
+        src = edge_cache.src
+        n_edge = src.numel()
+
+        rad_feat = radial_feat  # (E, lmax+1, C)
+        if self.radial_hidden_proj is not None:
+            rad_feat = self.radial_hidden_proj(rad_feat)  # (E, lmax+1, C_wide)
+
+        # Broadcast each degree's radial weight over its 2l+1 orders and scale the
+        # gathered source feature in the global frame.
+        x_src = x.index_select(0, src)  # (E, D, C_wide)
+        rad_packed = rad_feat.index_select(1, self.degree_index_full)  # (E, D, C_wide)
+        x_message = x_src * rad_packed
+
+        # === Cross-focus softmax competition ===
+        # Gate on the radial-fused source l=0 scalar, matching the SO(2) path.
+        if self.focus_compete and self.n_focus > 1:
+            focus_gate_src = (x_src[:, 0, :] * rad_feat[:, 0, :]).reshape(
+                n_edge, self.n_focus, self.so2_focus_dim
+            )  # (E, F, Cf)
+            alpha = self._focus_alpha(focus_gate_src)
+            x_message = (
+                x_message.reshape(
+                    n_edge, self.ebed_dim_full, self.n_focus, self.so2_focus_dim
+                )
+                * alpha.to(dtype=x_message.dtype)[:, None, :, None]
+            ).reshape(n_edge, self.ebed_dim_full, self.hidden_channels)
+        return x_message, rad_feat
+
+    def so2_message(
+        self,
+        x: torch.Tensor,
+        edge_cache: EdgeFeatureCache,
+        radial_feat: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Build edge messages by rotate-to-local, SO(2) mixing, and rotate-back.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Node features with shape (N, D, C_wide) after pre-focus mixing.
+        edge_cache : EdgeFeatureCache
+            Precomputed edge cache.
+        radial_feat : torch.Tensor
+            Per-edge radial features with shape (E, lmax+1, C).
+
+        Returns
+        -------
+        tuple[torch.Tensor, torch.Tensor]
+            ``(x_message, rad_feat)`` with shapes (E, D, C_wide) and
+            (E, D_m, C_wide). The ``l=0`` slice of ``rad_feat`` is consumed by
+            the attention aggregation.
+        """
+        src, dst = edge_cache.src, edge_cache.dst
+        n_edge = src.numel()
+
+        # === Step 1. Rotate to edge-aligned local frame ===
+        with nvtx_range("SO2Conv/rotate_to_local"):
+            D_full = edge_cache.D_full
+            x_dst_local: torch.Tensor | None = None
+            if self.use_triton_infer and not self.training:
+                # ``self._rotate_to_local_fn`` was bound in ``__init__`` (the
+                # block kernel for the m-major ``mmax == 1`` layout, dense
+                # otherwise).
+                x_local = self._rotate_to_local_fn(x, src, D_full)  # (E, D_m, C_wide)
+                if self.node_wise_grid_product is not None:
+                    x_dst_local = self._rotate_to_local_fn(
+                        x, dst, D_full
+                    )  # (E, D_m, C_wide)
+            else:
+                D_m_prime = project_D_to_m(
+                    D_full=D_full,
+                    coeff_index_m=self.coeff_index_m,
+                    ebed_dim_full=self.ebed_dim_full,
+                    cache=edge_cache.D_to_m_cache,
+                    key_lmax=self.lmax,
+                    key_mmax=self.mmax,
+                )
+                x_src = x.index_select(0, src)  # (E, D, C_wide)
+                x_local = torch.bmm(D_m_prime, x_src)  # (E, D_m, C_wide)
+                if self.node_wise_grid_product is not None:
+                    x_dst = x.index_select(0, dst)  # (E, D, C_wide)
+                    x_dst_local = torch.bmm(D_m_prime, x_dst)  # (E, D_m, C_wide)
+
+        # === Step 2. Select radial/type features for reduced layout ===
+        with nvtx_range("SO2Conv/radial_fuse"):
+            rad_feat = radial_feat[:, self.degree_index_m, :]  # (E, D_m, C)
+            if self.radial_hidden_proj is not None:
+                rad_feat = self.radial_hidden_proj(rad_feat)
+            if self.radial_degree_mixer is None:
+                x_local.mul_(rad_feat)
+            else:
+                x_local = self.radial_degree_mixer(x_local, rad_feat)
+            if self.node_wise_grid_product is not None:
+                x_local = x_local + self.node_wise_grid_product(
+                    x_local,
+                    x_dst_local,
+                )
+            rad_feat_l0_focus = rad_feat[:, 0, :].reshape(
+                n_edge, self.n_focus, self.so2_focus_dim
+            )  # (E, F, Cf)
+
+        # === Step 3. Convert to SO(2) internal focus layout ===
+        focus_gate_src: torch.Tensor | None = None
+        with nvtx_range("SO2Conv/reshape_for_so2"):
             x_local = x_local.reshape(
                 n_edge, self.reduced_dim, self.n_focus, self.so2_focus_dim
             ).transpose(1, 2)  # (E, F, D_m, Cf), strided
             if self.focus_compete and self.n_focus > 1:
                 focus_gate_src = x_local[:, :, 0, :]
 
-        # === Step 5. Multi-layer SO(2) mixing (pre-norm + residual) ===
+        # === Step 4. Multi-layer SO(2) mixing (pre-norm + residual) ===
         with nvtx_range("SO2Conv/so2_layers"):
 
             def so2_l0_extractor(v: torch.Tensor) -> torch.Tensor:
@@ -1698,25 +1917,14 @@ def apply_bias_correction(
                     else:
                         x_local = residual + x_local
 
-        # === Step 6. Cross-focus softmax competition ===
+        # === Step 5. Cross-focus softmax competition ===
         if self.focus_compete and self.n_focus > 1:
-            focus_gate_src = focus_gate_src.to(dtype=self.compute_dtype)
-            focus_logits = torch.einsum(
-                "efi,if->ef",
-                self.focus_compete_norm(focus_gate_src),
-                self.adamw_focus_compete_w,
-            )
-            if self.mlp_bias:
-                focus_logits = focus_logits + self.focus_compete_bias.unsqueeze(0)
-            alpha = torch.softmax(focus_logits / self.focus_softmax_tau, dim=1).to(
-                dtype=x_local.dtype
-            )
-            alpha = alpha * (1.0 - self.focus_label_smoothing) + (
-                self.focus_label_smoothing / float(self.n_focus)
+            alpha = self._focus_alpha(focus_gate_src)
+            x_local = x_local * alpha.to(dtype=x_local.dtype).unsqueeze(-1).unsqueeze(
+                -1
             )
-            x_local = x_local * alpha.unsqueeze(-1).unsqueeze(-1)
 
-        # === Step 7. Rotate back to global frame ===
+        # === Step 6. Rotate back to global frame ===
         with nvtx_range("SO2Conv/rotate_back"):
             Dt_full = edge_cache.Dt_full
             if self.use_triton_infer and self.mmax == 1 and not self.training:
@@ -1746,146 +1954,291 @@ def apply_bias_correction(
             # inverse-rotation degree rescale after the global lift restores the
             # full-basis amplitude expected by the block output contract.
             x_message = x_message * self.rotate_inv_rescale_full.view(1, -1, 1)
-            if self.attn_focus_mix is not None:
-                x_message = self.attn_focus_mix(x_message.unsqueeze(2)).squeeze(2)
+        return x_message, rad_feat
 
-        # === Step 8. Aggregate with optional head-wise gating ===
-        with nvtx_range("SO2Conv/aggregate"):
-            # Source Freeze Propagation Gate: broadcast the per-edge scalar
-            # eta[src] to the edge message before destination aggregation.
-            # ``edge_src_gate`` is ``None`` outside bridging mode, in which
-            # case this branch disappears and the baseline / attention paths
-            # run unchanged.
-            edge_src_gate = edge_cache.edge_src_gate
-            if self.n_atten_head == 0:
-                # Baseline path: fused envelope-weighted scatter add -> degree norm.
-                # Folding edge_src_gate into the scalar envelope keeps the
-                # op count unchanged.
-                edge_weight = edge_cache.edge_env  # (E, 1)
-                if edge_src_gate is not None:
-                    edge_weight = edge_weight * edge_src_gate.to(
-                        dtype=edge_weight.dtype
-                    )
-                x_message = x_message * edge_weight.unsqueeze(-1)
-                out = x.new_zeros(x.shape, dtype=self.compute_dtype)
-                out.index_add_(0, dst, x_message.to(dtype=self.compute_dtype))
-                out.mul_(edge_cache.inv_sqrt_deg.to(dtype=self.compute_dtype))
-                out = out.to(dtype=self.dtype)  # (N, D, C_wide)
+    def cartesian_message(
+        self,
+        x: torch.Tensor,
+        edge_cache: EdgeFeatureCache,
+        radial_feat: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Build edge messages via the global-frame Cartesian rank-2 tensor product.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Node features with shape (N, D, C_wide) after pre-focus mixing.
+        edge_cache : EdgeFeatureCache
+            Precomputed edge cache.
+        radial_feat : torch.Tensor
+            Per-edge radial features with shape (E, lmax+1, C).
+
+        Returns
+        -------
+        tuple[torch.Tensor, torch.Tensor]
+            ``(x_message, rad_feat)`` with shapes (E, D, C_wide) and
+            (E, lmax+1, C_wide). The ``l=0`` slice of ``rad_feat`` is consumed by
+            the attention aggregation.
+        """
+        src = edge_cache.src
+        n_edge = src.numel()
+
+        # === Step 1. Per-degree radial weights projected to the hidden width ===
+        with nvtx_range("SO2Conv/radial_fuse"):
+            rad_feat = radial_feat  # (E, lmax+1, C)
+            if self.radial_hidden_proj is not None:
+                rad_feat = self.radial_hidden_proj(rad_feat)  # (E, lmax+1, C_wide)
+
+        # === Step 2. Global-frame Cartesian tensor product ===
+        with nvtx_range("SO2Conv/cartesian_tp"):
+            x_src = x.index_select(0, src)  # (E, D, C_wide)
+            x_message = self.edge_cartesian_tp(
+                x_src, edge_cache.edge_vec, rad_feat
+            )  # (E, D, C_wide)
+
+        # === Step 3. Cross-focus softmax competition ===
+        # Gate on the radial-fused source l=0 scalar, matching the SO(2) path,
+        # whose competition reads the pre-mixing input (its l=0 equals the
+        # rotation-invariant source l=0 times the l=0 radial weight).
+        if self.focus_compete and self.n_focus > 1:
+            focus_gate_src = (x_src[:, 0, :] * rad_feat[:, 0, :]).reshape(
+                n_edge, self.n_focus, self.so2_focus_dim
+            )  # (E, F, Cf)
+            alpha = self._focus_alpha(focus_gate_src)
+            x_message = (
+                x_message.reshape(
+                    n_edge, self.ebed_dim_full, self.n_focus, self.so2_focus_dim
+                )
+                * alpha.to(dtype=x_message.dtype)[:, None, :, None]
+            ).reshape(n_edge, self.ebed_dim_full, self.hidden_channels)
+        return x_message, rad_feat
+
+    def _focus_alpha(self, focus_gate_src: torch.Tensor) -> torch.Tensor:
+        """
+        Compute per-focus softmax competition weights from l=0 scalars.
+
+        Parameters
+        ----------
+        focus_gate_src : torch.Tensor
+            Per-edge l=0 scalar features with shape (E, F, Cf).
+
+        Returns
+        -------
+        torch.Tensor
+            Label-smoothed competition weights with shape (E, F), in the compute
+            dtype.
+        """
+        focus_logits = torch.einsum(
+            "efi,if->ef",
+            self.focus_compete_norm(focus_gate_src.to(dtype=self.compute_dtype)),
+            self.adamw_focus_compete_w,
+        )
+        if self.mlp_bias:
+            focus_logits = focus_logits + self.focus_compete_bias.unsqueeze(0)
+        alpha = torch.softmax(focus_logits / self.focus_softmax_tau, dim=1)
+        return alpha * (1.0 - self.focus_label_smoothing) + (
+            self.focus_label_smoothing / float(self.n_focus)
+        )
+
+    def _build_so2_mixing(
+        self,
+        *,
+        seed_so2_stack: int | list[int] | None,
+        seed_non_linearities: int | list[int] | None,
+        seed_depth_attn: int | list[int] | None,
+        trainable: bool,
+    ) -> None:
+        """
+        Build the SO(2) rotation-frame mixing stack.
+
+        Populates the m-major reduced-layout buffers, the optional Triton
+        rotation kernels, the multi-layer ``SO2Linear`` stack, its intermediate
+        norms and nonlinearities, the optional depth-wise attention residuals,
+        and the optional per-layer LayerScale. These are the SO(2)-only tensors;
+        they are skipped entirely when ``edge_cartesian`` is True.
+
+        Parameters
+        ----------
+        seed_so2_stack : int | list[int] | None
+            Seed for the ``SO2Linear`` layers.
+        seed_non_linearities : int | list[int] | None
+            Seed for the intermediate nonlinearities.
+        seed_depth_attn : int | list[int] | None
+            Seed for the depth-wise attention residuals.
+        trainable : bool
+            Whether parameters are trainable.
+        """
+        # === Step 1. Precompute coefficient indices for m-major reduced layout ===
+        coeff_index_m = build_m_major_index(self.lmax, self.mmax, device=self.device)
+        degree_index_m = build_m_major_l_index(self.lmax, self.mmax, device=self.device)
+        degree_index_full = map_degree_idx(self.lmax, device=self.device)
+        rotate_inv_rescale_full = build_rotate_inv_rescale(
+            lmax=self.lmax,
+            mmax=self.mmax,
+            degree_index=degree_index_full,
+            device=self.device,
+            dtype=self.dtype,
+        )
+        self.register_buffer("coeff_index_m", coeff_index_m, persistent=False)
+        self.register_buffer("degree_index_m", degree_index_m, persistent=False)
+        # Packed (l, m) -> l index, used by the rotation-free radial message to
+        # broadcast each degree's radial weight over its orders.
+        self.register_buffer("degree_index_full", degree_index_full, persistent=False)
+        self.register_buffer(
+            "rotate_inv_rescale_full", rotate_inv_rescale_full, persistent=False
+        )
+        self.reduced_dim = int(coeff_index_m.numel())
+
+        # === Step 2. Triton rotation kernels: block for mmax == 1, dense otherwise ===
+        self._rotate_to_local_fn = None
+        self._rotate_back_fn = None
+        if self.use_triton_infer:
+            from .triton.so2_rotation import (
+                rotate_back_block_so2,
+                rotate_back_dense,
+                rotate_to_local_block,
+                rotate_to_local_dense,
+            )
+
+            if self.mmax == 1:
+                self._rotate_to_local_fn = lambda x, src, wigner: rotate_to_local_block(
+                    x, src, wigner, self.lmax
+                )
+                # The block kernel reads the (E, F, D_m, Cf) focus layout directly,
+                # so the rotate-back path passes ``x_local`` before the global
+                # reshape and the transpose-back copy is skipped.
+                self._rotate_back_fn = lambda x_local, wigner: rotate_back_block_so2(
+                    x_local, wigner, self.lmax
+                )
             else:
-                # === Step 8.1. Build attention logits from scalar channels ===
-                compute_dtype = self.compute_dtype
-                x_l0_node = x[:, 0, :].reshape(
-                    n_node, self.attn_n_focus, self.attn_focus_dim
-                )  # (N, Fa, Ca)
-                qk_input = self.attn_qk_norm(x_l0_node.to(dtype=compute_dtype))
-                q_node = self.attn_q_proj(qk_input)  # (N, Fa, Ca)
-                k_node = self.attn_k_proj(qk_input)  # (N, Fa, Ca)
-                q_edge = q_node.index_select(0, dst).reshape(
-                    n_edge, self.attn_n_focus, self.n_atten_head, self.head_dim
-                )  # (E, Fa, H, Ch), Ca = H * Ch
-                k_edge = k_node.index_select(0, src).reshape(
-                    n_edge, self.attn_n_focus, self.n_atten_head, self.head_dim
-                )  # (E, Fa, H, Ch)
-                radial_l0 = rad_feat[:, 0, :].reshape(
-                    n_edge, self.attn_n_focus, self.attn_focus_dim
-                )  # (E, Fa, Ca)
-                radial_bias = torch.einsum(
-                    "efi,ifo->efo",
-                    radial_l0.to(dtype=compute_dtype),
-                    self.adamw_attn_logit_w,
-                )  # (E, F, H)
-                attn_logits: torch.Tensor = (q_edge * k_edge).sum(-1) * (
-                    self.head_dim**-0.5
+                self._rotate_to_local_fn = lambda x, src, wigner: rotate_to_local_dense(
+                    x, src, wigner, self.coeff_index_m, self.ebed_dim_full
+                )
+                self._rotate_back_fn = lambda x_local, wigner: rotate_back_dense(
+                    x_local, wigner, self.coeff_index_m, self.ebed_dim_full
                 )
-                attn_logits = attn_logits + radial_bias
 
-                # === Step 8.2. Destination-wise stable envelope-gated softmax ===
-                # ``src_weight=edge_src_gate`` folds SFPG into both the
-                # numerator and the denominator of the softmax. A muted
-                # source (``eta_src = 0``) therefore drops out of the
-                # destination's attention normalization entirely, which
-                # is required for the attention path to honor the
-                # frozen-zone invariance: a post-multiplication on
-                # ``attn_alpha`` alone would still leave the muted
-                # source leaking through the shared denominator.
-                attn_alpha = segment_envelope_gated_softmax(
-                    logits=attn_logits,
-                    edge_env=edge_cache.edge_env.to(dtype=compute_dtype),
-                    dst=dst,
-                    n_nodes=n_node,
-                    z_bias_raw=self.adamw_attn_z_bias_raw,
-                    eps=self.eps,
-                    src_weight=(
-                        None
-                        if edge_src_gate is None
-                        else edge_src_gate.to(dtype=compute_dtype)
+        # === Step 3. Multiple SO2Linear layers ===
+        self.so2_linears = nn.ModuleList(
+            [
+                SO2Linear(
+                    lmax=self.lmax,
+                    mmax=self.mmax,
+                    in_channels=self.so2_focus_dim,
+                    out_channels=(
+                        2 * self.so2_focus_dim
+                        if self.s2_activation and i < self.mixing_layers - 1
+                        else self.so2_focus_dim
                     ),
-                )  # (E, F, H)
-
-                # === Step 8.3. Value projection and head-wise aggregation ===
-                value_focus = x_message.reshape(
-                    n_edge,
-                    self.ebed_dim_full,
-                    self.attn_n_focus,
-                    self.attn_focus_dim,
-                ).to(dtype=compute_dtype)  # (E, D, Fa, Ca)
-                if self.attn_v_proj is not None:
-                    value_focus = self.attn_v_proj(value_focus)
-                value_heads = value_focus.reshape(
-                    n_edge,
-                    self.ebed_dim_full,
-                    self.attn_n_focus,
-                    self.n_atten_head,
-                    self.head_dim,
-                )  # (E, D, Fa, H, Ch)
-                weighted_value = value_heads * attn_alpha.reshape(
-                    n_edge, 1, self.attn_n_focus, self.n_atten_head, 1
+                    n_focus=self.n_focus,
+                    dtype=self.dtype,
+                    mlp_bias=self.mlp_bias,
+                    seed=child_seed(seed_so2_stack, i),
+                    trainable=trainable,
                 )
-                out_heads = torch.zeros(
-                    n_node,
-                    self.ebed_dim_full,
-                    self.attn_n_focus,
-                    self.n_atten_head,
-                    self.head_dim,
-                    device=x.device,
-                    dtype=compute_dtype,
-                )  # (N, D, Fa, H, Ch)
-                out_heads.index_add_(0, dst, weighted_value)
+                for i in range(self.mixing_layers)
+            ]
+        )
 
-                # === Step 8.4. Output-side head gate ===
-                attn_output_gate = torch.sigmoid(
-                    torch.einsum(
-                        "nfi,ifo->nfo",
-                        self.attn_output_gate_norm(x_l0_node.to(dtype=compute_dtype)),
-                        self.adamw_attn_gate_w,
+        # === Step 4. Intermediate norms (the last layer always uses Identity) ===
+        inter_norms: list[nn.Module] = []
+        for i in range(self.mixing_layers):
+            if self.so2_norm and i < self.mixing_layers - 1:
+                inter_norms.append(
+                    ReducedEquivariantRMSNorm(
+                        lmax=self.lmax,
+                        mmax=self.mmax,
+                        channels=self.so2_focus_dim,
+                        degree_index_m=self.degree_index_m,
+                        n_focus=self.n_focus,
+                        dtype=self.compute_dtype,
+                        trainable=trainable,
                     )
-                )  # (N, F, H)
-                out_heads = out_heads * attn_output_gate.reshape(
-                    n_node, 1, self.attn_n_focus, self.n_atten_head, 1
-                )  # (N, D, Fa, H, Ch)
+                )
+            else:
+                inter_norms.append(nn.Identity())
+        self.so2_inter_norms = nn.ModuleList(inter_norms)
 
-                # === Step 8.5. Output projection and merge heads ===
-                out_focus = out_heads.reshape(
-                    n_node,
-                    self.ebed_dim_full,
-                    self.attn_n_focus,
-                    self.attn_focus_dim,
-                )  # (N, D, Fa, Ca)
-                if self.attn_o_proj is not None:
-                    out_focus = self.attn_o_proj(out_focus)
-                out = out_focus.reshape(
-                    n_node, self.ebed_dim_full, self.hidden_channels
-                ).to(dtype=self.dtype)  # (N, D, C_wide)
+        # === Step 5. Intermediate non-linearity (the last layer stays linear) ===
+        non_linearities: list[nn.Module] = []
+        for i in range(self.mixing_layers):
+            if i >= self.mixing_layers - 1:
+                non_linearities.append(nn.Identity())
+            elif self.s2_activation:
+                non_linearities.append(
+                    S2GridNet(
+                        lmax=self.lmax,
+                        mmax=self.mmax,
+                        channels=self.so2_focus_dim,
+                        n_focus=self.n_focus,
+                        mode="self",
+                        op_type="glu",
+                        dtype=self.compute_dtype,
+                        layout="nfdc",
+                        grid_resolution_list=self.s2_grid_resolution,
+                        coefficient_layout="m_major",
+                        grid_method=self.s2_grid_method,
+                        mlp_bias=self.mlp_bias,
+                        trainable=trainable,
+                        seed=child_seed(seed_non_linearities, i),
+                    )
+                )
+            else:
+                non_linearities.append(
+                    GatedActivation(
+                        lmax=self.lmax,
+                        mmax=self.mmax,
+                        channels=self.so2_focus_dim,
+                        n_focus=self.n_focus,
+                        dtype=self.compute_dtype,
+                        activation_function=self.activation_function,
+                        mlp_bias=self.mlp_bias,
+                        layout="nfdc",
+                        trainable=trainable,
+                        seed=child_seed(seed_non_linearities, i),
+                    )
+                )
+        self.non_linearities = nn.ModuleList(non_linearities)
 
-        # === Step 9. Optional message-node grid product ===
-        if self.message_node_grid_product is not None:
-            with nvtx_range("SO2Conv/message_node_grid"):
-                out = out + self.message_node_grid_product(out, x)
+        # === Step 6. Optional depth-wise attention residuals across SO(2) layers ===
+        if self.use_so2_attn_res:
+            self.so2_layer_attn_res: nn.ModuleList | None = nn.ModuleList(
+                [
+                    DepthAttnRes(
+                        channels=self.hidden_channels,
+                        input_dependent=self.so2_attn_res_mode == "dependent",
+                        eps=self.eps,
+                        bias=self.mlp_bias,
+                        dtype=self.compute_dtype,
+                        trainable=trainable,
+                        seed=child_seed(seed_depth_attn, i),
+                    )
+                    for i in range(self.mixing_layers)
+                ]
+            )
+        else:
+            self.so2_layer_attn_res = None
 
-        # === Step 10. Final channel mixing ===
-        with nvtx_range("SO2Conv/post_focus_mix"):
-            out = self.post_focus_mix(out.unsqueeze(2)).squeeze(2)
-        return out  # (N, D, C)
+        # === Step 7. Optional per-layer LayerScale for SO(2) residual branches ===
+        if self.layer_scale:
+            self.adam_so2_layer_scales = nn.ParameterList(
+                [
+                    nn.Parameter(
+                        torch.ones(
+                            self.n_focus,
+                            self.so2_focus_dim,
+                            dtype=self.dtype,
+                            device=self.device,
+                        )
+                        * 1e-3,
+                        requires_grad=trainable,
+                    )
+                    for _ in range(self.mixing_layers)
+                ]
+            )
+        else:
+            self.adam_so2_layer_scales = None
 
     def serialize(self) -> dict[str, Any]:
         trainable = all(p.requires_grad for p in self.parameters())
@@ -1902,7 +2255,7 @@ def serialize(self) -> dict[str, Any]:
                 "focus_dim": self.focus_dim,
                 "focus_compete": self.focus_compete,
                 "so2_norm": self.so2_norm,
-                "so2_layers": self.so2_layers,
+                "mixing_layers": self.mixing_layers,
                 "so2_attn_res": self.so2_attn_res_mode,
                 "layer_scale": self.layer_scale,
                 "n_atten_head": self.n_atten_head,
@@ -1923,6 +2276,8 @@ def serialize(self) -> dict[str, Any]:
                 "mlp_bias": self.mlp_bias,
                 "radial_so2_mode": self.radial_so2_mode,
                 "radial_so2_rank": self.radial_so2_rank,
+                "edge_cartesian": self.edge_cartesian,
+                "node_cartesian": self.node_cartesian,
                 "eps": self.eps,
                 "precision": RESERVED_PRECISION_DICT[self.dtype],
                 "trainable": trainable,
diff --git a/deepmd/pt/model/model/sezm_model.py b/deepmd/pt/model/model/sezm_model.py
index 2f07f2522c..a9ade1c79d 100644
--- a/deepmd/pt/model/model/sezm_model.py
+++ b/deepmd/pt/model/model/sezm_model.py
@@ -2086,12 +2086,28 @@ def compute_fn(  # type: ignore[misc]
         # FakeTensors, so we need concrete values to resolve their
         # control flow exactly once; shapes become symbolic immediately
         # afterwards.
-        traced = make_fx(
-            compute_fn,
-            tracing_mode="symbolic",
-            _allow_non_fake_inputs=True,
-            decomposition_table=decomp_table,
-        )(*trace_args)
+        # Eval lowers this make_fx graph verbatim through AOTAutograd, reusing
+        # the traced placeholders' symbolic shapes; any duck-shape symbol
+        # collision is therefore baked into the inference artifact. Disable
+        # duck-shaping for the eval trace so every size and stride receives an
+        # independent symbol -- a first frame whose ``nloc`` equals a trace axis
+        # size (e.g. the edge count) can then never unify an unrelated axis onto
+        # that symbol. Training re-derives its symbols through Dynamo from real
+        # contiguous inputs, so it keeps the default duck-shaped behavior.
+        from torch.fx.experimental import _config as fx_experimental_config
+
+        saved_use_duck_shape = fx_experimental_config.use_duck_shape
+        if mode == "eval":
+            fx_experimental_config.use_duck_shape = False
+        try:
+            traced = make_fx(
+                compute_fn,
+                tracing_mode="symbolic",
+                _allow_non_fake_inputs=True,
+                decomposition_table=decomp_table,
+            )(*trace_args)
+        finally:
+            fx_experimental_config.use_duck_shape = saved_use_duck_shape
 
         if self.training:
             # Only the training trace runs with ``create_graph=True``, so only
diff --git a/deepmd/pt/utils/compile_compat.py b/deepmd/pt/utils/compile_compat.py
index b1b533f740..a5dea86e26 100644
--- a/deepmd/pt/utils/compile_compat.py
+++ b/deepmd/pt/utils/compile_compat.py
@@ -179,8 +179,19 @@ def trace_pad_dim(t: torch.Tensor, dim: int, target: int) -> torch.Tensor:
     index-bearing tensors (``nlist`` neighbor indices, ``mapping``
     extended-to-local indices) because the duplicated row reuses the
     previously-valid row's values.  Trimming likewise never invalidates
-    indices.  Only shapes flow downstream during ``make_fx`` tracing,
-    so the exact replicated/trimmed values do not affect the FX graph.
+    indices.
+
+    The result is always contiguous, which matters as much as its shape.
+    Trimming a non-leading dimension by slicing returns a view whose stride
+    still encodes the *pre-trim* length; ``make_fx`` symbolic tracing records
+    that stale stride as a free symbol, and duck-shaping then unifies it with
+    any size symbol that happens to share the same trace-time value -- e.g. the
+    trimmed ``atype`` stride (= the frame's ``nloc``) colliding with the edge
+    count when both equal a ``next_safe_prime`` value. The compiled graph would
+    then guard unrelated axes against one another and fail ``assert_size_stride``
+    at runtime. Materializing a contiguous copy keeps the trace inputs' memory
+    layout identical to the contiguous runtime inputs, so strides never carry a
+    stale length into the symbol pool.
     """
     cur = int(t.shape[dim])
     if cur == target:
@@ -188,7 +199,7 @@ def trace_pad_dim(t: torch.Tensor, dim: int, target: int) -> torch.Tensor:
     if cur > target:
         sl: list[slice] = [slice(None)] * t.ndim
         sl[dim] = slice(None, target)
-        return t[tuple(sl)]
+        return t[tuple(sl)].contiguous()
     sl = [slice(None)] * t.ndim
     sl[dim] = slice(-1, None)
     last = t[tuple(sl)]
diff --git a/deepmd/pt_expt/descriptor/__init__.py b/deepmd/pt_expt/descriptor/__init__.py
index 163a6788d8..af1fa69893 100644
--- a/deepmd/pt_expt/descriptor/__init__.py
+++ b/deepmd/pt_expt/descriptor/__init__.py
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-# Import to register converters
+# Import to register converters. ``dpa4_nn`` registers the dpmodel -> pt_expt
+# converters for the DPA4 interaction block (activation checkpointing) and the
+# SO(2) modules / radial MLP (opt-in Triton kernels, trainable-weight promotion),
+# so the auto-wrapped descriptor tree picks up those subclasses.
 from . import (  # noqa: F401
+    dpa4_nn,
     repflows,
     repformers,
     se_t_tebd_block,
diff --git a/deepmd/pt_expt/descriptor/dpa4.py b/deepmd/pt_expt/descriptor/dpa4.py
index cd41b1aaf8..baca905d64 100644
--- a/deepmd/pt_expt/descriptor/dpa4.py
+++ b/deepmd/pt_expt/descriptor/dpa4.py
@@ -7,6 +7,10 @@
 
 from deepmd.dpmodel.descriptor.dpa4 import DescrptDPA4 as DescrptDPA4DP
 from deepmd.dpmodel.descriptor.dpa4_nn.activation import SwiGLU as SwiGLUDP
+from deepmd.dpmodel.descriptor.dpa4_nn.grid_net import GridProduct as GridProductDP
+from deepmd.dpmodel.descriptor.dpa4_nn.radial import (
+    C3CutoffEnvelope as C3CutoffEnvelopeDP,
+)
 from deepmd.dpmodel.descriptor.dpa4_nn.wignerd import (
     WignerDCalculator as WignerDCalculatorDP,
 )
@@ -46,6 +50,33 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
 register_dpmodel_mapping(SwiGLUDP, lambda v: SwiGLU())
 
 
+@torch_module
+class C3CutoffEnvelope(C3CutoffEnvelopeDP):
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        return self.call(*args, **kwargs)
+
+
+# C3CutoffEnvelope carries only scalar configuration (cutoff radius and
+# polynomial exponent) and holds no trainable arrays, so it implements no
+# serialize()/deserialize() that the generic auto-wrap path relies on; rebuild
+# it directly from the stored constructor arguments (``p`` is the exponent).
+register_dpmodel_mapping(
+    C3CutoffEnvelopeDP,
+    lambda v: C3CutoffEnvelope(v.rcut, v.p, precision=v.precision),
+)
+
+
+@torch_module
+class GridProduct(GridProductDP):
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        return self.call(*args, **kwargs)
+
+
+# GridProduct is a parameter-free quadratic grid product with no constructor
+# arguments and no serialize()/deserialize(); rebuild a fresh instance.
+register_dpmodel_mapping(GridProductDP, lambda v: GridProduct())
+
+
 # ---------------------------------------------------------------------------
 # Trainable-weight promotion
 #
@@ -86,6 +117,12 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
     ),
     # dpa4_nn.embedding
     "SeZMTypeEmbedding": ("adam_type_embedding",),
+    # dpa4_nn.attn_res
+    "DepthAttnRes": ("adamw_pseudo_query",),
+    # dpa4_nn.grid_net (residual_scale is None when disabled; _promote_trainable
+    # skips the missing buffer, so listing both concrete subclasses is safe)
+    "S2GridNet": ("residual_scale",),
+    "SO3GridNet": ("residual_scale",),
     # descriptor-level FiLM strengths
     "DescrptDPA4": ("film_scale_strength_log", "film_shift_strength_log"),
 }
@@ -147,6 +184,25 @@ def deserialize(cls, data: dict) -> "DescrptDPA4":
     def forward(self, *args: Any, **kwargs: Any) -> Any:
         return self.call(*args, **kwargs)
 
+    def _forward_blocks(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> Any:
+        """Run the interaction blocks under the pt_expt AMP policy.
+
+        This is the torch (pt_expt) implementation of the descriptor's
+        ``use_amp`` switch, mirroring the reference pt ``_compute_mode_ctx``:
+        bfloat16 autocast wraps only the interaction-block region, while the
+        geometry, edge cache, radial, env-seed, GIE and output FFN stages stay
+        in fp32 (or higher). The dpmodel base stores ``use_amp`` only as a
+        config flag and never autocasts (array-API has no autocast), so the
+        real automatic mixed precision lives here. ``x`` is the node-feature
+        tensor entering the blocks; its device equals the working device, so
+        autocast engages only when ``self.use_amp`` is set, the module is in
+        training mode, and the inputs live on a CUDA device.
+        """
+        if self.use_amp and self.training and x.device.type == "cuda":
+            with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
+                return super()._forward_blocks(x, *args, **kwargs)
+        return super()._forward_blocks(x, *args, **kwargs)
+
     def share_params(
         self,
         base_class: "DescrptDPA4",
diff --git a/deepmd/pt_expt/descriptor/dpa4_nn/__init__.py b/deepmd/pt_expt/descriptor/dpa4_nn/__init__.py
new file mode 100644
index 0000000000..4b649efaae
--- /dev/null
+++ b/deepmd/pt_expt/descriptor/dpa4_nn/__init__.py
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""pt_expt overrides for DPA4/SeZM sub-modules.
+
+These wrappers inject PyTorch-runtime behavior that the array-API dpmodel
+implementation cannot express:
+
+- :mod:`block` -- eval-time activation checkpointing of the interaction units.
+- :mod:`so2` -- opt-in fused Triton kernels for the SO(2) rotation and the
+  dynamic radial degree mixer.
+- :mod:`radial` -- a torch-native radial embedding MLP whose linear / norm
+  weights are trainable parameters (the dpmodel list mixes modules with a bare
+  activation function, which the generic conversion cannot turn into a
+  ``ModuleList``).
+
+Importing this package registers the dpmodel -> pt_expt converters (via
+``torch_module``), so the auto-wrapped descriptor tree picks up these subclasses
+instead of the generic dpmodel wrappers.
+"""
+
+from . import (  # noqa: F401
+    block,
+    radial,
+    so2,
+)
diff --git a/deepmd/pt_expt/descriptor/dpa4_nn/block.py b/deepmd/pt_expt/descriptor/dpa4_nn/block.py
new file mode 100644
index 0000000000..c78196ff58
--- /dev/null
+++ b/deepmd/pt_expt/descriptor/dpa4_nn/block.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""pt_expt interaction block with eval-time activation checkpointing.
+
+The dpmodel :class:`SeZMInteractionBlock` is array-API only and never
+checkpoints (array-API has no ``torch.utils.checkpoint``).  This wrapper injects
+the reference pt activation-checkpoint policy around the two recomputable units
+of the block -- the SO(2) convolution and each FFN subblock -- mirroring
+``deepmd.pt.model.descriptor.sezm_nn.block``.  Checkpointing trades compute for
+memory on the eval-time autograd path (force from ``autograd.grad``) and is
+opt-in through the ``DP_ACT_INFER`` environment variable.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import dataclasses
+import os
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+
+import torch
+from torch.utils.checkpoint import (
+    checkpoint,
+)
+
+from deepmd.dpmodel.descriptor.dpa4_nn.block import (
+    SeZMInteractionBlock as SeZMInteractionBlockDP,
+)
+from deepmd.dpmodel.descriptor.dpa4_nn.block import (
+    exchange_ghost_features,
+)
+from deepmd.pt_expt.common import (
+    torch_module,
+)
+
+if TYPE_CHECKING:
+    from deepmd.dpmodel.descriptor.dpa4_nn.edge_cache import (
+        EdgeFeatureCache,
+    )
+
+# Environment values that enable an inference flag.
+_TRUTHY = {"1", "true", "yes", "on"}
+
+
+@torch_module
+class SeZMInteractionBlock(SeZMInteractionBlockDP):
+    """SeZM interaction block with eval-time activation checkpointing."""
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        # Inference env policy, sampled once here (see
+        # ``_use_infer_activation_checkpoint``).
+        self._act_infer = os.environ.get("DP_ACT_INFER", "").strip().lower() in _TRUTHY
+        self._compile_infer = (
+            os.environ.get("DP_COMPILE_INFER", "").strip().lower() in _TRUTHY
+        )
+
+    def _use_infer_activation_checkpoint(self, *tensors: torch.Tensor) -> bool:
+        """Return whether eval-time activation checkpointing should be used.
+
+        Disabled on the compiled inference path (``DP_COMPILE_INFER``): Inductor
+        already reuses activation buffers, so recomputation only adds latency for
+        a negligible memory gain there.
+        """
+        return (
+            not self.training
+            and self._act_infer
+            and not self._compile_infer
+            and torch.is_grad_enabled()
+            and any(tensor.requires_grad for tensor in tensors)
+        )
+
+    def _run_so2_unit(
+        self,
+        x: torch.Tensor,
+        edge_cache: EdgeFeatureCache,
+        radial_feat: torch.Tensor,
+        comm_dict: dict[str, torch.Tensor] | None = None,
+    ) -> torch.Tensor:
+        if comm_dict is not None:
+            x = exchange_ghost_features(x, comm_dict)
+        if self._use_infer_activation_checkpoint(x, radial_feat):
+            edge_cache_no_proj = dataclasses.replace(
+                edge_cache,
+                D_to_m_cache=None,
+                Dt_from_m_cache=None,
+            )
+            return checkpoint(
+                lambda x_, radial_feat_: self._run_so2_unit_impl(
+                    x_,
+                    edge_cache_no_proj,
+                    radial_feat_,
+                ),
+                x,
+                radial_feat,
+                use_reentrant=False,
+                preserve_rng_state=True,
+            )
+        return self._run_so2_unit_impl(x, edge_cache, radial_feat)
+
+    def _run_ffn_unit(self, x: torch.Tensor, unit_idx: int) -> torch.Tensor:
+        if self._use_infer_activation_checkpoint(x):
+            return checkpoint(
+                lambda x_: self._run_ffn_unit_impl(x_, unit_idx),
+                x,
+                use_reentrant=False,
+                preserve_rng_state=True,
+            )
+        return self._run_ffn_unit_impl(x, unit_idx)
diff --git a/deepmd/pt_expt/descriptor/dpa4_nn/radial.py b/deepmd/pt_expt/descriptor/dpa4_nn/radial.py
new file mode 100644
index 0000000000..2ac636d5e6
--- /dev/null
+++ b/deepmd/pt_expt/descriptor/dpa4_nn/radial.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""pt_expt wrapper for the shared radial embedding MLP.
+
+The dpmodel :class:`RadialMLP` stores its layers in a single ``net`` list whose
+entries alternate between ``NativeOP`` modules (``NativeLayer`` linear maps and
+``RMSNorm``) and a *plain activation function* returned by ``get_activation_fn``.
+The generic ``dpmodel_setattr`` list conversion only turns a list into a
+``torch.nn.ModuleList`` when every entry is a module (or ``None``); the bare
+activation function makes that check fail, so the list -- and therefore the
+linear / norm weights nested inside it -- would stay raw numpy arrays, invisible
+to autograd and the optimizer.
+
+This wrapper rebuilds ``net`` as a ``ModuleList`` once the tree is constructed,
+converting each ``NativeOP`` entry through the standard registry (so the linear
+maps become trainable :class:`~deepmd.pt_expt.utils.network.NativeLayer` weights
+and the norms become promotable buffers) and replacing the plain activation with
+a parameter-free torch module.  The activation reuses the backend's
+``_torch_activation`` so it is bit-identical to the reference pt
+``ActivationFn`` and safe under ``make_fx`` tracing.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+from typing import (
+    Any,
+)
+
+import torch
+
+from deepmd.dpmodel.common import (
+    NativeOP,
+)
+from deepmd.dpmodel.descriptor.dpa4_nn.radial import RadialMLP as RadialMLPDP
+from deepmd.pt_expt.common import (
+    register_dpmodel_mapping,
+    torch_module,
+    try_convert_module,
+)
+from deepmd.pt_expt.utils.network import (
+    _torch_activation,
+)
+
+
+class _ScalarActivation(torch.nn.Module):
+    """Parameter-free torch module applying a named scalar activation.
+
+    Mirrors the position the plain activation function occupies in the dpmodel
+    ``RadialMLP.net`` list, so the whole list can become a ``ModuleList``.
+    """
+
+    def __init__(self, activation_function: str) -> None:
+        super().__init__()
+        self.activation_function = str(activation_function)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return _torch_activation(x, self.activation_function)
+
+
+@torch_module
+class RadialMLP(RadialMLPDP):
+    """Radial embedding MLP with a torch-native, trainable ``net``."""
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        # ``self.net`` is still the raw dpmodel list here (the bare activation
+        # function blocked the generic list -> ModuleList conversion).  Convert
+        # every entry explicitly so the linear / norm weights live in trainable
+        # torch sub-modules.
+        self.net = torch.nn.ModuleList(self._convert_layer(layer) for layer in self.net)
+
+    def _convert_layer(self, layer: Any) -> torch.nn.Module:
+        if isinstance(layer, torch.nn.Module):
+            return layer
+        if isinstance(layer, NativeOP):
+            return try_convert_module(layer)
+        return _ScalarActivation(self.activation_function)
+
+
+# Build the torch-native RadialMLP wherever the dpmodel one is assigned in the
+# auto-wrapped descriptor tree (e.g. ``DescrptDPA4.radial_embedding``).
+register_dpmodel_mapping(RadialMLPDP, lambda v: RadialMLP.deserialize(v.serialize()))
diff --git a/deepmd/pt_expt/descriptor/dpa4_nn/so2.py b/deepmd/pt_expt/descriptor/dpa4_nn/so2.py
new file mode 100644
index 0000000000..ae9ffc43aa
--- /dev/null
+++ b/deepmd/pt_expt/descriptor/dpa4_nn/so2.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""pt_expt SO(2) convolution and radial mixer with opt-in fused Triton kernels.
+
+The dpmodel SO(2) modules are array-API only.  These wrappers inject the
+reference pt opt-in Triton inference path (``DP_TRITON_INFER``) around the two
+rotation hot paths of the SO(2) convolution and the low-rank branch of the
+dynamic radial degree mixer, mirroring
+``deepmd.pt.model.descriptor.sezm_nn.so2``.  The kernels run only during
+inference (``not self.training``); training and CPU / fp64 inference fall back to
+the dpmodel dense path.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import os
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+
+from deepmd.dpmodel.descriptor.dpa4_nn.so2 import (
+    DynamicRadialDegreeMixer as DynamicRadialDegreeMixerDP,
+)
+from deepmd.dpmodel.descriptor.dpa4_nn.so2 import SO2Convolution as SO2ConvolutionDP
+from deepmd.pt_expt.common import (
+    torch_module,
+)
+
+if TYPE_CHECKING:
+    import torch
+
+    from deepmd.dpmodel.descriptor.dpa4_nn.edge_cache import (
+        EdgeFeatureCache,
+    )
+
+_TRITON_INFER_TRUE = ("1", "true", "yes", "on")
+
+
+def use_triton_infer() -> bool:
+    """Return whether the opt-in Triton inference kernels are enabled.
+
+    The flag is controlled by the ``DP_TRITON_INFER`` environment variable and
+    is read at module construction time so that it becomes a compile-time
+    constant in the traced (``make_fx``) graph. It only takes effect during
+    inference; training always uses the dense reference path.
+
+    Returns
+    -------
+    bool
+        ``True`` when ``DP_TRITON_INFER`` is set to a truthy value.
+    """
+    return os.environ.get("DP_TRITON_INFER", "0").strip().lower() in _TRITON_INFER_TRUE
+
+
+@torch_module
+class DynamicRadialDegreeMixer(DynamicRadialDegreeMixerDP):
+    """Dynamic radial degree mixer with an opt-in fused Triton low-rank branch."""
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        # Inference fast path (opt-in via ``DP_TRITON_INFER``): a fused Triton
+        # kernel replaces the dense scatter and the tiny batched matmul of the
+        # ``degree_channel`` low-rank branch in the ``mmax == 1`` layout.
+        self.use_triton_infer = use_triton_infer()
+        self._radial_mix_block = None
+        if (
+            self.use_triton_infer
+            and self.mode == "degree_channel"
+            and self.rank > 0
+            and self.mmax == 1
+        ):
+            from .triton.radial_mix import (
+                radial_mix_block,
+            )
+
+            self._radial_mix_block = radial_mix_block
+
+    def _mix_rank_compact(
+        self, compact: torch.Tensor, x_local: torch.Tensor
+    ) -> torch.Tensor:
+        if self._radial_mix_block is not None and not self.training:
+            return self._radial_mix_block(
+                compact, x_local, self.channel_basis, self.lmax
+            )
+        return super()._mix_rank_compact(compact, x_local)
+
+
+@torch_module
+class SO2Convolution(SO2ConvolutionDP):
+    """SO(2) convolution with opt-in fused Triton rotation kernels."""
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+        # ``use_triton_infer`` is read once at construction so it is a
+        # compile-time constant in the traced (``make_fx``) graph, and it only
+        # takes effect during inference.
+        self.use_triton_infer = use_triton_infer()
+
+        # === Triton rotation kernels: block for mmax == 1, dense otherwise ===
+        self._rotate_to_local_fn = None
+        self._rotate_back_fn = None
+        if self.use_triton_infer:
+            from .triton.so2_rotation import (
+                rotate_back_block_so2,
+                rotate_back_dense,
+                rotate_to_local_block,
+                rotate_to_local_dense,
+            )
+
+            if self.mmax == 1:
+                self._rotate_to_local_fn = lambda x, src, wigner: rotate_to_local_block(
+                    x, src, wigner, self.lmax
+                )
+                # The block kernel reads the (E, F, D_m, Cf) focus layout directly,
+                # so the rotate-back path passes ``x_local`` before the global
+                # reshape and the transpose-back copy is skipped.
+                self._rotate_back_fn = lambda x_local, wigner: rotate_back_block_so2(
+                    x_local, wigner, self.lmax
+                )
+            else:
+                self._rotate_to_local_fn = lambda x, src, wigner: rotate_to_local_dense(
+                    x, src, wigner, self.coeff_index_m, self.ebed_dim_full
+                )
+                self._rotate_back_fn = lambda x_local, wigner: rotate_back_dense(
+                    x_local, wigner, self.coeff_index_m, self.ebed_dim_full
+                )
+
+    def _rotate_to_local(
+        self, x: torch.Tensor, edge_cache: EdgeFeatureCache
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if self.use_triton_infer and not self.training:
+            # ``self._rotate_to_local_fn`` was bound in ``__init__`` (the block
+            # kernel for the m-major ``mmax == 1`` layout, dense otherwise).
+            D_full = edge_cache.D_full
+            x_local = self._rotate_to_local_fn(x, edge_cache.src, D_full)
+            x_dst_local: torch.Tensor | None = None
+            if self.node_wise_grid_product is not None:
+                x_dst_local = self._rotate_to_local_fn(x, edge_cache.dst, D_full)
+            return x_local, x_dst_local
+        return super()._rotate_to_local(x, edge_cache)
+
+    def _rotate_back(
+        self, x_local: torch.Tensor, edge_cache: EdgeFeatureCache, n_edge: int
+    ) -> torch.Tensor:
+        if self.use_triton_infer and not self.training:
+            Dt_full = edge_cache.Dt_full
+            if self.mmax == 1:
+                # The block kernel consumes the (E, F, D_m, Cf) focus layout in
+                # place, folding the inverse transpose into its channel addressing.
+                return self._rotate_back_fn(x_local, Dt_full)
+            # Restore reduced global layout (E, D_m, C_wide) for the dense kernel.
+            x_std = (
+                x_local.transpose(1, 2)
+                .contiguous()
+                .reshape(n_edge, self.reduced_dim, self.hidden_channels)
+            )
+            return self._rotate_back_fn(x_std, Dt_full)
+        return super()._rotate_back(x_local, edge_cache, n_edge)
diff --git a/deepmd/pt_expt/descriptor/dpa4_nn/triton/__init__.py b/deepmd/pt_expt/descriptor/dpa4_nn/triton/__init__.py
new file mode 100644
index 0000000000..3cc27f40d4
--- /dev/null
+++ b/deepmd/pt_expt/descriptor/dpa4_nn/triton/__init__.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Hardware-accelerated SeZM/DPA4 operators.
+
+This package hosts ``make_fx``-composable Triton implementations of SeZM hot
+paths.  Kernel entry points are internal implementation details of the SeZM
+descriptor; the package-level API only exposes availability.
+"""
+
+from .radial_mix import (
+    RADIAL_MIX_TRITON_AVAILABLE,
+)
+from .so2_rotation import (
+    TRITON_ROTATION_AVAILABLE,
+)
+
+# Both kernel modules guard their ``@triton.jit`` definitions behind a ``triton``
+# import, so the two module-level checks are equivalent. Expose a single
+# package-level availability flag.
+TRITON_AVAILABLE = TRITON_ROTATION_AVAILABLE and RADIAL_MIX_TRITON_AVAILABLE
+
+__all__ = [
+    "TRITON_AVAILABLE",
+]
diff --git a/deepmd/pt_expt/descriptor/dpa4_nn/triton/radial_mix.py b/deepmd/pt_expt/descriptor/dpa4_nn/triton/radial_mix.py
new file mode 100644
index 0000000000..c563887834
--- /dev/null
+++ b/deepmd/pt_expt/descriptor/dpa4_nn/triton/radial_mix.py
@@ -0,0 +1,836 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+# pyright: reportMissingImports=false
+# ruff: noqa: ANN001, ANN202
+"""Fused Triton dynamic radial degree mixer for the SeZM/DPA4 descriptor.
+
+This module provides a clean-room Triton implementation of the
+``degree_channel`` branch of :class:`DynamicRadialDegreeMixer` for the
+``mmax == 1`` reduced layout.  The eager reference applies, per edge ``e`` and
+output coefficient ``o``::
+
+    out[e, o, c] = sum_r channel_basis[r, c] * sum_i K_r[e, o, i] * x[e, i, c]
+
+where ``K_r`` is the edge-conditioned degree kernel obtained by scattering the
+projected radial features ``compact`` into a ``(reduced_dim, reduced_dim)``
+matrix.  ``K_r`` is block-diagonal over the ``|m|`` groups, so for
+``mmax == 1`` only a ``(lmax+1) x (lmax+1)`` block (orders ``m = 0``) and two
+identical ``lmax x lmax`` blocks (orders ``m = -1`` and ``m = +1``) are
+non-zero.
+
+Design goals
+------------
+1. **Skip the structural zeros and the dense scratch.** The eager path
+   materializes the dense kernel ``(E, reduced_dim, reduced_dim, rank)`` via a
+   scatter and then contracts it with a batched ``einsum``/``bmm`` whose matrices
+   are tiny (``reduced_dim <= 16``), which is inefficient on cuBLAS and wastes
+   roughly two thirds of the multiply-adds on off-block zeros. The kernel
+   instead reads ``compact`` directly and contracts only the structural
+   non-zeros, with the channel axis vectorized and one program per edge.
+2. **Match eager fp32 accuracy.** Accumulation is in fp32, matching the smooth
+   potential-energy surface contract used throughout the SeZM descriptor.
+3. **Compose with the SeZM ``make_fx`` lowering *and* the AOTInductor freeze.**
+   The forward and backward are functional ``torch.library.triton_op`` instances
+   (``mutates_args=()``) with registered fake kernels and an autograd formula, so
+   ``make_fx(tracing_mode="symbolic")`` captures the energy path together with
+   the force autograd graph used by inference. ``triton_op`` + ``wrap_triton``
+   (vs ``custom_op``) lets Inductor see through to the Triton kernel and bake the
+   cubin into the AOTInductor ``.pt2``, so the frozen package runs the fused
+   mixer inside the LAMMPS C++ runtime without any Python op registration.
+
+Inference-only contract
+-----------------------
+The operator is opt-in through ``DP_TRITON_INFER`` and is only used in
+evaluation, where the force is obtained from ``autograd.grad(energy, coord)``.
+The backward therefore returns gradients with respect to ``compact`` and
+``x_local`` (both of which carry a path to the coordinates) and ``None`` for
+``channel_basis``, which is a parameter and never differentiated by the force
+computation.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import torch
+from torch import (
+    Tensor,
+)
+from torch.library import (
+    wrap_triton,
+)
+
+__all__ = [
+    "RADIAL_MIX_TRITON_AVAILABLE",
+    "radial_mix_block",
+    "radial_mix_reference",
+]
+
+try:
+    import triton
+    import triton.language as tl
+
+    RADIAL_MIX_TRITON_AVAILABLE = True
+except ImportError:  # pragma: no cover - exercised only without triton
+    RADIAL_MIX_TRITON_AVAILABLE = False
+
+
+# ======================================================================
+# Eager reference / fallback implementation
+# ======================================================================
+def _block_layout(lmax: int) -> list[tuple[int, int, int]]:
+    """Return ``(coeff_start, compact_start, num_l)`` for the ``mmax == 1`` blocks.
+
+    The reduced m-major layout keeps, for each degree ``l``, the orders
+    ``m = 0`` (the leading ``lmax + 1`` coefficients) followed by ``m = -1`` and
+    ``m = +1`` (``lmax`` coefficients each). The degree kernel for the two
+    signed-``m`` blocks is shared, hence the identical ``compact_start``.
+    """
+    num_l0 = lmax + 1
+    return [
+        (0, 0, num_l0),
+        (num_l0, num_l0 * num_l0, lmax),
+        (num_l0 + lmax, num_l0 * num_l0, lmax),
+    ]
+
+
+def radial_mix_reference(
+    compact: Tensor, x_local: Tensor, channel_basis: Tensor, lmax: int
+) -> Tensor:
+    """Eager ground truth for :func:`radial_mix_block`.
+
+    Parameters
+    ----------
+    compact : Tensor
+        Projected radial degree kernel with shape ``(E, degree_kernel_size, R)``.
+    x_local : Tensor
+        Edge-local reduced features with shape ``(E, reduced_dim, C)``.
+    channel_basis : Tensor
+        Per-rank channel basis with shape ``(R, C)``.
+    lmax : int
+        Maximum spherical-harmonic degree.
+
+    Returns
+    -------
+    Tensor
+        Mixed features with shape ``(E, reduced_dim, C)``.
+    """
+    n_edge, reduced_dim, channels = x_local.shape
+    out = x_local.new_zeros(n_edge, reduced_dim, channels)
+    for coeff0, comp0, num_l in _block_layout(int(lmax)):
+        # K[e, o, i, r] = compact[e, comp0 + i * num_l + o, r]
+        block = compact[:, comp0 : comp0 + num_l * num_l, :].reshape(
+            n_edge, num_l, num_l, -1
+        )
+        block = block.permute(0, 2, 1, 3)  # (E, o, i, R)
+        x_block = x_local[:, coeff0 : coeff0 + num_l, :]  # (E, i, C)
+        inner = torch.einsum("eoir,eic->eocr", block, x_block)  # (E, o, C, R)
+        out[:, coeff0 : coeff0 + num_l, :] = torch.einsum(
+            "eocr,rc->eoc", inner, channel_basis
+        )
+    return out
+
+
+def _radial_mix_backward_reference(
+    grad_out: Tensor, compact: Tensor, x_local: Tensor, channel_basis: Tensor, lmax: int
+) -> tuple[Tensor, Tensor]:
+    """Closed-form eager backward of :func:`radial_mix_reference`.
+
+    Gradients are evaluated analytically per diagonal block, mirroring the
+    contractions of the Triton backward. A closed form is required rather than a
+    nested ``autograd.grad``: this routine is the CPU backend of the
+    ``radial_mix_block_bwd`` operator, which carries no autograd formula and is
+    consequently dispatched under ``_AutoDispatchBelowAutograd`` whenever the
+    force graph is replayed without grad (the SeZM ``.pt2`` freeze does so under
+    :func:`torch.no_grad`). That guard excludes the autograd key, so a nested
+    ``autograd.grad`` would observe an output without a ``grad_fn``.
+
+    Parameters
+    ----------
+    grad_out : Tensor
+        Upstream gradient with shape ``(E, reduced_dim, C)``.
+    compact : Tensor
+        Projected radial degree kernel with shape ``(E, degree_kernel_size, R)``.
+    x_local : Tensor
+        Edge-local reduced features with shape ``(E, reduced_dim, C)``.
+    channel_basis : Tensor
+        Per-rank channel basis with shape ``(R, C)``.
+    lmax : int
+        Maximum spherical-harmonic degree.
+
+    Returns
+    -------
+    tuple[Tensor, Tensor]
+        Gradients ``(grad_compact, grad_x_local)``, matching ``compact`` and
+        ``x_local`` in shape respectively.
+    """
+    n_edge, reduced_dim, channels = x_local.shape
+    grad_x_local = torch.zeros_like(x_local)
+    grad_compact = torch.zeros_like(compact)
+    for coeff0, comp0, num_l in _block_layout(int(lmax)):
+        # Forward of this block (see ``radial_mix_reference``):
+        #   out[e, o, c] = sum_{i, r} K[e, o, i, r] * x[e, i, c] * cb[r, c]
+        #   with K[e, o, i, r] = compact[e, comp0 + i * num_l + o, r].
+        k_block = (
+            compact[:, comp0 : comp0 + num_l * num_l, :]
+            .reshape(n_edge, num_l, num_l, -1)
+            .permute(0, 2, 1, 3)
+        )  # (E, o, i, R)
+        x_block = x_local[:, coeff0 : coeff0 + num_l, :]  # (E, i, C)
+        g_block = grad_out[:, coeff0 : coeff0 + num_l, :]  # (E, o, C)
+
+        # grad_x[e, i, c] = sum_r cb[r, c] * sum_o K[e, o, i, r] * g[e, o, c].
+        gx = torch.einsum("eoir,eoc->eicr", k_block, g_block)  # (E, i, C, R)
+        grad_x_local[:, coeff0 : coeff0 + num_l, :] += torch.einsum(
+            "eicr,rc->eic", gx, channel_basis
+        )
+
+        # grad_K[e, o, i, r] = sum_c cb[r, c] * x[e, i, c] * g[e, o, c], scattered
+        # back to the compact slot comp0 + i * num_l + o. The shared m = +-1
+        # blocks address the same slots, so the in-place add accumulates both.
+        gk = torch.einsum("eoc,eic,rc->eoir", g_block, x_block, channel_basis)
+        grad_compact[:, comp0 : comp0 + num_l * num_l, :] += gk.permute(
+            0, 2, 1, 3
+        ).reshape(n_edge, num_l * num_l, -1)
+    return grad_compact, grad_x_local
+
+
+# ======================================================================
+# Triton kernels (mmax == 1; LMAX and RANK are constexpr; channels vectorized)
+# ======================================================================
+if RADIAL_MIX_TRITON_AVAILABLE:
+    # The per-edge work is tiny and memory-light, so only the warp count and
+    # pipeline depth are swept, keyed on the channel width.
+    _CONFIGS = [
+        triton.Config({}, num_warps=1, num_stages=1),
+        triton.Config({}, num_warps=2, num_stages=1),
+        triton.Config({}, num_warps=4, num_stages=1),
+        triton.Config({}, num_warps=2, num_stages=2),
+        triton.Config({}, num_warps=4, num_stages=2),
+    ]
+    _KEY = ["channels"]
+
+    @triton.jit
+    def _mix_fwd_block(
+        edge,
+        chan,
+        cmask,
+        x_ptr,
+        k_ptr,
+        cb_ptr,
+        out_ptr,
+        x_se,
+        x_sr,
+        x_sc,
+        k_se,
+        k_sk,
+        k_sr,
+        cb_sr,
+        cb_sc,
+        o_se,
+        o_sr,
+        o_sc,
+        COEFF0: tl.constexpr,
+        COMPACT0: tl.constexpr,
+        NUM_L: tl.constexpr,
+        RANK: tl.constexpr,
+    ):
+        """Contract one diagonal block: ``out[o] = sum_r cb[r] sum_i K_r[o,i] x[i]``."""
+        for o in tl.static_range(0, NUM_L):
+            acc = tl.zeros(chan.shape, dtype=tl.float32)
+            for r in tl.static_range(0, RANK):
+                partial = tl.zeros(chan.shape, dtype=tl.float32)
+                for i in tl.static_range(0, NUM_L):
+                    kval = tl.load(
+                        k_ptr
+                        + edge * k_se
+                        + (COMPACT0 + i * NUM_L + o) * k_sk
+                        + r * k_sr
+                    ).to(tl.float32)
+                    x_vec = tl.load(
+                        x_ptr + edge * x_se + (COEFF0 + i) * x_sr + chan * x_sc,
+                        mask=cmask,
+                        other=0.0,
+                    ).to(tl.float32)
+                    partial += kval * x_vec
+                cb_vec = tl.load(
+                    cb_ptr + r * cb_sr + chan * cb_sc, mask=cmask, other=0.0
+                ).to(tl.float32)
+                acc += partial * cb_vec
+            tl.store(
+                out_ptr + edge * o_se + (COEFF0 + o) * o_sr + chan * o_sc,
+                acc.to(out_ptr.dtype.element_ty),
+                mask=cmask,
+            )
+
+    @triton.autotune(configs=_CONFIGS, key=_KEY)
+    @triton.jit
+    def _radial_mix_fwd_kernel(
+        x_ptr,
+        k_ptr,
+        cb_ptr,
+        out_ptr,
+        n_edge,
+        channels,
+        x_se,
+        x_sr,
+        x_sc,
+        k_se,
+        k_sk,
+        k_sr,
+        cb_sr,
+        cb_sc,
+        o_se,
+        o_sr,
+        o_sc,
+        LMAX: tl.constexpr,
+        RANK: tl.constexpr,
+        BLOCK_C: tl.constexpr,
+    ):
+        edge = tl.program_id(0).to(tl.int64)
+        chan = tl.arange(0, BLOCK_C)
+        cmask = chan < channels
+        num_l0: tl.constexpr = LMAX + 1
+        strides = (
+            x_se,
+            x_sr,
+            x_sc,
+            k_se,
+            k_sk,
+            k_sr,
+            cb_sr,
+            cb_sc,
+            o_se,
+            o_sr,
+            o_sc,
+        )
+        # m = 0 block, then the shared m = -1 and m = +1 blocks.
+        _mix_fwd_block(
+            edge,
+            chan,
+            cmask,
+            x_ptr,
+            k_ptr,
+            cb_ptr,
+            out_ptr,
+            *strides,
+            0,
+            0,
+            num_l0,
+            RANK,
+        )
+        _mix_fwd_block(
+            edge,
+            chan,
+            cmask,
+            x_ptr,
+            k_ptr,
+            cb_ptr,
+            out_ptr,
+            *strides,
+            num_l0,
+            num_l0 * num_l0,
+            LMAX,
+            RANK,
+        )
+        _mix_fwd_block(
+            edge,
+            chan,
+            cmask,
+            x_ptr,
+            k_ptr,
+            cb_ptr,
+            out_ptr,
+            *strides,
+            num_l0 + LMAX,
+            num_l0 * num_l0,
+            LMAX,
+            RANK,
+        )
+
+    @triton.jit
+    def _mix_bwd_grad_x_block(
+        edge,
+        chan,
+        cmask,
+        go_ptr,
+        k_ptr,
+        cb_ptr,
+        gx_ptr,
+        go_se,
+        go_sr,
+        go_sc,
+        k_se,
+        k_sk,
+        k_sr,
+        cb_sr,
+        cb_sc,
+        gx_se,
+        gx_sr,
+        gx_sc,
+        COEFF0: tl.constexpr,
+        COMPACT0: tl.constexpr,
+        NUM_L: tl.constexpr,
+        RANK: tl.constexpr,
+    ):
+        """Input gradient of one diagonal block.
+
+        Computes ``grad_x[i] = sum_r cb[r] sum_o K_r[o,i] grad_out[o]``. Each edge
+        owns its rows and the three blocks address disjoint coefficient rows, so
+        the result is written once with a plain store rather than an atomic add.
+        """
+        for i in tl.static_range(0, NUM_L):
+            grad_x = tl.zeros(chan.shape, dtype=tl.float32)
+            for r in tl.static_range(0, RANK):
+                cb_vec = tl.load(
+                    cb_ptr + r * cb_sr + chan * cb_sc, mask=cmask, other=0.0
+                ).to(tl.float32)
+                partial = tl.zeros(chan.shape, dtype=tl.float32)
+                for o in tl.static_range(0, NUM_L):
+                    kval = tl.load(
+                        k_ptr
+                        + edge * k_se
+                        + (COMPACT0 + i * NUM_L + o) * k_sk
+                        + r * k_sr
+                    ).to(tl.float32)
+                    go_vec = tl.load(
+                        go_ptr + edge * go_se + (COEFF0 + o) * go_sr + chan * go_sc,
+                        mask=cmask,
+                        other=0.0,
+                    ).to(tl.float32)
+                    partial += kval * go_vec
+                grad_x += cb_vec * partial
+            tl.store(
+                gx_ptr + edge * gx_se + (COEFF0 + i) * gx_sr + chan * gx_sc,
+                grad_x.to(gx_ptr.dtype.element_ty),
+                mask=cmask,
+            )
+
+    @triton.jit
+    def _mix_bwd_grad_k_block(
+        edge,
+        chan,
+        cmask,
+        go_ptr,
+        x_ptr,
+        cb_ptr,
+        gk_ptr,
+        go_se,
+        go_sr,
+        go_sc,
+        x_se,
+        x_sr,
+        x_sc,
+        cb_sr,
+        cb_sc,
+        gk_se,
+        gk_sk,
+        gk_sr,
+        COEFF0: tl.constexpr,
+        COEFF1: tl.constexpr,
+        COMPACT0: tl.constexpr,
+        NUM_L: tl.constexpr,
+        RANK: tl.constexpr,
+        SHARED: tl.constexpr,
+    ):
+        """Kernel gradient of one diagonal block.
+
+        Computes ``grad_K_r[o,i] = sum_c cb[r,c] x[i,c] grad_out[o,c]``. The
+        ``m = -1`` and ``m = +1`` blocks (``SHARED``) write the same ``compact``
+        slots; their contributions are summed in registers and stored once, which
+        removes the atomic add and the zero-initialization the original required.
+        """
+        for o in tl.static_range(0, NUM_L):
+            go_vec = tl.load(
+                go_ptr + edge * go_se + (COEFF0 + o) * go_sr + chan * go_sc,
+                mask=cmask,
+                other=0.0,
+            ).to(tl.float32)
+            if SHARED:
+                go_vec_sh = tl.load(
+                    go_ptr + edge * go_se + (COEFF1 + o) * go_sr + chan * go_sc,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+            for i in tl.static_range(0, NUM_L):
+                x_vec = tl.load(
+                    x_ptr + edge * x_se + (COEFF0 + i) * x_sr + chan * x_sc,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+                prod = go_vec * x_vec
+                if SHARED:
+                    x_vec_sh = tl.load(
+                        x_ptr + edge * x_se + (COEFF1 + i) * x_sr + chan * x_sc,
+                        mask=cmask,
+                        other=0.0,
+                    ).to(tl.float32)
+                    prod += go_vec_sh * x_vec_sh
+                for r in tl.static_range(0, RANK):
+                    cb_vec = tl.load(
+                        cb_ptr + r * cb_sr + chan * cb_sc, mask=cmask, other=0.0
+                    ).to(tl.float32)
+                    grad_k = tl.sum(tl.where(cmask, prod * cb_vec, 0.0))
+                    tl.store(
+                        gk_ptr
+                        + edge * gk_se
+                        + (COMPACT0 + i * NUM_L + o) * gk_sk
+                        + r * gk_sr,
+                        grad_k.to(gk_ptr.dtype.element_ty),
+                    )
+
+    @triton.autotune(configs=_CONFIGS, key=_KEY)
+    @triton.jit
+    def _radial_mix_bwd_kernel(
+        go_ptr,
+        x_ptr,
+        k_ptr,
+        cb_ptr,
+        gx_ptr,
+        gk_ptr,
+        n_edge,
+        channels,
+        go_se,
+        go_sr,
+        go_sc,
+        x_se,
+        x_sr,
+        x_sc,
+        k_se,
+        k_sk,
+        k_sr,
+        cb_sr,
+        cb_sc,
+        gx_se,
+        gx_sr,
+        gx_sc,
+        gk_se,
+        gk_sk,
+        gk_sr,
+        LMAX: tl.constexpr,
+        RANK: tl.constexpr,
+        BLOCK_C: tl.constexpr,
+    ):
+        edge = tl.program_id(0).to(tl.int64)
+        chan = tl.arange(0, BLOCK_C)
+        cmask = chan < channels
+        num_l0: tl.constexpr = LMAX + 1
+
+        # === Step 1. Input gradient: three disjoint coefficient blocks ===
+        grad_x_strides = (
+            go_se,
+            go_sr,
+            go_sc,
+            k_se,
+            k_sk,
+            k_sr,
+            cb_sr,
+            cb_sc,
+            gx_se,
+            gx_sr,
+            gx_sc,
+        )
+        _mix_bwd_grad_x_block(
+            edge,
+            chan,
+            cmask,
+            go_ptr,
+            k_ptr,
+            cb_ptr,
+            gx_ptr,
+            *grad_x_strides,
+            0,
+            0,
+            num_l0,
+            RANK,
+        )
+        _mix_bwd_grad_x_block(
+            edge,
+            chan,
+            cmask,
+            go_ptr,
+            k_ptr,
+            cb_ptr,
+            gx_ptr,
+            *grad_x_strides,
+            num_l0,
+            num_l0 * num_l0,
+            LMAX,
+            RANK,
+        )
+        _mix_bwd_grad_x_block(
+            edge,
+            chan,
+            cmask,
+            go_ptr,
+            k_ptr,
+            cb_ptr,
+            gx_ptr,
+            *grad_x_strides,
+            num_l0 + LMAX,
+            num_l0 * num_l0,
+            LMAX,
+            RANK,
+        )
+
+        # === Step 2. Kernel gradient: m=0 block, then summed m=+-1 blocks ===
+        grad_k_strides = (
+            go_se,
+            go_sr,
+            go_sc,
+            x_se,
+            x_sr,
+            x_sc,
+            cb_sr,
+            cb_sc,
+            gk_se,
+            gk_sk,
+            gk_sr,
+        )
+        _mix_bwd_grad_k_block(
+            edge,
+            chan,
+            cmask,
+            go_ptr,
+            x_ptr,
+            cb_ptr,
+            gk_ptr,
+            *grad_k_strides,
+            0,
+            0,
+            0,
+            num_l0,
+            RANK,
+            False,
+        )
+        _mix_bwd_grad_k_block(
+            edge,
+            chan,
+            cmask,
+            go_ptr,
+            x_ptr,
+            cb_ptr,
+            gk_ptr,
+            *grad_k_strides,
+            num_l0,
+            num_l0 + LMAX,
+            num_l0 * num_l0,
+            LMAX,
+            RANK,
+            True,
+        )
+
+
+# ======================================================================
+# Triton launch wrappers
+# ======================================================================
+def _tile_channels(channels: int) -> int:
+    """Smallest power-of-two channel tile of at least 16 covering ``channels``."""
+    tile = 16
+    while tile < int(channels):
+        tile *= 2
+    return tile
+
+
+def _has_no_edges(n_edge: int) -> bool:
+    """Return true for eager zero-edge calls without guarding symbolic edges."""
+    return type(n_edge) is int and n_edge == 0
+
+
+def _launch_forward(
+    x_local: Tensor, compact: Tensor, channel_basis: Tensor, lmax: int
+) -> Tensor:
+    n_edge, reduced_dim, channels = x_local.shape
+    rank = int(compact.shape[-1])
+    out = torch.empty_like(x_local)
+    if _has_no_edges(n_edge):
+        return out
+    wrap_triton(_radial_mix_fwd_kernel)[(n_edge,)](
+        x_local,
+        compact,
+        channel_basis,
+        out,
+        n_edge,
+        channels,
+        x_local.stride(0),
+        x_local.stride(1),
+        x_local.stride(2),
+        compact.stride(0),
+        compact.stride(1),
+        compact.stride(2),
+        channel_basis.stride(0),
+        channel_basis.stride(1),
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        LMAX=int(lmax),
+        RANK=rank,
+        BLOCK_C=_tile_channels(channels),
+    )
+    return out
+
+
+def _launch_backward(
+    grad_out: Tensor,
+    x_local: Tensor,
+    compact: Tensor,
+    channel_basis: Tensor,
+    lmax: int,
+) -> tuple[Tensor, Tensor]:
+    n_edge, reduced_dim, channels = x_local.shape
+    rank = int(compact.shape[-1])
+    # Every output element is written exactly once (input rows are disjoint and
+    # the shared m=+-1 kernel slots are summed in-register), so no zero-init.
+    grad_x = torch.empty_like(x_local)
+    grad_compact = torch.empty_like(compact)
+    if _has_no_edges(n_edge):
+        return grad_compact, grad_x
+    wrap_triton(_radial_mix_bwd_kernel)[(n_edge,)](
+        grad_out.contiguous(),
+        x_local,
+        compact,
+        channel_basis,
+        grad_x,
+        grad_compact,
+        n_edge,
+        channels,
+        grad_out.stride(0),
+        grad_out.stride(1),
+        grad_out.stride(2),
+        x_local.stride(0),
+        x_local.stride(1),
+        x_local.stride(2),
+        compact.stride(0),
+        compact.stride(1),
+        compact.stride(2),
+        channel_basis.stride(0),
+        channel_basis.stride(1),
+        grad_x.stride(0),
+        grad_x.stride(1),
+        grad_x.stride(2),
+        grad_compact.stride(0),
+        grad_compact.stride(1),
+        grad_compact.stride(2),
+        LMAX=int(lmax),
+        RANK=rank,
+        BLOCK_C=_tile_channels(channels),
+    )
+    return grad_compact, grad_x
+
+
+# ======================================================================
+# Dispatch helpers (triton on CUDA float, eager otherwise)
+# ======================================================================
+def _use_triton(tensor: Tensor) -> bool:
+    return (
+        RADIAL_MIX_TRITON_AVAILABLE
+        and tensor.is_cuda
+        and tensor.dtype in (torch.float16, torch.bfloat16, torch.float32)
+    )
+
+
+def _forward_impl(
+    compact: Tensor, x_local: Tensor, channel_basis: Tensor, lmax: int
+) -> Tensor:
+    if not _use_triton(x_local):
+        return radial_mix_reference(compact, x_local, channel_basis, lmax)
+    return _launch_forward(
+        x_local.contiguous(),
+        compact.contiguous(),
+        channel_basis.contiguous(),
+        int(lmax),
+    )
+
+
+def _backward_impl(
+    grad_out: Tensor,
+    compact: Tensor,
+    x_local: Tensor,
+    channel_basis: Tensor,
+    lmax: int,
+) -> tuple[Tensor, Tensor]:
+    if not _use_triton(x_local):
+        return _radial_mix_backward_reference(
+            grad_out, compact, x_local, channel_basis, lmax
+        )
+    return _launch_backward(
+        grad_out,
+        x_local.contiguous(),
+        compact.contiguous(),
+        channel_basis.contiguous(),
+        int(lmax),
+    )
+
+
+# ======================================================================
+# Functional triton_op + fake + autograd registration
+# ======================================================================
+# ``triton_op`` (not ``custom_op``) so Inductor bakes the Triton cubin into the
+# AOTInductor ``.pt2``; the LAMMPS C++ runtime then needs no Python registration.
+_radial_mix_op = torch.library.triton_op(
+    "dpa4_triton::radial_mix_block", mutates_args=()
+)(_forward_impl)
+
+_radial_mix_bwd_op = torch.library.triton_op(
+    "dpa4_triton::radial_mix_block_bwd", mutates_args=()
+)(_backward_impl)
+
+
+@_radial_mix_op.register_fake
+def _(compact, x_local, channel_basis, lmax):
+    return torch.empty_like(x_local)
+
+
+@_radial_mix_bwd_op.register_fake
+def _(grad_out, compact, x_local, channel_basis, lmax):
+    return torch.empty_like(compact), torch.empty_like(x_local)
+
+
+def _radial_mix_setup_context(ctx, inputs, output):
+    compact, x_local, channel_basis, lmax = inputs
+    ctx.save_for_backward(compact, x_local, channel_basis)
+    ctx.lmax = lmax
+
+
+def _radial_mix_backward(ctx, grad_out):
+    compact, x_local, channel_basis = ctx.saved_tensors
+    grad_compact, grad_x = _radial_mix_bwd_op(
+        grad_out, compact, x_local, channel_basis, ctx.lmax
+    )
+    # ``channel_basis`` is a parameter; the inference force differentiates only
+    # w.r.t. coordinates, so its gradient is intentionally not produced.
+    return grad_compact, grad_x, None, None
+
+
+_radial_mix_op.register_autograd(
+    _radial_mix_backward, setup_context=_radial_mix_setup_context
+)
+
+
+# ======================================================================
+# Public API
+# ======================================================================
+def radial_mix_block(
+    compact: Tensor, x_local: Tensor, channel_basis: Tensor, lmax: int
+) -> Tensor:
+    """Apply the block-diagonal dynamic radial degree mixer (``mmax == 1``).
+
+    Computes the same operation as :func:`radial_mix_reference` while avoiding
+    the dense scattered kernel and the tiny batched matmul on CUDA.
+
+    Parameters
+    ----------
+    compact : Tensor
+        Projected radial degree kernel with shape ``(E, degree_kernel_size, R)``.
+    x_local : Tensor
+        Edge-local reduced features with shape ``(E, reduced_dim, C)``.
+    channel_basis : Tensor
+        Per-rank channel basis with shape ``(R, C)``.
+    lmax : int
+        Maximum spherical-harmonic degree.
+
+    Returns
+    -------
+    Tensor
+        Mixed features with shape ``(E, reduced_dim, C)``.
+    """
+    return _radial_mix_op(compact, x_local, channel_basis, int(lmax))
diff --git a/deepmd/pt_expt/descriptor/dpa4_nn/triton/so2_rotation.py b/deepmd/pt_expt/descriptor/dpa4_nn/triton/so2_rotation.py
new file mode 100644
index 0000000000..e34e300259
--- /dev/null
+++ b/deepmd/pt_expt/descriptor/dpa4_nn/triton/so2_rotation.py
@@ -0,0 +1,2003 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+# pyright: reportMissingImports=false
+# ruff: noqa: ANN001, ANN202
+"""Fused Triton SO(2)/Wigner rotation operators for the SeZM/DPA4 descriptor.
+
+This module provides a *clean room* Triton implementation of the two rotation
+hot paths used by the SeZM SO(2) convolution:
+
+``rotate_to_local`` (global -> edge-local reduced frame)
+    For every edge ``e`` with source node ``src[e]``::
+
+        out[e] = Wrows[e] @ x[src[e]]  # (Dm, C)
+        Wrows[e][m, k] = wigner[e, coeff_index[m], k]  # (Dm, D), k < D
+
+    i.e. the eager reference ``bmm(D_to_m, x[src])`` where
+    ``D_to_m = wigner[:, :D, :D].index_select(1, coeff_index)``.
+
+``rotate_back`` (edge-local reduced frame -> global)
+    For every edge ``e``::
+
+        out[e] = Wcols[e] @ x_local[e]  # (D, C)
+        Wcols[e][d, m] = wigner[e, d, coeff_index[m]]  # (D, Dm), d < D
+
+    i.e. the eager reference ``bmm(Dt_from_m, x_local)`` where
+    ``Dt_from_m = wigner[:, :D, :D].index_select(2, coeff_index)``.
+
+Design goals
+------------
+1. **Fuse the gathers into the GEMM.** The eager / ``torch.compile`` path first
+   materializes ``D_to_m`` (or ``Dt_from_m``), shape ``(E, Dm, D)``, *and*
+   ``x[src]``, shape ``(E, D, C)``, before calling ``bmm``. For lmax 10 with
+   E=100k that is ~9 GB of scratch that is written and immediately re-read.
+   We instead gather the Wigner rows/columns (by ``coeff_index``) and the node
+   features (by ``src``) *inside* the kernel, so neither scratch tensor is ever
+   created. Each edge is one tiny GEMM; this also sidesteps the well-known
+   inefficiency of cuBLAS strided-batched GEMM on very small matrices.
+
+2. **Match eager FP32 accuracy.** Every ``tl.dot`` uses
+   ``input_precision="ieee"`` so the contraction runs in true IEEE FP32 (no
+   TF32). This keeps the potential-energy surface smooth.
+
+3. **Compose with SeZM's ``make_fx`` lowering *and* the AOTInductor freeze.**
+   The operators are functional ``torch.library.triton_op`` instances
+   (``mutates_args=()``) with registered fake kernels and autograd formulas; the
+   backward is itself a ``triton_op``, so ``make_fx(tracing_mode="symbolic")``
+   can capture the energy path together with the force autograd graph used by
+   inference. Unlike ``torch.library.custom_op`` (opaque to the compiler, hence
+   emitted as a *runtime dispatcher* call that the C++ ``.pt2`` runtime cannot
+   resolve), a ``triton_op`` wraps its kernel launch in ``wrap_triton`` so
+   Inductor sees through to the Triton kernel and **bakes the cubin into the
+   AOTInductor package**. That is what lets the frozen ``.pt2`` run the Triton
+   path inside the LAMMPS C++ runtime (``DeepPotPTExpt`` /
+   ``AOTIModelPackageLoader``), with no Python op registration available. The
+   ``_use_triton`` device/dtype branch below stays a plain Python ``if``: the
+   op is opaque under ``make_fx`` (CPU trace), and Inductor resolves the branch
+   at compile time on the post-``move_to_device`` CUDA tensors, so CUDA fp32
+   targets bake the Triton kernel while CPU / fp64 targets bake the eager
+   reference.
+
+Shapes / dtypes
+---------------
+``x``/``x_local`` and ``wigner`` are float tensors; fp32 is the supported
+precision for the smooth potential-energy surface, while fp16/bf16 inputs
+accumulate in fp32. ``src`` and ``coeff_index`` are int64 tensors. ``E`` (edges)
+may exceed 2**31 elements once multiplied by the per-edge matrix size, so all
+kernels use int64 addressing.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import torch
+from torch import (
+    Tensor,
+)
+from torch.library import (
+    wrap_triton,
+)
+
+from deepmd.dpmodel.descriptor.dpa4_nn.indexing import (
+    build_m_major_index as _build_m_major_index_np,
+)
+
+__all__ = [
+    "TRITON_ROTATION_AVAILABLE",
+]
+
+
+def build_m_major_index(lmax: int, mmax: int, device: torch.device) -> Tensor:
+    """Torch m-major reduced coefficient index on ``device``.
+
+    The dpmodel index builder is numpy-only; the Triton eager-fallback paths
+    need it as an int64 tensor on the working device.
+    """
+    return torch.as_tensor(_build_m_major_index_np(int(lmax), int(mmax)), device=device)
+
+
+try:
+    import triton
+    import triton.language as tl
+
+    TRITON_ROTATION_AVAILABLE = True
+except ImportError:  # pragma: no cover - exercised only without triton
+    TRITON_ROTATION_AVAILABLE = False
+
+
+# ======================================================================
+# Eager reference / fallback implementations
+# ======================================================================
+def rotate_to_local_reference(
+    x: Tensor,
+    src: Tensor,
+    wigner: Tensor,
+    coeff_index: Tensor,
+    dim_full: int,
+) -> Tensor:
+    """Eager ground-truth for ``rotate_to_local`` (``bmm(D_to_m, x[src])``)."""
+    d_to_m = wigner[:, :dim_full, :dim_full].index_select(1, coeff_index)
+    return torch.bmm(d_to_m, x.index_select(0, src))
+
+
+def rotate_back_reference(
+    x_local: Tensor,
+    wigner: Tensor,
+    coeff_index: Tensor,
+    dim_full: int,
+) -> Tensor:
+    """Eager ground-truth for ``rotate_back`` (``bmm(Dt_from_m, x_local)``)."""
+    dt_from_m = wigner[:, :dim_full, :dim_full].index_select(2, coeff_index)
+    return torch.bmm(dt_from_m, x_local)
+
+
+def _rotate_to_local_bwd_eager(
+    grad_out: Tensor,
+    x: Tensor,
+    src: Tensor,
+    wigner: Tensor,
+    coeff_index: Tensor,
+    dim_full: int,
+) -> tuple[Tensor, Tensor]:
+    """Eager backward of ``rotate_to_local`` returning ``(grad_x, grad_wigner)``."""
+    w_rows = wigner[:, :dim_full, :dim_full].index_select(1, coeff_index)  # (E,Dm,D)
+    x_src = x.index_select(0, src)  # (E,D,C)
+    grad_x_src = torch.bmm(w_rows.transpose(1, 2), grad_out)  # (E,D,C)
+    grad_x = torch.zeros_like(x).index_add_(0, src, grad_x_src)
+    grad_rows = torch.bmm(grad_out, x_src.transpose(1, 2))  # (E,Dm,D)
+    grad_block = torch.zeros(
+        grad_out.shape[0], dim_full, dim_full, dtype=wigner.dtype, device=wigner.device
+    )
+    grad_block.index_copy_(1, coeff_index, grad_rows)
+    grad_wigner = torch.zeros_like(wigner)
+    grad_wigner[:, :dim_full, :dim_full] = grad_block
+    return grad_x, grad_wigner
+
+
+def _rotate_back_bwd_eager(
+    grad_out: Tensor,
+    x_local: Tensor,
+    wigner: Tensor,
+    coeff_index: Tensor,
+    dim_full: int,
+) -> tuple[Tensor, Tensor]:
+    """Eager backward of ``rotate_back`` returning ``(grad_x_local, grad_wigner)``."""
+    w_cols = wigner[:, :dim_full, :dim_full].index_select(2, coeff_index)  # (E,D,Dm)
+    grad_x_local = torch.bmm(w_cols.transpose(1, 2), grad_out)  # (E,Dm,C)
+    grad_cols = torch.bmm(grad_out, x_local.transpose(1, 2))  # (E,D,Dm)
+    grad_block = torch.zeros(
+        grad_out.shape[0], dim_full, dim_full, dtype=wigner.dtype, device=wigner.device
+    )
+    grad_block.index_copy_(2, coeff_index, grad_cols)
+    grad_wigner = torch.zeros_like(wigner)
+    grad_wigner[:, :dim_full, :dim_full] = grad_block
+    return grad_x_local, grad_wigner
+
+
+# ======================================================================
+# Tile-size helpers and autotuning configs
+# ======================================================================
+def _tile_dim(value: int) -> int:
+    """Pick a single-tile edge: the next power of two, at least 16.
+
+    Tiles spanning a whole dimension (the non-tiled ``N`` axis and the static
+    ``BLOCK_N``) must be a power of two (``tl.arange``) *and* a multiple of 16
+    (``tl.dot``); powers of two ``>= 16`` satisfy both. Packed dims map as
+    ``16 -> 16`` (lmax 3), ``36 -> 64`` (lmax 5), ``64 -> 64`` (lmax 7),
+    ``121 -> 128`` (lmax 10), ``C=64 -> 64``.
+    """
+    tile = 16
+    target = max(int(value), 1)
+    while tile < target:
+        tile *= 2
+    return tile
+
+
+def _autotune_configs() -> list:
+    """A small curated set of (BLOCK_M, BLOCK_K, num_warps, num_stages) configs.
+
+    The per-edge GEMMs are tiny (M, K, N <= 128). We tile the output-row axis
+    ``M`` across the grid and stream the contraction axis ``K`` in a pipelined
+    loop, so the dominant Wigner load overlaps with the matmul. Autotuning over
+    a handful of shapes lets one source kernel serve lmax 3..10 well (small
+    tiles for lmax 3, larger tiles / more warps for lmax 10).
+    """
+    return [
+        # Tiny tiles: best for lmax 3 (D=16), where a single 16x16 row tile and a
+        # one-shot K step behave like a per-edge matvec with minimal overhead.
+        triton.Config({"BLOCK_M": 16, "BLOCK_K": 16}, num_warps=1, num_stages=2),
+        triton.Config({"BLOCK_M": 16, "BLOCK_K": 16}, num_warps=2, num_stages=2),
+        triton.Config({"BLOCK_M": 32, "BLOCK_K": 16}, num_warps=2, num_stages=2),
+        triton.Config({"BLOCK_M": 16, "BLOCK_K": 64}, num_warps=2, num_stages=2),
+        triton.Config({"BLOCK_M": 32, "BLOCK_K": 32}, num_warps=4, num_stages=3),
+        triton.Config({"BLOCK_M": 32, "BLOCK_K": 64}, num_warps=4, num_stages=3),
+        triton.Config({"BLOCK_M": 64, "BLOCK_K": 32}, num_warps=4, num_stages=3),
+        triton.Config({"BLOCK_M": 64, "BLOCK_K": 64}, num_warps=4, num_stages=4),
+        triton.Config({"BLOCK_M": 64, "BLOCK_K": 64}, num_warps=8, num_stages=3),
+        triton.Config({"BLOCK_M": 128, "BLOCK_K": 32}, num_warps=8, num_stages=3),
+    ]
+
+
+if TRITON_ROTATION_AVAILABLE:
+    _CONFIGS = _autotune_configs()
+    _KEY = ["dim_full", "reduced_dim", "channels"]
+
+    # Block-diagonal kernels are fully unrolled over l (LMAX constexpr) and over
+    # each l-block, with channels vectorized -- there is no GEMM tile to tune, so
+    # we only sweep the warp count / pipeline depth, keyed on the channel width.
+    _BD_CONFIGS = [
+        triton.Config({}, num_warps=1, num_stages=1),
+        triton.Config({}, num_warps=2, num_stages=1),
+        triton.Config({}, num_warps=4, num_stages=1),
+        triton.Config({}, num_warps=2, num_stages=2),
+        triton.Config({}, num_warps=4, num_stages=2),
+    ]
+    _BD_KEY = ["channels"]
+
+    # ==================================================================
+    # Triton kernels
+    #
+    # Every kernel is one fused-gather GEMM ``C_out = A @ B`` with:
+    #   * grid = (edge, ceil(M / BLOCK_M)) -- one program per (edge, row-tile),
+    #   * a pipelined K-loop streaming BLOCK_K of the contraction at a time,
+    #   * the Wigner row/column gather (by ``coeff_index``) and the node-feature
+    #     gather (by ``src``) folded into the pointer arithmetic, so neither
+    #     ``D_to_m``/``Dt_from_m`` nor ``x[src]`` is ever materialized.
+    # All stores overwrite their tile (idempotent), which keeps autotuning safe.
+    # ==================================================================
+    @triton.autotune(configs=_CONFIGS, key=_KEY)
+    @triton.jit
+    def _to_local_fwd_kernel(
+        x_ptr,
+        src_ptr,
+        w_ptr,
+        idx_ptr,
+        out_ptr,
+        n_edge,
+        reduced_dim,
+        dim_full,
+        channels,
+        x_sn,
+        x_sd,
+        x_sc,
+        w_se,
+        w_sr,
+        w_sk,
+        o_se,
+        o_sr,
+        o_sc,
+        BLOCK_M: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+    ):
+        """``out[e,m,c] = sum_k W[e, coeff[m], k] * x[src[e], k, c]`` (M=Dm,K=D,N=C)."""
+        edge = tl.program_id(0).to(tl.int64)
+        row = tl.program_id(1) * BLOCK_M + tl.arange(0, BLOCK_M)  # over Dm
+        chan = tl.arange(0, BLOCK_N)  # over C
+        row_mask = row < reduced_dim
+        chan_mask = chan < channels
+
+        src_idx = tl.load(src_ptr + edge).to(tl.int64)
+        coeff_rows = tl.load(idx_ptr + row, mask=row_mask, other=0).to(tl.int64)
+
+        acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        for k0 in range(0, tl.cdiv(dim_full, BLOCK_K)):
+            kk = k0 * BLOCK_K + tl.arange(0, BLOCK_K)  # over D
+            k_mask = kk < dim_full
+            w_tile = tl.load(
+                w_ptr + edge * w_se + coeff_rows[:, None] * w_sr + kk[None, :] * w_sk,
+                mask=row_mask[:, None] & k_mask[None, :],
+                other=0.0,
+            )  # (BLOCK_M, BLOCK_K) = W[coeff[m], k]
+            x_tile = tl.load(
+                x_ptr + src_idx * x_sn + kk[:, None] * x_sd + chan[None, :] * x_sc,
+                mask=k_mask[:, None] & chan_mask[None, :],
+                other=0.0,
+            )  # (BLOCK_K, BLOCK_N) = x[src, k, c]
+            acc = tl.dot(w_tile.to(x_tile.dtype), x_tile, acc, input_precision="ieee")
+
+        tl.store(
+            out_ptr + edge * o_se + row[:, None] * o_sr + chan[None, :] * o_sc,
+            acc.to(out_ptr.dtype.element_ty),
+            mask=row_mask[:, None] & chan_mask[None, :],
+        )
+
+    @triton.autotune(configs=_CONFIGS, key=_KEY, reset_to_zero=["gx_ptr"])
+    @triton.jit
+    def _to_local_bwd_dx_kernel(
+        go_ptr,
+        src_ptr,
+        w_ptr,
+        idx_ptr,
+        gx_ptr,
+        n_edge,
+        reduced_dim,
+        dim_full,
+        channels,
+        go_se,
+        go_sr,
+        go_sc,
+        w_se,
+        w_sr,
+        w_sk,
+        gx_sn,
+        gx_sd,
+        gx_sc,
+        BLOCK_M: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+    ):
+        """``grad_x[src[e],d,c] += sum_m W[e, coeff[m], d] * grad_out[e,m,c]``.
+
+        (M=D, K=Dm, N=C). The per-edge source gradient is atomically scattered
+        straight into the zero-initialized ``grad_x`` (no ``x[src]``-sized
+        scratch). ``reset_to_zero`` keeps the autotuner's trial runs from
+        polluting the accumulator.
+        """
+        edge = tl.program_id(0).to(tl.int64)
+        drow = tl.program_id(1) * BLOCK_M + tl.arange(0, BLOCK_M)  # over D
+        chan = tl.arange(0, BLOCK_N)  # over C
+        d_mask = drow < dim_full
+        chan_mask = chan < channels
+
+        src_idx = tl.load(src_ptr + edge).to(tl.int64)
+
+        acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        for k0 in range(0, tl.cdiv(reduced_dim, BLOCK_K)):
+            mm = k0 * BLOCK_K + tl.arange(0, BLOCK_K)  # over Dm
+            m_mask = mm < reduced_dim
+            coeff = tl.load(idx_ptr + mm, mask=m_mask, other=0).to(tl.int64)
+            w_tile = tl.load(
+                w_ptr + edge * w_se + coeff[None, :] * w_sr + drow[:, None] * w_sk,
+                mask=d_mask[:, None] & m_mask[None, :],
+                other=0.0,
+            )  # (BLOCK_M(d), BLOCK_K(m)) = W[coeff[m], d]
+            go_tile = tl.load(
+                go_ptr + edge * go_se + mm[:, None] * go_sr + chan[None, :] * go_sc,
+                mask=m_mask[:, None] & chan_mask[None, :],
+                other=0.0,
+            )  # (BLOCK_K(m), BLOCK_N(c))
+            acc = tl.dot(w_tile.to(go_tile.dtype), go_tile, acc, input_precision="ieee")
+
+        tl.atomic_add(
+            gx_ptr + src_idx * gx_sn + drow[:, None] * gx_sd + chan[None, :] * gx_sc,
+            acc,
+            mask=d_mask[:, None] & chan_mask[None, :],
+        )
+
+    @triton.autotune(configs=_CONFIGS, key=_KEY)
+    @triton.jit
+    def _to_local_bwd_dw_kernel(
+        go_ptr,
+        x_ptr,
+        src_ptr,
+        idx_ptr,
+        gw_ptr,
+        n_edge,
+        reduced_dim,
+        dim_full,
+        channels,
+        go_se,
+        go_sr,
+        go_sc,
+        x_sn,
+        x_sd,
+        x_sc,
+        gw_se,
+        gw_sr,
+        gw_sk,
+        BLOCK_M: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+    ):
+        """``grad_W[e, coeff[m], d] = sum_c grad_out[e,m,c] * x[src[e], d, c]``.
+
+        (M=Dm, K=C, N=D). Writes directly into rows ``coeff_index`` of the
+        zero-initialized ``grad_wigner``.
+        """
+        edge = tl.program_id(0).to(tl.int64)
+        mrow = tl.program_id(1) * BLOCK_M + tl.arange(0, BLOCK_M)  # over Dm
+        dcol = tl.arange(0, BLOCK_N)  # over D
+        m_mask = mrow < reduced_dim
+        d_mask = dcol < dim_full
+
+        coeff = tl.load(idx_ptr + mrow, mask=m_mask, other=0).to(tl.int64)
+        src_idx = tl.load(src_ptr + edge).to(tl.int64)
+
+        acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        for k0 in range(0, tl.cdiv(channels, BLOCK_K)):
+            cc = k0 * BLOCK_K + tl.arange(0, BLOCK_K)  # over C
+            c_mask = cc < channels
+            go_tile = tl.load(
+                go_ptr + edge * go_se + mrow[:, None] * go_sr + cc[None, :] * go_sc,
+                mask=m_mask[:, None] & c_mask[None, :],
+                other=0.0,
+            )  # (BLOCK_M(m), BLOCK_K(c))
+            x_tile = tl.load(
+                x_ptr + src_idx * x_sn + dcol[None, :] * x_sd + cc[:, None] * x_sc,
+                mask=c_mask[:, None] & d_mask[None, :],
+                other=0.0,
+            )  # (BLOCK_K(c), BLOCK_N(d)) = x[src, d, c]
+            acc = tl.dot(go_tile.to(x_tile.dtype), x_tile, acc, input_precision="ieee")
+
+        tl.store(
+            gw_ptr + edge * gw_se + coeff[:, None] * gw_sr + dcol[None, :] * gw_sk,
+            acc.to(gw_ptr.dtype.element_ty),
+            mask=m_mask[:, None] & d_mask[None, :],
+        )
+
+    # ``rotate_back`` reads the Wigner *columns* selected by ``coeff_index``.
+    # Gathering columns of a row-major ``(E, D, D)`` tensor is uncoalesced, so
+    # instead we read *dense* Wigner rows (coalesced last axis) and gather /
+    # scatter the small ``x_local`` through the inverse permutation
+    # ``inv[k] = m  if coeff[m]==k else -1``. For ``mmax==lmax`` (a full
+    # permutation) this is the same flop count with far better memory behaviour.
+    @triton.autotune(configs=_CONFIGS, key=_KEY)
+    @triton.jit
+    def _back_fwd_kernel(
+        xl_ptr,
+        w_ptr,
+        inv_ptr,
+        out_ptr,
+        n_edge,
+        reduced_dim,
+        dim_full,
+        channels,
+        xl_se,
+        xl_sr,
+        xl_sc,
+        w_se,
+        w_sr,
+        w_sk,
+        o_se,
+        o_sd,
+        o_sc,
+        BLOCK_M: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+    ):
+        """``out[e,d,c] = sum_k W[e,d,k] * x_local[e, inv[k], c]`` (M=D, K=D, N=C)."""
+        edge = tl.program_id(0).to(tl.int64)
+        drow = tl.program_id(1) * BLOCK_M + tl.arange(0, BLOCK_M)  # over D
+        chan = tl.arange(0, BLOCK_N)  # over C
+        d_mask = drow < dim_full
+        chan_mask = chan < channels
+
+        acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        for k0 in range(0, tl.cdiv(dim_full, BLOCK_K)):
+            kk = k0 * BLOCK_K + tl.arange(0, BLOCK_K)  # over D (contraction)
+            k_mask = kk < dim_full
+            inv_k = tl.load(inv_ptr + kk, mask=k_mask, other=-1).to(tl.int64)
+            keep = inv_k >= 0
+            w_tile = tl.load(
+                w_ptr + edge * w_se + drow[:, None] * w_sr + kk[None, :] * w_sk,
+                mask=d_mask[:, None] & k_mask[None, :],
+                other=0.0,
+            )  # (BLOCK_M(d), BLOCK_K(k)) = W[d, k]  (k contiguous -> coalesced)
+            xl_tile = tl.load(
+                xl_ptr + edge * xl_se + inv_k[:, None] * xl_sr + chan[None, :] * xl_sc,
+                mask=keep[:, None] & chan_mask[None, :],
+                other=0.0,
+            )  # (BLOCK_K(k), BLOCK_N(c)) = x_local[inv[k], c]
+            acc = tl.dot(w_tile.to(xl_tile.dtype), xl_tile, acc, input_precision="ieee")
+
+        tl.store(
+            out_ptr + edge * o_se + drow[:, None] * o_sd + chan[None, :] * o_sc,
+            acc.to(out_ptr.dtype.element_ty),
+            mask=d_mask[:, None] & chan_mask[None, :],
+        )
+
+    @triton.autotune(configs=_CONFIGS, key=_KEY)
+    @triton.jit
+    def _back_bwd_dx_kernel(
+        go_ptr,
+        w_ptr,
+        inv_ptr,
+        gxl_ptr,
+        n_edge,
+        reduced_dim,
+        dim_full,
+        channels,
+        go_se,
+        go_sd,
+        go_sc,
+        w_se,
+        w_sr,
+        w_sk,
+        gxl_se,
+        gxl_sr,
+        gxl_sc,
+        BLOCK_M: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+    ):
+        """``grad_x_local[e, inv[k], c] = sum_d W[e,d,k] * grad_out[e,d,c]``.
+
+        (M=D, K=D, N=C). Computes the dense ``k``-indexed gradient with coalesced
+        Wigner reads, then scatters each full row ``k`` into reduced row
+        ``inv[k]`` of ``grad_x_local``.
+        """
+        edge = tl.program_id(0).to(tl.int64)
+        krow = tl.program_id(1) * BLOCK_M + tl.arange(0, BLOCK_M)  # over D
+        chan = tl.arange(0, BLOCK_N)  # over C
+        k_mask = krow < dim_full
+        chan_mask = chan < channels
+
+        inv_k = tl.load(inv_ptr + krow, mask=k_mask, other=-1).to(tl.int64)
+        keep = inv_k >= 0
+
+        acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        for k0 in range(0, tl.cdiv(dim_full, BLOCK_K)):
+            dd = k0 * BLOCK_K + tl.arange(0, BLOCK_K)  # over D (contraction)
+            d_mask = dd < dim_full
+            w_tile = tl.load(
+                w_ptr + edge * w_se + dd[None, :] * w_sr + krow[:, None] * w_sk,
+                mask=k_mask[:, None] & d_mask[None, :],
+                other=0.0,
+            )  # (BLOCK_M(k), BLOCK_K(d)) = W[d, k]  (k contiguous -> coalesced)
+            go_tile = tl.load(
+                go_ptr + edge * go_se + dd[:, None] * go_sd + chan[None, :] * go_sc,
+                mask=d_mask[:, None] & chan_mask[None, :],
+                other=0.0,
+            )  # (BLOCK_K(d), BLOCK_N(c))
+            acc = tl.dot(w_tile.to(go_tile.dtype), go_tile, acc, input_precision="ieee")
+
+        tl.store(
+            gxl_ptr + edge * gxl_se + inv_k[:, None] * gxl_sr + chan[None, :] * gxl_sc,
+            acc.to(gxl_ptr.dtype.element_ty),
+            mask=keep[:, None] & chan_mask[None, :],
+        )
+
+    @triton.autotune(configs=_CONFIGS, key=_KEY)
+    @triton.jit
+    def _back_bwd_dw_kernel(
+        go_ptr,
+        xl_ptr,
+        inv_ptr,
+        gw_ptr,
+        n_edge,
+        reduced_dim,
+        dim_full,
+        channels,
+        go_se,
+        go_sd,
+        go_sc,
+        xl_se,
+        xl_sr,
+        xl_sc,
+        gw_se,
+        gw_sr,
+        gw_sk,
+        BLOCK_M: tl.constexpr,
+        BLOCK_K: tl.constexpr,
+        BLOCK_N: tl.constexpr,
+    ):
+        """``grad_W[e,d,k] = sum_c grad_out[e,d,c] * x_local[e, inv[k], c]``.
+
+        (M=D, K=C, N=D). Writes the dense ``(D, D)`` block of ``grad_wigner``
+        with a coalesced last axis; columns ``k`` not selected by ``coeff_index``
+        receive zero (``inv[k] < 0``), matching the eager column gather.
+        """
+        edge = tl.program_id(0).to(tl.int64)
+        drow = tl.program_id(1) * BLOCK_M + tl.arange(0, BLOCK_M)  # over D
+        kcol = tl.arange(0, BLOCK_N)  # over D
+        d_mask = drow < dim_full
+        k_mask = kcol < dim_full
+
+        inv_k = tl.load(inv_ptr + kcol, mask=k_mask, other=-1).to(tl.int64)
+        keep = inv_k >= 0
+
+        acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        for k0 in range(0, tl.cdiv(channels, BLOCK_K)):
+            cc = k0 * BLOCK_K + tl.arange(0, BLOCK_K)  # over C (contraction)
+            c_mask = cc < channels
+            go_tile = tl.load(
+                go_ptr + edge * go_se + drow[:, None] * go_sd + cc[None, :] * go_sc,
+                mask=d_mask[:, None] & c_mask[None, :],
+                other=0.0,
+            )  # (BLOCK_M(d), BLOCK_K(c))
+            xl_tile = tl.load(
+                xl_ptr + edge * xl_se + inv_k[None, :] * xl_sr + cc[:, None] * xl_sc,
+                mask=c_mask[:, None] & keep[None, :],
+                other=0.0,
+            )  # (BLOCK_K(c), BLOCK_N(k)) = x_local[inv[k], c]
+            acc = tl.dot(
+                go_tile.to(xl_tile.dtype), xl_tile, acc, input_precision="ieee"
+            )
+
+        tl.store(
+            gw_ptr + edge * gw_se + drow[:, None] * gw_sr + kcol[None, :] * gw_sk,
+            acc.to(gw_ptr.dtype.element_ty),
+            mask=d_mask[:, None] & k_mask[None, :],
+        )
+
+    # ==================================================================
+    # Block-diagonal kernels (mmax == 1, block-diagonal Wigner-D)
+    #
+    # The Wigner-D matrix is block-diagonal by degree ``l``: block ``l`` is the
+    # ``(2l+1) x (2l+1)`` sub-matrix on rows/cols ``[l^2 : (l+1)^2]`` and every
+    # off-(l-block) entry is exactly 0. With ``mmax == 1`` the reduced layout
+    # keeps, per degree ``l``, the orders ``m in {0}`` (l == 0) or
+    # ``{0, -1, +1}`` (l >= 1). Output coefficient ``(l, m)`` therefore contracts
+    # ONLY over the ``2l+1`` inputs of block ``l`` -- never the full ``D``.
+    #
+    # The m-major reduced index and the packed Wigner row/col are pure functions
+    # of ``(l, m, LMAX)``::
+    #
+    #     reduced index:  m=0 -> l,  m=-1 -> LMAX+l,  m=+1 -> 2*LMAX+l
+    #     packed (l, m):  l^2 + l + m   (so m=0 -> l^2+l, m=-1 -> -1, m=+1 -> +1)
+    #
+    # so the kernels need no ``coeff_index`` tensor: with ``LMAX`` a constexpr we
+    # fully unroll over ``l`` and over each block, contracting exactly the
+    # structural non-zeros (no padding, no wasted FLOPs). Channels are the
+    # vectorized axis (``BLOCK_C`` spans the full width ``C``), so the backward
+    # Wigner gradient is a single in-program ``tl.sum`` over channels.
+    @triton.autotune(configs=_BD_CONFIGS, key=_BD_KEY)
+    @triton.jit
+    def _bd_to_local_fwd_kernel(
+        x_ptr,
+        src_ptr,
+        w_ptr,
+        out_ptr,
+        n_edge,
+        channels,
+        x_sn,
+        x_sd,
+        x_sc,
+        w_se,
+        w_sr,
+        w_sk,
+        o_se,
+        o_sr,
+        o_sc,
+        LMAX: tl.constexpr,
+        BLOCK_C: tl.constexpr,
+    ):
+        """``out[e,(l,m),c] = sum_{j} W[e, l^2+l+m, l^2+j] * x[src[e], l^2+j, c]``."""
+        edge = tl.program_id(0).to(tl.int64)
+        chan = tl.arange(0, BLOCK_C)
+        cmask = chan < channels
+        src_idx = tl.load(src_ptr + edge).to(tl.int64)
+
+        for l in tl.static_range(0, LMAX + 1):
+            base = l * l
+            r0 = base + l  # packed row of order m=0
+            acc0 = tl.zeros((BLOCK_C,), dtype=tl.float32)
+            acc_m = tl.zeros((BLOCK_C,), dtype=tl.float32)
+            acc_p = tl.zeros((BLOCK_C,), dtype=tl.float32)
+            for j in tl.static_range(0, 2 * l + 1):
+                col = base + j
+                x_vec = tl.load(
+                    x_ptr + src_idx * x_sn + col * x_sd + chan * x_sc,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+                acc0 += tl.load(w_ptr + edge * w_se + r0 * w_sr + col * w_sk) * x_vec
+                if l >= 1:
+                    acc_m += (
+                        tl.load(w_ptr + edge * w_se + (r0 - 1) * w_sr + col * w_sk)
+                        * x_vec
+                    )
+                    acc_p += (
+                        tl.load(w_ptr + edge * w_se + (r0 + 1) * w_sr + col * w_sk)
+                        * x_vec
+                    )
+            tl.store(
+                out_ptr + edge * o_se + l * o_sr + chan * o_sc,
+                acc0.to(out_ptr.dtype.element_ty),
+                mask=cmask,
+            )
+            if l >= 1:
+                tl.store(
+                    out_ptr + edge * o_se + (LMAX + l) * o_sr + chan * o_sc,
+                    acc_m.to(out_ptr.dtype.element_ty),
+                    mask=cmask,
+                )
+                tl.store(
+                    out_ptr + edge * o_se + (2 * LMAX + l) * o_sr + chan * o_sc,
+                    acc_p.to(out_ptr.dtype.element_ty),
+                    mask=cmask,
+                )
+
+    @triton.autotune(configs=_BD_CONFIGS, key=_BD_KEY, reset_to_zero=["gx_ptr"])
+    @triton.jit
+    def _bd_to_local_bwd_kernel(
+        go_ptr,
+        x_ptr,
+        src_ptr,
+        w_ptr,
+        gx_ptr,
+        gw_ptr,
+        n_edge,
+        channels,
+        go_se,
+        go_sr,
+        go_sc,
+        x_sn,
+        x_sd,
+        x_sc,
+        w_se,
+        w_sr,
+        w_sk,
+        gx_sn,
+        gx_sd,
+        gx_sc,
+        gw_se,
+        gw_sr,
+        gw_sk,
+        LMAX: tl.constexpr,
+        BLOCK_C: tl.constexpr,
+    ):
+        """Fused block-diagonal backward of ``rotate_to_local``.
+
+        Per edge (full channel width in one program): scatters
+        ``grad_x[src, l^2+j, :] += sum_m W[l^2+l+m, l^2+j] * grad_out[(l,m), :]``
+        and writes ``grad_W[l^2+l+m, l^2+j] = sum_c grad_out[(l,m),c] * x[l^2+j,c]``
+        for the structural non-zeros only.
+        """
+        edge = tl.program_id(0).to(tl.int64)
+        chan = tl.arange(0, BLOCK_C)
+        cmask = chan < channels
+        src_idx = tl.load(src_ptr + edge).to(tl.int64)
+
+        for l in tl.static_range(0, LMAX + 1):
+            base = l * l
+            r0 = base + l
+            go0 = tl.load(
+                go_ptr + edge * go_se + l * go_sr + chan * go_sc,
+                mask=cmask,
+                other=0.0,
+            ).to(tl.float32)
+            if l >= 1:
+                go_m = tl.load(
+                    go_ptr + edge * go_se + (LMAX + l) * go_sr + chan * go_sc,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+                go_p = tl.load(
+                    go_ptr + edge * go_se + (2 * LMAX + l) * go_sr + chan * go_sc,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+            for j in tl.static_range(0, 2 * l + 1):
+                col = base + j
+                x_vec = tl.load(
+                    x_ptr + src_idx * x_sn + col * x_sd + chan * x_sc,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+                w0 = tl.load(w_ptr + edge * w_se + r0 * w_sr + col * w_sk)
+                gx_row = w0 * go0
+                tl.store(
+                    gw_ptr + edge * gw_se + r0 * gw_sr + col * gw_sk,
+                    tl.sum(go0 * x_vec).to(gw_ptr.dtype.element_ty),
+                )
+                if l >= 1:
+                    wm = tl.load(w_ptr + edge * w_se + (r0 - 1) * w_sr + col * w_sk)
+                    wp = tl.load(w_ptr + edge * w_se + (r0 + 1) * w_sr + col * w_sk)
+                    gx_row += wm * go_m + wp * go_p
+                    tl.store(
+                        gw_ptr + edge * gw_se + (r0 - 1) * gw_sr + col * gw_sk,
+                        tl.sum(go_m * x_vec).to(gw_ptr.dtype.element_ty),
+                    )
+                    tl.store(
+                        gw_ptr + edge * gw_se + (r0 + 1) * gw_sr + col * gw_sk,
+                        tl.sum(go_p * x_vec).to(gw_ptr.dtype.element_ty),
+                    )
+                tl.atomic_add(
+                    gx_ptr + src_idx * gx_sn + col * gx_sd + chan * gx_sc,
+                    gx_row,
+                    mask=cmask,
+                )
+
+    @triton.autotune(configs=_BD_CONFIGS, key=_BD_KEY)
+    @triton.jit
+    def _bd_back_fwd_kernel(
+        xl_ptr,
+        w_ptr,
+        out_ptr,
+        n_edge,
+        channels,
+        xl_se,
+        xl_sr,
+        xl_sc,
+        w_se,
+        w_sr,
+        w_sk,
+        o_se,
+        o_sd,
+        o_sc,
+        LMAX: tl.constexpr,
+        BLOCK_C: tl.constexpr,
+    ):
+        """``out[e, l^2+j, c] = sum_m W[e, l^2+j, l^2+l+m] * x_local[(l,m), c]``."""
+        edge = tl.program_id(0).to(tl.int64)
+        chan = tl.arange(0, BLOCK_C)
+        cmask = chan < channels
+
+        for l in tl.static_range(0, LMAX + 1):
+            base = l * l
+            r0 = base + l  # packed col of order m=0
+            xl0 = tl.load(
+                xl_ptr + edge * xl_se + l * xl_sr + chan * xl_sc,
+                mask=cmask,
+                other=0.0,
+            ).to(tl.float32)
+            if l >= 1:
+                xl_m = tl.load(
+                    xl_ptr + edge * xl_se + (LMAX + l) * xl_sr + chan * xl_sc,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+                xl_p = tl.load(
+                    xl_ptr + edge * xl_se + (2 * LMAX + l) * xl_sr + chan * xl_sc,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+            for j in tl.static_range(0, 2 * l + 1):
+                d = base + j  # full packed output row
+                acc = tl.load(w_ptr + edge * w_se + d * w_sr + r0 * w_sk) * xl0
+                if l >= 1:
+                    acc += (
+                        tl.load(w_ptr + edge * w_se + d * w_sr + (r0 - 1) * w_sk) * xl_m
+                    )
+                    acc += (
+                        tl.load(w_ptr + edge * w_se + d * w_sr + (r0 + 1) * w_sk) * xl_p
+                    )
+                tl.store(
+                    out_ptr + edge * o_se + d * o_sd + chan * o_sc,
+                    acc.to(out_ptr.dtype.element_ty),
+                    mask=cmask,
+                )
+
+    @triton.autotune(configs=_BD_CONFIGS, key=_BD_KEY)
+    @triton.jit
+    def _bd_back_bwd_kernel(
+        go_ptr,
+        xl_ptr,
+        w_ptr,
+        gxl_ptr,
+        gw_ptr,
+        n_edge,
+        channels,
+        go_se,
+        go_sd,
+        go_sc,
+        xl_se,
+        xl_sr,
+        xl_sc,
+        w_se,
+        w_sr,
+        w_sk,
+        gxl_se,
+        gxl_sr,
+        gxl_sc,
+        gw_se,
+        gw_sr,
+        gw_sk,
+        LMAX: tl.constexpr,
+        BLOCK_C: tl.constexpr,
+    ):
+        """Fused block-diagonal backward of ``rotate_back``.
+
+        Per edge (full channel width in one program): writes
+        ``grad_x_local[(l,m), :] = sum_j W[l^2+j, l^2+l+m] * grad_out[l^2+j, :]``
+        (no scatter -- ``x_local`` is per-edge) and
+        ``grad_W[l^2+j, l^2+l+m] = sum_c grad_out[l^2+j, c] * x_local[(l,m), c]``.
+        """
+        edge = tl.program_id(0).to(tl.int64)
+        chan = tl.arange(0, BLOCK_C)
+        cmask = chan < channels
+
+        for l in tl.static_range(0, LMAX + 1):
+            base = l * l
+            r0 = base + l  # packed col of order m=0
+            xl0 = tl.load(
+                xl_ptr + edge * xl_se + l * xl_sr + chan * xl_sc,
+                mask=cmask,
+                other=0.0,
+            ).to(tl.float32)
+            gxl0 = tl.zeros((BLOCK_C,), dtype=tl.float32)
+            if l >= 1:
+                xl_m = tl.load(
+                    xl_ptr + edge * xl_se + (LMAX + l) * xl_sr + chan * xl_sc,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+                xl_p = tl.load(
+                    xl_ptr + edge * xl_se + (2 * LMAX + l) * xl_sr + chan * xl_sc,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+                gxl_m = tl.zeros((BLOCK_C,), dtype=tl.float32)
+                gxl_p = tl.zeros((BLOCK_C,), dtype=tl.float32)
+            for j in tl.static_range(0, 2 * l + 1):
+                d = base + j  # full packed row (output of forward / grad_out row)
+                go_d = tl.load(
+                    go_ptr + edge * go_se + d * go_sd + chan * go_sc,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+                w0 = tl.load(w_ptr + edge * w_se + d * w_sr + r0 * w_sk)
+                gxl0 += w0 * go_d
+                tl.store(
+                    gw_ptr + edge * gw_se + d * gw_sr + r0 * gw_sk,
+                    tl.sum(go_d * xl0).to(gw_ptr.dtype.element_ty),
+                )
+                if l >= 1:
+                    wm = tl.load(w_ptr + edge * w_se + d * w_sr + (r0 - 1) * w_sk)
+                    wp = tl.load(w_ptr + edge * w_se + d * w_sr + (r0 + 1) * w_sk)
+                    gxl_m += wm * go_d
+                    gxl_p += wp * go_d
+                    tl.store(
+                        gw_ptr + edge * gw_se + d * gw_sr + (r0 - 1) * gw_sk,
+                        tl.sum(go_d * xl_m).to(gw_ptr.dtype.element_ty),
+                    )
+                    tl.store(
+                        gw_ptr + edge * gw_se + d * gw_sr + (r0 + 1) * gw_sk,
+                        tl.sum(go_d * xl_p).to(gw_ptr.dtype.element_ty),
+                    )
+            tl.store(
+                gxl_ptr + edge * gxl_se + l * gxl_sr + chan * gxl_sc,
+                gxl0.to(gxl_ptr.dtype.element_ty),
+                mask=cmask,
+            )
+            if l >= 1:
+                tl.store(
+                    gxl_ptr + edge * gxl_se + (LMAX + l) * gxl_sr + chan * gxl_sc,
+                    gxl_m.to(gxl_ptr.dtype.element_ty),
+                    mask=cmask,
+                )
+                tl.store(
+                    gxl_ptr + edge * gxl_se + (2 * LMAX + l) * gxl_sr + chan * gxl_sc,
+                    gxl_p.to(gxl_ptr.dtype.element_ty),
+                    mask=cmask,
+                )
+
+    @triton.autotune(configs=_BD_CONFIGS, key=["channels"])
+    @triton.jit
+    def _bd_back_so2_fwd_kernel(
+        xl_ptr,
+        w_ptr,
+        out_ptr,
+        n_edge,
+        channels,
+        xl_se,
+        xl_sf,
+        xl_sr,
+        xl_sc,
+        w_se,
+        w_sr,
+        w_sk,
+        o_se,
+        o_sd,
+        o_sc,
+        LMAX: tl.constexpr,
+        FOCUS_DIM: tl.constexpr,
+        BLOCK_C: tl.constexpr,
+    ):
+        """Block-diagonal rotate_back reading the per-focus layout in place.
+
+        ``out[e, l^2+j, c] = sum_m W[e, l^2+j, l^2+l+m] * x_local[e, f, (l,m), cf]``
+        with ``c = f * FOCUS_DIM + cf``. Decoding the channel as ``(f, cf)`` folds
+        the ``(F, D_m, Cf) -> (D_m, C_wide)`` transpose into the addressing, so the
+        caller passes the SO(2) focus tensor without an explicit copy.
+        """
+        edge = tl.program_id(0).to(tl.int64)
+        chan = tl.arange(0, BLOCK_C)
+        cmask = chan < channels
+        xl_co = (chan // FOCUS_DIM) * xl_sf + (chan % FOCUS_DIM) * xl_sc
+        for l in tl.static_range(0, LMAX + 1):
+            base = l * l
+            r0 = base + l
+            xl0 = tl.load(
+                xl_ptr + edge * xl_se + l * xl_sr + xl_co, mask=cmask, other=0.0
+            ).to(tl.float32)
+            if l >= 1:
+                xl_m = tl.load(
+                    xl_ptr + edge * xl_se + (LMAX + l) * xl_sr + xl_co,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+                xl_p = tl.load(
+                    xl_ptr + edge * xl_se + (2 * LMAX + l) * xl_sr + xl_co,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+            for j in tl.static_range(0, 2 * l + 1):
+                d = base + j
+                acc = tl.load(w_ptr + edge * w_se + d * w_sr + r0 * w_sk) * xl0
+                if l >= 1:
+                    acc += (
+                        tl.load(w_ptr + edge * w_se + d * w_sr + (r0 - 1) * w_sk) * xl_m
+                    )
+                    acc += (
+                        tl.load(w_ptr + edge * w_se + d * w_sr + (r0 + 1) * w_sk) * xl_p
+                    )
+                tl.store(
+                    out_ptr + edge * o_se + d * o_sd + chan * o_sc,
+                    acc.to(out_ptr.dtype.element_ty),
+                    mask=cmask,
+                )
+
+    @triton.autotune(configs=_BD_CONFIGS, key=["channels"])
+    @triton.jit
+    def _bd_back_so2_bwd_kernel(
+        go_ptr,
+        xl_ptr,
+        w_ptr,
+        gxl_ptr,
+        gw_ptr,
+        n_edge,
+        channels,
+        go_se,
+        go_sd,
+        go_sc,
+        xl_se,
+        xl_sf,
+        xl_sr,
+        xl_sc,
+        w_se,
+        w_sr,
+        w_sk,
+        gxl_se,
+        gxl_sf,
+        gxl_sr,
+        gxl_sc,
+        gw_se,
+        gw_sr,
+        gw_sk,
+        LMAX: tl.constexpr,
+        FOCUS_DIM: tl.constexpr,
+        BLOCK_C: tl.constexpr,
+    ):
+        """Backward of :func:`_bd_back_so2_fwd_kernel`.
+
+        Writes ``grad_x_local`` in the per-focus layout (decoding the channel as
+        ``(f, cf)`` exactly as the forward) and accumulates ``grad_W`` over the
+        full channel width, i.e. summed across focus streams.
+        """
+        edge = tl.program_id(0).to(tl.int64)
+        chan = tl.arange(0, BLOCK_C)
+        cmask = chan < channels
+        xl_co = (chan // FOCUS_DIM) * xl_sf + (chan % FOCUS_DIM) * xl_sc
+        gxl_co = (chan // FOCUS_DIM) * gxl_sf + (chan % FOCUS_DIM) * gxl_sc
+        for l in tl.static_range(0, LMAX + 1):
+            base = l * l
+            r0 = base + l
+            xl0 = tl.load(
+                xl_ptr + edge * xl_se + l * xl_sr + xl_co, mask=cmask, other=0.0
+            ).to(tl.float32)
+            gxl0 = tl.zeros((BLOCK_C,), dtype=tl.float32)
+            if l >= 1:
+                xl_m = tl.load(
+                    xl_ptr + edge * xl_se + (LMAX + l) * xl_sr + xl_co,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+                xl_p = tl.load(
+                    xl_ptr + edge * xl_se + (2 * LMAX + l) * xl_sr + xl_co,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+                gxl_m = tl.zeros((BLOCK_C,), dtype=tl.float32)
+                gxl_p = tl.zeros((BLOCK_C,), dtype=tl.float32)
+            for j in tl.static_range(0, 2 * l + 1):
+                d = base + j
+                go_d = tl.load(
+                    go_ptr + edge * go_se + d * go_sd + chan * go_sc,
+                    mask=cmask,
+                    other=0.0,
+                ).to(tl.float32)
+                gxl0 += tl.load(w_ptr + edge * w_se + d * w_sr + r0 * w_sk) * go_d
+                tl.store(
+                    gw_ptr + edge * gw_se + d * gw_sr + r0 * gw_sk,
+                    tl.sum(go_d * xl0).to(gw_ptr.dtype.element_ty),
+                )
+                if l >= 1:
+                    gxl_m += (
+                        tl.load(w_ptr + edge * w_se + d * w_sr + (r0 - 1) * w_sk) * go_d
+                    )
+                    gxl_p += (
+                        tl.load(w_ptr + edge * w_se + d * w_sr + (r0 + 1) * w_sk) * go_d
+                    )
+                    tl.store(
+                        gw_ptr + edge * gw_se + d * gw_sr + (r0 - 1) * gw_sk,
+                        tl.sum(go_d * xl_m).to(gw_ptr.dtype.element_ty),
+                    )
+                    tl.store(
+                        gw_ptr + edge * gw_se + d * gw_sr + (r0 + 1) * gw_sk,
+                        tl.sum(go_d * xl_p).to(gw_ptr.dtype.element_ty),
+                    )
+            tl.store(
+                gxl_ptr + edge * gxl_se + l * gxl_sr + gxl_co,
+                gxl0.to(gxl_ptr.dtype.element_ty),
+                mask=cmask,
+            )
+            if l >= 1:
+                tl.store(
+                    gxl_ptr + edge * gxl_se + (LMAX + l) * gxl_sr + gxl_co,
+                    gxl_m.to(gxl_ptr.dtype.element_ty),
+                    mask=cmask,
+                )
+                tl.store(
+                    gxl_ptr + edge * gxl_se + (2 * LMAX + l) * gxl_sr + gxl_co,
+                    gxl_p.to(gxl_ptr.dtype.element_ty),
+                    mask=cmask,
+                )
+
+
+# ======================================================================
+# Triton launch wrappers
+# ======================================================================
+def _grid_over_rows(n_edge: int, rows: int):
+    """Grid callable: one program per (edge, BLOCK_M-sized row tile)."""
+    return lambda meta: (n_edge, triton.cdiv(rows, meta["BLOCK_M"]))
+
+
+def _has_no_edges(n_edge: int) -> bool:
+    """Return true for eager zero-edge calls without guarding symbolic edges.
+
+    Under ``torch.library.triton_op`` decomposition (AOTInductor freeze), the
+    edge dimension can be a data-dependent SymInt produced from the neighbour
+    list.  Converting it to a Python ``int`` would force a guard such as
+    ``u0 + 2`` and abort export.  We only need the zero-edge early return in
+    eager Python execution; compiled production graphs always see a non-empty
+    representative trace and use dynamic shapes for later calls.
+    """
+    return type(n_edge) is int and n_edge == 0
+
+
+def _inverse_index(coeff_index: Tensor, dim_full: int) -> Tensor:
+    """Inverse permutation ``inv[k] = m`` where ``coeff_index[m] == k`` else ``-1``.
+
+    Maps a full packed position ``k`` back to its reduced-layout slot. Used by the
+    ``rotate_back`` kernels so they can read dense Wigner rows (coalesced) and
+    gather/scatter the small ``x_local`` instead of gathering Wigner columns.
+    """
+    inv = torch.full((int(dim_full),), -1, dtype=torch.int64, device=coeff_index.device)
+    inv[coeff_index] = torch.arange(
+        coeff_index.numel(), dtype=torch.int64, device=coeff_index.device
+    )
+    return inv
+
+
+def _launch_rotate_to_local_fwd(
+    x: Tensor,
+    src: Tensor,
+    wigner: Tensor,
+    coeff_index: Tensor,
+    dim_full: int,
+) -> Tensor:
+    n_edge = src.shape[0]
+    reduced_dim = int(coeff_index.shape[0])
+    channels = int(x.shape[2])
+    out = torch.empty((n_edge, reduced_dim, channels), dtype=x.dtype, device=x.device)
+    if _has_no_edges(n_edge):
+        return out
+    wrap_triton(_to_local_fwd_kernel)[_grid_over_rows(n_edge, reduced_dim)](
+        x,
+        src,
+        wigner,
+        coeff_index,
+        out,
+        n_edge,
+        reduced_dim,
+        dim_full,
+        channels,
+        x.stride(0),
+        x.stride(1),
+        x.stride(2),
+        wigner.stride(0),
+        wigner.stride(1),
+        wigner.stride(2),
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        BLOCK_N=_tile_dim(channels),
+    )
+    return out
+
+
+def _launch_rotate_to_local_bwd(
+    grad_out: Tensor,
+    x: Tensor,
+    src: Tensor,
+    wigner: Tensor,
+    coeff_index: Tensor,
+    dim_full: int,
+) -> tuple[Tensor, Tensor]:
+    n_edge = src.shape[0]
+    reduced_dim = int(coeff_index.shape[0])
+    channels = int(x.shape[2])
+    grad_x = torch.zeros_like(x)
+    grad_wigner = torch.zeros_like(wigner)
+    if _has_no_edges(n_edge):
+        return grad_x, grad_wigner
+
+    # --- grad_x: per-edge GEMM atomically scattered into grad_x by src ---
+    wrap_triton(_to_local_bwd_dx_kernel)[_grid_over_rows(n_edge, dim_full)](
+        grad_out,
+        src,
+        wigner,
+        coeff_index,
+        grad_x,
+        n_edge,
+        reduced_dim,
+        dim_full,
+        channels,
+        grad_out.stride(0),
+        grad_out.stride(1),
+        grad_out.stride(2),
+        wigner.stride(0),
+        wigner.stride(1),
+        wigner.stride(2),
+        grad_x.stride(0),
+        grad_x.stride(1),
+        grad_x.stride(2),
+        BLOCK_N=_tile_dim(channels),
+    )
+
+    # --- grad_wigner: per-edge GEMM written into rows ``coeff_index`` ---
+    wrap_triton(_to_local_bwd_dw_kernel)[_grid_over_rows(n_edge, reduced_dim)](
+        grad_out,
+        x,
+        src,
+        coeff_index,
+        grad_wigner,
+        n_edge,
+        reduced_dim,
+        dim_full,
+        channels,
+        grad_out.stride(0),
+        grad_out.stride(1),
+        grad_out.stride(2),
+        x.stride(0),
+        x.stride(1),
+        x.stride(2),
+        grad_wigner.stride(0),
+        grad_wigner.stride(1),
+        grad_wigner.stride(2),
+        BLOCK_N=_tile_dim(dim_full),
+    )
+    return grad_x, grad_wigner
+
+
+def _launch_rotate_back_fwd(
+    x_local: Tensor,
+    wigner: Tensor,
+    coeff_index: Tensor,
+    dim_full: int,
+) -> Tensor:
+    n_edge = x_local.shape[0]
+    reduced_dim = int(coeff_index.shape[0])
+    channels = int(x_local.shape[2])
+    out = torch.empty(
+        (n_edge, dim_full, channels), dtype=x_local.dtype, device=x_local.device
+    )
+    if _has_no_edges(n_edge):
+        return out
+    inv_index = _inverse_index(coeff_index, dim_full)
+    wrap_triton(_back_fwd_kernel)[_grid_over_rows(n_edge, dim_full)](
+        x_local,
+        wigner,
+        inv_index,
+        out,
+        n_edge,
+        reduced_dim,
+        dim_full,
+        channels,
+        x_local.stride(0),
+        x_local.stride(1),
+        x_local.stride(2),
+        wigner.stride(0),
+        wigner.stride(1),
+        wigner.stride(2),
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        BLOCK_N=_tile_dim(channels),
+    )
+    return out
+
+
+def _launch_rotate_back_bwd(
+    grad_out: Tensor,
+    x_local: Tensor,
+    wigner: Tensor,
+    coeff_index: Tensor,
+    dim_full: int,
+) -> tuple[Tensor, Tensor]:
+    n_edge = x_local.shape[0]
+    reduced_dim = int(coeff_index.shape[0])
+    channels = int(x_local.shape[2])
+    grad_x_local = torch.empty_like(x_local)
+    grad_wigner = torch.zeros_like(wigner)
+    if _has_no_edges(n_edge):
+        return grad_x_local, grad_wigner
+
+    inv_index = _inverse_index(coeff_index, dim_full)
+    wrap_triton(_back_bwd_dx_kernel)[_grid_over_rows(n_edge, dim_full)](
+        grad_out,
+        wigner,
+        inv_index,
+        grad_x_local,
+        n_edge,
+        reduced_dim,
+        dim_full,
+        channels,
+        grad_out.stride(0),
+        grad_out.stride(1),
+        grad_out.stride(2),
+        wigner.stride(0),
+        wigner.stride(1),
+        wigner.stride(2),
+        grad_x_local.stride(0),
+        grad_x_local.stride(1),
+        grad_x_local.stride(2),
+        BLOCK_N=_tile_dim(channels),
+    )
+    wrap_triton(_back_bwd_dw_kernel)[_grid_over_rows(n_edge, dim_full)](
+        grad_out,
+        x_local,
+        inv_index,
+        grad_wigner,
+        n_edge,
+        reduced_dim,
+        dim_full,
+        channels,
+        grad_out.stride(0),
+        grad_out.stride(1),
+        grad_out.stride(2),
+        x_local.stride(0),
+        x_local.stride(1),
+        x_local.stride(2),
+        grad_wigner.stride(0),
+        grad_wigner.stride(1),
+        grad_wigner.stride(2),
+        BLOCK_N=_tile_dim(dim_full),
+    )
+    return grad_x_local, grad_wigner
+
+
+# ======================================================================
+# Block-diagonal launch wrappers (mmax == 1)
+# ======================================================================
+def _launch_bd_to_local_fwd(
+    x: Tensor, src: Tensor, wigner: Tensor, lmax: int
+) -> Tensor:
+    n_edge = src.shape[0]
+    channels = int(x.shape[2])
+    out = torch.empty((n_edge, 3 * lmax + 1, channels), dtype=x.dtype, device=x.device)
+    if _has_no_edges(n_edge):
+        return out
+    wrap_triton(_bd_to_local_fwd_kernel)[(n_edge,)](
+        x,
+        src,
+        wigner,
+        out,
+        n_edge,
+        channels,
+        x.stride(0),
+        x.stride(1),
+        x.stride(2),
+        wigner.stride(0),
+        wigner.stride(1),
+        wigner.stride(2),
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        LMAX=lmax,
+        BLOCK_C=_tile_dim(channels),
+    )
+    return out
+
+
+def _launch_bd_to_local_bwd(
+    grad_out: Tensor, x: Tensor, src: Tensor, wigner: Tensor, lmax: int
+) -> tuple[Tensor, Tensor]:
+    n_edge = src.shape[0]
+    channels = int(x.shape[2])
+    grad_x = torch.zeros_like(x)
+    grad_wigner = torch.zeros_like(wigner)
+    if _has_no_edges(n_edge):
+        return grad_x, grad_wigner
+    wrap_triton(_bd_to_local_bwd_kernel)[(n_edge,)](
+        grad_out,
+        x,
+        src,
+        wigner,
+        grad_x,
+        grad_wigner,
+        n_edge,
+        channels,
+        grad_out.stride(0),
+        grad_out.stride(1),
+        grad_out.stride(2),
+        x.stride(0),
+        x.stride(1),
+        x.stride(2),
+        wigner.stride(0),
+        wigner.stride(1),
+        wigner.stride(2),
+        grad_x.stride(0),
+        grad_x.stride(1),
+        grad_x.stride(2),
+        grad_wigner.stride(0),
+        grad_wigner.stride(1),
+        grad_wigner.stride(2),
+        LMAX=lmax,
+        BLOCK_C=_tile_dim(channels),
+    )
+    return grad_x, grad_wigner
+
+
+def _launch_bd_back_fwd(x_local: Tensor, wigner: Tensor, lmax: int) -> Tensor:
+    n_edge = x_local.shape[0]
+    channels = int(x_local.shape[2])
+    dim_full = (lmax + 1) ** 2
+    out = torch.empty(
+        (n_edge, dim_full, channels), dtype=x_local.dtype, device=x_local.device
+    )
+    if _has_no_edges(n_edge):
+        return out
+    wrap_triton(_bd_back_fwd_kernel)[(n_edge,)](
+        x_local,
+        wigner,
+        out,
+        n_edge,
+        channels,
+        x_local.stride(0),
+        x_local.stride(1),
+        x_local.stride(2),
+        wigner.stride(0),
+        wigner.stride(1),
+        wigner.stride(2),
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        LMAX=lmax,
+        BLOCK_C=_tile_dim(channels),
+    )
+    return out
+
+
+def _launch_bd_back_bwd(
+    grad_out: Tensor, x_local: Tensor, wigner: Tensor, lmax: int
+) -> tuple[Tensor, Tensor]:
+    n_edge = x_local.shape[0]
+    channels = int(x_local.shape[2])
+    grad_x_local = torch.empty_like(x_local)
+    grad_wigner = torch.zeros_like(wigner)
+    if _has_no_edges(n_edge):
+        return grad_x_local, grad_wigner
+    wrap_triton(_bd_back_bwd_kernel)[(n_edge,)](
+        grad_out,
+        x_local,
+        wigner,
+        grad_x_local,
+        grad_wigner,
+        n_edge,
+        channels,
+        grad_out.stride(0),
+        grad_out.stride(1),
+        grad_out.stride(2),
+        x_local.stride(0),
+        x_local.stride(1),
+        x_local.stride(2),
+        wigner.stride(0),
+        wigner.stride(1),
+        wigner.stride(2),
+        grad_x_local.stride(0),
+        grad_x_local.stride(1),
+        grad_x_local.stride(2),
+        grad_wigner.stride(0),
+        grad_wigner.stride(1),
+        grad_wigner.stride(2),
+        LMAX=lmax,
+        BLOCK_C=_tile_dim(channels),
+    )
+    return grad_x_local, grad_wigner
+
+
+def _launch_bd_back_so2_fwd(x_local_4d: Tensor, wigner: Tensor, lmax: int) -> Tensor:
+    n_edge = x_local_4d.shape[0]
+    n_focus = int(x_local_4d.shape[1])
+    focus_dim = int(x_local_4d.shape[3])
+    channels = n_focus * focus_dim
+    dim_full = (lmax + 1) ** 2
+    out = torch.empty(
+        (n_edge, dim_full, channels), dtype=x_local_4d.dtype, device=x_local_4d.device
+    )
+    if _has_no_edges(n_edge):
+        return out
+    wrap_triton(_bd_back_so2_fwd_kernel)[(n_edge,)](
+        x_local_4d,
+        wigner,
+        out,
+        n_edge,
+        channels,
+        x_local_4d.stride(0),
+        x_local_4d.stride(1),
+        x_local_4d.stride(2),
+        x_local_4d.stride(3),
+        wigner.stride(0),
+        wigner.stride(1),
+        wigner.stride(2),
+        out.stride(0),
+        out.stride(1),
+        out.stride(2),
+        LMAX=lmax,
+        FOCUS_DIM=focus_dim,
+        BLOCK_C=_tile_dim(channels),
+    )
+    return out
+
+
+def _launch_bd_back_so2_bwd(
+    grad_out: Tensor, x_local_4d: Tensor, wigner: Tensor, lmax: int
+) -> tuple[Tensor, Tensor]:
+    n_edge = x_local_4d.shape[0]
+    n_focus = int(x_local_4d.shape[1])
+    focus_dim = int(x_local_4d.shape[3])
+    channels = n_focus * focus_dim
+    grad_x_local = torch.empty_like(x_local_4d)
+    grad_wigner = torch.zeros_like(wigner)
+    if _has_no_edges(n_edge):
+        return grad_x_local, grad_wigner
+    wrap_triton(_bd_back_so2_bwd_kernel)[(n_edge,)](
+        grad_out,
+        x_local_4d,
+        wigner,
+        grad_x_local,
+        grad_wigner,
+        n_edge,
+        channels,
+        grad_out.stride(0),
+        grad_out.stride(1),
+        grad_out.stride(2),
+        x_local_4d.stride(0),
+        x_local_4d.stride(1),
+        x_local_4d.stride(2),
+        x_local_4d.stride(3),
+        wigner.stride(0),
+        wigner.stride(1),
+        wigner.stride(2),
+        grad_x_local.stride(0),
+        grad_x_local.stride(1),
+        grad_x_local.stride(2),
+        grad_x_local.stride(3),
+        grad_wigner.stride(0),
+        grad_wigner.stride(1),
+        grad_wigner.stride(2),
+        LMAX=lmax,
+        FOCUS_DIM=focus_dim,
+        BLOCK_C=_tile_dim(channels),
+    )
+    return grad_x_local, grad_wigner
+
+
+# ======================================================================
+# Dispatch helpers (triton on CUDA float, eager otherwise)
+# ======================================================================
+def _use_triton(tensor: Tensor) -> bool:
+    return (
+        TRITON_ROTATION_AVAILABLE
+        and tensor.is_cuda
+        and tensor.dtype in (torch.float16, torch.bfloat16, torch.float32)
+    )
+
+
+def _rotate_to_local_impl(
+    x: Tensor,
+    src: Tensor,
+    wigner: Tensor,
+    coeff_index: Tensor,
+    dim_full: int,
+) -> Tensor:
+    if not _use_triton(x):
+        return rotate_to_local_reference(x, src, wigner, coeff_index, dim_full)
+    return _launch_rotate_to_local_fwd(
+        x, src.contiguous(), wigner, coeff_index.contiguous(), int(dim_full)
+    )
+
+
+def _rotate_to_local_bwd_impl(
+    grad_out: Tensor,
+    x: Tensor,
+    src: Tensor,
+    wigner: Tensor,
+    coeff_index: Tensor,
+    dim_full: int,
+) -> tuple[Tensor, Tensor]:
+    if not _use_triton(x):
+        return _rotate_to_local_bwd_eager(
+            grad_out, x, src, wigner, coeff_index, dim_full
+        )
+    return _launch_rotate_to_local_bwd(
+        grad_out.contiguous(),
+        x,
+        src.contiguous(),
+        wigner,
+        coeff_index.contiguous(),
+        int(dim_full),
+    )
+
+
+def _rotate_back_impl(
+    x_local: Tensor,
+    wigner: Tensor,
+    coeff_index: Tensor,
+    dim_full: int,
+) -> Tensor:
+    if not _use_triton(x_local):
+        return rotate_back_reference(x_local, wigner, coeff_index, dim_full)
+    return _launch_rotate_back_fwd(
+        x_local, wigner, coeff_index.contiguous(), int(dim_full)
+    )
+
+
+def _rotate_back_bwd_impl(
+    grad_out: Tensor,
+    x_local: Tensor,
+    wigner: Tensor,
+    coeff_index: Tensor,
+    dim_full: int,
+) -> tuple[Tensor, Tensor]:
+    if not _use_triton(x_local):
+        return _rotate_back_bwd_eager(grad_out, x_local, wigner, coeff_index, dim_full)
+    return _launch_rotate_back_bwd(
+        grad_out.contiguous(),
+        x_local,
+        wigner,
+        coeff_index.contiguous(),
+        int(dim_full),
+    )
+
+
+# --- block-diagonal impls (mmax == 1; assume block-diagonal Wigner-D) ---
+def _block_rotate_to_local_impl(
+    x: Tensor, src: Tensor, wigner: Tensor, lmax: int
+) -> Tensor:
+    if not _use_triton(x):
+        coeff = build_m_major_index(int(lmax), 1, device=x.device)
+        return rotate_to_local_reference(x, src, wigner, coeff, (int(lmax) + 1) ** 2)
+    return _launch_bd_to_local_fwd(x, src.contiguous(), wigner, int(lmax))
+
+
+def _block_rotate_to_local_bwd_impl(
+    grad_out: Tensor, x: Tensor, src: Tensor, wigner: Tensor, lmax: int
+) -> tuple[Tensor, Tensor]:
+    if not _use_triton(x):
+        coeff = build_m_major_index(int(lmax), 1, device=x.device)
+        return _rotate_to_local_bwd_eager(
+            grad_out, x, src, wigner, coeff, (int(lmax) + 1) ** 2
+        )
+    return _launch_bd_to_local_bwd(
+        grad_out.contiguous(), x, src.contiguous(), wigner, int(lmax)
+    )
+
+
+def _block_rotate_back_impl(x_local: Tensor, wigner: Tensor, lmax: int) -> Tensor:
+    if not _use_triton(x_local):
+        coeff = build_m_major_index(int(lmax), 1, device=x_local.device)
+        return rotate_back_reference(x_local, wigner, coeff, (int(lmax) + 1) ** 2)
+    return _launch_bd_back_fwd(x_local, wigner, int(lmax))
+
+
+def _block_rotate_back_bwd_impl(
+    grad_out: Tensor, x_local: Tensor, wigner: Tensor, lmax: int
+) -> tuple[Tensor, Tensor]:
+    if not _use_triton(x_local):
+        coeff = build_m_major_index(int(lmax), 1, device=x_local.device)
+        return _rotate_back_bwd_eager(
+            grad_out, x_local, wigner, coeff, (int(lmax) + 1) ** 2
+        )
+    return _launch_bd_back_bwd(grad_out.contiguous(), x_local, wigner, int(lmax))
+
+
+# ======================================================================
+# Functional triton_op + fake + autograd registration
+# ======================================================================
+# Forward and backward are both *functional* triton_ops (mutates_args=()), so
+# functionalization keeps the full gradient path -- including grad w.r.t.
+# ``wigner`` -- intact under ``torch.compile``. ``triton_op`` (vs ``custom_op``)
+# additionally lets Inductor see through to the wrapped Triton kernel and bake
+# the cubin into the AOTInductor ``.pt2`` so the LAMMPS C++ runtime needs no
+# Python registration.
+
+_rotate_to_local_op = torch.library.triton_op(
+    "dpa4_triton::rotate_to_local", mutates_args=()
+)(_rotate_to_local_impl)
+
+_rotate_to_local_bwd_op = torch.library.triton_op(
+    "dpa4_triton::rotate_to_local_bwd", mutates_args=()
+)(_rotate_to_local_bwd_impl)
+
+_rotate_back_op = torch.library.triton_op("dpa4_triton::rotate_back", mutates_args=())(
+    _rotate_back_impl
+)
+
+_rotate_back_bwd_op = torch.library.triton_op(
+    "dpa4_triton::rotate_back_bwd", mutates_args=()
+)(_rotate_back_bwd_impl)
+
+
+@_rotate_to_local_op.register_fake
+def _(x, src, wigner, coeff_index, dim_full):
+    return x.new_empty((src.shape[0], coeff_index.shape[0], x.shape[2]))
+
+
+@_rotate_to_local_bwd_op.register_fake
+def _(grad_out, x, src, wigner, coeff_index, dim_full):
+    return torch.empty_like(x), torch.empty_like(wigner)
+
+
+@_rotate_back_op.register_fake
+def _(x_local, wigner, coeff_index, dim_full):
+    return x_local.new_empty((x_local.shape[0], dim_full, x_local.shape[2]))
+
+
+@_rotate_back_bwd_op.register_fake
+def _(grad_out, x_local, wigner, coeff_index, dim_full):
+    return torch.empty_like(x_local), torch.empty_like(wigner)
+
+
+def _rotate_to_local_setup_context(ctx, inputs, output):
+    x, src, wigner, coeff_index, dim_full = inputs
+    ctx.save_for_backward(x, src, wigner, coeff_index)
+    ctx.dim_full = dim_full
+
+
+def _rotate_to_local_backward(ctx, grad_out):
+    x, src, wigner, coeff_index = ctx.saved_tensors
+    grad_x, grad_wigner = _rotate_to_local_bwd_op(
+        grad_out, x, src, wigner, coeff_index, ctx.dim_full
+    )
+    return grad_x, None, grad_wigner, None, None
+
+
+def _rotate_back_setup_context(ctx, inputs, output):
+    x_local, wigner, coeff_index, dim_full = inputs
+    ctx.save_for_backward(x_local, wigner, coeff_index)
+    ctx.dim_full = dim_full
+
+
+def _rotate_back_backward(ctx, grad_out):
+    x_local, wigner, coeff_index = ctx.saved_tensors
+    grad_x_local, grad_wigner = _rotate_back_bwd_op(
+        grad_out, x_local, wigner, coeff_index, ctx.dim_full
+    )
+    return grad_x_local, grad_wigner, None, None
+
+
+_rotate_to_local_op.register_autograd(
+    _rotate_to_local_backward, setup_context=_rotate_to_local_setup_context
+)
+_rotate_back_op.register_autograd(
+    _rotate_back_backward, setup_context=_rotate_back_setup_context
+)
+
+
+# --- block-diagonal custom ops (carry only ``lmax``; no coeff_index tensor) ---
+_block_to_local_op = torch.library.triton_op(
+    "dpa4_triton::rotate_to_local_block", mutates_args=()
+)(_block_rotate_to_local_impl)
+
+_block_to_local_bwd_op = torch.library.triton_op(
+    "dpa4_triton::rotate_to_local_block_bwd", mutates_args=()
+)(_block_rotate_to_local_bwd_impl)
+
+_block_back_op = torch.library.triton_op(
+    "dpa4_triton::rotate_back_block", mutates_args=()
+)(_block_rotate_back_impl)
+
+_block_back_bwd_op = torch.library.triton_op(
+    "dpa4_triton::rotate_back_block_bwd", mutates_args=()
+)(_block_rotate_back_bwd_impl)
+
+
+@_block_to_local_op.register_fake
+def _(x, src, wigner, lmax):
+    return x.new_empty((src.shape[0], 3 * int(lmax) + 1, x.shape[2]))
+
+
+@_block_to_local_bwd_op.register_fake
+def _(grad_out, x, src, wigner, lmax):
+    return torch.empty_like(x), torch.empty_like(wigner)
+
+
+@_block_back_op.register_fake
+def _(x_local, wigner, lmax):
+    return x_local.new_empty((x_local.shape[0], (int(lmax) + 1) ** 2, x_local.shape[2]))
+
+
+@_block_back_bwd_op.register_fake
+def _(grad_out, x_local, wigner, lmax):
+    return torch.empty_like(x_local), torch.empty_like(wigner)
+
+
+def _block_to_local_setup_context(ctx, inputs, output):
+    x, src, wigner, lmax = inputs
+    ctx.save_for_backward(x, src, wigner)
+    ctx.lmax = lmax
+
+
+def _block_to_local_backward(ctx, grad_out):
+    x, src, wigner = ctx.saved_tensors
+    grad_x, grad_wigner = _block_to_local_bwd_op(grad_out, x, src, wigner, ctx.lmax)
+    return grad_x, None, grad_wigner, None
+
+
+def _block_back_setup_context(ctx, inputs, output):
+    x_local, wigner, lmax = inputs
+    ctx.save_for_backward(x_local, wigner)
+    ctx.lmax = lmax
+
+
+def _block_back_backward(ctx, grad_out):
+    x_local, wigner = ctx.saved_tensors
+    grad_x_local, grad_wigner = _block_back_bwd_op(grad_out, x_local, wigner, ctx.lmax)
+    return grad_x_local, grad_wigner, None
+
+
+_block_to_local_op.register_autograd(
+    _block_to_local_backward, setup_context=_block_to_local_setup_context
+)
+_block_back_op.register_autograd(
+    _block_back_backward, setup_context=_block_back_setup_context
+)
+
+
+# ======================================================================
+# Public API
+# ======================================================================
+# --- Public entry points -----------------------------------------------------
+def rotate_to_local_dense(
+    x: Tensor, src: Tensor, wigner: Tensor, coeff_index: Tensor, dim_full: int
+) -> Tensor:
+    """Apply the general ``global -> local`` rotation.
+
+    This entry point honors every value in ``coeff_index`` and supports any
+    reduced coefficient layout.  It computes the same operation as
+    ``rotate_to_local_reference`` while avoiding materialized gather operands on
+    CUDA.
+    """
+    return _rotate_to_local_op(x, src, wigner, coeff_index, int(dim_full))
+
+
+def rotate_back_dense(
+    x_local: Tensor, wigner: Tensor, coeff_index: Tensor, dim_full: int
+) -> Tensor:
+    """Apply the general ``local -> global`` rotation.
+
+    This entry point honors every value in ``coeff_index`` and supports any
+    reduced coefficient layout.  It computes the same operation as
+    ``rotate_back_reference`` while avoiding materialized gather operands on
+    CUDA.
+    """
+    return _rotate_back_op(x_local, wigner, coeff_index, int(dim_full))
+
+
+def rotate_to_local_block(x: Tensor, src: Tensor, wigner: Tensor, lmax: int) -> Tensor:
+    """Apply the block-diagonal ``global -> local`` rotation.
+
+    Use this when the caller owns the invariant that the reduced layout is the
+    canonical m-major ``mmax=1`` layout for ``lmax``.  The block kernel derives
+    the reduced row order from ``lmax`` and does not consume a coefficient-index
+    tensor.
+    """
+    return _block_to_local_op(x, src, wigner, int(lmax))
+
+
+def rotate_back_block(x_local: Tensor, wigner: Tensor, lmax: int) -> Tensor:
+    """Apply the block-diagonal ``local -> global`` rotation.
+
+    Use this when the caller owns the invariant that ``x_local`` is ordered in
+    the canonical m-major ``mmax=1`` layout for ``lmax``.  The block kernel
+    derives the reduced column order from ``lmax`` and does not consume a
+    coefficient-index tensor.
+    """
+    return _block_back_op(x_local, wigner, int(lmax))
+
+
+# ======================================================================
+# Layout-aware block rotate_back (per-focus SO(2) layout, mmax == 1)
+# ======================================================================
+# Consumes the (E, F, D_m, Cf) focus layout produced by the SO(2) layers so the
+# caller can skip the ``transpose(1, 2).contiguous()`` that would otherwise
+# materialize (E, D_m, F * Cf) before the inverse rotation.
+
+
+def _block_rotate_back_so2_impl(
+    x_local_4d: Tensor, wigner: Tensor, lmax: int
+) -> Tensor:
+    if not _use_triton(x_local_4d):
+        n_edge, n_focus, reduced_dim, focus_dim = x_local_4d.shape
+        x_std = x_local_4d.transpose(1, 2).reshape(
+            n_edge, reduced_dim, n_focus * focus_dim
+        )
+        coeff = build_m_major_index(int(lmax), 1, device=x_local_4d.device)
+        return rotate_back_reference(x_std, wigner, coeff, (int(lmax) + 1) ** 2)
+    return _launch_bd_back_so2_fwd(x_local_4d, wigner, int(lmax))
+
+
+def _block_rotate_back_so2_bwd_impl(
+    grad_out: Tensor, x_local_4d: Tensor, wigner: Tensor, lmax: int
+) -> tuple[Tensor, Tensor]:
+    if not _use_triton(x_local_4d):
+        n_edge, n_focus, reduced_dim, focus_dim = x_local_4d.shape
+        x_std = x_local_4d.transpose(1, 2).reshape(
+            n_edge, reduced_dim, n_focus * focus_dim
+        )
+        coeff = build_m_major_index(int(lmax), 1, device=x_local_4d.device)
+        grad_x_std, grad_wigner = _rotate_back_bwd_eager(
+            grad_out, x_std, wigner, coeff, (int(lmax) + 1) ** 2
+        )
+        grad_x_local = grad_x_std.reshape(
+            n_edge, reduced_dim, n_focus, focus_dim
+        ).transpose(1, 2)
+        return grad_x_local, grad_wigner
+    return _launch_bd_back_so2_bwd(grad_out.contiguous(), x_local_4d, wigner, int(lmax))
+
+
+_block_back_so2_op = torch.library.triton_op(
+    "dpa4_triton::rotate_back_block_so2", mutates_args=()
+)(_block_rotate_back_so2_impl)
+
+_block_back_so2_bwd_op = torch.library.triton_op(
+    "dpa4_triton::rotate_back_block_so2_bwd", mutates_args=()
+)(_block_rotate_back_so2_bwd_impl)
+
+
+@_block_back_so2_op.register_fake
+def _(x_local_4d, wigner, lmax):
+    n_edge, n_focus, _reduced, focus_dim = x_local_4d.shape
+    return x_local_4d.new_empty((n_edge, (int(lmax) + 1) ** 2, n_focus * focus_dim))
+
+
+@_block_back_so2_bwd_op.register_fake
+def _(grad_out, x_local_4d, wigner, lmax):
+    return torch.empty_like(x_local_4d), torch.empty_like(wigner)
+
+
+def _block_back_so2_setup_context(ctx, inputs, output):
+    x_local_4d, wigner, lmax = inputs
+    ctx.save_for_backward(x_local_4d, wigner)
+    ctx.lmax = lmax
+
+
+def _block_back_so2_backward(ctx, grad_out):
+    x_local_4d, wigner = ctx.saved_tensors
+    grad_x_local, grad_wigner = _block_back_so2_bwd_op(
+        grad_out, x_local_4d, wigner, ctx.lmax
+    )
+    return grad_x_local, grad_wigner, None
+
+
+_block_back_so2_op.register_autograd(
+    _block_back_so2_backward, setup_context=_block_back_so2_setup_context
+)
+
+
+def rotate_back_block_so2(x_local_4d: Tensor, wigner: Tensor, lmax: int) -> Tensor:
+    """Block-diagonal ``local -> global`` rotation reading the per-focus layout.
+
+    Parameters
+    ----------
+    x_local_4d : Tensor
+        Local features with shape (E, F, reduced_dim, Cf) in the canonical m-major
+        ``mmax=1`` layout, where C_wide = F * Cf.
+    wigner : Tensor
+        Transposed Wigner-D with shape (E, D, D), D = (lmax + 1) ** 2.
+    lmax : int
+        Maximum degree.
+
+    Returns
+    -------
+    Tensor
+        Global-frame message with shape (E, D, C_wide). The per-focus to packed
+        channel mapping ``c = f * Cf + cf`` folds the inverse transpose into the
+        kernel addressing, avoiding an explicit copy.
+    """
+    return _block_back_so2_op(x_local_4d, wigner, int(lmax))
diff --git a/deepmd/pt_expt/utils/serialization.py b/deepmd/pt_expt/utils/serialization.py
index 9c03783574..55ae2e2b57 100644
--- a/deepmd/pt_expt/utils/serialization.py
+++ b/deepmd/pt_expt/utils/serialization.py
@@ -985,6 +985,17 @@ def _deserialize_to_file_pt2(
     # descriptors (DPA1, DPA2). Setting threshold=0 prevents fusion and
     # avoids the NaN. Only applied on CUDA; CPU compilation is unaffected.
     #
+    # ``assert_indirect_indexing`` (default True) makes inductor emit an
+    # ``AOTI_TORCH_CHECK`` bounds assertion for every indirect (data-dependent)
+    # index. In the CPU-vectorised codegen for DPA4/SeZM's per-node
+    # gather/scatter (the descriptor broadcasts a per-node value across its
+    # edges), inductor mis-hoists that assertion ABOVE the declaration of the
+    # index temporary, emitting C++ that references an undeclared ``tmpN`` and
+    # fails to compile ("use of undeclared identifier"). The asserted indices
+    # are loop counters that are in-bounds by construction, so the check is
+    # redundant; disabling it removes the broken assertion while leaving
+    # vectorisation (and therefore inference throughput) untouched.
+    #
     # NOTE: ``torch._inductor.config`` is a process-wide singleton. The
     # save/restore pattern here is NOT thread-safe — concurrent AOTInductor
     # compilations from multiple threads would race on this global. Callers
@@ -996,12 +1007,15 @@ def _deserialize_to_file_pt2(
 
     is_cuda = _env.DEVICE.type == "cuda"
     saved_threshold = _inductor_config.realize_opcount_threshold
+    saved_assert_indexing = _inductor_config.assert_indirect_indexing
     if is_cuda:
         _inductor_config.realize_opcount_threshold = 0
+    _inductor_config.assert_indirect_indexing = False
     try:
         aoti_compile_and_package(exported, package_path=model_file)
     finally:
         _inductor_config.realize_opcount_threshold = saved_threshold
+        _inductor_config.assert_indirect_indexing = saved_assert_indexing
 
     # Second artifact: with-comm. Only for descriptors whose message
     # passing extends across rank boundaries. The flag was computed
@@ -1020,12 +1034,15 @@ def _deserialize_to_file_pt2(
         with tempfile.TemporaryDirectory() as td:
             wc_path = os.path.join(td, "forward_lower_with_comm.pt2")
             saved_threshold = _inductor_config.realize_opcount_threshold
+            saved_assert_indexing = _inductor_config.assert_indirect_indexing
             if is_cuda:
                 _inductor_config.realize_opcount_threshold = 0
+            _inductor_config.assert_indirect_indexing = False
             try:
                 aoti_compile_and_package(exported_wc, package_path=wc_path)
             finally:
                 _inductor_config.realize_opcount_threshold = saved_threshold
+                _inductor_config.assert_indirect_indexing = saved_assert_indexing
             with open(wc_path, "rb") as f:
                 with_comm_bytes = f.read()
         # The output keys are identical between the two artifacts (same
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 07b256e21b..53765d22c5 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -398,6 +398,25 @@ def descrpt_se_zm_args() -> list[Argument]:
         "building Wigner-D blocks. The roll is sampled independently per edge and "
         "per forward call."
     )
+    doc_edge_cartesian = (
+        "If True, every interaction block whose message-passing degree is 1 or 2 "
+        "replaces its per-edge SO(2) rotation-frame tensor product with an "
+        "equivalent global-frame Cartesian rank-2 tensor product, removing the "
+        "two per-edge Wigner-D rotations. Blocks with degree 0 or at least 3 keep "
+        "the SO(2) path. When every block takes the Cartesian path, the full "
+        "Wigner-D construction is skipped automatically."
+    )
+    doc_node_cartesian = (
+        "Per-node global-frame Cartesian rank-2 tensor product applied to the "
+        "aggregated message in every interaction block whose message-passing "
+        "degree is 1 or 2, coupling it with the destination node feature. "
+        "Configured by a string `<mode>:<layers>` where `mode` is `default` (the "
+        "one-sided product) or `parity` (the symmetrized product), and `layers` "
+        "is the stack depth; a bare integer `N` is shorthand for `default:N`, and "
+        "`none` (or `0`) disables it. Orthogonal to `edge_cartesian`: "
+        "either, both, or neither may be enabled. Its cost scales with the number "
+        "of nodes rather than edges, leaving the per-edge message path unchanged."
+    )
     doc_lmax = "Maximum degree, only used when `l_schedule` is None."
     doc_l_schedule = "Pyramid schedule of lmax per block, e.g. [3, 3, 2]. Must be non-increasing. If set, lmax and n_blocks will be ignored."
     doc_mmax = "Maximum SO(2) order (|m|), only used when `m_schedule` is None. If None, defaults to the per-block lmax."
@@ -427,7 +446,13 @@ def descrpt_se_zm_args() -> list[Argument]:
         "If True, apply intermediate ReducedEquivariantRMSNorm between SO(2) mixing layers. "
         "When False (default), no normalization is applied between layers."
     )
-    doc_so2_layers = "Number of SO(2) mixing layers per block."
+    doc_mixing_layers = (
+        "Number of learnable mixing layers in the per-edge message core of each "
+        "block (legacy alias: so2_layers). `0` applies only the edge-condition "
+        "modulation: the rotation-free per-degree radial scaling on the SO(2) "
+        "path, or a single `x @ T_e` when edge_cartesian applies. The per-node "
+        "node_cartesian stack carries its own independent depth."
+    )
     doc_so2_attn_res = (
         "Depth-wise attention residual mode across the internal SO(2) layer "
         "history inside each interaction block. Must be one of `none`, "
@@ -441,7 +466,10 @@ def descrpt_se_zm_args() -> list[Argument]:
         "`degree` uses an edge-conditioned cross-degree kernel "
         "`W[l_in,l_out,|m|](r)` shared by all channels. "
         "`degree_channel` uses `W[l_in,l_out,|m|,c](r)`, optionally low-rank "
-        "when `radial_so2_rank > 0`."
+        "when `radial_so2_rank > 0`. "
+        "This setting has no effect on blocks that take the Cartesian path "
+        "(edge_cartesian with degree 1 or 2), where the dynamic radial degree "
+        "mixer is bypassed."
     )
     doc_radial_so2_rank = (
         "Low-rank channel factorization rank for `radial_so2_mode=degree_channel`. "
@@ -671,6 +699,20 @@ def descrpt_se_zm_args() -> list[Argument]:
             default=True,
             doc=doc_only_pt_supported + doc_random_gamma,
         ),
+        Argument(
+            "edge_cartesian",
+            bool,
+            optional=True,
+            default=False,
+            doc=doc_only_pt_supported + doc_edge_cartesian,
+        ),
+        Argument(
+            "node_cartesian",
+            [str, int],
+            optional=True,
+            default="none",
+            doc=doc_only_pt_supported + doc_node_cartesian,
+        ),
         Argument("lmax", int, optional=True, default=3, doc=doc_lmax),
         Argument(
             "l_schedule", list[int], optional=True, default=None, doc=doc_l_schedule
@@ -705,7 +747,14 @@ def descrpt_se_zm_args() -> list[Argument]:
         ),
         Argument("n_blocks", int, optional=True, default=3, doc=doc_n_blocks),
         Argument("so2_norm", bool, optional=True, default=False, doc=doc_so2_norm),
-        Argument("so2_layers", int, optional=True, default=4, doc=doc_so2_layers),
+        Argument(
+            "mixing_layers",
+            int,
+            optional=True,
+            default=4,
+            alias=["so2_layers"],
+            doc=doc_mixing_layers,
+        ),
         Argument(
             "so2_attn_res",
             str,
diff --git a/examples/water/dpa4/input.json b/examples/water/dpa4/input.json
index 9a2260d0d6..c0e12b9be4 100644
--- a/examples/water/dpa4/input.json
+++ b/examples/water/dpa4/input.json
@@ -11,10 +11,12 @@
       "channels": 32,
       "n_radial": 16,
       "use_env_seed": true,
+      "edge_cartesian": false,
+      "node_cartesian": "none",
       "lmax": 3,
       "mmax": 1,
       "n_blocks": 2,
-      "so2_layers": 3,
+      "mixing_layers": 3,
       "radial_so2_mode": "degree_channel",
       "radial_so2_rank": 1,
       "n_focus": 2,
diff --git a/source/tests/common/dpmodel/test_descrpt_dpa4.py b/source/tests/common/dpmodel/test_descrpt_dpa4.py
index e4ab24c064..a687fffcb8 100644
--- a/source/tests/common/dpmodel/test_descrpt_dpa4.py
+++ b/source/tests/common/dpmodel/test_descrpt_dpa4.py
@@ -92,10 +92,17 @@ def test_shapes_and_interface(self) -> None:
         assert dd.need_sorted_nlist_for_lower() is False
         assert dd.get_env_protection() == dd.eps
 
-    def test_has_message_passing_false(self) -> None:
-        # scalar-only model: lmax=0 carries no directional messages
-        dd = make_descriptor(lmax=0, mmax=0, kmax=0, n_blocks=1)
-        assert dd.has_message_passing() is False
+    def test_message_passing_semantics(self) -> None:
+        # SeZM always resolves ghost neighbours on the lower path, so it always
+        # reports message passing; cross-rank ghost exchange is needed only when
+        # zone bridging is disabled (a BridgingSwitch cannot be reproduced by a
+        # single rank for ghost owners).
+        dd = make_descriptor()
+        assert dd.has_message_passing() is True
+        assert dd.has_message_passing_across_ranks() is True
+        dd_bridge = make_descriptor(inner_clamp_r_inner=0.5, inner_clamp_r_outer=1.0)
+        assert dd_bridge.has_message_passing() is True
+        assert dd_bridge.has_message_passing_across_ranks() is False
 
     def test_serialize_roundtrip_exact(self) -> None:
         dd = make_descriptor()
@@ -149,51 +156,39 @@ def test_masked_edge_inertness(self) -> None:
         np.testing.assert_allclose(out2, out, rtol=1e-12, atol=1e-14)
 
     @pytest.mark.parametrize(
-        "flag,value",
-        [
-            # guarded at the descriptor level
-            ("lebedev_quadrature", False),  # tensor-product S2 grid
-            ("lebedev_quadrature", [False, True]),  # tensor-product S2 grid (so2)
-            ("lebedev_quadrature", [True, False]),  # tensor-product S2 grid (ffn)
-            ("add_chg_spin_ebd", True),  # ChargeSpinEmbedding
-            ("inner_clamp_r_inner", 0.5),  # zone bridging
-            ("inner_clamp_r_outer", 1.0),  # zone bridging
-            # delegated to the owning submodules
-            ("layer_scale", True),  # block LayerScale
-            ("full_attn_res", "independent"),  # DepthAttnRes
-            ("block_attn_res", "dependent"),  # DepthAttnRes
-            ("so2_attn_res", "independent"),  # SO(2) DepthAttnRes
-            ("s2_activation", [True, True]),  # so2-side S2 activation
-            ("atten_f_mix", True),  # SO(2) attention focus mix
-            ("atten_v_proj", True),  # SO(2) attention value projection
-            ("atten_o_proj", True),  # SO(2) attention output projection
-        ],
-    )
-    def test_not_implemented_guards(self, flag, value) -> None:
-        with pytest.raises(NotImplementedError):
-            make_descriptor(**{flag: value})
-
-    @pytest.mark.parametrize(
-        "flag,value",
+        "overrides",
         [
-            ("lebedev_quadrature", True),  # supported branch of every guard
-            ("add_chg_spin_ebd", False),
-            ("inner_clamp_r_inner", None),
-            ("layer_scale", False),
-            ("full_attn_res", "none"),
-            ("s2_activation", [False, True]),
-            ("node_wise_s2", False),
-            ("node_wise_so3", True),  # SO(2) edge-local SO(3) cross-grid product
-            ("message_node_so3", True),  # SO(2) post-agg SO(3) cross-grid product
-            ("ffn_so3_grid", False),  # SO(3) Wigner-D FFN grid off
-            ("ffn_so3_grid", True),  # SO(3) Wigner-D FFN grid on (now wired)
-            ("use_amp", True),  # pt-runtime-only switch: accepted and ignored
-            ("use_amp", False),
+            # Charge/spin condition embedding; default_chg_spin lets forward run
+            # without an explicit charge_spin input.
+            pytest.param(
+                {"add_chg_spin_ebd": True, "default_chg_spin": [0.5, -0.5]},
+                id="add_chg_spin_ebd",
+            ),
+            pytest.param({"so2_attn_res": "independent"}, id="so2_attn_res"),
+            pytest.param({"full_attn_res": "dependent"}, id="full_attn_res"),
+            pytest.param({"layer_scale": True}, id="layer_scale"),
+            pytest.param(
+                {"lebedev_quadrature": [False, False]}, id="lebedev_quadrature_off"
+            ),
+            pytest.param({"atten_v_proj": True}, id="atten_v_proj"),
+            pytest.param({"node_wise_so3": True}, id="node_wise_so3"),
+            pytest.param({"message_node_so3": True}, id="message_node_so3"),
+            pytest.param({"ffn_so3_grid": True}, id="ffn_so3_grid"),
         ],
     )
-    def test_supported_branches_construct(self, flag, value) -> None:
-        dd = make_descriptor(**{flag: value})
-        assert isinstance(dd, DescrptDPA4)
+    def test_supported_feature_roundtrip(self, overrides) -> None:
+        # Each flag enables a feature the migration now implements. Verify the
+        # key steps: forward is finite with the right shape, and the descriptor
+        # survives a serialize -> deserialize round-trip bit-exactly.
+        dd = make_descriptor(**overrides)
+        coord, atype, nlist = make_inputs()
+        nf, nloc = atype.shape
+        out1 = np.asarray(dd.call(coord.reshape(nf, -1), atype, nlist)[0])
+        assert out1.shape == (nf, nloc, dd.get_dim_out())
+        assert np.isfinite(out1).all()
+        dd2 = DescrptDPA4.deserialize(dd.serialize())
+        out2 = np.asarray(dd2.call(coord.reshape(nf, -1), atype, nlist)[0])
+        np.testing.assert_array_equal(out1, out2)
 
     def test_value_errors(self) -> None:
         with pytest.raises(ValueError):  # kmax must be <= lmax
diff --git a/source/tests/common/dpmodel/test_dpa4_basegridnet_cross.py b/source/tests/common/dpmodel/test_dpa4_basegridnet_cross.py
index 03d4e62987..b72bc6917e 100644
--- a/source/tests/common/dpmodel/test_dpa4_basegridnet_cross.py
+++ b/source/tests/common/dpmodel/test_dpa4_basegridnet_cross.py
@@ -126,28 +126,6 @@ def _run(net, query, context, backend):
     return np.asarray(out)
 
 
-@pytest.mark.parametrize("op_type", ["glu", "mlp", "branch"])  # grid operation
-def test_s2_self_regression(op_type) -> None:
-    """mode='self' S2GridNet still matches pt at 1e-12 (guards the self path)."""
-    lmax, n_focus, n_batch = 2, 2, 5
-    pt_net, dp_net = _build_nets(
-        mode="self", op_type=op_type, layout="ndfc", lmax=lmax, n_focus=n_focus
-    )
-    rng = np.random.default_rng(11)
-    query, context = _make_inputs(
-        mode="self",
-        layout="ndfc",
-        n_batch=n_batch,
-        lmax=lmax,
-        n_focus=n_focus,
-        channels=dp_net.channels,
-        rng=rng,
-    )
-    dp_out = _run(dp_net, query, context, "dp")
-    pt_out = _run(pt_net, query, context, "pt")
-    np.testing.assert_allclose(dp_out, pt_out, rtol=1e-12, atol=1e-12)
-
-
 @pytest.mark.parametrize("op_type", ["glu", "mlp", "branch"])  # grid operation
 def test_s2_cross_parity(op_type) -> None:
     """mode='cross' S2GridNet matches pt at 1e-12 (separate query/context)."""
diff --git a/source/tests/common/dpmodel/test_dpa4_frame_mixers.py b/source/tests/common/dpmodel/test_dpa4_frame_mixers.py
index 7432353a71..38498c84ec 100644
--- a/source/tests/common/dpmodel/test_dpa4_frame_mixers.py
+++ b/source/tests/common/dpmodel/test_dpa4_frame_mixers.py
@@ -121,41 +121,6 @@ def test_frame_expand_parity(lmax, channels, kmax) -> None:
     )
 
 
-@pytest.mark.parametrize("lmax,channels,kmax", _CASES)  # degree, channels, kmax
-def test_expand_then_contract_shapes(lmax, channels, kmax) -> None:
-    """Shape round-trip ``(N,D,F,C) -> expand -> (N,D,F,K*C) -> contract -> (N,D,F,C)``."""
-    n_frames = 2 * kmax + 1
-    coeff_dim = (lmax + 1) ** 2
-    n_batch, n_focus = 3, 2
-    rng = np.random.default_rng(404)
-
-    expand = DPFrameExpand(
-        lmax=lmax,
-        mmax=lmax,
-        coefficient_layout="packed",
-        n_frames=n_frames,
-        channels=channels,
-        precision="float64",
-        trainable=True,
-        seed=1,
-    )
-    contract = DPFrameContract(
-        lmax=lmax,
-        mmax=lmax,
-        coefficient_layout="packed",
-        n_frames=n_frames,
-        channels=channels,
-        precision="float64",
-        trainable=True,
-        seed=2,
-    )
-    coeff = rng.normal(size=(n_batch, coeff_dim, n_focus, channels))
-    expanded = expand.call(coeff)
-    assert expanded.shape == (n_batch, coeff_dim, n_focus, n_frames * channels)
-    contracted = contract.call(expanded)
-    assert contracted.shape == (n_batch, coeff_dim, n_focus, channels)
-
-
 @pytest.mark.parametrize("cls", [DPFrameContract, DPFrameExpand])  # mixer class
 def test_serialize_roundtrip(cls) -> None:
     """Serialize -> deserialize -> forward is identical; @version == 1."""
diff --git a/source/tests/common/dpmodel/test_dpa4_grid_descriptor.py b/source/tests/common/dpmodel/test_dpa4_grid_descriptor.py
index 92b46dbae6..23242a3185 100644
--- a/source/tests/common/dpmodel/test_dpa4_grid_descriptor.py
+++ b/source/tests/common/dpmodel/test_dpa4_grid_descriptor.py
@@ -70,7 +70,7 @@ def make_inputs(seed=5, nf=2, nloc=6, rcut=6.0, nnei=20, ntypes=2):
 def example_descriptor_kwargs(**overrides) -> dict:
     """Small example-config-like (examples/water/dpa4/input.json) descriptor block.
 
-    Sizes are shrunk (channels=8, sel=20, so2_layers=2) for fast fp64 parity,
+    Sizes are shrunk (channels=8, sel=20, mixing_layers=2) for fast fp64 parity,
     but the grid-relevant structure (lmax=3, mmax=1, n_focus=2, n_blocks=2,
     grid_branch=[1,1,1], ffn_so3_grid + message_node_so3) mirrors the flagship
     config.
@@ -84,7 +84,7 @@ def example_descriptor_kwargs(**overrides) -> dict:
         "lmax": 3,
         "mmax": 1,
         "n_blocks": 2,
-        "so2_layers": 2,
+        "mixing_layers": 2,
         "n_focus": 2,
         "focus_dim": 0,
         "ffn_so3_grid": True,
@@ -181,6 +181,7 @@ def test_masked_edge_noop() -> None:
     nf, nloc = atype.shape
     coord_ext = coord.reshape(nf, -1)
     out = np.asarray(dd.call(coord_ext, atype, nlist)[0])
+    assert np.abs(out).max() > 1e-6
 
     pad = -np.ones((nf, nloc, 1), dtype=nlist.dtype)
     nlist2 = np.concatenate([nlist, pad], axis=-1)
diff --git a/source/tests/common/dpmodel/test_dpa4_gridmlp_frames.py b/source/tests/common/dpmodel/test_dpa4_gridmlp_frames.py
index 19a4f979e2..d6c96e53d2 100644
--- a/source/tests/common/dpmodel/test_dpa4_gridmlp_frames.py
+++ b/source/tests/common/dpmodel/test_dpa4_gridmlp_frames.py
@@ -123,7 +123,7 @@ def test_gridmlp_parity(n_frames, mode) -> None:
     left = rng.normal(size=(n_batch, coeff_dim, n_focus, n_frames * channels))
     right = rng.normal(size=(n_batch, coeff_dim, n_focus, n_frames * channels))
 
-    dp_out = dp_mlp.call(left, right, to_grid=np_to_grid, from_grid=np_from_grid)
+    dp_out = dp_mlp.call(left, right, None, to_grid=np_to_grid, from_grid=np_from_grid)
     pt_out = pt_mlp(
         torch.from_numpy(left),
         torch.from_numpy(right),
@@ -211,7 +211,7 @@ def test_gridmlp_s2_regression(mode) -> None:
     right = rng.normal(size=(5, coeff_dim, n_focus, channels))
 
     dp_out = dp_mlp.call(
-        left, right, to_grid=dp_net._to_grid, from_grid=dp_net._from_grid
+        left, right, None, to_grid=dp_net._to_grid, from_grid=dp_net._from_grid
     )
     pt_out = pt_mlp(
         torch.from_numpy(left),
@@ -264,8 +264,8 @@ def test_gridmlp_serialize_roundtrip(mode) -> None:
 
     left = rng.normal(size=(n_batch, coeff_dim, n_focus, n_frames * channels))
     right = rng.normal(size=(n_batch, coeff_dim, n_focus, n_frames * channels))
-    out0 = mlp.call(left, right, to_grid=to_grid, from_grid=from_grid)
-    out1 = restored.call(left, right, to_grid=to_grid, from_grid=from_grid)
+    out0 = mlp.call(left, right, None, to_grid=to_grid, from_grid=from_grid)
+    out1 = restored.call(left, right, None, to_grid=to_grid, from_grid=from_grid)
     np.testing.assert_allclose(
         np.asarray(out0), np.asarray(out1), rtol=1e-12, atol=1e-12
     )
@@ -296,10 +296,11 @@ def test_gridmlp_torch_namespace(mode) -> None:
     left = rng.normal(size=(n_batch, coeff_dim, n_focus, n_frames * channels))
     right = rng.normal(size=(n_batch, coeff_dim, n_focus, n_frames * channels))
 
-    np_out = mlp.call(left, right, to_grid=to_grid, from_grid=from_grid)
+    np_out = mlp.call(left, right, None, to_grid=to_grid, from_grid=from_grid)
     torch_out = mlp.call(
         torch.from_numpy(left),
         torch.from_numpy(right),
+        None,
         to_grid=to_grid,
         from_grid=from_grid,
     )
diff --git a/source/tests/common/dpmodel/test_dpa4_lora.py b/source/tests/common/dpmodel/test_dpa4_lora.py
new file mode 100644
index 0000000000..54d4257394
--- /dev/null
+++ b/source/tests/common/dpmodel/test_dpa4_lora.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Torch-free tests for the dpmodel DPA4 (SeZM) LoRA fine-tune freeze policy."""
+
+from deepmd.dpmodel.descriptor.dpa4 import (
+    DescrptDPA4,
+)
+from deepmd.dpmodel.descriptor.dpa4_nn.lora import (
+    _iter_named_modules,
+    apply_lora_to_sezm,
+    has_lora,
+)
+
+
+def make_descriptor(**overrides) -> DescrptDPA4:
+    kwargs = {
+        "ntypes": 2,
+        "sel": 8,
+        "rcut": 4.0,
+        "channels": 16,
+        "n_radial": 8,
+        "lmax": 2,
+        "mmax": 1,
+        "n_blocks": 2,
+        "grid_branch": [1, 1, 1],
+        "s2_activation": [False, True],
+        "random_gamma": False,
+        "precision": "float64",
+        "seed": 7,
+    }
+    kwargs.update(overrides)
+    return DescrptDPA4(**kwargs)
+
+
+def test_apply_lora_marks_adapters_trainable() -> None:
+    # apply_lora freezes the pre-trained backbone and injects LoRASO3 / LoRASO2
+    # adapters.  The dpmodel tracks trainability per module, so every injected
+    # adapter module must be marked trainable for its low-rank delta to receive
+    # gradients.  Regression for the ``_UNFREEZE_LEAF_NAMES`` adapter entries:
+    # without them the adapters inherit ``trainable=False`` from the frozen base
+    # (the base is built frozen) and would stay frozen, so fine-tuning would be
+    # a no-op.
+    dd = make_descriptor()
+    apply_lora_to_sezm(dd, rank=2)
+    assert has_lora(dd)
+
+    modules = list(_iter_named_modules(dd))
+    adapters = [m for _name, m in modules if type(m).__name__ in ("LoRASO3", "LoRASO2")]
+    assert adapters, "apply_lora injected no LoRA adapter modules"
+    still_frozen = [m for m in adapters if not m.trainable]
+    assert not still_frozen, f"{len(still_frozen)} LoRA adapter module(s) left frozen"
+
+    # The pre-trained backbone is otherwise frozen: the type embedding carries a
+    # converged ``adam_type_embedding`` that ``apply_lora`` override-freezes, so
+    # the policy is a genuine freeze (not a trivial unfreeze-everything).
+    type_embeddings = [
+        m for _name, m in modules if type(m).__name__ == "SeZMTypeEmbedding"
+    ]
+    assert type_embeddings
+    assert all(not m.trainable for m in type_embeddings)
diff --git a/source/tests/common/dpmodel/test_dpa4_so2_grid.py b/source/tests/common/dpmodel/test_dpa4_so2_grid.py
index 4bf13567fe..64934bfbd1 100644
--- a/source/tests/common/dpmodel/test_dpa4_so2_grid.py
+++ b/source/tests/common/dpmodel/test_dpa4_so2_grid.py
@@ -48,7 +48,7 @@ def _base_kwargs(**overrides):
         "focus_dim": 0,
         "focus_compete": True,
         "so2_norm": False,
-        "so2_layers": 2,
+        "mixing_layers": 2,
         "so2_attn_res": "none",
         "layer_scale": False,
         "n_atten_head": 1,
@@ -235,14 +235,14 @@ def test_so2_message_node_so3_parity(masked, lmax, mmax) -> None:
 
 def test_so2_both_so3_parity() -> None:
     # node_wise_so3 + message_node_so3 together, example-config-like
-    # (lmax=3, mmax=1, n_focus=2, so2_layers=3, degree_channel radial).
+    # (lmax=3, mmax=1, n_focus=2, mixing_layers=3, degree_channel radial).
     pt_mod, dp_mod, kwargs = _build_conv_pair(
         node_wise_so3=True,
         message_node_so3=True,
         lmax=3,
         mmax=1,
         n_focus=2,
-        so2_layers=3,
+        mixing_layers=3,
         lebedev_quadrature=False,
     )
     _assert_conv_parity(pt_mod, dp_mod, kwargs)
@@ -298,19 +298,3 @@ def test_so2_no_grid_regression(masked) -> None:
     assert dp_mod.node_wise_grid_product is None
     assert dp_mod.message_node_grid_product is None
     _assert_conv_parity(pt_mod, dp_mod, kwargs, masked=masked)
-
-
-def test_so2_deserialize_rejects_drift_key() -> None:
-    """A drift key under a grid-product prefix fails deserialization loudly."""
-    pt_mod, _dp_mod, _kwargs = _build_conv_pair(
-        message_node_so3=True, lmax=3, mmax=1, lebedev_quadrature=False
-    )
-    data = pt_mod.serialize()
-    var_key = next(
-        k for k in data["@variables"] if k.startswith("message_node_grid_product.")
-    )
-    data["@variables"]["message_node_grid_product.__bogus__"] = data["@variables"][
-        var_key
-    ]
-    with pytest.raises(ValueError, match="message_node_grid_product"):
-        DPSO2Conv.deserialize(data)
diff --git a/source/tests/common/dpmodel/test_dpa4_so3_grid_utils.py b/source/tests/common/dpmodel/test_dpa4_so3_grid_utils.py
index 1e29eeb4e2..67f819fcbe 100644
--- a/source/tests/common/dpmodel/test_dpa4_so3_grid_utils.py
+++ b/source/tests/common/dpmodel/test_dpa4_so3_grid_utils.py
@@ -1,29 +1,17 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-"""Parity tests for the DPA4 SO(3) grid utility functions.
+"""Parity tests for the DPA4 SO(3) grid resolver.
 
-Compares the dpmodel ports of ``resolve_so3_grid`` and ``_build_so3_frame_set``
-against the reference pt implementations.
+Checks that the dpmodel port of ``resolve_so3_grid`` matches the reference pt
+implementation and honors its documented quadrature-resolution contract.
 """
 
 import pytest
 
 from deepmd.dpmodel.descriptor.dpa4_nn.projection import (
-    _build_so3_frame_set,
     resolve_so3_grid,
 )
 
 
-@pytest.mark.parametrize("kmax", [0, 1, 2, 3])  # frame-index half-width
-def test_build_so3_frame_set(kmax) -> None:
-    from deepmd.pt.model.descriptor.sezm_nn.projection import (
-        _build_so3_frame_set as pt_build_so3_frame_set,
-    )
-
-    assert _build_so3_frame_set(kmax) == pt_build_so3_frame_set(kmax)
-    if kmax == 2:
-        assert _build_so3_frame_set(kmax) == [0, -1, 1, -2, 2]
-
-
 @pytest.mark.parametrize(
     "lmax,kmax",  # max angular momentum, frame-index half-width
     [(1, 1), (2, 1), (2, 2), (3, 1), (3, 2)],
@@ -74,8 +62,6 @@ def test_resolve_so3_grid_unpackaged_precision_raises() -> None:
 
 @pytest.mark.parametrize("kmax", [-1, -2])  # negative half-width is invalid
 def test_negative_kmax_raises(kmax) -> None:
-    """Both utilities reject negative kmax."""
-    with pytest.raises(ValueError, match="non-negative"):
-        _build_so3_frame_set(kmax)
+    """The resolver rejects negative kmax."""
     with pytest.raises(ValueError, match="non-negative"):
         resolve_so3_grid(2, kmax=kmax)
diff --git a/source/tests/common/dpmodel/test_dpa4_so3_gridnet.py b/source/tests/common/dpmodel/test_dpa4_so3_gridnet.py
index 2abbb80dbe..097992883d 100644
--- a/source/tests/common/dpmodel/test_dpa4_so3_gridnet.py
+++ b/source/tests/common/dpmodel/test_dpa4_so3_gridnet.py
@@ -13,8 +13,6 @@
 pinned to CPU (``.to("cpu")``) under the CUDA-default-device CI.
 """
 
-import copy
-
 import array_api_compat
 import numpy as np
 import pytest
@@ -330,7 +328,8 @@ def test_so3_serialize_roundtrip(mode, op_type) -> None:
 
     data = dp_net.serialize()
     assert data["@version"] == 1
-    assert data["config"]["projector"]["@class"] == "SO3GridProjector"
+    assert data["@class"] == "SO3GridNet"
+    assert data["config"]["kmax"] == kmax
     assert "residual_scale" in data["@variables"]
     if mode == "cross":
         assert "frame_expand.weight" in data["@variables"]
@@ -439,10 +438,9 @@ def test_s2_regression(mode) -> None:
 def test_so3_cross_mixed_precision_runs() -> None:
     """fp32 inputs through an fp64 SO3GridNet cross net run cleanly.
 
-    ``_FrameMixer`` casts its weights to the operand dtype, so operands are
-    lifted to compute precision before frame expansion (matching pt's fp64
-    FrameExpand); the mixed-precision path must run and stay close to the
-    fp64-input result.
+    ``call`` lifts the operands to the fp64 compute precision (and
+    ``FrameExpand`` holds fp64 weights), so the mixed-precision path must run
+    and stay close to the all-fp64 result while returning the fp32 input dtype.
     """
     _pt, dp_net = _build_so3_nets(
         mode="cross", op_type="glu", layout="ndfc", precision="float64"
@@ -457,21 +455,3 @@ def test_so3_cross_mixed_precision_runs() -> None:
     out64 = np.asarray(dp_net.call(query, context))
     assert np.all(np.isfinite(out32))
     np.testing.assert_allclose(out32, out64, rtol=1e-4, atol=1e-4)
-
-
-def test_so3_deserialize_rejects_bad_projector() -> None:
-    """SO3GridNet.deserialize validates the nested projector @class/@version."""
-    _pt, dp_net = _build_so3_nets(mode="self", op_type="glu", layout="ndfc")
-    data = dp_net.serialize()
-    bad_class = copy.deepcopy(data)
-    bad_class["config"]["projector"]["@class"] = "S2GridProjector"
-    with pytest.raises(ValueError, match="projector"):
-        DPSO3GridNet.deserialize(bad_class)
-    bad_ver = copy.deepcopy(data)
-    bad_ver["config"]["projector"]["@version"] = 99
-    with pytest.raises(ValueError, match="version"):
-        DPSO3GridNet.deserialize(bad_ver)
-    missing_ver = copy.deepcopy(data)
-    del missing_ver["config"]["projector"]["@version"]
-    with pytest.raises(ValueError, match="@version"):
-        DPSO3GridNet.deserialize(missing_ver)
diff --git a/source/tests/common/dpmodel/test_dpa4_so3_projector.py b/source/tests/common/dpmodel/test_dpa4_so3_projector.py
index 01274f134c..8aa2a69ec5 100644
--- a/source/tests/common/dpmodel/test_dpa4_so3_projector.py
+++ b/source/tests/common/dpmodel/test_dpa4_so3_projector.py
@@ -68,14 +68,21 @@ def test_projection_matrices_match_pt(lmax, kmax, mmax) -> None:
 
 @pytest.mark.parametrize("lmax", [1, 2, 3, 4, 5, 6])  # max degree
 def test_roundtrip_preserves_legal_frame_coeffs(lmax) -> None:
-    """Project legal-frame coefficients to grid and back; recovery to 1e-12."""
+    """Project legal-frame coefficients to grid and back; recovery to 1e-11.
+
+    The round-trip is a chain of Wigner-D / Lebedev-quadrature matrix products,
+    so the float64 recovery residual grows with the coefficient count and hence
+    with ``lmax``; at ``lmax=6`` it sits at ~1e-12. A tolerance of 1e-11 keeps
+    an order-of-magnitude margin over that floor while still asserting recovery
+    to eleven significant digits.
+    """
     rng = np.random.default_rng(8100 + lmax)
     projector = SO3GridProjector(lmax=lmax, kmax=1, precision="float64")
     x = rng.standard_normal((2, projector.coeff_dim, 2)).astype(np.float64)
     mask = _legal_so3_frame_mask(projector)
     x[:, ~mask, :] = 0.0
     y = projector.from_grid(projector.to_grid(x))
-    np.testing.assert_allclose(y[:, mask, :], x[:, mask, :], atol=1e-12, rtol=1e-12)
+    np.testing.assert_allclose(y[:, mask, :], x[:, mask, :], atol=1e-11, rtol=1e-11)
     assert float(np.max(np.abs(y[:, ~mask, :]))) < 1e-14
 
 
diff --git a/source/tests/common/dpmodel/test_dpa4_sparse_edges.py b/source/tests/common/dpmodel/test_dpa4_sparse_edges.py
new file mode 100644
index 0000000000..edf6107b54
--- /dev/null
+++ b/source/tests/common/dpmodel/test_dpa4_sparse_edges.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Layout-agnostic edge-aggregation regression test for the dpmodel DPA4.
+
+The standard ``call`` path consumes a padded edge cache (``E = n_nodes * nnei``
+with ``dst == repeat(arange(n_nodes), nnei)``), while ``call_with_edges``
+consumes an arbitrary sparse edge list. The destination-wise aggregations
+(geometric initial embedding, environment initial embedding, and the attention
+softmax) are scatter reductions over ``dst``, so both layouts must yield the
+same descriptor for the same physical edges. These tests feed the sparse path
+the padded path's valid edges -- once in row-major order and once permuted into
+an arbitrary order with a non-uniform per-node degree -- and assert the two
+descriptors agree. They are the regression guard for the scatter-by-``dst``
+aggregation.
+"""
+
+import numpy as np
+
+from deepmd.dpmodel.descriptor.dpa4 import (
+    DescrptDPA4,
+)
+
+
+def build_neighbor_list_np(coord, rcut, nnei):
+    """Build a padded, distance-sorted gas-phase neighbor list (no PBC).
+
+    Parameters
+    ----------
+    coord
+        Coordinates with shape (nf, nloc, 3).
+    rcut
+        Cutoff radius.
+    nnei
+        Number of neighbor slots; pads with -1.
+
+    Returns
+    -------
+    np.ndarray
+        Neighbor list with shape (nf, nloc, nnei) holding local indices.
+    """
+    nf, nloc, _ = coord.shape
+    nlist = -np.ones((nf, nloc, nnei), dtype=np.int64)
+    for f in range(nf):
+        dist = np.linalg.norm(coord[f][:, None, :] - coord[f][None, :, :], axis=-1)
+        for i in range(nloc):
+            neighbors = [
+                (dist[i, j], j) for j in range(nloc) if j != i and dist[i, j] < rcut
+            ]
+            neighbors.sort()
+            for slot, (_, j) in enumerate(neighbors[:nnei]):
+                nlist[f, i, slot] = j
+    return nlist
+
+
+def build_sparse_edges_from_nlist(coord, nlist):
+    """Extract the valid physical edges of a padded neighbor list.
+
+    The padded layout keeps one slot per neighbor (``-1`` marks padding). The
+    sparse contract for :meth:`DescrptDPA4.call_with_edges` is one explicit edge
+    per kept slot, indexing the flattened frame-major node axis
+    (``node = f * nloc + i``). The edge vector points from the center toward the
+    neighbor, matching the padded path's ``r_j - r_i``.
+
+    Parameters
+    ----------
+    coord
+        Coordinates with shape (nf, nloc, 3).
+    nlist
+        Neighbor list with shape (nf, nloc, nnei); -1 marks padding.
+
+    Returns
+    -------
+    tuple[np.ndarray, np.ndarray]
+        ``edge_index`` with shape (2, E) (rows are src, dst) and ``edge_vec``
+        with shape (E, 3), aligned on the same edge axis in row-major
+        ``(frame, center, slot)`` order.
+    """
+    nf, nloc, nnei = nlist.shape
+    src, dst, vec = [], [], []
+    for f in range(nf):
+        for i in range(nloc):
+            for s in range(nnei):
+                j = int(nlist[f, i, s])
+                if j < 0:
+                    continue
+                src.append(f * nloc + j)
+                dst.append(f * nloc + i)
+                vec.append(coord[f, j] - coord[f, i])
+    edge_index = np.asarray([src, dst], dtype=np.int64)  # (2, E)
+    edge_vec = np.asarray(vec, dtype=np.float64)  # (E, 3)
+    return edge_index, edge_vec
+
+
+def make_descriptor() -> DescrptDPA4:
+    return DescrptDPA4(
+        ntypes=3,
+        sel=8,
+        rcut=4.0,
+        channels=16,
+        n_radial=8,
+        lmax=2,
+        mmax=1,
+        n_blocks=2,
+        precision="float64",
+        seed=7,
+        random_gamma=False,
+    )
+
+
+def make_inputs(seed=7, nf=2, nloc=6, rcut=4.0, nnei=8, ntypes=3):
+    rng = np.random.default_rng(seed)
+    coord = rng.uniform(0.0, 3.5, size=(nf, nloc, 3))
+    atype = rng.integers(0, ntypes, size=(nf, nloc))
+    nlist = build_neighbor_list_np(coord, rcut, nnei)
+    return coord, atype, nlist
+
+
+def _run_sparse(dd, coord, atype, edge_index, edge_vec):
+    nf = atype.shape[0]
+    edge_mask = np.ones(edge_index.shape[1], dtype=bool)
+    return np.asarray(
+        dd.call_with_edges(
+            coord_ext=coord,
+            atype_ext=atype,
+            edge_index=edge_index,
+            edge_vec=edge_vec,
+            edge_mask=edge_mask,
+        )[0]
+    )
+
+
+def test_sparse_edges_match_padded_rowmajor() -> None:
+    # Row-major sparse edges reproduce the padded scatter order exactly: the
+    # masked padding slots of the padded path contribute zero, so dropping them
+    # leaves the destination accumulation order unchanged.
+    dd = make_descriptor()
+    coord, atype, nlist = make_inputs()
+    nf, nloc = atype.shape
+
+    out_pad = np.asarray(dd.call(coord.reshape(nf, -1), atype, nlist)[0])
+    edge_index, edge_vec = build_sparse_edges_from_nlist(coord, nlist)
+    out_sparse = _run_sparse(dd, coord, atype, edge_index, edge_vec)
+
+    assert out_sparse.shape == out_pad.shape == (nf, nloc, dd.get_dim_out())
+    assert np.isfinite(out_sparse).all()
+    np.testing.assert_allclose(out_sparse, out_pad, rtol=1e-10, atol=1e-12)
+
+
+def test_sparse_edges_match_padded_permuted() -> None:
+    # Permuted sparse edges exercise an arbitrary (non-row-major) ``dst`` order
+    # and a non-uniform per-node degree. The destination scatter reductions are
+    # order-agnostic, so the descriptor must still match the padded path within
+    # float64 reassociation tolerance.
+    dd = make_descriptor()
+    coord, atype, nlist = make_inputs()
+    nf, nloc = atype.shape
+
+    out_pad = np.asarray(dd.call(coord.reshape(nf, -1), atype, nlist)[0])
+
+    edge_index, edge_vec = build_sparse_edges_from_nlist(coord, nlist)
+    perm = np.random.default_rng(31).permutation(edge_index.shape[1])
+    edge_index = edge_index[:, perm]
+    edge_vec = edge_vec[perm]
+    out_sparse = _run_sparse(dd, coord, atype, edge_index, edge_vec)
+
+    assert out_sparse.shape == out_pad.shape == (nf, nloc, dd.get_dim_out())
+    assert np.isfinite(out_sparse).all()
+    np.testing.assert_allclose(out_sparse, out_pad, rtol=1e-10, atol=1e-12)
diff --git a/source/tests/common/dpmodel/test_lebedev_sh.py b/source/tests/common/dpmodel/test_lebedev_sh.py
index 5b7a448ed1..393316751f 100644
--- a/source/tests/common/dpmodel/test_lebedev_sh.py
+++ b/source/tests/common/dpmodel/test_lebedev_sh.py
@@ -17,31 +17,6 @@ def test_rule_basic(self, precision):
         np.testing.assert_allclose(np.linalg.norm(pts, axis=1), 1.0, rtol=1e-12)
         np.testing.assert_allclose(wts.sum(), 1.0, rtol=1e-12)
 
-    def test_unpackaged_precision_raises(self):
-        with pytest.raises(ValueError, match="not packaged"):
-            lebedev_module.load_lebedev_rule(4)
-
-    @pytest.mark.parametrize("precision", [3.5, 11.0, "11", None])  # non-integers
-    def test_non_integer_precision_raises(self, precision):
-        with pytest.raises(TypeError, match="integer"):
-            lebedev_module.load_lebedev_rule(precision)
-
-    def test_missing_data_file_raises(self, monkeypatch, tmp_path):
-        monkeypatch.setattr(lebedev_module, "LEBEDEV_RULES_FILE", tmp_path / "nope.npz")
-        with pytest.raises(FileNotFoundError, match="missing"):
-            lebedev_module.load_lebedev_rule(11)
-
-    def test_pt_loader_matches(self):
-        torch = pytest.importorskip("torch")
-        from deepmd.pt.model.descriptor.sezm_nn.lebedev import (
-            load_lebedev_rule as pt_rule,
-        )
-
-        pts, wts = lebedev_module.load_lebedev_rule(11)
-        tpts, twts = pt_rule(11, dtype=torch.float64, device="cpu")
-        np.testing.assert_allclose(pts, tpts.numpy(), rtol=0, atol=0)
-        np.testing.assert_allclose(wts, twts.numpy(), rtol=0, atol=0)
-
 
 class TestRealSphericalHarmonics:
     @pytest.mark.parametrize("lmax", [0, 1, 2, 3, 4, 6])  # maximum angular degree
@@ -68,38 +43,6 @@ def test_matches_e3nn(self, lmax):
         assert out.dtype == np.float64
         np.testing.assert_allclose(out, ref, rtol=1e-12, atol=1e-13)
 
-    @pytest.mark.parametrize("lmax", [2, 4])  # maximum angular degree
-    def test_scale_invariance(self, lmax):
-        # normalize=True in the e3nn call: input vectors are normalized
-        # internally, so non-unit inputs must give identical output.
-        rng = np.random.default_rng(1)
-        v = rng.standard_normal((32, 3))
-        v /= np.linalg.norm(v, axis=1, keepdims=True)
-        scale = rng.uniform(0.1, 10.0, size=(32, 1))
-        np.testing.assert_allclose(
-            real_spherical_harmonics(v * scale, lmax),
-            real_spherical_harmonics(v, lmax),
-            rtol=1e-12,
-            atol=1e-14,
-        )
-
-    def test_batched_leading_dims(self):
-        rng = np.random.default_rng(2)
-        v = rng.standard_normal((4, 5, 3))
-        out = real_spherical_harmonics(v, 3)
-        assert out.shape == (4, 5, 16)
-        flat = real_spherical_harmonics(v.reshape(-1, 3), 3)
-        np.testing.assert_allclose(out.reshape(-1, 16), flat, rtol=0, atol=0)
-
-    # e3nn-free convention pin: the l=1 block under normalization="norm"
-    # is exactly the unit input vector in (x, y, z) order (m = -1, 0, +1).
-    def test_l1_block_is_unit_vector(self):
-        rng = np.random.default_rng(3)
-        v = rng.standard_normal((128, 3))
-        v /= np.linalg.norm(v, axis=1, keepdims=True)
-        out = real_spherical_harmonics(v, 1)
-        np.testing.assert_allclose(out[:, 1:4], v, rtol=1e-12, atol=1e-14)
-
     def test_basis_vectors_lmax2(self):
         # e3nn-free convention pin: analytic SH values at lmax=2 for the
         # Cartesian basis vectors. Cross-checked against
@@ -116,51 +59,6 @@ def test_basis_vectors_lmax2(self):
             out = real_spherical_harmonics(np.array([vec]), 2)
             np.testing.assert_allclose(out[0], ref, rtol=1e-12, atol=1e-15)
 
-    @pytest.mark.filterwarnings("error")
-    @pytest.mark.parametrize("vecs", [1.0, np.float64(2.0), [1.0, 0.0]])  # bad shapes
-    def test_invalid_input_shape_raises(self, vecs):
-        # scalar/0-d inputs and wrong last-axis sizes must raise, not crash
-        with pytest.raises(ValueError, match="shape"):
-            real_spherical_harmonics(vecs, 2)
-
-    def test_negative_lmax_raises(self):
-        with pytest.raises(ValueError, match="lmax"):
-            real_spherical_harmonics(np.zeros((4, 3)), -1)
-
-    def test_zero_vector(self):
-        # e3nn's normalize=True clamps the norm, so a zero vector maps to
-        # [Y00, 0, 0, ...] = [1, 0, ...]. Verified against
-        # e3nn.o3.spherical_harmonics([0, 1, 2], zeros, normalize=True,
-        # normalization="norm") -> [1, 0, 0, 0, 0, 0, 0, 0, 0].
-        expected = np.zeros(9)
-        expected[0] = 1.0
-        with np.errstate(invalid="raise", divide="raise"):
-            out = real_spherical_harmonics(np.zeros((1, 3)), 2)
-        np.testing.assert_allclose(out[0], expected, rtol=0, atol=0)
-
-    @pytest.mark.filterwarnings("error")
-    def test_zero_vector_mixed_batch(self):
-        # batch mixing zero and unit vectors: zero rows give [1, 0, ...],
-        # nonzero rows are unaffected by the zero-vector guard
-        rng = np.random.default_rng(4)
-        v = rng.standard_normal((6, 3))
-        v /= np.linalg.norm(v, axis=1, keepdims=True)
-        v[1] = 0.0
-        v[4] = 0.0
-        with np.errstate(invalid="raise", divide="raise"):
-            out = real_spherical_harmonics(v, 2)
-        expected_zero = np.zeros(9)
-        expected_zero[0] = 1.0
-        for i in (1, 4):
-            np.testing.assert_allclose(out[i], expected_zero, rtol=0, atol=0)
-        nonzero = [0, 2, 3, 5]
-        np.testing.assert_allclose(
-            out[nonzero],
-            real_spherical_harmonics(v[nonzero], 2),
-            rtol=0,
-            atol=0,
-        )
-
     def test_quadrature_orthogonality(self):
         lmax = 3
         pts, wts = lebedev_module.load_lebedev_rule(2 * lmax + 1)
diff --git a/source/tests/pt/model/test_descriptor_sezm.py b/source/tests/pt/model/test_descriptor_sezm.py
index 1f155f1b40..093ef2b8d0 100644
--- a/source/tests/pt/model/test_descriptor_sezm.py
+++ b/source/tests/pt/model/test_descriptor_sezm.py
@@ -10,16 +10,21 @@
 )
 from deepmd.pt.model.descriptor.sezm_nn import (
     DynamicRadialDegreeMixer,
+    EdgeCartesianTensorProduct,
     ForceEmbedding,
     InnerClamp,
+    NodeCartesianTensorProduct,
     SeZMDirectForceHead,
     SO2Linear,
     WignerDCalculator,
+    build_cartesian_basis,
+    build_edge_cartesian_tensors,
     build_edge_quaternion,
     build_gie_zonal_index,
     build_m_major_l_index,
     quaternion_multiply,
     quaternion_to_rotation_matrix,
+    safe_norm,
 )
 from deepmd.pt.model.model import (
     get_sezm_model,
@@ -155,6 +160,106 @@ def _assert_forward_backward_smoke(self, **model_kwargs) -> DescrptSeZM:
         self.assertTrue(torch.all(torch.isfinite(extended_coord.grad)))
         return model
 
+    def test_cartesian_config_wiring(self) -> None:
+        """Each Cartesian/mixing config builds the intended submodules.
+
+        Guards against silent fallback: a mis-wired flag would still run and stay
+        equivariant, so the structural assertions are the only safeguard that the
+        requested path is actually taken.
+        """
+        # edge_cartesian replaces the SO(2) core and elides the full Wigner-D.
+        edge_model = self._assert_forward_backward_smoke(
+            **_descriptor_kwargs(
+                l_schedule=[2, 1], edge_cartesian=True, channels=4, n_focus=2
+            )
+        )
+        self.assertFalse(edge_model._need_full_wigner)
+        for block in edge_model.blocks:
+            conv = block.so2_conv
+            self.assertTrue(conv.edge_cartesian)
+            self.assertTrue(hasattr(conv, "edge_cartesian_tp"))
+            self.assertFalse(hasattr(conv, "so2_linears"))
+
+        # node_cartesian adds a per-node product on top of the SO(2) core, leaving
+        # the per-edge message path (and the full Wigner-D) intact.
+        node_model = self._assert_forward_backward_smoke(
+            **_descriptor_kwargs(
+                l_schedule=[2, 1], node_cartesian="parity:2", channels=4, n_focus=2
+            )
+        )
+        self.assertTrue(node_model._need_full_wigner)
+        for block in node_model.blocks:
+            tp = block.so2_conv.node_cartesian_tp
+            self.assertIsNotNone(tp)
+            self.assertTrue(tp.symmetric)
+            self.assertEqual(tp.n_layers, 2)
+
+        # mixing_layers=0 without a degree mixer skips the edge-aligned frame.
+        radial_model = self._assert_forward_backward_smoke(
+            **_descriptor_kwargs(
+                l_schedule=[2, 1],
+                mixing_layers=0,
+                radial_so2_mode="none",
+                channels=4,
+                n_focus=2,
+            )
+        )
+        for block in radial_model.blocks:
+            conv = block.so2_conv
+            self.assertFalse(conv.needs_local_frame)
+            self.assertEqual(len(conv.so2_linears), 0)
+
+    def test_cartesian_rotation_invariance(self) -> None:
+        """Descriptor scalar output is rotation-invariant across Cartesian modes.
+
+        This end-to-end check covers every per-edge and per-node path introduced
+        by the Cartesian options, so it also guards the edge/node engine
+        equivariance within the full pipeline.
+        """
+        dtype = torch.float64
+        coord = torch.tensor(
+            [[0.0, 0.0, 0.0], [0.9, 0.2, 0.1], [0.1, 1.0, 0.3]],
+            dtype=dtype,
+            device=self.device,
+        ).view(1, -1, 3)
+        atype = torch.tensor([[0, 1, 1]], dtype=torch.int32, device=self.device)
+        nlist = torch.tensor(
+            [[[1, 2], [0, 2], [0, 1]]], dtype=torch.int64, device=self.device
+        )
+        rot = quaternion_to_rotation_matrix(
+            _random_quaternion(1, device=self.device, dtype=dtype)
+        )[0]
+        configs = {
+            "edge": {"edge_cartesian": True},
+            "node_default": {"node_cartesian": "default:1"},
+            "node_parity": {"node_cartesian": "parity:2"},
+            "edge_and_node": {"edge_cartesian": True, "node_cartesian": "parity:1"},
+            "mixing0_radial": {"mixing_layers": 0, "radial_so2_mode": "none"},
+            "mixing0_degree": {"mixing_layers": 0, "radial_so2_mode": "degree_channel"},
+        }
+        for name, override in configs.items():
+            with self.subTest(config=name):
+                model = DescrptSeZM(
+                    **_descriptor_kwargs(
+                        rcut=3.0,
+                        sel=[2, 2],
+                        l_schedule=[2, 2],
+                        channels=4,
+                        n_focus=2,
+                        n_atten_head=1,
+                        precision="float64",
+                        use_amp=False,
+                        random_gamma=False,
+                        **override,
+                    )
+                )
+                model.eval()
+                with torch.no_grad():
+                    desc, *_ = model(coord.reshape(1, -1), atype, nlist)
+                    coord_rot = (rot @ coord.reshape(-1, 3).T).T.reshape(1, -1)
+                    desc_rot, *_ = model(coord_rot, atype, nlist)
+                torch.testing.assert_close(desc, desc_rot, atol=1e-10, rtol=1e-10)
+
     def test_so3_readout_empty_edge_shrinking_schedule(self) -> None:
         """so3_readout glu/mlp must handle the empty-edge path.
 
@@ -499,6 +604,18 @@ def test_serialization_deserialization(self) -> None:
                 radial_so2_mode="degree_channel",
                 radial_so2_rank=2,
             ),
+            "cartesian": _descriptor_kwargs(
+                precision="float32",
+                l_schedule=[2, 1],
+                edge_cartesian=True,
+                channels=4,
+                n_focus=2,
+                focus_dim=0,
+                so2_layers=2,
+                n_radial=3,
+                radial_mlp=[6],
+                ffn_neurons=8,
+            ),
         }
         dtype = PRECISION_DICT["float32"]
         for case_name, model_kwargs in cases.items():
@@ -1106,6 +1223,146 @@ def test_dynamic_radial_degree_mixer_equivariance(self) -> None:
                 torch.testing.assert_close(lhs, rhs, atol=1e-5, rtol=1e-5)
 
 
+class TestCartesianTensorProduct(_SeZMTestCase):
+    """Test the Cartesian rank-2 tensor-product building blocks."""
+
+    def test_basis_intertwines_wigner(self) -> None:
+        """cart(D @ sh) == R cart(sh) R^T links Wigner-D to Cartesian rotation.
+
+        Guards the hand-written sign/ordering convention of
+        ``build_cartesian_basis``: the change of basis must intertwine the packed
+        Wigner-D rotation with the Cartesian conjugation ``X -> R X R^T``.
+        """
+        for lmax in (1, 2):
+            dim = (lmax + 1) ** 2
+            basis = build_cartesian_basis(lmax, dtype=torch.float64, device=self.device)
+            wigner = WignerDCalculator(lmax=lmax, dtype=torch.float64).to(self.device)
+            quat = _random_quaternion(6, device=self.device, dtype=torch.float64)
+            d_full, _ = wigner(quat)
+            rot = quaternion_to_rotation_matrix(quat)
+            sh = torch.randn(6, dim, dtype=torch.float64, device=self.device)
+            cart = torch.einsum("bd,dij->bij", sh, basis)
+            lhs = torch.einsum(
+                "bd,dij->bij", torch.einsum("bde,be->bd", d_full, sh), basis
+            )
+            rhs = rot @ cart @ rot.transpose(-1, -2)
+            torch.testing.assert_close(lhs, rhs, atol=1e-9, rtol=1e-9)
+
+    def test_edge_cartesian_matches_dense_reference(self) -> None:
+        """Channel-shared edge evaluation equals the naive per-channel ``Y @ T_e``.
+
+        Covers the ``mixing_layers > 0`` stack and the ``mixing_layers == 0``
+        single-modulation path; equivariance of the edge path is covered
+        end-to-end by ``test_cartesian_rotation_invariance``.
+        """
+        for lmax in (1, 2):
+            for n_layers in (0, 3):
+                dim = (lmax + 1) ** 2
+                n_focus, focus_dim, n_edge = 2, 4, 16
+                width = n_focus * focus_dim
+                engine = EdgeCartesianTensorProduct(
+                    lmax=lmax,
+                    focus_dim=focus_dim,
+                    n_focus=n_focus,
+                    n_layers=n_layers,
+                    activation_function="silu",
+                    mlp_bias=True,
+                    eps=1e-7,
+                    dtype=torch.float64,
+                    seed=7,
+                    trainable=True,
+                ).to(self.device)
+                x = torch.randn(
+                    n_edge, dim, width, dtype=torch.float64, device=self.device
+                )
+                edge_vec = torch.randn(
+                    n_edge, 3, dtype=torch.float64, device=self.device
+                )
+                rad = torch.randn(
+                    n_edge, lmax + 1, width, dtype=torch.float64, device=self.device
+                )
+                out = engine(x, edge_vec, rad)
+                ref = self._dense_cartesian_reference(engine, x, edge_vec, rad)
+                torch.testing.assert_close(out, ref, atol=1e-11, rtol=1e-11)
+
+    @staticmethod
+    def _dense_cartesian_reference(
+        engine: EdgeCartesianTensorProduct,
+        x: torch.Tensor,
+        edge_vec: torch.Tensor,
+        rad: torch.Tensor,
+    ) -> torch.Tensor:
+        """Reference Cartesian product via explicit per-channel ``Y @ T_e``."""
+        n_edge = x.shape[0]
+        f, cf = engine.n_focus, engine.focus_dim
+        basis = build_cartesian_basis(engine.lmax, dtype=x.dtype, device=x.device)
+        r_hat = edge_vec / safe_norm(edge_vec, engine.eps)
+        a0, s0 = build_edge_cartesian_tensors(r_hat)
+        eye = torch.eye(3, dtype=x.dtype, device=x.device) / math.sqrt(3.0)
+        a0 = a0 / math.sqrt(2.0)
+        f_iso = rad[:, 0, :].reshape(n_edge, f, cf, 1, 1)
+        f_aniso = rad[:, 1, :].reshape(n_edge, f, cf, 1, 1)
+        t_e = f_iso * eye + f_aniso * a0[:, None, None, :, :]
+        if engine.lmax == 2:
+            s0 = s0 / math.sqrt(2.0 / 3.0)
+            f_sym = rad[:, 2, :].reshape(n_edge, f, cf, 1, 1)
+            t_e = t_e + f_sym * s0[:, None, None, :, :]
+
+        def modulate(coeff: torch.Tensor) -> torch.Tensor:
+            cart = torch.einsum("edfc,dij->efcij", coeff, basis)
+            return torch.einsum("efcij,dij->edfc", torch.matmul(cart, t_e), basis)
+
+        h = x.reshape(n_edge, engine.ebed_dim, f, cf)
+        if engine.n_layers == 0:
+            # Single modulation ``x @ T_e`` with no learnable channel-mixing layer.
+            return modulate(h).reshape(n_edge, engine.ebed_dim, f * cf)
+        for linear, activation in zip(engine.linears, engine.activations, strict=True):
+            h = h + activation(modulate(linear(h)))
+        return h.reshape(n_edge, engine.ebed_dim, f * cf)
+
+    def test_node_engine_rotation_equivariance(self) -> None:
+        """NodeCartesianTensorProduct commutes with a global Wigner-D rotation."""
+        for lmax in (1, 2):
+            for symmetric in (False, True):
+                dim = (lmax + 1) ** 2
+                n_focus, focus_dim, n_node = 2, 4, 16
+                width = n_focus * focus_dim
+                engine = NodeCartesianTensorProduct(
+                    lmax=lmax,
+                    focus_dim=focus_dim,
+                    n_focus=n_focus,
+                    n_layers=3,
+                    symmetric=symmetric,
+                    activation_function="silu",
+                    mlp_bias=True,
+                    dtype=torch.float64,
+                    seed=7,
+                    trainable=True,
+                ).to(self.device)
+                message = torch.randn(
+                    n_node, dim, width, dtype=torch.float64, device=self.device
+                )
+                node = torch.randn(
+                    n_node, dim, width, dtype=torch.float64, device=self.device
+                )
+                wigner = WignerDCalculator(lmax=lmax, dtype=torch.float64).to(
+                    self.device
+                )
+                quat = _random_quaternion(1, device=self.device, dtype=torch.float64)
+                d_mat = wigner(quat)[0][0]  # (D, D), one shared global rotation
+                out = engine(message, node)
+                out_rot = engine(
+                    torch.einsum("ij,njc->nic", d_mat, message),
+                    torch.einsum("ij,njc->nic", d_mat, node),
+                )
+                torch.testing.assert_close(
+                    out_rot,
+                    torch.einsum("ij,njc->nic", d_mat, out),
+                    atol=1e-9,
+                    rtol=1e-9,
+                )
+
+
 class TestInnerClamp(_SeZMTestCase):
     """Test InnerClamp C3-continuous septic Hermite clamping."""
 
@@ -1274,6 +1531,7 @@ def _build_model_params(
         *,
         use_amp: bool,
         n_focus: int = 1,
+        edge_cartesian: bool = False,
         bridging_method: str = "none",
         bridging_r_inner: float = 0.8,
         bridging_r_outer: float = 1.2,
@@ -1293,7 +1551,8 @@ def _build_model_params(
                 "n_radial": 6,
                 "radial_mlp": [16],
                 "use_env_seed": True,
-                "l_schedule": [1, 0],
+                "edge_cartesian": edge_cartesian,
+                "l_schedule": [2, 1] if edge_cartesian else [1, 0],
                 "mmax": 1,
                 "so2_norm": False,
                 "so2_layers": 1,
@@ -1349,6 +1608,7 @@ def _build_random_weight_model(
         *,
         use_amp: bool,
         n_focus: int = 1,
+        edge_cartesian: bool = False,
         bridging_method: str = "none",
         bridging_r_inner: float = 0.8,
         bridging_r_outer: float = 1.2,
@@ -1359,6 +1619,7 @@ def _build_random_weight_model(
                 n_atten_head,
                 use_amp=use_amp,
                 n_focus=n_focus,
+                edge_cartesian=edge_cartesian,
                 bridging_method=bridging_method,
                 bridging_r_inner=bridging_r_inner,
                 bridging_r_outer=bridging_r_outer,
@@ -1492,12 +1753,14 @@ def _assert_cutoff_near_energy_curve_is_smooth(
         *,
         use_amp: bool,
         n_focus: int,
+        edge_cartesian: bool = False,
     ) -> None:
         """Check that the non-bridged near-cutoff probe keeps one smooth extremum."""
         model = self._build_random_weight_model(
             n_atten_head,
             use_amp=use_amp,
             n_focus=n_focus,
+            edge_cartesian=edge_cartesian,
         )
         displacements, energies = self._scan_total_energy_curve(
             model,
@@ -1536,6 +1799,18 @@ def test_scaled_cutoff_near_energy_curve_is_smooth_across_attention_modes(
                             n_focus=n_focus,
                         )
 
+    def test_cartesian_near_cutoff_energy_curve_is_smooth(self) -> None:
+        """The Cartesian path keeps a single smooth near-cutoff PES extremum."""
+        for n_atten_head in (0, 1):
+            for n_focus in (1, 2):
+                with self.subTest(n_atten_head=n_atten_head, n_focus=n_focus):
+                    self._assert_cutoff_near_energy_curve_is_smooth(
+                        n_atten_head,
+                        use_amp=False,
+                        n_focus=n_focus,
+                        edge_cartesian=True,
+                    )
+
     def _assert_bridging_force_consistent_across_switch(
         self,
         model: torch.nn.Module,
diff --git a/source/tests/pt/model/test_dpa4_dpmodel_parity.py b/source/tests/pt/model/test_dpa4_dpmodel_parity.py
index 8009f81bc0..128073bb60 100644
--- a/source/tests/pt/model/test_dpa4_dpmodel_parity.py
+++ b/source/tests/pt/model/test_dpa4_dpmodel_parity.py
@@ -41,6 +41,28 @@ def pt_state_to_numpy(module: torch.nn.Module) -> dict[str, np.ndarray]:
     return {k: v.detach().cpu().numpy() for k, v in module.state_dict().items()}
 
 
+# The pt SO(2) convolution registers persistent buffers for the derived
+# m-major index / inverse-rotation rescale tables, and the descriptor carries a
+# scalar ``_empty_tensor`` placeholder. These are non-learnable constants that
+# the dpmodel serialize() rebuilds from config and omits from ``@variables``,
+# so the cross-backend key-set contract compares against the pt ``state_dict``
+# with these derived keys removed.
+_DERIVED_PT_BUFFER_SUFFIXES = (
+    ".coeff_index_m",
+    ".degree_index_m",
+    ".rotate_inv_rescale_full",
+)
+
+
+def _learnable_pt_keys(module: torch.nn.Module) -> set[str]:
+    """Pt ``state_dict`` keys minus the derived (non-learnable) buffer keys."""
+    return {
+        k
+        for k in module.state_dict()
+        if k != "_empty_tensor" and not k.endswith(_DERIVED_PT_BUFFER_SUFFIXES)
+    }
+
+
 def assert_parity(a, t, rtol=PT_RTOL, atol=PT_ATOL):
     np.testing.assert_allclose(
         np.asarray(a), t.detach().cpu().numpy(), rtol=rtol, atol=atol
@@ -143,9 +165,7 @@ def test_build_rotate_inv_rescale(self, lmax, mmax) -> None:
             pytest.skip("mmax must be <= lmax")
         degree_index_np = dp_indexing.build_m_major_l_index(lmax, mmax)
         degree_index_pt = pt_indexing.build_m_major_l_index(lmax, mmax, device=CPU)
-        res = dp_indexing.build_rotate_inv_rescale(
-            lmax, mmax, degree_index_np, dtype=np.float64
-        )
+        res = dp_indexing.build_rotate_inv_rescale(lmax, mmax, degree_index_np)
         ref = pt_indexing.build_rotate_inv_rescale(
             lmax, mmax, degree_index_pt, device=CPU, dtype=torch.float64
         )
@@ -375,18 +395,6 @@ def test_envelope(self, exponent) -> None:
         np.testing.assert_array_equal(np.asarray(res)[0], 1.0)
         np.testing.assert_array_equal(np.asarray(res)[r[:, 0] >= self.rcut], 0.0)
 
-    def test_envelope_roundtrip(self) -> None:
-        from deepmd.dpmodel.descriptor.dpa4_nn.radial import (
-            C3CutoffEnvelope as DPEnvelope,
-        )
-
-        dp_mod = DPEnvelope(rcut=self.rcut, exponent=5, precision="float64")
-        dp_mod2 = DPEnvelope.deserialize(dp_mod.serialize())
-        r = self._r_grid()
-        np.testing.assert_array_equal(
-            np.asarray(dp_mod.call(r)), np.asarray(dp_mod2.call(r))
-        )
-
     @pytest.mark.parametrize("basis_type", ["bessel", "gaussian"])  # both bases
     @pytest.mark.parametrize("exponent", [5, 7])  # envelope exponent
     def test_radial_basis(self, basis_type, exponent) -> None:
@@ -496,9 +504,9 @@ def test_radial_mlp_zero_input_is_zero(self) -> None:
     def test_radial_mlp_unsupported_activation(self) -> None:
         from deepmd.dpmodel.descriptor.dpa4_nn.radial import RadialMLP as DPRadialMLP
 
-        dp_mod = DPRadialMLP([4, 8, 4], activation_function="nope", seed=0)
+        # the activation is resolved when the network is built (construction time)
         with pytest.raises(NotImplementedError):
-            dp_mod.call(np.zeros((2, 4), dtype=np.float64))
+            DPRadialMLP([4, 8, 4], activation_function="nope", seed=0)
 
     def test_rmsnorm_parity_and_roundtrip(self) -> None:
         from deepmd.dpmodel.descriptor.dpa4_nn.norm import RMSNorm as DPRMSNorm
@@ -560,15 +568,14 @@ def test_constructor_errors(self) -> None:
 
     def test_deserialize_wrong_class(self) -> None:
         from deepmd.dpmodel.descriptor.dpa4_nn.norm import RMSNorm as DPRMSNorm
-        from deepmd.dpmodel.descriptor.dpa4_nn.radial import (
-            C3CutoffEnvelope as DPEnvelope,
-        )
         from deepmd.dpmodel.descriptor.dpa4_nn.radial import (
             RadialBasis as DPRadialBasis,
         )
         from deepmd.dpmodel.descriptor.dpa4_nn.radial import RadialMLP as DPRadialMLP
 
-        for klass in (DPEnvelope, DPRadialBasis, DPRadialMLP, DPRMSNorm):
+        # C3CutoffEnvelope is a parameter-free derived module with no
+        # serialize/deserialize, so it is not part of this contract.
+        for klass in (DPRadialBasis, DPRadialMLP, DPRMSNorm):
             with pytest.raises(ValueError):
                 klass.deserialize({"@class": "Nope", "@version": 1})
 
@@ -683,8 +690,15 @@ def test_quat_and_d(self, lmax) -> None:
         D_pt, Dt_pt = calc_pt(quat_pt)
         dim = (lmax + 1) ** 2
         assert D_dp.shape == (vec.shape[0], dim, dim)
-        assert_parity(D_dp, D_pt)
-        assert_parity(Dt_dp, Dt_pt)
+        # The Wigner-D rotation recursion accumulates O((lmax+1)^2) fp64 terms,
+        # so numpy- and torch-summed entries diverge at a degree-dependent
+        # round-off floor (~4e-15 at lmax=2, ~4e-14 at lmax=4). The relative
+        # gate stays tight; only the near-zero absolute floor follows the
+        # measured high-degree accumulation (still ~1e10 below any logic-level
+        # divergence, and D @ Dt == I is verified independently below).
+        wig_atol = max(PT_ATOL, 1e-13 * (lmax + 1))
+        assert_parity(D_dp, D_pt, atol=wig_atol)
+        assert_parity(Dt_dp, Dt_pt, atol=wig_atol)
         # rotation property: D @ Dt == I
         eye = np.broadcast_to(np.eye(dim), D_dp.shape)
         np.testing.assert_allclose(D_dp @ Dt_dp, eye, atol=1e-11)
@@ -717,7 +731,9 @@ def test_forward_zonal(self, lmax, lmin) -> None:
         n_expected = max((lmax + 1) ** 2 - lmin * lmin, 0)
         assert z_dp.shape == (vec.shape[0], n_expected)
         assert tuple(z_pt.shape) == (vec.shape[0], n_expected)
-        assert_parity(z_dp, z_pt)
+        # zonal projection inherits the degree-dependent fp64 round-off floor of
+        # the full Wigner-D matrices (see test_quat_and_d).
+        assert_parity(z_dp, z_pt, atol=max(PT_ATOL, 1e-13 * (lmax + 1)))
 
     def test_call_works_on_torch_tensors(self) -> None:
         from deepmd.dpmodel.descriptor.dpa4_nn.wignerd import (
@@ -903,21 +919,6 @@ def test_reduced_equivariant_rmsnorm_invalid_degree_index(self) -> None:
                 precision="float64",
             )
 
-    @pytest.mark.parametrize("mmax", [-1, 3])  # below 0 / above lmax
-    def test_reduced_equivariant_rmsnorm_invalid_mmax(self, mmax) -> None:
-        from deepmd.dpmodel.descriptor.dpa4_nn.norm import (
-            ReducedEquivariantRMSNorm as DPReducedEquivariantRMSNorm,
-        )
-
-        with pytest.raises(ValueError, match="mmax"):
-            DPReducedEquivariantRMSNorm(
-                lmax=2,
-                mmax=mmax,
-                channels=4,
-                degree_index_m=np.array([0, 1, 2], dtype=np.int64),
-                precision="float64",
-            )
-
     @pytest.mark.parametrize("ndim", [2, 3])  # (B, C) and (B, F, C) branches
     def test_scalar_rmsnorm(self, ndim) -> None:
         from deepmd.dpmodel.descriptor.dpa4_nn.norm import (
@@ -1312,7 +1313,12 @@ def test_gated_activation_lmax0(self, use_gate) -> None:
             layout="nfdc",
             activation="silu",
         )
-        assert dp_mod.gate_linear is None
+        from deepmd.dpmodel.utils.network import (
+            Identity,
+        )
+
+        # lmax=0 has no l>0 coefficients to gate: the gate projection is a no-op
+        assert isinstance(dp_mod.gate_linear, Identity)
         rng = np.random.default_rng(2063)
         shape = self._shape(0, None, 1, "nfdc")
         x = rng.normal(size=shape)
@@ -1648,6 +1654,7 @@ def test_grid_branch(self, n_branches) -> None:
             n_frames=1,
             precision="float64",
             seed=9,
+            trainable=True,
         )
         for name in ("left_proj", "right_proj", "router", "out_proj"):
             getattr(dp_mod, name).weight = state[f"{name}.weight"]
@@ -1716,6 +1723,7 @@ def test_grid_mlp(self, mode) -> None:
             n_frames=1,
             precision="float64",
             seed=9,
+            trainable=True,
         )
         for name in ("left_proj", "right_proj", "out_proj"):
             getattr(dp_mod, name).weight = state[f"{name}.weight"]
@@ -1786,36 +1794,6 @@ def test_grid_branch_deserialize_wrong_class(self) -> None:
         with pytest.raises(ValueError):
             DPGridBranch.deserialize({"@class": "Nope", "@version": 1})
 
-    # ------------------------------------------------ (d) not-ported guards
-    def test_not_ported_guards(self) -> None:
-        from deepmd.dpmodel.descriptor.dpa4_nn.grid_net import S2GridNet as DPS2GridNet
-        from deepmd.dpmodel.descriptor.dpa4_nn.projection import (
-            S2GridProjector as DPS2GridProjector,
-        )
-
-        common = {
-            "lmax": 2,
-            "channels": 4,
-            "mode": "self",
-            "op_type": "glu",
-            "precision": "float64",
-            "layout": "ndfc",
-            "grid_method": "lebedev",
-        }
-        with pytest.raises(NotImplementedError, match="lebedev_quadrature"):
-            # e3nn product grid (lebedev_quadrature=False) is not ported
-            DPS2GridProjector(lmax=2, precision="float64", grid_method="e3nn")
-        with pytest.raises(NotImplementedError, match="lebedev_quadrature"):
-            DPS2GridNet(**{**common, "grid_method": "e3nn"})
-        # default grid_method is "lebedev" (deliberate divergence from pt's
-        # "e3nn" default, which dp rejects): default construction works
-        net = DPS2GridNet(**{k: v for k, v in common.items() if k != "grid_method"})
-        assert net.grid_method == "lebedev"
-        # cross mode and residual_scale_init are now ported (see
-        # test_dpa4_basegridnet_cross.py for parity coverage); they construct.
-        DPS2GridNet(**{**common, "mode": "cross"})
-        DPS2GridNet(**common, residual_scale_init=1e-3)
-
     def test_value_errors(self) -> None:
         from deepmd.dpmodel.descriptor.dpa4_nn.grid_net import (
             GridBranch as DPGridBranch,
@@ -1833,6 +1811,7 @@ def test_value_errors(self) -> None:
             "precision": "float64",
             "layout": "ndfc",
             "grid_method": "lebedev",
+            "trainable": True,
         }
         with pytest.raises(ValueError):  # unknown grid method
             DPS2GridProjector(lmax=2, grid_method="cartesian")
@@ -1859,7 +1838,13 @@ def test_value_errors(self) -> None:
         with pytest.raises(ValueError):  # flat layout is cross-only
             DPS2GridNet(**{**common, "layout": "flat"})
         with pytest.raises(ValueError):  # n_branches must be positive
-            DPGridBranch(channels=4, n_branches=0, n_frames=1, precision="float64")
+            DPGridBranch(
+                channels=4,
+                n_branches=0,
+                n_frames=1,
+                precision="float64",
+                trainable=True,
+            )
         dp_net = DPS2GridNet(**common)
         rng = np.random.default_rng(2086)
         with pytest.raises(ValueError):  # wrong query channel count
@@ -1884,7 +1869,10 @@ def _build_so2_edge_data(
     row-major slot order pt's ``torch.nonzero`` would produce). Both sides
     share identical Wigner-D blocks built from the (parity-proven) dpmodel
     ``WignerDCalculator``. Invalid slots intentionally keep garbage (nonzero)
-    envelope/feature values so a missing mask shows up as a parity failure.
+    per-edge feature values so a consumer that forgets to mask them surfaces as
+    a parity failure. ``edge_env`` is the exception: the production
+    ``build_edge_cache`` multiplies the envelope by the slot mask, so it is
+    exactly zero on invalid slots, and this fixture mirrors that contract.
 
     ``n_radial``: when not None, ``edge_rbf`` is filled with random values of
     width ``n_radial`` (garbage in masked slots too); otherwise it stays the
@@ -1936,7 +1924,11 @@ def _build_so2_edge_data(
         edge_rbf = np.zeros((n_edge, 1))
     else:
         edge_rbf = rng.normal(size=(n_edge, n_radial))
-    edge_env = rng.uniform(0.2, 1.0, size=(n_edge, 1))
+    # edge_env follows the production contract: zero on invalid slots (the real
+    # build_edge_cache applies ``envelope * mask``). The envelope-summing
+    # baseline aggregation (n_atten_head=0) relies on this; the attention path
+    # masks independently, so both stay parity-correct.
+    edge_env = rng.uniform(0.2, 1.0, size=(n_edge, 1)) * mask[:, None]
     deg = ((edge_env[:, 0] ** 2) * mask).reshape(nloc, nnei).sum(axis=1)
     inv_sqrt_deg = (1.0 / np.sqrt(deg + 1.0)).reshape(nloc, 1, 1)
     edge_src_gate = rng.uniform(0.1, 1.0, size=(n_edge, 1)) if with_gate else None
@@ -2024,6 +2016,7 @@ def test_so2_linear_roundtrip(self) -> None:
             precision="float64",
             mlp_bias=True,
             seed=4,
+            trainable=True,
         )
         dp_mod2 = DPSO2Linear.deserialize(dp_mod.serialize())
         rng = np.random.default_rng(2054)
@@ -2036,9 +2029,13 @@ def test_so2_linear_errors(self) -> None:
         from deepmd.dpmodel.descriptor.dpa4_nn.so2 import SO2Linear as DPSO2Linear
 
         with pytest.raises(ValueError):  # mmax > lmax
-            DPSO2Linear(lmax=2, mmax=3, in_channels=2, out_channels=2)
+            DPSO2Linear(
+                lmax=2, mmax=3, in_channels=2, out_channels=2, seed=0, trainable=True
+            )
         with pytest.raises(ValueError):  # negative mmax
-            DPSO2Linear(lmax=2, mmax=-1, in_channels=2, out_channels=2)
+            DPSO2Linear(
+                lmax=2, mmax=-1, in_channels=2, out_channels=2, seed=0, trainable=True
+            )
         with pytest.raises(ValueError):  # wrong class tag
             DPSO2Linear.deserialize({"@class": "NotSO2Linear", "@version": 1})
 
@@ -2079,9 +2076,13 @@ def test_radial_degree_mixer(self, mode, rank) -> None:
             rank=rank,
             precision="float64",
             seed=5,
+            trainable=True,
         )
-        # pt has no standalone serialize(); load the pt state_dict fragment
-        dp_mod._load_variables(pt_state_to_numpy(pt_mod))
+        # pt has no standalone serialize(); reuse the dp config and load the pt
+        # state_dict fragment as @variables (key names match) via deserialize.
+        ser = dp_mod.serialize()
+        ser["@variables"] = pt_state_to_numpy(pt_mod)
+        dp_mod = DPMixer.deserialize(ser)
         rng = np.random.default_rng(2056)
         x_local = rng.normal(size=(17, dp_mod.reduced_dim, 4))
         radial = rng.normal(size=(17, dp_mod.reduced_dim, 4))
@@ -2103,6 +2104,7 @@ def test_radial_degree_mixer_roundtrip(self) -> None:
             rank=1,
             precision="float64",
             seed=6,
+            trainable=True,
         )
         dp_mod2 = DPMixer.deserialize(dp_mod.serialize())
         rng = np.random.default_rng(2057)
@@ -2118,15 +2120,22 @@ def test_radial_degree_mixer_errors(self) -> None:
             DynamicRadialDegreeMixer as DPMixer,
         )
 
-        common = {"lmax": 2, "mmax": 1, "channels": 4, "precision": "float64"}
+        common = {
+            "lmax": 2,
+            "mmax": 1,
+            "channels": 4,
+            "precision": "float64",
+            "seed": 0,
+            "trainable": True,
+        }
         with pytest.raises(ValueError):  # unknown mode
             DPMixer(mode="channel", **common)
         with pytest.raises(ValueError):  # negative rank
             DPMixer(mode="degree_channel", rank=-1, **common)
         with pytest.raises(ValueError):  # non-positive channels
-            DPMixer(lmax=2, mmax=1, channels=0, mode="degree")
+            DPMixer(lmax=2, mmax=1, channels=0, mode="degree", seed=0, trainable=True)
         with pytest.raises(ValueError):  # mmax > lmax
-            DPMixer(lmax=2, mmax=3, channels=4, mode="degree")
+            DPMixer(lmax=2, mmax=3, channels=4, mode="degree", seed=0, trainable=True)
         dp_mod = DPMixer(mode="degree", **common)
         rng = np.random.default_rng(2058)
         good = rng.normal(size=(3, dp_mod.reduced_dim, 4))
@@ -2168,6 +2177,7 @@ def test_segment_envelope_gated_softmax(self, masked, use_src_weight) -> None:
         alpha_dp = dp_softmax(
             logits=logits,
             edge_env=dp_cache.edge_env,
+            dst=dp_cache.dst,
             n_nodes=nloc,
             z_bias_raw=z_bias_raw,
             eps=1e-7,
@@ -2194,20 +2204,29 @@ def test_segment_envelope_gated_softmax(self, masked, use_src_weight) -> None:
         np.testing.assert_array_equal(alpha_dp[~valid], 0.0)
         assert np.all(np.isfinite(alpha_dp))
 
-    def test_segment_softmax_errors(self) -> None:
+    def test_segment_softmax_arbitrary_degree(self) -> None:
+        # The destination scatter is layout-agnostic: E need not be a multiple
+        # of n_nodes and dst may carry an arbitrary (non-row-major) order with a
+        # non-uniform per-node degree (here node 2 has three edges, node 0 two,
+        # node 1 two). The reduction must still produce a finite, correctly
+        # shaped result.
         from deepmd.dpmodel.descriptor.dpa4_nn.attention import (
             segment_envelope_gated_softmax as dp_softmax,
         )
 
         rng = np.random.default_rng(2062)
-        with pytest.raises(ValueError):  # E not a multiple of n_nodes
-            dp_softmax(
-                logits=rng.normal(size=(7, 1, 1)),
-                edge_env=rng.uniform(size=(7, 1)),
-                n_nodes=3,
-                z_bias_raw=np.zeros((1, 1)),
-                eps=1e-7,
-            )
+        dst = np.array([2, 0, 0, 1, 2, 2, 1], dtype=np.int64)  # n_nodes=3, E=7
+        alpha = dp_softmax(
+            logits=rng.normal(size=(7, 1, 1)),
+            edge_env=rng.uniform(size=(7, 1)),
+            dst=dst,
+            n_nodes=3,
+            z_bias_raw=np.zeros((1, 1)),
+            eps=1e-7,
+        )
+        alpha = np.asarray(alpha)
+        assert alpha.shape == (7, 1, 1)
+        assert np.all(np.isfinite(alpha))
 
     # ---------- SO2Convolution ----------
     def _conv_kwargs(self, **overrides):
@@ -2220,7 +2239,7 @@ def _conv_kwargs(self, **overrides):
             "focus_dim": 0,
             "focus_compete": True,
             "so2_norm": False,
-            "so2_layers": 2,
+            "mixing_layers": 2,
             "so2_attn_res": "none",
             "layer_scale": False,
             "n_atten_head": 1,
@@ -2263,9 +2282,9 @@ def _assert_conv_parity(
         assert_parity(out_dp, out_pt)
 
     @pytest.mark.parametrize("masked", ["none", "slots"])  # padded-slot pattern
-    @pytest.mark.parametrize("so2_layers", [2, 4])  # SO(2) layer loop depth (core=4)
-    def test_so2_convolution(self, masked, so2_layers) -> None:
-        pt_mod, dp_mod, kwargs = self._build_conv_pair(so2_layers=so2_layers)
+    @pytest.mark.parametrize("mixing_layers", [2, 4])  # SO(2) layer loop depth (core=4)
+    def test_so2_convolution(self, masked, mixing_layers) -> None:
+        pt_mod, dp_mod, kwargs = self._build_conv_pair(mixing_layers=mixing_layers)
         self._assert_conv_parity(pt_mod, dp_mod, kwargs, masked=masked)
 
     def test_so2_convolution_all_masked_node(self) -> None:
@@ -2305,7 +2324,7 @@ def test_so2_convolution_multi_focus(self, focus_compete) -> None:
         self._assert_conv_parity(pt_mod, dp_mod, kwargs)
 
     def test_so2_convolution_so2_norm(self) -> None:
-        pt_mod, dp_mod, kwargs = self._build_conv_pair(so2_norm=True, so2_layers=3)
+        pt_mod, dp_mod, kwargs = self._build_conv_pair(so2_norm=True, mixing_layers=3)
         self._assert_conv_parity(pt_mod, dp_mod, kwargs)
 
     def test_so2_convolution_mlp_bias(self) -> None:
@@ -2367,55 +2386,44 @@ def test_so2_convolution_roundtrip(self) -> None:
         out2 = np.asarray(dp_mod2.call(x, dp_cache, radial))
         np.testing.assert_array_equal(out1, out2)
 
-    @pytest.mark.parametrize(
-        "flag,value",
-        [
-            ("so2_attn_res", "independent"),  # DepthAttnRes
-            ("so2_attn_res", "dependent"),  # DepthAttnRes
-            ("layer_scale", True),  # per-layer LayerScale
-            ("n_atten_head", -1),  # ValueError, not NotImplementedError
-            ("atten_f_mix", True),  # focus-merged attention
-            ("atten_v_proj", True),  # value projection
-            ("atten_o_proj", True),  # output projection
-            ("s2_activation", True),  # S2-grid SwiGLU non-linearity
-        ],
-    )
-    def test_so2_convolution_guards(self, flag, value) -> None:
-        from deepmd.dpmodel.descriptor.dpa4_nn.so2 import SO2Convolution as DPSO2Conv
-
-        kwargs = self._conv_kwargs(**{flag: value})
-        if flag == "n_atten_head":
-            with pytest.raises(ValueError):
-                DPSO2Conv(**kwargs, precision="float64")
-        else:
-            with pytest.raises(NotImplementedError, match=flag):
-                DPSO2Conv(**kwargs, precision="float64")
-
     def test_so2_convolution_errors(self) -> None:
         from deepmd.dpmodel.descriptor.dpa4_nn.so2 import SO2Convolution as DPSO2Conv
 
         with pytest.raises(ValueError):  # head count must divide focus width
-            DPSO2Conv(**self._conv_kwargs(n_atten_head=3), precision="float64")
-        with pytest.raises(ValueError):  # so2_layers must be >= 1
-            DPSO2Conv(**self._conv_kwargs(so2_layers=0), precision="float64")
+            DPSO2Conv(
+                **self._conv_kwargs(n_atten_head=3),
+                precision="float64",
+                seed=0,
+                trainable=True,
+            )
         with pytest.raises(ValueError):  # n_focus must be >= 1
-            DPSO2Conv(**self._conv_kwargs(n_focus=0), precision="float64")
+            DPSO2Conv(
+                **self._conv_kwargs(n_focus=0),
+                precision="float64",
+                seed=0,
+                trainable=True,
+            )
         with pytest.raises(ValueError):  # unknown radial mode
             DPSO2Conv(
                 **self._conv_kwargs(radial_so2_mode="degree_rank"),
                 precision="float64",
+                seed=0,
+                trainable=True,
             )
         with pytest.raises(ValueError):  # mmax > lmax
-            DPSO2Conv(**self._conv_kwargs(mmax=4), precision="float64")
+            DPSO2Conv(
+                **self._conv_kwargs(mmax=4),
+                precision="float64",
+                seed=0,
+                trainable=True,
+            )
         with pytest.raises(ValueError):  # unknown so2_attn_res token
-            DPSO2Conv(**self._conv_kwargs(so2_attn_res="depth"), precision="float64")
-        dp_mod = DPSO2Conv(**self._conv_kwargs(), precision="float64", seed=1)
-        rng = np.random.default_rng(2064)
-        _, dp_cache, radial, _, x, _ = _build_so2_edge_data(
-            rng, nloc=self.nloc, nnei=self.nnei, lmax=3, channels=4
-        )
-        with pytest.raises(ValueError):  # E not a multiple of N
-            dp_mod.call(x[:3], dp_cache, radial)
+            DPSO2Conv(
+                **self._conv_kwargs(so2_attn_res="depth"),
+                precision="float64",
+                seed=0,
+                trainable=True,
+            )
         with pytest.raises(ValueError):  # wrong class tag
             DPSO2Conv.deserialize({"@class": "NotConv", "@version": 1})
 
@@ -2496,11 +2504,6 @@ def test_type_embedding_errors(self) -> None:
             DPTypeEmbed(ntypes=3, embed_dim=0)
         with pytest.raises(ValueError):  # wrong class tag
             DPTypeEmbed.deserialize({"@class": "NotTypeEmbed", "@version": 1})
-        dp_mod = DPTypeEmbed(ntypes=3, embed_dim=4, precision="float64")
-        data = dp_mod.serialize()
-        data["@variables"]["adam_type_embedding"] = np.zeros((2, 4))
-        with pytest.raises(ValueError):  # table shape mismatch
-            DPTypeEmbed.deserialize(data)
 
     # ---------- GeometricInitialEmbedding ----------
     def _build_gie_pair(self, lmax, channels):
@@ -2604,13 +2607,6 @@ def test_gie_errors(self) -> None:
 
         with pytest.raises(ValueError):  # wrong class tag
             DPGIE.deserialize({"@class": "NotGIE", "@version": 1})
-        dp_mod = DPGIE(lmax=2, channels=4, precision="float64")
-        rng = np.random.default_rng(2074)
-        _, dp_cache, radial, _, _, _ = _build_so2_edge_data(
-            rng, nloc=self.nloc, nnei=self.nnei, lmax=2, channels=4
-        )
-        with pytest.raises(ValueError):  # E not a multiple of N
-            dp_mod.call(n_nodes=3, edge_cache=dp_cache, radial_feat=radial[:, 1:, :])
 
     # ---------- EnvironmentInitialEmbedding ----------
     n_radial = 5
@@ -2723,27 +2719,6 @@ def test_env_embedding_errors(self) -> None:
             DPEnv(**self._env_kwargs(axis_dim=12), precision="float64")
         with pytest.raises(ValueError):  # wrong class tag
             DPEnv.deserialize({"@class": "NotEnv", "@version": 1})
-        dp_mod = DPEnv(**self._env_kwargs(), precision="float64", seed=2)
-        data = dp_mod.serialize()
-        data["@variables"].pop("output_proj.matrix")
-        with pytest.raises(ValueError):  # variable key set mismatch
-            DPEnv.deserialize(data)
-        data = dp_mod.serialize()
-        data["@variables"]["output_proj.matrix"] = np.zeros((2, 2))
-        with pytest.raises(ValueError):  # variable shape mismatch
-            DPEnv.deserialize(data)
-        rng = np.random.default_rng(2078)
-        _, dp_cache, _, _, _, _ = _build_so2_edge_data(
-            rng,
-            nloc=self.nloc,
-            nnei=self.nnei,
-            lmax=1,
-            channels=4,
-            n_radial=self.n_radial,
-        )
-        atype = rng.integers(0, self.ntypes, size=(self.nloc,))
-        with pytest.raises(ValueError):  # E not a multiple of N
-            dp_mod.call(edge_cache=dp_cache, atype_flat=atype, n_nodes=3)
 
 
 def _build_real_edge_inputs(
@@ -3203,8 +3178,6 @@ def test_ffn_errors(self) -> None:
         dp_mod = DPFFN(**self._ffn_kwargs(), precision="float64", seed=3)
         with pytest.raises(KeyError):  # missing sub-module variables
             dp_mod._load_variables({"so3_linear_1.weight": dp_mod.so3_linear_1.weight})
-        with pytest.raises(KeyError):  # unknown variables rejected
-            dp_mod._load_variables({**dp_mod._variables(), "extra.weight": 0.0})
 
 
 class TestBlockParity:
@@ -3230,7 +3203,7 @@ def _block_kwargs(self, **overrides):
             "focus_dim": 0,
             "focus_compete": True,
             "so2_norm": False,
-            "so2_layers": 4,
+            "mixing_layers": 4,
             "so2_attn_res": "none",
             "radial_so2_mode": "degree_channel",
             "radial_so2_rank": 1,
@@ -3293,9 +3266,9 @@ def _assert_block_parity(self, pt_mod, dp_mod, kwargs, *, masked="slots") -> Non
         assert out_pt[1] is None and out_pt[2] is None and out_pt[3] is None
         assert_parity(out_dp[0], out_pt[0])
 
-    @pytest.mark.parametrize("so2_layers", [2, 4])  # SO(2) layer depth (core=4)
-    def test_block(self, so2_layers) -> None:
-        pt_mod, dp_mod, kwargs = self._build_block_pair(so2_layers=so2_layers)
+    @pytest.mark.parametrize("mixing_layers", [2, 4])  # SO(2) layer depth (core=4)
+    def test_block(self, mixing_layers) -> None:
+        pt_mod, dp_mod, kwargs = self._build_block_pair(mixing_layers=mixing_layers)
         self._assert_block_parity(pt_mod, dp_mod, kwargs)
 
     @pytest.mark.parametrize(
@@ -3381,8 +3354,10 @@ def test_block_roundtrip(self) -> None:
 
         pt_mod, dp_mod, kwargs = self._build_block_pair(ffn_blocks=2)
         data = dp_mod.serialize()
-        # exact pt state_dict key-set match
-        assert set(data["@variables"]) == set(pt_state_to_numpy(pt_mod))
+        # dp serialize emits exactly the learnable pt state_dict keys; the pt
+        # SO(2) convolution additionally carries derived index buffers that dp
+        # rebuilds on deserialize (see _learnable_pt_keys).
+        assert set(data["@variables"]) == _learnable_pt_keys(pt_mod)
         dp_mod2 = DPBlock.deserialize(data)
         rng = np.random.default_rng(2123)
         _, dp_cache, radial, _, _, _ = _build_so2_edge_data(
@@ -3398,43 +3373,24 @@ def test_block_roundtrip(self) -> None:
         out2 = np.asarray(dp_mod2.call(x, dp_cache, radial)[0])
         np.testing.assert_array_equal(out1, out2)
 
-    @pytest.mark.parametrize(
-        "flag,value",
-        [
-            ("full_attn_res", "independent"),  # block-level DepthAttnRes
-            ("full_attn_res", "dependent"),  # block-level DepthAttnRes
-            ("block_attn_res", "independent"),  # block-level DepthAttnRes
-            ("block_attn_res", "dependent"),  # block-level DepthAttnRes
-            ("layer_scale", True),  # block-level FFN LayerScale
-            ("so2_s2_activation", True),  # delegated to SO2Convolution
-        ],
-    )
-    def test_block_guards(self, flag, value) -> None:
-        from deepmd.dpmodel.descriptor.dpa4_nn.block import (
-            SeZMInteractionBlock as DPBlock,
-        )
-
-        match = "s2_activation" if flag == "so2_s2_activation" else flag
-        with pytest.raises(NotImplementedError, match=match):
-            DPBlock(**self._block_kwargs(**{flag: value}), precision="float64")
-
     def test_block_errors(self) -> None:
         from deepmd.dpmodel.descriptor.dpa4_nn.block import (
             SeZMInteractionBlock as DPBlock,
         )
 
+        opts = {"precision": "float64", "seed": 0, "trainable": True}
         with pytest.raises(ValueError):  # node_lmax must be >= lmax
-            DPBlock(**self._block_kwargs(node_lmax=2), precision="float64")
+            DPBlock(**self._block_kwargs(node_lmax=2), **opts)
         with pytest.raises(ValueError):  # mmax must be <= lmax
-            DPBlock(**self._block_kwargs(mmax=4), precision="float64")
+            DPBlock(**self._block_kwargs(mmax=4), **opts)
         with pytest.raises(ValueError):  # ffn_blocks must be >= 1
-            DPBlock(**self._block_kwargs(ffn_blocks=0), precision="float64")
+            DPBlock(**self._block_kwargs(ffn_blocks=0), **opts)
         with pytest.raises(ValueError):  # unknown full_attn_res token
-            DPBlock(**self._block_kwargs(full_attn_res="depth"), precision="float64")
+            DPBlock(**self._block_kwargs(full_attn_res="depth"), **opts)
         with pytest.raises(ValueError):  # unknown block_attn_res token
-            DPBlock(**self._block_kwargs(block_attn_res="depth"), precision="float64")
+            DPBlock(**self._block_kwargs(block_attn_res="depth"), **opts)
         with pytest.raises(ValueError):  # negative grid branch count
-            DPBlock(**self._block_kwargs(ffn_grid_branch=-1), precision="float64")
+            DPBlock(**self._block_kwargs(ffn_grid_branch=-1), **opts)
         with pytest.raises(ValueError):  # wrong class tag
             DPBlock.deserialize({"@class": "NotBlock", "@version": 1})
 
@@ -3665,12 +3621,22 @@ def test_descriptor_cross_deserialize(self) -> None:
         )
 
         pt_mod, dp_mod, _ = self._build_descr_pair()
-        # dp serialize emits exactly the pt state_dict key set
+        # dp serialize emits exactly the learnable pt state_dict keys; the pt
+        # SO(2) convolutions additionally carry derived index buffers that dp
+        # rebuilds on deserialize (see _learnable_pt_keys).
         data = dp_mod.serialize()
-        assert set(data["@variables"]) == set(pt_state_to_numpy(pt_mod))
+        assert set(data["@variables"]) == _learnable_pt_keys(pt_mod)
         assert data["type"] == "SeZM"
-        # pt <- dp: load the dp serialization into a fresh pt descriptor
-        pt_mod2 = DescrptSeZM.deserialize(data).double().eval()
+        # pt <- dp: load the dp serialization into a fresh pt descriptor. pt's
+        # deserialize strict-loads the full state_dict, while the dpmodel
+        # serialize omits the config-derived SO(2) index buffers; supply those
+        # from the reference pt state_dict so the dp learnable weights load in.
+        data_for_pt = dict(data)
+        data_for_pt["@variables"] = {
+            **pt_state_to_numpy(pt_mod),
+            **data["@variables"],
+        }
+        pt_mod2 = DescrptSeZM.deserialize(data_for_pt).double().eval()
         self._assert_descr_parity(pt_mod2, dp_mod)
         # dp <- dp roundtrip is bit-exact
         dp_mod2 = DescrptDPA4.deserialize(data)
diff --git a/source/tests/pt/model/test_sezm_model.py b/source/tests/pt/model/test_sezm_model.py
index 6ec058602b..fcc5ae01ea 100644
--- a/source/tests/pt/model/test_sezm_model.py
+++ b/source/tests/pt/model/test_sezm_model.py
@@ -228,7 +228,9 @@ def _randomize_params(model: torch.nn.Module, seed: int = 1234) -> None:
             for p in model.parameters():
                 p.copy_(torch.randn_like(p) * 0.1)
 
-    def _build_model_params(self, *, use_compile: bool) -> dict:
+    def _build_model_params(
+        self, *, use_compile: bool, edge_cartesian: bool = False
+    ) -> dict:
         return {
             "type": "SeZM",
             "type_map": ["A", "B"],
@@ -241,7 +243,8 @@ def _build_model_params(self, *, use_compile: bool) -> dict:
                 "n_radial": 3,
                 "radial_mlp": [6],
                 "use_env_seed": True,
-                "l_schedule": [1, 0],
+                "edge_cartesian": edge_cartesian,
+                "l_schedule": [2, 1] if edge_cartesian else [1, 0],
                 "mmax": 1,
                 "so2_norm": False,
                 "so2_layers": 1,
@@ -392,6 +395,89 @@ def _train_steps(
             name: param.detach().clone() for name, param in model.named_parameters()
         }
 
+    def _make_frame_with_natoms(
+        self, nloc: int, *, seed: int = 20240613
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Build a compact ``nloc``-atom frame with neighbours inside ``rcut``.
+
+        Atoms are placed in a tight cluster so the ``sel=[2, 2]`` neighbour list
+        is saturated and the edge count is comfortably larger than ``nloc``.
+        """
+        torch.manual_seed(seed + nloc)
+        coord = torch.rand(1, nloc, 3, device=self.device, dtype=torch.float32) * 2.5
+        atype = (
+            (torch.arange(nloc, device=self.device) % 2).view(1, nloc).to(torch.int32)
+        )
+        box = torch.tensor(
+            [[8.0, 0.0, 0.0, 0.0, 8.0, 0.0, 0.0, 0.0, 8.0]],
+            dtype=torch.float32,
+            device=self.device,
+        )
+        return coord, atype, box
+
+    def test_trace_pad_dim_trim_returns_contiguous(self) -> None:
+        """Trimmed trace inputs stay contiguous so strides mirror runtime layout.
+
+        A sliced (non-contiguous) trim leaks the pre-trim length into the tensor
+        stride; ``make_fx`` duck-shaping can then fuse that stale stride with the
+        edge-count symbol and corrupt the compiled shape guards.
+        """
+        from deepmd.pt.utils.compile_compat import (
+            trace_pad_dim,
+        )
+
+        base = torch.arange(5 * 13, device=self.device).view(5, 13)
+        trimmed = trace_pad_dim(base, 1, 7)
+        self.assertEqual(tuple(trimmed.shape), (5, 7))
+        self.assertTrue(trimmed.is_contiguous())
+        padded = trace_pad_dim(base, 1, 20)
+        self.assertEqual(tuple(padded.shape), (5, 20))
+        self.assertTrue(padded.is_contiguous())
+
+    @unittest.skipIf(_SKIP_OFF_COMPILE_TORCH, _SKIP_OFF_COMPILE_TORCH_REASON)
+    def test_eval_compile_first_frame_nloc_matches_trace_edge_count(self) -> None:
+        """First eval frame with ``nloc`` equal to the trace edge count compiles.
+
+        The symbolic trace pads the edge axis to ``next_safe_prime`` (13 for the
+        two-type forbidden set ``{1, 2, 3, 9}`` -> primes 5/7/11/13) and trims
+        ``atype`` to ``trace_nloc`` (7). A first frame with ``nloc == 13`` leaves
+        the trimmed ``atype`` carrying ``stride(0) == 13``; previously that stale
+        stride was duck-shaped onto the edge-count symbol, so every edge tensor
+        was guarded against ``nloc`` and ``assert_size_stride`` failed once the
+        real edge count differed. Pins the contiguous-trace + eval-only
+        duck-shape-off fix.
+        """
+        nloc = 13  # == next_safe_prime edge count for the two-type forbidden set
+        coord, atype, box = self._make_frame_with_natoms(nloc)
+
+        model_dyn = get_sezm_model(self._build_model_params(use_compile=False))
+        self._randomize_params(model_dyn)
+        with mock.patch.dict(os.environ, {"DP_COMPILE_INFER": "1"}, clear=False):
+            model_cmp = get_sezm_model(self._build_model_params(use_compile=True))
+        model_cmp.load_state_dict(model_dyn.state_dict())
+        model_dyn.eval()
+        model_cmp.eval()
+
+        out_dyn = model_dyn(coord, atype, box=box)
+        # The compiled eval path must trace, lower and run without tripping
+        # ``assert_size_stride`` on the edge tensors.
+        out_cmp = model_cmp(coord, atype, box=box)
+        self.assertIn((False, False), model_cmp.compiled_core_compute_cache)
+        _assert_close_with_strict_warning(
+            out_dyn["energy"],
+            out_cmp["energy"],
+            atol=1.0e-6,
+            rtol=1.0e-6,
+            msg="eval energy mismatch when first-frame nloc == trace edge count",
+        )
+        _assert_close_with_strict_warning(
+            out_dyn["force"],
+            out_cmp["force"],
+            atol=1.0e-6,
+            rtol=1.0e-6,
+            msg="eval force mismatch when first-frame nloc == trace edge count",
+        )
+
     @unittest.skipIf(_SKIP_OFF_COMPILE_TORCH, _SKIP_OFF_COMPILE_TORCH_REASON)
     def test_compile_cache_slots_and_eval_shape_change(self) -> None:
         """Compile cache slots should coexist while eval handles batch-size growth."""
@@ -784,6 +870,68 @@ def test_forward_backward_double_backward_matches_compile(self) -> None:
                 msg=f"force-grad mismatch at {name}",
             )
 
+    @unittest.skipIf(_SKIP_OFF_COMPILE_TORCH, _SKIP_OFF_COMPILE_TORCH_REASON)
+    def test_cartesian_forward_backward_matches_compile(self) -> None:
+        """The Cartesian path (Wigner-D skipped) matches eager and compiled runs."""
+        coord, atype, box, _, _, _ = self._make_tiny_frame()
+        model_dyn = get_sezm_model(
+            self._build_model_params(use_compile=False, edge_cartesian=True)
+        )
+        self._randomize_params(model_dyn)
+        model_cmp = get_sezm_model(
+            self._build_model_params(use_compile=True, edge_cartesian=True)
+        )
+        model_cmp.load_state_dict(model_dyn.state_dict())
+        model_dyn.train()
+        model_cmp.train()
+
+        # === Step 1. Forward output consistency ===
+        out_dyn = model_dyn(coord, atype, box=box)
+        out_cmp = model_cmp(coord, atype, box=box)
+        _assert_close_with_strict_warning(
+            out_dyn["energy"],
+            out_cmp["energy"],
+            atol=1.0e-6,
+            rtol=1.0e-6,
+            msg="cartesian energy mismatch on first compiled call",
+        )
+        _assert_close_with_strict_warning(
+            out_dyn["force"],
+            out_cmp["force"],
+            atol=1.0e-6,
+            rtol=1.0e-6,
+            msg="cartesian force mismatch on first compiled call",
+        )
+
+        # === Step 2. Energy-gradient consistency ===
+        model_dyn.zero_grad(set_to_none=True)
+        model_cmp.zero_grad(set_to_none=True)
+        out_dyn["energy"].sum().backward()
+        out_cmp["energy"].sum().backward()
+        grad_atol = 1.0e-5 if self.device == torch.device("cpu") else 2.0e-3
+        grad_rtol = 1.0e-5 if self.device == torch.device("cpu") else 3.0e-3
+        grads_dyn = {
+            name: (
+                torch.zeros_like(param) if param.grad is None else param.grad.detach()
+            )
+            for name, param in model_dyn.named_parameters()
+        }
+        grads_cmp = {
+            name: (
+                torch.zeros_like(param) if param.grad is None else param.grad.detach()
+            )
+            for name, param in model_cmp.named_parameters()
+        }
+        self.assertEqual(set(grads_dyn.keys()), set(grads_cmp.keys()))
+        for name in grads_dyn.keys():
+            _assert_close_with_strict_warning(
+                grads_dyn[name],
+                grads_cmp[name],
+                atol=grad_atol,
+                rtol=grad_rtol,
+                msg=f"cartesian energy-grad mismatch at {name}",
+            )
+
     def _assert_multitask_compile_matches_eager(
         self,
         *,
diff --git a/source/tests/pt_expt/descriptor/test_dpa4.py b/source/tests/pt_expt/descriptor/test_dpa4.py
index 67de02c2b9..ef4a10227b 100644
--- a/source/tests/pt_expt/descriptor/test_dpa4.py
+++ b/source/tests/pt_expt/descriptor/test_dpa4.py
@@ -205,35 +205,3 @@ def test_trainable_false_freezes_all_parameters(self, via_deserialize) -> None:
             f"trainable=False left parameters trainable: "
             f"{sorted(set(params) - set(frozen))}"
         )
-
-
-# `use_amp` is a pt-runtime CUDA autocast switch with no dpmodel/pt_expt effect;
-# constructing the descriptor with it truthy must emit a warn-once message.
-@pytest.mark.parametrize("use_amp", [True, False])  # truthy warns, falsy is silent
-def test_use_amp_warns_once(use_amp, caplog, monkeypatch) -> None:
-    import logging
-
-    # The descriptor module's logger name is its module path; derive it from the
-    # already-imported class to avoid a second (mixed-style) import of the module.
-    logger_name = DPDescrptDPA4.__module__
-
-    # reset the warn-once set so the assertion is deterministic regardless of
-    # test ordering (other constructions in the suite may have already warned).
-    # String target lets pytest resolve the module without an import statement.
-    monkeypatch.setattr(f"{logger_name}._WARNED_ONCE", set())
-
-    def _construct() -> None:
-        make_descriptor(2, [10, 10], 4.0, use_amp=use_amp)
-
-    with caplog.at_level(logging.WARNING, logger=logger_name):
-        _construct()
-    matches = [r for r in caplog.records if "use_amp" in r.getMessage()]
-    if use_amp:
-        assert len(matches) == 1, caplog.text
-        # second construction must NOT warn again (warn-once per process)
-        caplog.clear()
-        with caplog.at_level(logging.WARNING, logger=logger_name):
-            _construct()
-        assert not [r for r in caplog.records if "use_amp" in r.getMessage()]
-    else:
-        assert not matches, caplog.text
diff --git a/source/tests/pt_expt/descriptor/test_dpa4_ckpt_triton.py b/source/tests/pt_expt/descriptor/test_dpa4_ckpt_triton.py
new file mode 100644
index 0000000000..e4a34d4e16
--- /dev/null
+++ b/source/tests/pt_expt/descriptor/test_dpa4_ckpt_triton.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""pt_expt DPA4 runtime features: activation checkpointing and Triton fallback.
+
+These features are pt_expt-only (the array-API dpmodel cannot express
+``torch.utils.checkpoint`` or Triton kernels).  The Triton kernels themselves
+run only on CUDA; on CPU the opt-in path falls back to the eager reference,
+which must reproduce the dpmodel dense result bit-for-bit.
+"""
+
+import numpy as np
+import torch
+
+from deepmd.dpmodel.descriptor.dpa4 import DescrptDPA4 as DPDescrptDPA4
+from deepmd.pt_expt.descriptor.dpa4 import (
+    DescrptDPA4,
+)
+from deepmd.pt_expt.descriptor.dpa4_nn.block import (
+    SeZMInteractionBlock,
+)
+from deepmd.pt_expt.descriptor.dpa4_nn.so2 import (
+    DynamicRadialDegreeMixer,
+    SO2Convolution,
+)
+from deepmd.pt_expt.utils import (
+    env,
+)
+from deepmd.pt_expt.utils.env import (
+    PRECISION_DICT,
+)
+
+from ...common.test_mixins import (
+    TestCaseSingleFrameWithNlist,
+)
+
+
+def make_descriptor(nt, sel, rcut, **overrides) -> DescrptDPA4:
+    kwargs = {
+        "ntypes": nt,
+        "sel": sel,
+        "rcut": rcut,
+        "channels": 16,
+        "n_radial": 8,
+        "lmax": 2,
+        "mmax": 1,
+        "n_blocks": 2,
+        "grid_branch": [1, 1, 1],
+        "s2_activation": [False, True],
+        "random_gamma": False,
+        "precision": "float64",
+        "seed": 7,
+    }
+    kwargs.update(overrides)
+    return DescrptDPA4(**kwargs)
+
+
+class TestDPA4RuntimeFeatures(TestCaseSingleFrameWithNlist):
+    def setup_method(self) -> None:
+        TestCaseSingleFrameWithNlist.setUp(self)
+        self.device = env.DEVICE
+        self.dtype = PRECISION_DICT["float64"]
+        # one serialized reference so every descriptor shares identical weights
+        self.data = make_descriptor(self.nt, self.sel_mix, self.rcut).serialize()
+
+    def _inputs(self, requires_grad=False):
+        coord = torch.tensor(self.coord_ext, dtype=self.dtype, device=self.device)
+        if requires_grad:
+            coord = coord.detach().requires_grad_(True)
+        atype = torch.tensor(self.atype_ext, dtype=int, device=self.device)
+        nlist = torch.tensor(self.nlist, dtype=int, device=self.device)
+        return coord, atype, nlist
+
+    def test_activation_checkpoint_matches_eager(self, monkeypatch) -> None:
+        # Baseline: eval-time forward + autograd force with checkpointing OFF.
+        monkeypatch.delenv("DP_ACT_INFER", raising=False)
+        m0 = DescrptDPA4.deserialize(self.data).to(self.device).eval()
+        c0, atype, nlist = self._inputs(requires_grad=True)
+        out0 = m0(c0, atype, nlist)[0]
+        g0 = torch.autograd.grad(out0.sum(), c0)[0]
+
+        # Checkpointing ON: the block must engage recomputation and return the
+        # same value and gradient (checkpoint only trades compute for memory).
+        monkeypatch.setenv("DP_ACT_INFER", "1")
+        m1 = DescrptDPA4.deserialize(self.data).to(self.device).eval()
+        block = next(m for m in m1.modules() if isinstance(m, SeZMInteractionBlock))
+        assert block._act_infer
+        assert block._use_infer_activation_checkpoint(
+            torch.zeros(1, requires_grad=True)
+        )
+        c1, atype, nlist = self._inputs(requires_grad=True)
+        out1 = m1(c1, atype, nlist)[0]
+        g1 = torch.autograd.grad(out1.sum(), c1)[0]
+
+        np.testing.assert_allclose(
+            out1.detach().cpu().numpy(),
+            out0.detach().cpu().numpy(),
+            rtol=1e-12,
+            atol=1e-14,
+        )
+        np.testing.assert_allclose(
+            g1.detach().cpu().numpy(),
+            g0.detach().cpu().numpy(),
+            rtol=1e-12,
+            atol=1e-14,
+        )
+
+    def test_checkpoint_off_when_training(self, monkeypatch) -> None:
+        # Training mode never checkpoints (only the eval-time autograd path does).
+        monkeypatch.setenv("DP_ACT_INFER", "1")
+        m = DescrptDPA4.deserialize(self.data).to(self.device).train()
+        block = next(x for x in m.modules() if isinstance(x, SeZMInteractionBlock))
+        assert block._act_infer
+        assert not block._use_infer_activation_checkpoint(
+            torch.zeros(1, requires_grad=True)
+        )
+
+    def test_triton_eager_fallback_parity(self, monkeypatch) -> None:
+        # With the kernels bound but no CUDA/Triton present, the SO(2) rotation
+        # and radial-mix paths fall back to the eager reference, which must equal
+        # the dpmodel dense result bit-for-bit.
+        monkeypatch.setenv("DP_TRITON_INFER", "1")
+        m = DescrptDPA4.deserialize(self.data).to(self.device).eval()
+        so2 = next(x for x in m.modules() if isinstance(x, SO2Convolution))
+        assert so2.use_triton_infer
+        assert so2._rotate_to_local_fn is not None
+        assert so2._rotate_back_fn is not None
+        mixers = [x for x in m.modules() if isinstance(x, DynamicRadialDegreeMixer)]
+        assert mixers and all(x.use_triton_infer for x in mixers)
+
+        coord, atype, nlist = self._inputs()
+        out = m(coord, atype, nlist)[0]
+
+        dd = DPDescrptDPA4.deserialize(self.data)
+        ref = dd.call(self.coord_ext, self.atype_ext, self.nlist)[0]
+        np.testing.assert_allclose(
+            out.detach().cpu().numpy(),
+            np.asarray(ref),
+            rtol=1e-12,
+            atol=1e-14,
+        )