From 39af1a4f690444283cc021782d63f7de7385daf9 Mon Sep 17 00:00:00 2001
From: ChaoZheng109 <zhengchao47@huawei.com>
Date: Tue, 23 Jun 2026 20:28:10 +0800
Subject: [PATCH] docs(examples): demonstrate per-ring runtime_env sizing

#1099 added per-ring array fields (ring_task_windows / ring_heaps /
ring_dep_pools) alongside the scalar runtime_env knobs, but neither
per_task_runtime_env example exercised them.

Extend both the L2 and L3 examples to also cover the per-ring form:
each scope-depth ring (0..3) sized independently. The config helpers
now iterate a RING_FIELDS tuple so a spec dict can carry either the
scalar or the array keys, and the READMEs document the full precedence
chain and the --enable-scope-stats verification path.
---
 .../workers/l2/per_task_runtime_env/README.md | 48 ++++++++-----
 .../workers/l2/per_task_runtime_env/main.py   | 68 ++++++++++++++-----
 .../workers/l3/per_task_runtime_env/README.md | 26 ++++---
 .../workers/l3/per_task_runtime_env/main.py   | 46 ++++++++++---
 4 files changed, 137 insertions(+), 51 deletions(-)

diff --git a/examples/workers/l2/per_task_runtime_env/README.md b/examples/workers/l2/per_task_runtime_env/README.md
index f78766103..407df48db 100644
--- a/examples/workers/l2/per_task_runtime_env/README.md
+++ b/examples/workers/l2/per_task_runtime_env/README.md
@@ -1,40 +1,56 @@
 # `per_task_runtime_env/` — per-task ring sizing on one L2 Worker
 
-Runs the same vector_add kernel three times on one L2 `Worker`, each with a
-different `CallConfig.runtime_env` (ring buffer sizing). Ring sizing is a
-**per-run** knob carried on `CallConfig` — not a process-wide env export.
+Runs the same vector_add kernel several times on one L2 `Worker`, each with a
+different `CallConfig.runtime_env` (ring buffer sizing) — covering both the
+**scalar** form (one value broadcast to every ring) and the **per-ring** form
+(each scope-depth ring sized independently). Ring sizing is a **per-run** knob
+carried on `CallConfig` — not a process-wide env export.
 
 ## What it shows
 
-`CallConfig.runtime_env` groups the three ring overrides as a distinct config
-tier, separate from the top-level execution knobs (`block_dim`, …):
+`CallConfig.runtime_env` groups the ring overrides as a distinct config tier,
+separate from the top-level execution knobs (`block_dim`, …). Each resource
+comes in a scalar field and a 4-entry per-ring array:
 
-| field | unit | constraint |
-| ----- | ---- | ---------- |
-| `ring_task_window` | tasks | power of 2, >= 4 |
-| `ring_heap` | bytes / ring | >= 1024 |
-| `ring_dep_pool` | entries | 4 .. INT32_MAX |
+| scalar field | per-ring array | unit | constraint (per value) |
+| ------------ | -------------- | ---- | ---------------------- |
+| `ring_task_window` | `ring_task_windows` | tasks | power of 2 in [4, INT32_MAX] |
+| `ring_heap` | `ring_heaps` | bytes / ring | >= 1024 |
+| `ring_dep_pool` | `ring_dep_pools` | entries | 4 .. INT32_MAX |
 
-Precedence per value: **`runtime_env` field > `PTO2_RING_*` env var >
-compile-time default**. A field left at 0 (or omitted) falls back to the env
-var / default.
+The array fields must contain exactly **4 entries**, indexed by scope-depth
+ring `0..3` (depth `>=3` maps to ring 3). A `0` entry — or a field left unset —
+falls through to the next precedence tier:
+
+```text
+per-ring field > scalar field > per-ring env > scalar env > compile-time default
+```
 
 ```python
 cfg = CallConfig()
+# Scalar: one value broadcast to every ring.
 cfg.runtime_env.ring_task_window = 128
 cfg.runtime_env.ring_heap = 8 * 1024 * 1024   # bytes per ring
 cfg.runtime_env.ring_dep_pool = 256
+
+# Per-ring: size rings 0..3 independently (overrides the scalar tier per ring).
+cfg.runtime_env.ring_task_windows = [128, 64, 32, 16]
+cfg.runtime_env.ring_heaps = [8 * 1024 * 1024, 4 * 1024 * 1024, 2 * 1024 * 1024, 1 * 1024 * 1024]
+cfg.runtime_env.ring_dep_pools = [256, 128, 64, 64]
 worker.run(chip_handle, args, cfg)
 ```
 
-The three runs (`small_ring`, `large_ring`, `env_or_default`) compute the same
-vector add and all pass golden — only the ring footprint differs.
+The runs (`scalar_small`, `scalar_large`, `per_ring`, `env_or_default`) compute
+the same vector add and all pass golden — only the ring footprint differs.
+Confirm the effective per-ring sizes with `--enable-scope-stats` (the first line
+of `scope_stats/scope_stats.jsonl` reports `task_window_max` / `heap_max` /
+`dep_pool_max`, indexed by `ring`).
 
 ## Layout
 
 ```text
 per_task_runtime_env/
-  main.py                 # 3 runs, one CallConfig.runtime_env each
+  main.py                 # 4 runs, one CallConfig.runtime_env each
   test_per_task_runtime_env.py
 ```
 
diff --git a/examples/workers/l2/per_task_runtime_env/main.py b/examples/workers/l2/per_task_runtime_env/main.py
index fba29fe4b..232d74e64 100644
--- a/examples/workers/l2/per_task_runtime_env/main.py
+++ b/examples/workers/l2/per_task_runtime_env/main.py
@@ -9,17 +9,25 @@
 # -----------------------------------------------------------------------------------------------------------
 """L2 Worker API demo — per-task ring sizing via ``CallConfig.runtime_env``.
 
-Runs the same vector_add kernel three times on one L2 Worker, each time with a
-different ``CallConfig.runtime_env`` (ring buffer sizing). Ring sizing is a
-per-run knob carried on ``CallConfig`` — no process-wide ``PTO2_RING_*`` env
-export needed, and each ``worker.run`` binds its ring buffers from the config
-it was handed.
+Runs the same vector_add kernel several times on one L2 Worker, each time with
+a different ``CallConfig.runtime_env`` (ring buffer sizing) — covering both the
+scalar form (one value broadcast to every ring) and the per-ring form (each
+scope-depth ring sized independently). Ring sizing is a per-run knob carried on
+``CallConfig`` — no process-wide ``PTO2_RING_*`` env export needed, and each
+``worker.run`` binds its ring buffers from the config it was handed.
 
     runtime_env fields (0 / unset => fall back to env var / compile default):
-      ring_task_window   power of 2, >= 4
-      ring_heap          bytes per ring, >= 1024
-      ring_dep_pool      4 .. INT32_MAX
-    Precedence: runtime_env field > PTO2_RING_* env var > compile-time default.
+      scalar (broadcast to every ring):
+        ring_task_window    power of 2 in [4, INT32_MAX]
+        ring_heap           bytes per ring, >= 1024
+        ring_dep_pool       4 .. INT32_MAX
+      per-ring arrays (exactly 4 entries, indexed by scope-depth ring 0..3;
+      a 0 entry falls through to the scalar / env / default tier):
+        ring_task_windows   [w0, w1, w2, w3]
+        ring_heaps          [h0, h1, h2, h3]   bytes per ring
+        ring_dep_pools      [d0, d1, d2, d3]
+    Precedence per resource and ring:
+      per-ring field > scalar field > per-ring env > scalar env > default.
 
 See ../vector_add/main.py for the full L2 lifecycle walk-through; this example
 reuses that kernel verbatim and only varies the per-run ring configuration.
@@ -59,12 +67,36 @@
 N_ELEMS = N_ROWS * N_COLS
 NBYTES = N_ELEMS * 4  # float32
 
+# RuntimeEnv keys a config dict may carry. Scalar keys broadcast one value to
+# every ring; the array keys size the four scope-depth rings independently.
+RING_FIELDS = (
+    "ring_task_window",
+    "ring_heap",
+    "ring_dep_pool",
+    "ring_task_windows",
+    "ring_heaps",
+    "ring_dep_pools",
+)
+
 # (label, runtime_env dict or None). None => no override; falls back to the
 # PTO2_RING_* env var / compile-time default. Same kernel + same inputs run
-# under every sizing, so all three produce identical (correct) output.
+# under every sizing, so all of them produce identical (correct) output.
 RING_CONFIGS = [
-    ("small_ring", {"ring_task_window": 16, "ring_heap": 1 * 1024 * 1024, "ring_dep_pool": 64}),
-    ("large_ring", {"ring_task_window": 128, "ring_heap": 8 * 1024 * 1024, "ring_dep_pool": 256}),
+    # Scalar form: one value broadcast to every ring (the #1042 behavior).
+    ("scalar_small", {"ring_task_window": 16, "ring_heap": 1 * 1024 * 1024, "ring_dep_pool": 64}),
+    ("scalar_large", {"ring_task_window": 128, "ring_heap": 8 * 1024 * 1024, "ring_dep_pool": 256}),
+    # Per-ring form: each scope-depth ring (0..3) sized independently. Ring 0 is
+    # the shallow ring the kernel actually drives, so it gets the most headroom;
+    # the deeper rings taper down. Confirm the effective sizes with
+    # --enable-scope-stats (see scope_stats/scope_stats.jsonl).
+    (
+        "per_ring",
+        {
+            "ring_task_windows": [128, 64, 32, 16],
+            "ring_heaps": [8 * 1024 * 1024, 4 * 1024 * 1024, 2 * 1024 * 1024, 1 * 1024 * 1024],
+            "ring_dep_pools": [256, 128, 64, 64],
+        },
+    ),
     ("env_or_default", None),
 ]
 
@@ -115,12 +147,16 @@ def build_chip_callable(platform: str) -> ChipCallable:
 
 
 def _make_config(ring: Optional[dict]) -> CallConfig:
-    """Build a CallConfig, attaching this run's ring sizing under runtime_env."""
+    """Build a CallConfig, attaching this run's ring sizing under runtime_env.
+
+    Sets whichever of the scalar / per-ring keys the dict carries; the same
+    helper serves both the scalar and per-ring configs above.
+    """
     cfg = CallConfig()
     if ring is not None:
-        cfg.runtime_env.ring_task_window = ring["ring_task_window"]
-        cfg.runtime_env.ring_heap = ring["ring_heap"]
-        cfg.runtime_env.ring_dep_pool = ring["ring_dep_pool"]
+        for key in RING_FIELDS:
+            if key in ring:
+                setattr(cfg.runtime_env, key, ring[key])
     return cfg
 
 
diff --git a/examples/workers/l3/per_task_runtime_env/README.md b/examples/workers/l3/per_task_runtime_env/README.md
index bf048b3ed..937d9a7f4 100644
--- a/examples/workers/l3/per_task_runtime_env/README.md
+++ b/examples/workers/l3/per_task_runtime_env/README.md
@@ -1,6 +1,6 @@
 # `per_task_runtime_env/` — distinct ring sizes per L2 in one L3 launch
 
-One L3 orchestration dispatches two L2 tasks, each binding its **own** ring
+One L3 orchestration dispatches several L2 tasks, each binding its **own** ring
 buffers via `CallConfig.runtime_env`. This is the headline use case the
 per-task ring sizing enables: heterogeneous L2 tasks in a single launch that
 each need a different ring footprint.
@@ -11,20 +11,28 @@ Before this knob, every L2 dispatched from one L3 shared the process-wide
 `PTO2_RING_*` env and could not be sized independently. Now each
 `submit_next_level` gets its own `CallConfig`:
 
+Each spec carries one form — scalar keys (`ring_task_window`, …) *or* the
+per-ring arrays (`ring_task_windows`, …) — so the loop sets whichever keys the
+spec contains:
+
 ```python
 def orch_fn(orch, _args, _cfg):
     for spec in L2_TASKS:                      # one entry per L2 task
         cfg = CallConfig()
-        cfg.runtime_env.ring_task_window = spec["ring_task_window"]
-        cfg.runtime_env.ring_heap = spec["ring_heap"]       # bytes per ring
-        cfg.runtime_env.ring_dep_pool = spec["ring_dep_pool"]
+        for key in RING_FIELDS:                # scalar OR per-ring keys
+            if key in spec:                    # a spec carries just one form
+                setattr(cfg.runtime_env, key, spec[key])
         orch.submit_next_level(chip_handle, chip_args, cfg)  # per-task config
 ```
 
 The per-task config travels through the mailbox to the chip child, so each L2
-binds its rings from its own values. The demo dispatches `l2_small`
-(16 / 1 MiB / 64) and `l2_large` (128 / 8 MiB / 256); both run the same
-vector_add and pass golden.
+binds its rings from its own values. The demo dispatches three L2 tasks:
+`l2_scalar_small` (16 / 1 MiB / 64) and `l2_scalar_large` (128 / 8 MiB / 256)
+use the scalar form, and `l2_per_ring` sizes the four scope-depth rings
+independently (`ring_task_windows=[128, 64, 32, 16]`, etc.). All run the same
+vector_add and pass golden. The array fields take exactly four entries (one per
+scope-depth ring `0..3`); a `0` entry falls through to the scalar / env /
+default tier.
 
 ### Derive per-task config from the base, don't rebuild it
 
@@ -39,7 +47,7 @@ any fields the harness injected on the orchestration's base config — otherwise
 
 ```text
 per_task_runtime_env/
-  main.py                 # 2 submit_next_level, one runtime_env each
+  main.py                 # several submit_next_level, one runtime_env each
   test_per_task_runtime_env.py
 ```
 
@@ -51,7 +59,7 @@ The kernel is reused verbatim from `../../l2/vector_add/kernels`.
 python examples/workers/l3/per_task_runtime_env/main.py -p a2a3sim -d 0
 ```
 
-The two L2 tasks run serially on one device. See
+The L2 tasks run serially on one device. See
 [`../multi_chip_dispatch/`](../multi_chip_dispatch/) for the multi-device DAG
 primitives (`worker=i` pinning, `submit_sub`), and
 [`../../l2/per_task_runtime_env/`](../../l2/per_task_runtime_env/) for the
diff --git a/examples/workers/l3/per_task_runtime_env/main.py b/examples/workers/l3/per_task_runtime_env/main.py
index 979313426..df234972f 100644
--- a/examples/workers/l3/per_task_runtime_env/main.py
+++ b/examples/workers/l3/per_task_runtime_env/main.py
@@ -7,8 +7,8 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
-"""L3 Worker API demo — one orchestration dispatches two L2 tasks, each sized
-with its OWN ring buffers.
+"""L3 Worker API demo — one orchestration dispatches several L2 tasks, each
+sized with its OWN ring buffers.
 
 This is the headline use case for ``CallConfig.runtime_env``: an L3 fans out
 several heterogeneous L2 tasks in one launch, and each L2 needs a different
@@ -16,6 +16,11 @@
 fine with the default). Before this knob, all L2 tasks in one L3 launch shared
 the process-wide ``PTO2_RING_*`` env and could not be sized independently.
 
+The demo dispatches three L2 tasks: two use the scalar form (one ring value
+broadcast to every ring), and a third uses the per-ring arrays
+(``ring_task_windows`` / ``ring_heaps`` / ``ring_dep_pools``) to size each of
+the four scope-depth rings independently.
+
 The key line is inside ``orch_fn``: each ``submit_next_level`` gets its OWN
 ``CallConfig`` whose ``runtime_env`` is set per task. That per-task config
 travels through the mailbox to the chip child, so each L2 binds its ring
@@ -56,11 +61,24 @@
 N_ROWS = 128
 N_COLS = 128
 
+# RuntimeEnv keys an L2 spec may carry. Scalar keys broadcast one value to every
+# ring; the array keys size the four scope-depth rings (0..3) independently.
+RING_FIELDS = (
+    "ring_task_window",
+    "ring_heap",
+    "ring_dep_pool",
+    "ring_task_windows",
+    "ring_heaps",
+    "ring_dep_pools",
+)
+
 # One entry per L2 task dispatched by the orchestration. Each carries its own
-# ring sizing; the inputs differ so the two golden checks are independent.
+# ring sizing; the inputs differ so the golden checks are independent. The first
+# two tasks use the scalar form (one value per ring); the third uses the
+# per-ring arrays to size each scope-depth ring independently.
 L2_TASKS = [
     {
-        "label": "l2_small",
+        "label": "l2_scalar_small",
         "a": 2.0,
         "b": 3.0,
         "ring_task_window": 16,
@@ -68,20 +86,28 @@
         "ring_dep_pool": 64,
     },
     {
-        "label": "l2_large",
+        "label": "l2_scalar_large",
         "a": 5.0,
         "b": 7.0,
         "ring_task_window": 128,
         "ring_heap": 8 * 1024 * 1024,
         "ring_dep_pool": 256,
     },
+    {
+        "label": "l2_per_ring",
+        "a": 1.0,
+        "b": 4.0,
+        "ring_task_windows": [128, 64, 32, 16],
+        "ring_heaps": [8 * 1024 * 1024, 4 * 1024 * 1024, 2 * 1024 * 1024, 1 * 1024 * 1024],
+        "ring_dep_pools": [256, 128, 64, 64],
+    },
 ]
 
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
     parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3"])
-    parser.add_argument("-d", "--device", type=int, default=0, help="Single device id; the two L2 tasks run serially.")
+    parser.add_argument("-d", "--device", type=int, default=0, help="Single device id; the L2 tasks run serially.")
     return parser.parse_args()
 
 
@@ -142,9 +168,9 @@ def _l2_config(base: CallConfig, spec: dict) -> CallConfig:
     cfg.enable_dep_gen = base.enable_dep_gen
     cfg.enable_scope_stats = base.enable_scope_stats
     cfg.output_prefix = base.output_prefix
-    cfg.runtime_env.ring_task_window = spec["ring_task_window"]
-    cfg.runtime_env.ring_heap = spec["ring_heap"]
-    cfg.runtime_env.ring_dep_pool = spec["ring_dep_pool"]
+    for key in RING_FIELDS:
+        if key in spec:
+            setattr(cfg.runtime_env, key, spec[key])
     return cfg
 
 
@@ -188,7 +214,7 @@ def orch_fn(orch, _args, _cfg):
                 print(f"[per_task_runtime_env] submit '{spec['label']}': runtime_env={cfg.runtime_env!r}")
                 orch.submit_next_level(chip_handle, chip_args, cfg)
 
-        print("[per_task_runtime_env] running DAG (2 L2 tasks, distinct rings)...")
+        print(f"[per_task_runtime_env] running DAG ({len(L2_TASKS)} L2 tasks, distinct rings)...")
         worker.run(orch_fn, args=None, config=CallConfig())
 
         for i, spec in enumerate(L2_TASKS):