From 39af1a4f690444283cc021782d63f7de7385daf9 Mon Sep 17 00:00:00 2001 From: ChaoZheng109 Date: Tue, 23 Jun 2026 20:28:10 +0800 Subject: [PATCH] docs(examples): demonstrate per-ring runtime_env sizing #1099 added per-ring array fields (ring_task_windows / ring_heaps / ring_dep_pools) alongside the scalar runtime_env knobs, but neither per_task_runtime_env example exercised them. Extend both the L2 and L3 examples to also cover the per-ring form: each scope-depth ring (0..3) sized independently. The config helpers now iterate a RING_FIELDS tuple so a spec dict can carry either the scalar or the array keys, and the READMEs document the full precedence chain and the --enable-scope-stats verification path. --- .../workers/l2/per_task_runtime_env/README.md | 48 ++++++++----- .../workers/l2/per_task_runtime_env/main.py | 68 ++++++++++++++----- .../workers/l3/per_task_runtime_env/README.md | 26 ++++--- .../workers/l3/per_task_runtime_env/main.py | 46 ++++++++++--- 4 files changed, 137 insertions(+), 51 deletions(-) diff --git a/examples/workers/l2/per_task_runtime_env/README.md b/examples/workers/l2/per_task_runtime_env/README.md index f78766103..407df48db 100644 --- a/examples/workers/l2/per_task_runtime_env/README.md +++ b/examples/workers/l2/per_task_runtime_env/README.md @@ -1,40 +1,56 @@ # `per_task_runtime_env/` — per-task ring sizing on one L2 Worker -Runs the same vector_add kernel three times on one L2 `Worker`, each with a -different `CallConfig.runtime_env` (ring buffer sizing). Ring sizing is a -**per-run** knob carried on `CallConfig` — not a process-wide env export. +Runs the same vector_add kernel several times on one L2 `Worker`, each with a +different `CallConfig.runtime_env` (ring buffer sizing) — covering both the +**scalar** form (one value broadcast to every ring) and the **per-ring** form +(each scope-depth ring sized independently). Ring sizing is a **per-run** knob +carried on `CallConfig` — not a process-wide env export. ## What it shows -`CallConfig.runtime_env` groups the three ring overrides as a distinct config -tier, separate from the top-level execution knobs (`block_dim`, …): +`CallConfig.runtime_env` groups the ring overrides as a distinct config tier, +separate from the top-level execution knobs (`block_dim`, …). Each resource +comes in a scalar field and a 4-entry per-ring array: -| field | unit | constraint | -| ----- | ---- | ---------- | -| `ring_task_window` | tasks | power of 2, >= 4 | -| `ring_heap` | bytes / ring | >= 1024 | -| `ring_dep_pool` | entries | 4 .. INT32_MAX | +| scalar field | per-ring array | unit | constraint (per value) | +| ------------ | -------------- | ---- | ---------------------- | +| `ring_task_window` | `ring_task_windows` | tasks | power of 2 in [4, INT32_MAX] | +| `ring_heap` | `ring_heaps` | bytes / ring | >= 1024 | +| `ring_dep_pool` | `ring_dep_pools` | entries | 4 .. INT32_MAX | -Precedence per value: **`runtime_env` field > `PTO2_RING_*` env var > -compile-time default**. A field left at 0 (or omitted) falls back to the env -var / default. +The array fields must contain exactly **4 entries**, indexed by scope-depth +ring `0..3` (depth `>=3` maps to ring 3). A `0` entry — or a field left unset — +falls through to the next precedence tier: + +```text +per-ring field > scalar field > per-ring env > scalar env > compile-time default +``` ```python cfg = CallConfig() +# Scalar: one value broadcast to every ring. cfg.runtime_env.ring_task_window = 128 cfg.runtime_env.ring_heap = 8 * 1024 * 1024 # bytes per ring cfg.runtime_env.ring_dep_pool = 256 + +# Per-ring: size rings 0..3 independently (overrides the scalar tier per ring). +cfg.runtime_env.ring_task_windows = [128, 64, 32, 16] +cfg.runtime_env.ring_heaps = [8 * 1024 * 1024, 4 * 1024 * 1024, 2 * 1024 * 1024, 1 * 1024 * 1024] +cfg.runtime_env.ring_dep_pools = [256, 128, 64, 64] worker.run(chip_handle, args, cfg) ``` -The three runs (`small_ring`, `large_ring`, `env_or_default`) compute the same -vector add and all pass golden — only the ring footprint differs. +The runs (`scalar_small`, `scalar_large`, `per_ring`, `env_or_default`) compute +the same vector add and all pass golden — only the ring footprint differs. +Confirm the effective per-ring sizes with `--enable-scope-stats` (the first line +of `scope_stats/scope_stats.jsonl` reports `task_window_max` / `heap_max` / +`dep_pool_max`, indexed by `ring`). ## Layout ```text per_task_runtime_env/ - main.py # 3 runs, one CallConfig.runtime_env each + main.py # 4 runs, one CallConfig.runtime_env each test_per_task_runtime_env.py ``` diff --git a/examples/workers/l2/per_task_runtime_env/main.py b/examples/workers/l2/per_task_runtime_env/main.py index fba29fe4b..232d74e64 100644 --- a/examples/workers/l2/per_task_runtime_env/main.py +++ b/examples/workers/l2/per_task_runtime_env/main.py @@ -9,17 +9,25 @@ # ----------------------------------------------------------------------------------------------------------- """L2 Worker API demo — per-task ring sizing via ``CallConfig.runtime_env``. -Runs the same vector_add kernel three times on one L2 Worker, each time with a -different ``CallConfig.runtime_env`` (ring buffer sizing). Ring sizing is a -per-run knob carried on ``CallConfig`` — no process-wide ``PTO2_RING_*`` env -export needed, and each ``worker.run`` binds its ring buffers from the config -it was handed. +Runs the same vector_add kernel several times on one L2 Worker, each time with +a different ``CallConfig.runtime_env`` (ring buffer sizing) — covering both the +scalar form (one value broadcast to every ring) and the per-ring form (each +scope-depth ring sized independently). Ring sizing is a per-run knob carried on +``CallConfig`` — no process-wide ``PTO2_RING_*`` env export needed, and each +``worker.run`` binds its ring buffers from the config it was handed. runtime_env fields (0 / unset => fall back to env var / compile default): - ring_task_window power of 2, >= 4 - ring_heap bytes per ring, >= 1024 - ring_dep_pool 4 .. INT32_MAX - Precedence: runtime_env field > PTO2_RING_* env var > compile-time default. + scalar (broadcast to every ring): + ring_task_window power of 2 in [4, INT32_MAX] + ring_heap bytes per ring, >= 1024 + ring_dep_pool 4 .. INT32_MAX + per-ring arrays (exactly 4 entries, indexed by scope-depth ring 0..3; + a 0 entry falls through to the scalar / env / default tier): + ring_task_windows [w0, w1, w2, w3] + ring_heaps [h0, h1, h2, h3] bytes per ring + ring_dep_pools [d0, d1, d2, d3] + Precedence per resource and ring: + per-ring field > scalar field > per-ring env > scalar env > default. See ../vector_add/main.py for the full L2 lifecycle walk-through; this example reuses that kernel verbatim and only varies the per-run ring configuration. @@ -59,12 +67,36 @@ N_ELEMS = N_ROWS * N_COLS NBYTES = N_ELEMS * 4 # float32 +# RuntimeEnv keys a config dict may carry. Scalar keys broadcast one value to +# every ring; the array keys size the four scope-depth rings independently. +RING_FIELDS = ( + "ring_task_window", + "ring_heap", + "ring_dep_pool", + "ring_task_windows", + "ring_heaps", + "ring_dep_pools", +) + # (label, runtime_env dict or None). None => no override; falls back to the # PTO2_RING_* env var / compile-time default. Same kernel + same inputs run -# under every sizing, so all three produce identical (correct) output. +# under every sizing, so all of them produce identical (correct) output. RING_CONFIGS = [ - ("small_ring", {"ring_task_window": 16, "ring_heap": 1 * 1024 * 1024, "ring_dep_pool": 64}), - ("large_ring", {"ring_task_window": 128, "ring_heap": 8 * 1024 * 1024, "ring_dep_pool": 256}), + # Scalar form: one value broadcast to every ring (the #1042 behavior). + ("scalar_small", {"ring_task_window": 16, "ring_heap": 1 * 1024 * 1024, "ring_dep_pool": 64}), + ("scalar_large", {"ring_task_window": 128, "ring_heap": 8 * 1024 * 1024, "ring_dep_pool": 256}), + # Per-ring form: each scope-depth ring (0..3) sized independently. Ring 0 is + # the shallow ring the kernel actually drives, so it gets the most headroom; + # the deeper rings taper down. Confirm the effective sizes with + # --enable-scope-stats (see scope_stats/scope_stats.jsonl). + ( + "per_ring", + { + "ring_task_windows": [128, 64, 32, 16], + "ring_heaps": [8 * 1024 * 1024, 4 * 1024 * 1024, 2 * 1024 * 1024, 1 * 1024 * 1024], + "ring_dep_pools": [256, 128, 64, 64], + }, + ), ("env_or_default", None), ] @@ -115,12 +147,16 @@ def build_chip_callable(platform: str) -> ChipCallable: def _make_config(ring: Optional[dict]) -> CallConfig: - """Build a CallConfig, attaching this run's ring sizing under runtime_env.""" + """Build a CallConfig, attaching this run's ring sizing under runtime_env. + + Sets whichever of the scalar / per-ring keys the dict carries; the same + helper serves both the scalar and per-ring configs above. + """ cfg = CallConfig() if ring is not None: - cfg.runtime_env.ring_task_window = ring["ring_task_window"] - cfg.runtime_env.ring_heap = ring["ring_heap"] - cfg.runtime_env.ring_dep_pool = ring["ring_dep_pool"] + for key in RING_FIELDS: + if key in ring: + setattr(cfg.runtime_env, key, ring[key]) return cfg diff --git a/examples/workers/l3/per_task_runtime_env/README.md b/examples/workers/l3/per_task_runtime_env/README.md index bf048b3ed..937d9a7f4 100644 --- a/examples/workers/l3/per_task_runtime_env/README.md +++ b/examples/workers/l3/per_task_runtime_env/README.md @@ -1,6 +1,6 @@ # `per_task_runtime_env/` — distinct ring sizes per L2 in one L3 launch -One L3 orchestration dispatches two L2 tasks, each binding its **own** ring +One L3 orchestration dispatches several L2 tasks, each binding its **own** ring buffers via `CallConfig.runtime_env`. This is the headline use case the per-task ring sizing enables: heterogeneous L2 tasks in a single launch that each need a different ring footprint. @@ -11,20 +11,28 @@ Before this knob, every L2 dispatched from one L3 shared the process-wide `PTO2_RING_*` env and could not be sized independently. Now each `submit_next_level` gets its own `CallConfig`: +Each spec carries one form — scalar keys (`ring_task_window`, …) *or* the +per-ring arrays (`ring_task_windows`, …) — so the loop sets whichever keys the +spec contains: + ```python def orch_fn(orch, _args, _cfg): for spec in L2_TASKS: # one entry per L2 task cfg = CallConfig() - cfg.runtime_env.ring_task_window = spec["ring_task_window"] - cfg.runtime_env.ring_heap = spec["ring_heap"] # bytes per ring - cfg.runtime_env.ring_dep_pool = spec["ring_dep_pool"] + for key in RING_FIELDS: # scalar OR per-ring keys + if key in spec: # a spec carries just one form + setattr(cfg.runtime_env, key, spec[key]) orch.submit_next_level(chip_handle, chip_args, cfg) # per-task config ``` The per-task config travels through the mailbox to the chip child, so each L2 -binds its rings from its own values. The demo dispatches `l2_small` -(16 / 1 MiB / 64) and `l2_large` (128 / 8 MiB / 256); both run the same -vector_add and pass golden. +binds its rings from its own values. The demo dispatches three L2 tasks: +`l2_scalar_small` (16 / 1 MiB / 64) and `l2_scalar_large` (128 / 8 MiB / 256) +use the scalar form, and `l2_per_ring` sizes the four scope-depth rings +independently (`ring_task_windows=[128, 64, 32, 16]`, etc.). All run the same +vector_add and pass golden. The array fields take exactly four entries (one per +scope-depth ring `0..3`); a `0` entry falls through to the scalar / env / +default tier. ### Derive per-task config from the base, don't rebuild it @@ -39,7 +47,7 @@ any fields the harness injected on the orchestration's base config — otherwise ```text per_task_runtime_env/ - main.py # 2 submit_next_level, one runtime_env each + main.py # several submit_next_level, one runtime_env each test_per_task_runtime_env.py ``` @@ -51,7 +59,7 @@ The kernel is reused verbatim from `../../l2/vector_add/kernels`. python examples/workers/l3/per_task_runtime_env/main.py -p a2a3sim -d 0 ``` -The two L2 tasks run serially on one device. See +The L2 tasks run serially on one device. See [`../multi_chip_dispatch/`](../multi_chip_dispatch/) for the multi-device DAG primitives (`worker=i` pinning, `submit_sub`), and [`../../l2/per_task_runtime_env/`](../../l2/per_task_runtime_env/) for the diff --git a/examples/workers/l3/per_task_runtime_env/main.py b/examples/workers/l3/per_task_runtime_env/main.py index 979313426..df234972f 100644 --- a/examples/workers/l3/per_task_runtime_env/main.py +++ b/examples/workers/l3/per_task_runtime_env/main.py @@ -7,8 +7,8 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. # ----------------------------------------------------------------------------------------------------------- -"""L3 Worker API demo — one orchestration dispatches two L2 tasks, each sized -with its OWN ring buffers. +"""L3 Worker API demo — one orchestration dispatches several L2 tasks, each +sized with its OWN ring buffers. This is the headline use case for ``CallConfig.runtime_env``: an L3 fans out several heterogeneous L2 tasks in one launch, and each L2 needs a different @@ -16,6 +16,11 @@ fine with the default). Before this knob, all L2 tasks in one L3 launch shared the process-wide ``PTO2_RING_*`` env and could not be sized independently. +The demo dispatches three L2 tasks: two use the scalar form (one ring value +broadcast to every ring), and a third uses the per-ring arrays +(``ring_task_windows`` / ``ring_heaps`` / ``ring_dep_pools``) to size each of +the four scope-depth rings independently. + The key line is inside ``orch_fn``: each ``submit_next_level`` gets its OWN ``CallConfig`` whose ``runtime_env`` is set per task. That per-task config travels through the mailbox to the chip child, so each L2 binds its ring @@ -56,11 +61,24 @@ N_ROWS = 128 N_COLS = 128 +# RuntimeEnv keys an L2 spec may carry. Scalar keys broadcast one value to every +# ring; the array keys size the four scope-depth rings (0..3) independently. +RING_FIELDS = ( + "ring_task_window", + "ring_heap", + "ring_dep_pool", + "ring_task_windows", + "ring_heaps", + "ring_dep_pools", +) + # One entry per L2 task dispatched by the orchestration. Each carries its own -# ring sizing; the inputs differ so the two golden checks are independent. +# ring sizing; the inputs differ so the golden checks are independent. The first +# two tasks use the scalar form (one value per ring); the third uses the +# per-ring arrays to size each scope-depth ring independently. L2_TASKS = [ { - "label": "l2_small", + "label": "l2_scalar_small", "a": 2.0, "b": 3.0, "ring_task_window": 16, @@ -68,20 +86,28 @@ "ring_dep_pool": 64, }, { - "label": "l2_large", + "label": "l2_scalar_large", "a": 5.0, "b": 7.0, "ring_task_window": 128, "ring_heap": 8 * 1024 * 1024, "ring_dep_pool": 256, }, + { + "label": "l2_per_ring", + "a": 1.0, + "b": 4.0, + "ring_task_windows": [128, 64, 32, 16], + "ring_heaps": [8 * 1024 * 1024, 4 * 1024 * 1024, 2 * 1024 * 1024, 1 * 1024 * 1024], + "ring_dep_pools": [256, 128, 64, 64], + }, ] def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-p", "--platform", required=True, choices=["a2a3sim", "a2a3"]) - parser.add_argument("-d", "--device", type=int, default=0, help="Single device id; the two L2 tasks run serially.") + parser.add_argument("-d", "--device", type=int, default=0, help="Single device id; the L2 tasks run serially.") return parser.parse_args() @@ -142,9 +168,9 @@ def _l2_config(base: CallConfig, spec: dict) -> CallConfig: cfg.enable_dep_gen = base.enable_dep_gen cfg.enable_scope_stats = base.enable_scope_stats cfg.output_prefix = base.output_prefix - cfg.runtime_env.ring_task_window = spec["ring_task_window"] - cfg.runtime_env.ring_heap = spec["ring_heap"] - cfg.runtime_env.ring_dep_pool = spec["ring_dep_pool"] + for key in RING_FIELDS: + if key in spec: + setattr(cfg.runtime_env, key, spec[key]) return cfg @@ -188,7 +214,7 @@ def orch_fn(orch, _args, _cfg): print(f"[per_task_runtime_env] submit '{spec['label']}': runtime_env={cfg.runtime_env!r}") orch.submit_next_level(chip_handle, chip_args, cfg) - print("[per_task_runtime_env] running DAG (2 L2 tasks, distinct rings)...") + print(f"[per_task_runtime_env] running DAG ({len(L2_TASKS)} L2 tasks, distinct rings)...") worker.run(orch_fn, args=None, config=CallConfig()) for i, spec in enumerate(L2_TASKS):