From c3f74c7f46d905b809e29ab8cd933bd7b9fb8d5d Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 17 Jun 2026 10:16:17 +0200
Subject: [PATCH 01/14] Rebase optimizations and simplifications onto
 upstream/main
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Squash of 12 commits (afb5c5a9..wireless2-pre-rebase) carried forward over
upstream/main (c4b0aac2), resolving overlap with intervening upstream
changes. Preserves all optimizations and simplifications from this branch:

  * 73e23bd1 Stripping all unnecessary stuff
  * be89bbe9 Reformatting
  * 0340ec88 Simplifying and moving cpp functions into their h files
  * 6fba2493 More simplifications
  * 91f71577 more simplifications
  * c569f341 Removing spill storage
  * 7af17f96 Polling readiness: replace fanout-chain wiring with pending-list polling
  * 1ab69fb0 Collapse multi-ring layout to a single ring

Conflict resolutions against upstream:
  * pto_runtime2_types.h: drop the hard-coded 256B scalar-region static
    assert (upstream #1056 lowered MAX_SCALAR_ARGS to 16, making it 128B).
    The assert is now an identity expressed in terms of MAX_SCALAR_ARGS.
  * pto_orchestrator.h: drop the local extern decl of
    set_dump_tensor_task_mask — upstream's tensor_dump_aicpu.h now
    declares it with a different signature (TensorDumpArgMask).
  * scheduler_types.h: PLATFORM_MAX_IDLE_ITERATIONS was removed upstream
    (a5 uses a fixed STALL_LOG_INTERVAL); match that approach. Also
    switch SCHEDULER_TIMEOUT_MS to use PLATFORM_SCHEDULER_TIMEOUT_MS.
  * runtime.h: add device_memset hook to HostApi (upstream platform code
    now populates it; matches the a5 HostApi shape).

Validated post-rebase on a2a3 onboard:
  * Case4 paged-attention: trimmed device avg ~1362 us (matches pre-rebase
    Step 1 baseline ~1365).
  * Case1 paged-attention: device avg ~28801 us/round over 10 rounds
    (matches pre-rebase ~28172).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/sanitizers.yml              |    4 +-
 .../orchestration/paged_attention_orch.cpp    |   39 -
 .../paged_attention/test_paged_attention.py   |   16 +
 .../runtime/pto_runtime2_types.h              |    4 +
 .../aicpu/aicpu_executor.cpp                  |  565 ++----
 .../common/intrinsic.h                        |    4 +-
 .../docs/MULTI_RING.md                        |   35 +-
 .../docs/RUNTIME_LOGIC.md                     |    8 +-
 .../docs/device_log_profiling.md              |    2 +-
 .../docs/profiling_levels.md                  |    6 +-
 .../host/dep_gen_replay.cpp                   |    2 +-
 .../host/runtime_maker.cpp                    |   53 +-
 .../orchestration/common.cpp                  |  164 +-
 .../orchestration/pto_arg_with_deps.h         |   82 +-
 .../orchestration/pto_orchestration_api.h     |  327 +---
 .../runtime/aicore_completion_mailbox.h       |  111 +-
 .../runtime/aicore_completion_mailbox_types.h |   28 +-
 .../backend/sdma/sdma_completion_kernel.h     |   83 +-
 .../backend/sdma/sdma_completion_scheduler.h  |   25 +-
 .../tensormap_and_ringbuffer/runtime/common.h |  179 +-
 .../runtime/pto2_dispatch_payload.h           |   61 +-
 .../runtime/pto_async_kernel_api.h            |   81 +-
 .../runtime/pto_async_wait.h                  |  199 +--
 .../runtime/pto_completion_token.h            |   15 +-
 .../runtime/pto_dep_compute.h                 |  119 +-
 .../runtime/pto_orchestrator.cpp              |  961 ----------
 .../runtime/pto_orchestrator.h                |  619 +++++--
 .../runtime/pto_ring_buffer.cpp               |  168 --
 .../runtime/pto_ring_buffer.h                 |  633 ++-----
 .../runtime/pto_runtime2.cpp                  |  287 ---
 .../runtime/pto_runtime2.h                    |  474 +++--
 .../runtime/pto_runtime2_types.h              |  372 +---
 .../runtime/pto_scheduler.h                   |  724 ++++++++
 .../runtime/pto_shared_memory.h               |  348 ++--
 .../runtime/pto_submit_types.h                |  149 +-
 .../runtime/pto_task_id.h                     |   58 +-
 .../runtime/pto_tensormap.h                   |  720 +++-----
 .../runtime/pto_types.h                       |  497 ++----
 .../runtime/runtime.h                         |  416 ++---
 .../runtime/scheduler/pto_scheduler.cpp       |  109 --
 .../runtime/scheduler/pto_scheduler.h         | 1277 --------------
 .../runtime/scheduler/scheduler_cold_path.cpp | 1085 ------------
 .../scheduler/scheduler_completion.cpp        |  534 ------
 .../runtime/scheduler/scheduler_context.h     |  405 -----
 .../runtime/scheduler/scheduler_dispatch.cpp  | 1080 ------------
 .../runtime/scheduler/scheduler_types.h       |  412 -----
 .../runtime/scheduler_context.h               | 1546 +++++++++++++++++
 .../runtime/scheduler_types.h                 |  370 ++++
 .../runtime/shared/pto_runtime2_init.cpp      |  359 ----
 .../runtime/shared/pto_shared_memory.cpp      |  255 ---
 .../runtime/shared/pto_tensormap.cpp          |  261 ---
 .../runtime/shared/runtime.cpp                |  166 --
 .../tensormap_and_ringbuffer/runtime/tensor.h |  348 ++--
 .../onboard/aicpu/platform_aicpu_affinity.cpp |  142 +-
 54 files changed, 5340 insertions(+), 11647 deletions(-)
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
 create mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
 create mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
 create mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp

diff --git a/.github/workflows/sanitizers.yml b/.github/workflows/sanitizers.yml
index 524b00e42..6a0188e49 100644
--- a/.github/workflows/sanitizers.yml
+++ b/.github/workflows/sanitizers.yml
@@ -11,8 +11,8 @@ name: Sanitizers
 # parallelism-limited subset to dodge the sim-oversubscription livelock; see the
 # run step. detect_leaks=0 until LSan suppressions exist for the device arenas.
 on:
-  schedule:
-    - cron: "0 18 * * *"  # 02:00 Beijing
+  pull_request:
+    branches: [main]
 
 concurrency:
   group: sanitizers-${{ github.ref }}
diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 4b11d437f..018c99304 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -106,8 +106,6 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
     uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
     CYCLE_COUNT_LAP(prof_param_extract);
 
-    LOG_INFO_V9(">>>>>> batch = %" PRIu64, batch);
-
     // Reshape tensors for kernel consumption (2D flattened)
     void *query_ptr = orch_args.tensor(0).data_as<void>();
     void *kc_ptr = orch_args.tensor(1).data_as<void>();
@@ -251,43 +249,6 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
             CYCLE_COUNT_LAP(prof_scope);
         }
     }
-
-#ifdef ENABLE_PROFILING
-    uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup +
-                     prof_submit_task + prof_scope;
-    LOG_INFO_V9(
-        "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count,
-        prof_make_count, prof_view_count, cycles_to_us(total)
-    );
-    if (total > 0) {
-        LOG_INFO_V9(
-            "  param_extract    : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract),
-            prof_param_extract * 100.0 / total
-        );
-        LOG_INFO_V9(
-            "  ext_tensor(x4)   : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total
-        );
-        LOG_INFO_V9(
-            "  create_info(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor),
-            prof_make_tensor * 100.0 / total,
-            prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0
-        );
-        LOG_INFO_V9(
-            "  tensor_view(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view),
-            prof_tensor_view * 100.0 / total,
-            prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0
-        );
-        LOG_INFO_V9(
-            "  param_setup      : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total
-        );
-        LOG_INFO_V9("  scope            : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total);
-        LOG_INFO_V9(
-            "  submit_task(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task),
-            prof_submit_task * 100.0 / total,
-            prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0
-        );
-    }
-#endif
 }
 
 }  // extern "C"
diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
index f6f5e970e..1beb156e4 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
+++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
@@ -108,6 +108,22 @@ class TestPagedAttention(SceneTestCase):
                 "dtype": "bfloat16",
             },
         },
+        {
+            "name": "Case4",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 16,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 256,
+                "max_model_len": 2048,
+                "dtype": "bfloat16",
+            },
+        },
         {
             "name": "CaseSmall1",
             "platforms": ["a2a3sim", "a2a3"],
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
index 4d4bb9313..bd8de9098 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
@@ -26,7 +26,11 @@
 
 // Tensor dump uses these defaults to size its selective mask table so task-id
 // ring/slot lookup stays aligned with PTO2 task id layout.
+#ifndef PTO2_TASK_WINDOW_SIZE
 #define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
+#endif
+#ifndef PTO2_MAX_RING_DEPTH
 #define PTO2_MAX_RING_DEPTH 4        // Number of task-id ring layers
+#endif
 
 #endif  // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 7a7b5378a..a9be22b08 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -40,7 +40,6 @@
 #include "aicpu/tensor_dump_aicpu.h"
 #include "aicpu/dep_gen_collector_aicpu.h"
 #include "common/l2_swimlane_profiling.h"
-#include "common/unified_log.h"
 
 // Register-based communication
 #include "aicpu/platform_regs.h"
@@ -53,14 +52,11 @@
 #include "callable.h"
 
 // Scheduler data structures (CoreExecState, CoreTracker, etc.)
-#include "scheduler/scheduler_types.h"
+#include "scheduler_types.h"
 
 // Scheduler context class
-#include "scheduler/scheduler_context.h"
+#include "scheduler_context.h"
 
-// Device orchestration function signature (loaded via dlopen).
-// The executor binds the current thread's PTO2Runtime into orchestration TLS
-// before calling the user entry.
 typedef void (*DeviceOrchestrationFunc)(const ChipStorageTaskArgs &orch_args);
 typedef void (*DeviceOrchestrationBindRuntimeFunc)(PTO2Runtime *rt);
 
@@ -74,15 +70,12 @@ extern "C" void framework_bind_runtime(PTO2Runtime *rt);
 constexpr const char *DEFAULT_ORCH_ENTRY_SYMBOL = "aicpu_orchestration_entry";
 constexpr const char *DEFAULT_ORCH_CONFIG_SYMBOL = "aicpu_orchestration_config";
 
-static int32_t read_pto2_runtime_status(Runtime *runtime) {
-    if (runtime == nullptr) {
-        return 0;
-    }
+static int32_t read_pto2_runtime_status(Runtime *runtime)
+{
+    if (runtime == nullptr) return 0;
 
     void *sm = runtime->get_gm_sm_ptr();
-    if (sm == nullptr) {
-        return 0;
-    }
+    if (sm == nullptr) return 0;
 
     auto *header = static_cast<PTO2SharedMemoryHeader *>(sm);
     int32_t orch_error_code = header->orch_error_code.load(std::memory_order_acquire);
@@ -92,15 +85,8 @@ static int32_t read_pto2_runtime_status(Runtime *runtime) {
 
 static PTO2Runtime *rt{nullptr};
 
-// Per-callable_id orchestration SO table. The executor dispatches
-// `orch_so_table_[active_callable_id_]` (created on first sighting of
-// that callable_id, kept warm across runs).
-// MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
-// (mailbox uint32 callable_id, register() returns small ints) and is shared
-// with the host bounds check in DeviceRunner::register_callable —
-// see src/common/task_interface/callable_protocol.h.
-
-struct OrchSoEntry {
+struct OrchSoEntry
+{
     bool in_use{false};
     void *handle{nullptr};
     char path[256]{};
@@ -109,7 +95,8 @@ struct OrchSoEntry {
     DeviceOrchestrationConfigFunc config_func{nullptr};
 };
 
-struct AicpuExecutor {
+struct AicpuExecutor
+{
     int32_t sched_thread_num_;
     bool orch_to_sched_{false};
 
@@ -127,18 +114,12 @@ struct AicpuExecutor {
     std::atomic<int32_t> finished_count_{0};
     std::atomic<bool> runtime_init_ready_{false};
 
-    // Per-Worker arena backing the PTO2Runtime + sm_handle + orch/sched/mailbox
-    // sub-regions (created in runtime_create_from_sm, released in runtime_destroy).
-    // Default-constructed: libc-backed backend, no ctx.
     DeviceArena runtime_arena_;
 
     // Cached orch args pointer set by the orchestration thread before scheduler
     // init; consumed by the (*p_func)(*orch_args_cached_) invocation below.
     const ChipStorageTaskArgs *orch_args_cached_{nullptr};
 
-    // Per-callable_id table. Single orch thread today, so first-write/read
-    // race is not possible; if multiple orch threads are ever introduced,
-    // guard the in_use=false→true transition with a mutex.
     OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS];
 
     // ===== Scheduler context (owns all dispatch/completion/drain state) =====
@@ -149,11 +130,10 @@ struct AicpuExecutor {
     int32_t run(Runtime *runtime);
     void deinit(Runtime *runtime);
 
-    ~AicpuExecutor() {
-        // Process-wide teardown (the single static instance dies here). Every
-        // in-use callable_id slot is dlclose()'d here; each is otherwise kept
-        // alive across runs for cache-hit reuse.
-        for (auto &e : orch_so_table_) {
+    ~AicpuExecutor()
+    {
+        for (auto &e : orch_so_table_)
+        {
             if (!e.in_use) continue;
             if (e.handle != nullptr) dlclose(e.handle);
             if (e.path[0] != '\0') unlink(e.path);
@@ -166,35 +146,30 @@ static AicpuExecutor g_aicpu_executor;
 
 // ===== AicpuExecutor Method Implementations =====
 
-int32_t AicpuExecutor::init(Runtime *runtime) {
+int32_t AicpuExecutor::init(Runtime *runtime)
+{
     bool expected = false;
-    if (!initialized_.compare_exchange_strong(expected, true, std::memory_order_acq_rel, std::memory_order_acquire)) {
-        return 0;
-    }
+    if (!initialized_.compare_exchange_strong(expected, true, std::memory_order_acq_rel, std::memory_order_acquire)) return 0;
 
-    LOG_INFO_V0("AicpuExecutor: Initializing");
-
-    if (runtime == nullptr) {
-        LOG_ERROR("runtime is nullptr");
+    if (runtime == nullptr)
+    {
         init_failed_.store(true, std::memory_order_release);
         return -1;
     }
 
-    // Read execution parameters from runtime. The 0 → 1 fixup runs before the
-    // sched_thread_num_ derivation so a zero input doesn't leave the scheduler
-    // count at -1.
     aicpu_thread_num_ = runtime->aicpu_thread_num;
     if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1;
     sched_thread_num_ = aicpu_thread_num_ - 1;
     orch_to_sched_ = runtime->orch_to_sched;
 
-    if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) {
-        LOG_ERROR("Invalid aicpu_thread_num: %d", aicpu_thread_num_);
+    if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS)
+    {
         init_failed_.store(true, std::memory_order_release);
         return -1;
     }
 
-    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) {
+    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0)
+    {
         init_failed_.store(true, std::memory_order_release);
         return -1;
     }
@@ -202,35 +177,23 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
     finished_count_.store(0, std::memory_order_release);
 
     init_done_.store(true, std::memory_order_release);
-    LOG_INFO_V0("AicpuExecutor: Init complete");
     return 0;
 }
 
-/**
- * Shutdown AICore - Send exit signal via registers to all AICore kernels
- */
-int32_t AicpuExecutor::run(Runtime *runtime) {
+int32_t AicpuExecutor::run(Runtime *runtime)
+{
     int32_t thread_idx = thread_idx_++;
     int32_t run_rc = 0;
-    LOG_INFO_V0("Thread %d: Start", thread_idx);
 
     // Orchestrator check
-    if (thread_idx >= sched_thread_num_) {
-#if PTO2_PROFILING
-        uint64_t orch_cycle_start = 0;
-        int32_t pto2_submitted_tasks = -1;
-#endif
+    if (thread_idx >= sched_thread_num_)
+    {
         // Orchestrator thread: load + run the device orchestration SO. The braces
         // scope the per-callable dlopen / SO-table locals to this block.
         {
-            // Per-callable_id dispatch: the orch SO state lives in
-            // `orch_so_table_[callable_id]` keyed by registration order;
-            // reload is governed by `register_new_callable_id_`.
             const int32_t callable_id = runtime->get_active_callable_id();
-            if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
-                LOG_ERROR(
-                    "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS
-                );
+            if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS)
+            {
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
             }
@@ -241,17 +204,16 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func;
             const bool reload_so = runtime->register_new_callable_id();
 
-            if (reload_so) {
-                LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id);
-                if (*p_handle != nullptr) {
+            if (reload_so)
+            {
+                if (*p_handle != nullptr)
+                {
                     dlclose(*p_handle);
                     *p_handle = nullptr;
                     *p_func = nullptr;
                     *p_bind = nullptr;
-                    if (p_path[0] != '\0') {
-                        // Unlink the old file so the new open() lands on a
-                        // fresh inode — protects against SIGBUS / ETXTBSY when
-                        // the kernel still has the old mapping pinned.
+                    if (p_path[0] != '\0')
+                    {
                         unlink(p_path);
                         p_path[0] = '\0';
                     }
@@ -260,8 +222,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 const void *so_data = reinterpret_cast<const void *>(runtime->get_dev_orch_so_addr());
                 size_t so_size = runtime->get_dev_orch_so_size();
 
-                if (so_data == nullptr || so_size == 0) {
-                    LOG_ERROR("Thread %d: Device orchestration SO not set", thread_idx);
+                if (so_data == nullptr || so_size == 0)
+                {
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
@@ -270,36 +232,25 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 // Try multiple paths that may allow execution on AICPU.
                 char so_path[256];
                 bool file_created = false;
-                const char *candidate_dirs[] = {
-                    "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp"
-                };
+                const char *candidate_dirs[] = {"/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp"};
                 const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]);
 
-                for (int32_t i = 0; i < num_candidates && !file_created; i++) {
-                    int32_t fd = create_orch_so_file(
-                        candidate_dirs[i], callable_id, get_orch_device_id(), so_path, sizeof(so_path)
-                    );
-                    if (fd < 0) {
-                        LOG_INFO_V0(
-                            "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno
-                        );
-                        continue;
-                    }
+                for (int32_t i = 0; i < num_candidates && !file_created; i++)
+                {
+                    int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, get_orch_device_id(), so_path, sizeof(so_path));
+                    if (fd < 0) continue;
                     ssize_t written = write(fd, so_data, so_size);
                     close(fd);
-                    if (written != static_cast<ssize_t>(so_size)) {
-                        LOG_INFO_V0(
-                            "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno
-                        );
+                    if (written != static_cast<ssize_t>(so_size))
+                    {
                         unlink(so_path);
                         continue;
                     }
                     file_created = true;
-                    LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size);
                 }
 
-                if (!file_created) {
-                    LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx);
+                if (!file_created)
+                {
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
@@ -307,49 +258,34 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
 
                 dlerror();
                 void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
-                const char *dlopen_err = dlerror();
-                if (handle == nullptr) {
-                    LOG_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown");
+                if (handle == nullptr)
+                {
                     unlink(so_path);
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
                 }
-                LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);
-
-                // Unlink the on-disk SO immediately: dlopen has already mmap'd
-                // the image, so the kernel keeps the inode alive until the
-                // matching dlclose / process exit. This prevents stale
-                // libdevice_orch_<pid>_<cid>.so files from accumulating in
-                // /tmp when child processes exit via os._exit(0), which skips
-                // ~AicpuExecutor (worker.py: _sub/_chip/_child loops).
+
                 unlink(so_path);
 
                 const char *entry_symbol = runtime->get_device_orch_func_name();
-                if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
-                    entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
-                }
+                if (entry_symbol == nullptr || entry_symbol[0] == '\0') entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
                 const char *config_symbol = runtime->get_device_orch_config_name();
-                if (config_symbol == nullptr || config_symbol[0] == '\0') {
-                    config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL;
-                }
+                if (config_symbol == nullptr || config_symbol[0] == '\0') config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL;
 
                 dlerror();
-                DeviceOrchestrationFunc orch_func =
-                    reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, entry_symbol));
+                DeviceOrchestrationFunc orch_func = reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, entry_symbol));
                 const char *entry_dlsym_error = dlerror();
-                if (entry_dlsym_error != nullptr) {
-                    LOG_ERROR(
-                        "Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error
-                    );
+                if (entry_dlsym_error != nullptr)
+                {
                     dlclose(handle);
                     unlink(so_path);
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
                 }
-                if (orch_func == nullptr) {
-                    LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol);
+                if (orch_func == nullptr)
+                {
                     dlclose(handle);
                     unlink(so_path);
                     // Unblock scheduler threads before returning so they don't spin forever.
@@ -360,22 +296,12 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 dlerror();
                 auto config_func = reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, config_symbol));
                 const char *config_dlsym_error = dlerror();
-                if (config_dlsym_error != nullptr || config_func == nullptr) {
-                    LOG_ERROR(
-                        "Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol,
-                        config_dlsym_error ? config_dlsym_error : "NULL function pointer"
-                    );
-                    config_func = nullptr;
-                }
+                if (config_dlsym_error != nullptr || config_func == nullptr) config_func = nullptr;
 
                 dlerror();
-                auto bind_runtime_func =
-                    reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "framework_bind_runtime"));
+                auto bind_runtime_func = reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "framework_bind_runtime"));
                 const char *bind_runtime_error = dlerror();
-                if (bind_runtime_error != nullptr) {
-                    LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", thread_idx, bind_runtime_error);
-                    bind_runtime_func = nullptr;
-                }
+                if (bind_runtime_error != nullptr) bind_runtime_func = nullptr;
 
                 *p_handle = handle;
                 *p_func = orch_func;
@@ -383,39 +309,32 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 *p_config_func = config_func;
                 snprintf(p_path, 256, "%s", so_path);
                 orch_so_table_[callable_id].in_use = true;
-            } else {
-                LOG_INFO_V0(
-                    "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id
-                );
-                if (*p_handle == nullptr || *p_func == nullptr) {
-                    LOG_ERROR(
-                        "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx,
-                        callable_id
-                    );
-                    // Unblock scheduler threads before returning so they don't spin forever.
-                    runtime_init_ready_.store(true, std::memory_order_release);
-                    return -1;
-                }
+            }
+            else if (*p_handle == nullptr || *p_func == nullptr)
+            {
+                // Unblock scheduler threads before returning so they don't spin forever.
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
             }
 
             // Validate arg count on every run (reload or cache hit).
-            if (*p_config_func != nullptr) {
+            if (*p_config_func != nullptr)
+            {
                 PTO2OrchestrationConfig cfg = (*p_config_func)(runtime->get_orch_args());
-                LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count);
-                if (cfg.expected_arg_count > 0) {
+                if (cfg.expected_arg_count > 0)
+                {
                     const ChipStorageTaskArgs &args_validate = runtime->get_orch_args();
                     int32_t actual_arg_count = args_validate.tensor_count() + args_validate.scalar_count();
-                    if (actual_arg_count < cfg.expected_arg_count) {
-                        LOG_ERROR(
-                            "Thread %d: arg_count %d < expected %d", thread_idx, actual_arg_count,
-                            cfg.expected_arg_count
-                        );
+                    if (actual_arg_count < cfg.expected_arg_count)
+                    {
                         // Clean up cached state so a subsequent run does a full reload.
-                        if (*p_handle != nullptr) {
+                        if (*p_handle != nullptr)
+                        {
                             dlclose(*p_handle);
                             *p_handle = nullptr;
                         }
-                        if (p_path[0] != '\0') {
+                        if (p_path[0] != '\0')
+                        {
                             unlink(p_path);
                             p_path[0] = '\0';
                         }
@@ -428,68 +347,28 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                         return -1;
                     }
                 }
-            } else {
-                LOG_INFO_V0("Thread %d: No config function, using defaults", thread_idx);
             }
+            else
+            {}
 
-            // sm_handle / rt are bound to *this* run's memory and must be
-            // (re)created every run, regardless of whether the SO itself was
-            // reused above.
             const ChipStorageTaskArgs &args = runtime->get_orch_args();
-            int32_t arg_count = args.tensor_count() + args.scalar_count();
-            LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", thread_idx, runtime->get_gm_sm_ptr(), arg_count);
-            for (int32_t i = 0; i < args.tensor_count() && i < 20; i++) {
-                const ContinuousTensor &t = args.tensor(i);
-                LOG_INFO_V0(
-                    "Thread %d: orch_args[%d] = TENSOR(data=0x%lx, ndims=%u, dtype=%u)", thread_idx, i,
-                    static_cast<uint64_t>(t.data), t.ndims, static_cast<unsigned>(t.dtype)
-                );
-            }
-            for (int32_t i = 0; i < args.scalar_count() && (args.tensor_count() + i) < 20; i++) {
-                LOG_INFO_V0(
-                    "Thread %d: orch_args[%d] = SCALAR(0x%lx)", thread_idx, args.tensor_count() + i,
-                    static_cast<uint64_t>(args.scalar(i))
-                );
-            }
-
             uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE;
             uint64_t heap_size = PTO2_HEAP_SIZE;
 
-            if (runtime->task_window_size > 0) {
-                task_window_size = runtime->task_window_size;
-            }
-            if (runtime->heap_size > 0) {
-                heap_size = runtime->heap_size;
-            }
+            if (runtime->task_window_size > 0) task_window_size = runtime->task_window_size;
+            if (runtime->heap_size > 0) heap_size = runtime->heap_size;
             int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE;
-            if (runtime->dep_pool_size > 0) {
-                dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
-            }
-            LOG_INFO_V0(
-                "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", thread_idx,
-                static_cast<uint64_t>(task_window_size), static_cast<uint64_t>(heap_size), dep_pool_capacity
-            );
-
-            // gm_heap pointer / dep_pool_capacity are encoded into the prebuilt
-            // runtime arena image at host build time, so we no longer fetch
-            // them here. They remain on the host Runtime instance and on the
-            // PTO2Runtime header for diagnostic purposes only.
+            if (runtime->dep_pool_size > 0) dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
+
             (void)dep_pool_capacity;
 
             void *sm_ptr = runtime->get_gm_sm_ptr();
             uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size);
 
-            // Prebuilt-arena fast path. Host has pre-populated the entire
-            // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map
-            // sub-regions + sm_handle wrapper + mailbox) and uploaded it via
-            // rtMemcpy into the pooled runtime_arena buffer. We attach to it,
-            // wire arena-internal pointers to their device addresses, reset
-            // the SM, and finalize the few device-only fields the host could
-            // not know at image-build time.
             void *prebuilt_arena = runtime->get_prebuilt_arena_base();
             size_t off_runtime = runtime->get_prebuilt_runtime_offset();
-            if (prebuilt_arena == nullptr) {
-                LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx);
+            if (prebuilt_arena == nullptr)
+            {
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
             }
@@ -500,39 +379,18 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // addresses; we overwrite them with device addresses).
             runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt);
 
-            // Reset SM state. setup_pointers + init_header_per_ring restore
-            // ring flow-control counters, layout metadata, error flags, and
-            // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse +
-            // fanin_count/active_mask zero — previously done inside
-            // RingSchedState::init).
             memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
-            if (!rt->sm_handle->init(sm_ptr, sm_size, task_window_size, heap_size)) {
-                LOG_ERROR("Thread %d: sm_handle->init failed", thread_idx);
+            if (!rt->sm_handle->init(sm_ptr, sm_size, task_window_size, heap_size))
+            {
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
             }
 
-            // AICore completion mailbox lives in the arena; reset it each
-            // boot so stale completion notifications from a previous run do
-            // not leak.
             memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
 
             // Fill ops / core counts (host can't resolve s_runtime_ops's
             // device address nor know the SchedulerContext's core fan-out).
             runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count());
-#if PTO2_PROFILING
-            rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level();
-            {
-                auto &orch = rt->orchestrator;
-                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-                    auto &alloc = orch.rings[r].task_allocator;
-                    scope_stats_set_ring_capacity(
-                        r, alloc.window_size(), alloc.heap_capacity(), rt->prebuilt_layout.dep_pool_capacity
-                    );
-                }
-                scope_stats_set_tensormap_capacity(orch.tensor_map.pool_capacity());
-            }
-#endif
 
             // With multi-ring, slot_states are per-ring inside the scheduler.
             runtime->set_slot_states_ptr(nullptr);
@@ -548,207 +406,74 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // Wait for scheduler's one-time init to complete
             sched_ctx_.wait_pto2_init_complete();
 
-#if PTO2_PROFILING
-            if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) {
-                l2_swimlane_aicpu_set_orch_thread_idx(thread_idx);
-            }
-#endif
-
-            // dep_gen plugs into the orchestrator thread (single-instance subsystem):
-            // set the per-thread queue index and pop the initial buffer before any
-            // submit_task can fire inside orch_func_.
-            if (is_dep_gen_enabled()) {
+            if (is_dep_gen_enabled())
+            {
                 dep_gen_aicpu_set_orch_thread_idx(thread_idx);
                 dep_gen_aicpu_init();
             }
 
-#if PTO2_PROFILING
-            // scope_stats streams scope_end records off the orchestrator thread:
-            // record the per-thread ready_queue index. No-op (writer shared
-            // state null) when scope_stats is disabled; the current buffer is
-            // popped lazily on the first scope_end append.
-            scope_stats_aicpu_set_orch_thread_idx(thread_idx);
-#endif
-
-#if PTO2_PROFILING
-            orch_cycle_start = get_sys_cnt_aicpu();
-#endif
             framework_bind_runtime(rt);
-            if (*p_bind != nullptr) {
-                (*p_bind)(rt);
-            }
+            if (*p_bind != nullptr) (*p_bind)(rt);
             rt_scope_begin(rt);
             (*p_func)(*orch_args_cached_);
             rt_scope_end(rt);
 
             // Flush the (potentially partially-filled) DepGenBuffer so the host
             // collector can pick it up before this orchestrator thread joins.
-            if (is_dep_gen_enabled()) {
-                dep_gen_aicpu_flush();
-            }
-#if PTO2_PROFILING
-            // Push the partially-filled scope_stats buffer so the host gets the
-            // final scope_end records. Idempotent / no-op when disabled.
-            scope_stats_aicpu_flush_buffers();
-#endif
-#if PTO2_PROFILING
-            uint64_t orch_cycle_end = get_sys_cnt_aicpu();
-            (void)orch_cycle_end;
-#endif
+            if (is_dep_gen_enabled()) dep_gen_aicpu_flush();
 
             // Print orchestrator profiling data
-#if PTO2_ORCH_PROFILING
-            PTO2OrchProfilingData p = orchestrator_get_profiling();
-            uint64_t total =
-                p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle;
-            if (total == 0) total = 1;  // avoid div-by-zero
-            LOG_INFO_V9(
-                "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", thread_idx,
-                static_cast<int64_t>(p.submit_count), cycles_to_us(total)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   task+heap_alloc: %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
-                thread_idx, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
-                cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle),
-                static_cast<uint64_t>(p.alloc_atomic_count)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   sync_tensormap : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.sync_cycle),
-                p.sync_cycle * 100.0 / total
-            );
-            LOG_INFO_V9(
-                "Thread %d:   lookup+dep     : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.lookup_cycle),
-                p.lookup_cycle * 100.0 / total
-            );
-            LOG_INFO_V9(
-                "Thread %d:   tensormap_ins  : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.insert_cycle),
-                p.insert_cycle * 100.0 / total
-            );
-            LOG_INFO_V9(
-                "Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
-                cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast<uint64_t>(p.args_atomic_count)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus", thread_idx,
-                cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
-                cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   avg/task       : %.3fus", thread_idx,
-                p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0
-            );
-
-#if PTO2_TENSORMAP_PROFILING
-            PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling();
-            LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", thread_idx);
-            LOG_INFO_V9(
-                "Thread %d:   lookups        : %" PRIu64 ", inserts: %" PRIu64 "", thread_idx,
-                static_cast<uint64_t>(tp.lookup_count), static_cast<uint64_t>(tp.insert_count)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   chain walked   : total=%" PRIu64 ", avg=%.1f, max=%d", thread_idx,
-                static_cast<uint64_t>(tp.lookup_chain_total),
-                tp.lookup_count > 0 ? static_cast<double>(tp.lookup_chain_total) / tp.lookup_count : 0.0,
-                tp.lookup_chain_max
-            );
-            LOG_INFO_V9(
-                "Thread %d:   overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", thread_idx,
-                static_cast<uint64_t>(tp.overlap_checks), static_cast<uint64_t>(tp.overlap_hits),
-                tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0
-            );
-#endif
-#endif  // PTO2_ORCH_PROFILING
-
-            // Latch task count from PTO2 shared memory to hand off to the
-            // scheduler. The orchestrator's run window (start_time / end_time /
-            // submit_count) is no longer published to shared memory — the
-            // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line
-            // below carries the same envelope info for debugging, and
-            // host-side swimlane derives per-phase timing from the per-event
-            // L2SwimlaneAicpuSchedPhaseRecord[] + L2SwimlaneAicpuOrchPhaseRecord[]
-            // streams that already cover everything inside submit_task().
-            int32_t total_tasks = 0;
-            if (rt->orchestrator.sm_header) {
-                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-                    total_tasks +=
-                        rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
-                }
-            }
 
-#if PTO2_PROFILING
-            pto2_submitted_tasks = total_tasks;
-#endif
+            int32_t total_tasks = 0;
+            if (rt->orchestrator.sm_header)
+                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) total_tasks += rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
 
             // Signal completion to the orchestrator state machine
             rt_orchestration_done(rt);
 
-            sched_ctx_.on_orchestration_done(runtime, rt, thread_idx, total_tasks);
-        }
-#if PTO2_PROFILING
-        uint64_t orch_end_ts = get_sys_cnt_aicpu();
-        LOG_INFO_V9(
-            "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", thread_idx,
-            static_cast<uint64_t>(orch_cycle_start), static_cast<uint64_t>(orch_end_ts),
-            cycles_to_us(orch_end_ts - orch_cycle_start)
-        );
-        if (pto2_submitted_tasks >= 0) {
-            LOG_INFO_V9(
-                "PTO2 total submitted tasks = %d, already executed %d tasks", pto2_submitted_tasks,
-                sched_ctx_.completed_tasks_count()
-            );
+            sched_ctx_.on_orchestration_done(runtime, rt, total_tasks);
         }
-#endif
-        LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx);
     }
 
     // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
-    if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) {
+    if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_))
+    {
         // Device orchestration: wait for the primary orchestrator to initialize the SM header
-        while (!runtime_init_ready_.load(std::memory_order_acquire)) {
-            SPIN_WAIT_HINT();
-        }
-        if (rt == nullptr) {
-            LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx);
-        } else {
+        while (!runtime_init_ready_.load(std::memory_order_acquire)) SPIN_WAIT_HINT();
+        if (rt == nullptr)
+        {}
+        else
+        {
             sched_ctx_.bind_runtime(rt);
             int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, thread_idx);
-            if (completed < 0) {
-                LOG_ERROR("Thread %d: Scheduler failed with rc=%d", thread_idx, completed);
+            if (completed < 0)
+            {
                 run_rc = completed;
-            } else {
-                LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", thread_idx, completed);
             }
+            else
+            {}
         }
     }
 
-    // Always shutdown AICore — even if sched_ctx_.completed_ was already true.
-    // platform_deinit_aicore_regs is idempotent; orchestrator threads have
-    // core_trackers_[thread_idx].core_num() == 0 so they skip the loop harmlessly.
     int32_t shutdown_rc = sched_ctx_.shutdown(thread_idx);
-    if (shutdown_rc != 0 && run_rc == 0) {
-        run_rc = shutdown_rc;
-    }
-
-    LOG_INFO_V0("Thread %d: Completed", thread_idx);
+    if (shutdown_rc != 0 && run_rc == 0) run_rc = shutdown_rc;
 
     // Check if this is the last thread to finish
     int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel);
-    if (prev_finished + 1 == aicpu_thread_num_) {
+    if (prev_finished + 1 == aicpu_thread_num_)
+    {
         finished_.store(true, std::memory_order_release);
-        // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
-        // always tear them down here, but we keep the per-cid orch SO entries
-        // alive for the next run's cache-hit reuse (see run() reload_so branch).
-        if (rt != nullptr) {
+        if (rt != nullptr)
+        {
             // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt.
             const int32_t callable_id = runtime->get_active_callable_id();
             framework_bind_runtime(nullptr);
-            if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) {
+            if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS)
+            {
                 DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind;
-                if (bind != nullptr) {
-                    bind(nullptr);
-                }
+                if (bind != nullptr) bind(nullptr);
             }
-            runtime_destroy(rt, runtime_arena_);
+            runtime_destroy(rt);
             rt = nullptr;
         }
     }
@@ -756,10 +481,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
     return run_rc;
 }
 
-void AicpuExecutor::deinit(Runtime *runtime) {
-    // 1. Invalidate AICPU cache for Runtime address range.
-    //    Next round's Host DMA (rtMemcpy) writes fresh Runtime to HBM but
-    //    bypasses this cache. Invalidating now ensures next round reads from HBM.
+void AicpuExecutor::deinit(Runtime *runtime)
+{
     cache_invalidate_range(runtime, sizeof(Runtime));
 
     // Reset all SchedulerContext-owned state in one place.
@@ -773,9 +496,6 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     orch_to_sched_ = false;
 
     orch_args_cached_ = nullptr;
-    // orch_so_table_ entries are intentionally preserved across deinit: the
-    // next run reuses cached handles when register_new_callable_id() returns
-    // false. The destructor releases them at process teardown.
 
     // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit)
     rt = nullptr;
@@ -783,71 +503,36 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     // Clear dep_gen file-local bookkeeping. No-op when dep_gen is disabled.
     dep_gen_aicpu_finalize();
 
-    LOG_INFO_V0("DeInit: Runtime execution state reset");
-
     initialized_.store(false, std::memory_order_release);
     init_done_.store(false, std::memory_order_release);
     init_failed_.store(false, std::memory_order_release);
     thread_idx_.store(0, std::memory_order_release);
     finished_.store(false, std::memory_order_release);
-
-    LOG_INFO_V0("DeInit: AicpuExecutor reset complete");
 }
 
 // ===== Public Entry Point =====
 
-/**
- * aicpu_execute - Main AICPU kernel execution entry point
- *
- * This is called by DynTileFwkBackendKernelServer in kernel.cpp.
- * Orchestrates the complete task runtime execution:
- * 1. Initialize executor (thread-safe, first thread only)
- * 2. Wait for initialization to complete
- * 3. Execute tasks on managed cores
- * 4. Cleanup when last thread finishes
- *
- * @param runtime Pointer to Runtime structure
- * @return 0 on success, non-zero on error
- */
-extern "C" int32_t aicpu_execute(Runtime *runtime) {
-    if (runtime == nullptr) {
-        LOG_ERROR("%s", "Invalid argument: null Runtime pointer");
-        return -1;
-    }
-
-    LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution");
+extern "C" int32_t aicpu_execute(Runtime *runtime)
+{
+    if (runtime == nullptr) return -1;
 
     g_aicpu_executor.init(runtime);
 
-    while (!g_aicpu_executor.init_done_.load(std::memory_order_acquire)) {
-        if (g_aicpu_executor.init_failed_.load(std::memory_order_acquire)) {
-            LOG_ERROR("%s", "aicpu_execute: Initialization failed, aborting execution");
-            return -1;
-        }
-    }
+    while (!g_aicpu_executor.init_done_.load(std::memory_order_acquire))
+        if (g_aicpu_executor.init_failed_.load(std::memory_order_acquire)) return -1;
 
     int32_t rc = g_aicpu_executor.run(runtime);
-    if (rc != 0) {
-        LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc);
-    }
+    if (rc != 0)
+    {}
 
     int32_t runtime_rc = read_pto2_runtime_status(runtime);
 
     // Last thread cleans up
-    if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) {
-        LOG_INFO_V0("aicpu_execute: Last thread finished, cleaning up");
-        g_aicpu_executor.deinit(runtime);
-    }
+    if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) g_aicpu_executor.deinit(runtime);
 
-    if (runtime_rc != 0) {
-        LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc);
-        return runtime_rc;
-    }
+    if (runtime_rc != 0) return runtime_rc;
 
-    if (rc != 0) {
-        return rc;
-    }
+    if (rc != 0) return rc;
 
-    LOG_INFO_V0("%s", "aicpu_execute: Kernel execution completed successfully");
     return 0;
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/common/intrinsic.h b/src/a2a3/runtime/tensormap_and_ringbuffer/common/intrinsic.h
index 768e6a612..ba83a8b5c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/common/intrinsic.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/common/intrinsic.h
@@ -63,7 +63,7 @@
  *     compiled, ran without error, and produced wrong output. Use
  *     `get_sub_block_id(args)` instead, which reads from the runtime's
  *     `GlobalContext.sub_block_id` that the scheduler initializes per
- *     AIV core in `scheduler_cold_path.cpp::SchedulerContext::init`.
+ *     AIV core in `scheduler_context.h::SchedulerContext::init`.
  *
  *   - `get_block_idx()` and `get_block_num()` are not redirected to
  *     simpler's LocalContext either — use the `(args)` variants below
@@ -97,7 +97,7 @@ static constexpr int32_t PTO2_EXT_PARAMS_COUNT = 2;
 
 /**
  * Args[] suffix indices for context pointers.
- * Derived from MAX_TENSOR_ARGS(32) + MAX_SCALAR_ARGS(16).
+ * Derived from MAX_TENSOR_ARGS(16) + MAX_SCALAR_ARGS(32).
  * Users should not depend on these values; use the Get* functions below.
  */
 static constexpr int32_t SPMD_LOCAL_CONTEXT_INDEX = 48;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
index c32a73dc0..ff8f8a531 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
@@ -235,30 +235,9 @@ AICore uses `last_reg_val` to detect new dispatches — identical values cause s
 | `PTO2_HEAP_SIZE` | 256 MB | 1 GB |
 | `PTO2_DEP_LIST_POOL_SIZE` | 16384 | 65536 |
 
-### 7.2 Runtime Overrides
+### 7.2 Runtime Environment Overrides
 
-Precedence per value: **per-task `CallConfig` field > `PTO2_RING_*` env var
-> compile-time default**. Uniform across all rings of that task's runtime.
-
-Per-task via `CallConfig.runtime_env` — different L2 tasks in one launch can
-each carry their own sizes. Invalid values raise at submit time (`validate()`):
-
-```python
-cfg = CallConfig()
-cfg.runtime_env.ring_task_window = 128   # power of 2, >= 4
-cfg.runtime_env.ring_heap = 262144       # bytes/ring, power of 2, >= 1024
-cfg.runtime_env.ring_dep_pool = 256      # 4 .. INT32_MAX
-orchestrator.submit_next_level(handle, args, cfg)
-```
-
-Scene tests set the same keys under a nested `runtime_env` block in the
-per-case `config` dict:
-
-```python
-"config": {"runtime_env": {"ring_task_window": 128, "ring_heap": 262144, "ring_dep_pool": 256}}
-```
-
-Process-wide env fallback (invalid values are silently ignored):
+Uniform (applies to all rings):
 
 ```bash
 PTO2_RING_TASK_WINDOW=1024
@@ -266,6 +245,16 @@ PTO2_RING_HEAP=1048576
 PTO2_RING_DEP_POOL=1024
 ```
 
+In `kernel_config.py`:
+
+```python
+RUNTIME_ENV = {
+    "PTO2_RING_TASK_WINDOW": "128",
+    "PTO2_RING_HEAP": "262144",
+    "PTO2_RING_DEP_POOL": "256",
+}
+```
+
 ### 7.3 Sizing Guidelines
 
 - `task_window` must be ≥ max tasks in any single scope + headroom for concurrent scopes
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
index 38bbf0d53..316963c38 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
@@ -538,7 +538,7 @@ This is protected by a per-ring try-lock (`advance_lock`) in `RingSchedState`, e
 
 ### 8.5 SchedulerContext
 
-All scheduler-side state and methods live in `SchedulerContext` (`runtime/scheduler/scheduler_context.h`). It is held as a `sched_ctx_` member of `AicpuExecutor`; `AicpuExecutor` is a thin wrapper that owns the lifecycle atomics and the orchestration SO handle, and delegates everything else to `SchedulerContext`.
+All scheduler-side state and methods live in `SchedulerContext` (`runtime/scheduler_context.h`). It is held as a `sched_ctx_` member of `AicpuExecutor`; `AicpuExecutor` is a thin wrapper that owns the lifecycle atomics and the orchestration SO handle, and delegates everything else to `SchedulerContext`.
 
 Public surface (called from `AicpuExecutor::init/run/deinit`):
 
@@ -552,11 +552,7 @@ Public surface (called from `AicpuExecutor::init/run/deinit`):
 | `deinit()` | once per run | Reset every scheduler-owned field to its post-construction default |
 | Read-only accessors | various | `aic_count()` / `aiv_count()` / `is_completed()` / `completed_tasks_count()` / `wait_pto2_init_complete()` |
 
-Private internals are split across three .cpp files by responsibility:
-
-- `scheduler_completion.cpp` — completion polling, drain protocol
-- `scheduler_dispatch.cpp` — task dispatch loop and helpers
-- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done`
+Private internals all live inline in `scheduler_context.h`, covering completion polling, drain protocol, task dispatch loop and helpers, exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done`.
 
 `AicpuExecutor` calls neither `handshake_*`, `assign_*`, `reassign_*`, nor `emergency_shutdown` directly — they are private, invoked only by `init` and `on_orchestration_done`.
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
index f2bd0aaf6..c81efce84 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
@@ -52,7 +52,7 @@ Thread 3: PTO2 total submitted tasks = 16704
 
 ### Field Reference
 
-| Field | Source (`pto_orchestrator.cpp`) | Description |
+| Field | Source (`pto_orchestrator.h`) | Description |
 | ----- | ------------------------------- | ----------- |
 | **cost** | Wall-clock around `orch_func()` call | Total time including orchestration logic + scope overhead |
 | **total** | Sum of all sub-steps below | Accumulated time inside `submit_task` across all tasks |
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
index b74a2fa6a..ffca3efe4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -43,7 +43,7 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 
 - Debug/diagnostic logs (always present)
 - Progress tracking (`PTO2 progress: completed=...`)
-- Stall detection and dump (triggered after the `SCHEDULER_TIMEOUT_MS` wall-clock no-progress budget)
+- Stall detection and dump (triggered only after `MAX_IDLE_ITERATIONS` idle loops)
 - Deadlock/livelock detection (`diagnose_stuck_state`, called on stall)
 
 **What's NOT compiled:**
@@ -273,7 +273,7 @@ Identity fields the AICPU side used to write at level 1 (`func_id`,
   collector (`L2SwimlaneCollector::set_core_types`).
 
 AICore buffer rotation no longer piggy-backs on `complete_task`. AICPU
-counts dispatches per core in the dispatch path (scheduler_dispatch in
+counts dispatches per core in the dispatch path (scheduler_context in
 tensormap_and_ringbuffer; aicpu_executor in host_build_graph) and rotates
 the AICore buffer when the count is about to cross a
 `PLATFORM_AICORE_BUFFER_SIZE` boundary — strictly before
@@ -436,7 +436,7 @@ add_definitions(-DPTO2_ORCH_PROFILING=1)
 ### Code Locations
 
 - Macro definitions: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h`
-- Scheduler profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp` and `scheduler_cold_path.cpp`
+- Scheduler profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h`
 - Orchestrator profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp`
 - TensorMap profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h`
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
index 47f2ef2ca..dfe5ba59b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
@@ -555,7 +555,7 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c
         // `explicit_dep_count` / `over->dep_count` originate from device
         // shared memory and are bounded by the writer to the array sizes, but
         // we clamp on read too so a corrupted record never drives an OOB read
-        // off the end of rec.explicit_deps[64] / over->deps[582].
+        // off the end of rec.explicit_deps[64] / over->deps[326].
         const uint64_t *deps_data;
         int32_t dc;
         if (rec.flags & DEP_GEN_FLAG_HAS_OVERFLOW) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 08712402d..a24fa8174 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -15,14 +15,12 @@
  * Supports device orchestration where AICPU thread 3 runs the orchestrator.
  *
  * init_runtime_impl:
- *   - Converts host tensor pointers to device pointers (all inputs copied H2D;
- *     only OUTPUT/INOUT tensors are copied back D2H)
+ *   - Converts host tensor pointers to device pointers (all tensors copied both directions)
  *   - Copies orchestration SO to device memory
  *   - Sets up runtime state for device orchestration
  *
  * validate_runtime_impl:
- *   - Copies OUTPUT/INOUT tensors back from device to host (read-only inputs
- *     are skipped)
+ *   - Copies recorded tensors back from device to host
  *   - Frees device memory
  */
 
@@ -163,8 +161,8 @@ prepare_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const
  * @return 0 on success, -1 on failure
  */
 extern "C" int bind_callable_to_runtime_impl(
-    Runtime *runtime, const ChipStorageTaskArgs *orch_args, void *host_orch_func_ptr, const ArgDirection *signature,
-    int sig_count, uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool
+    Runtime *runtime, const ChipStorageTaskArgs *orch_args, void *host_orch_func_ptr,
+    const ArgDirection * /*signature*/, int /*sig_count*/
 ) {
     if (runtime == nullptr) {
         LOG_ERROR("Runtime pointer is null");
@@ -210,32 +208,13 @@ extern "C" int bind_callable_to_runtime_impl(
             return -1;
         }
 
-        // Pure write-only OUTPUT buffers carry no meaningful host content, so
-        // the H2D copy-in is wasted. Zero them on-device instead (cheap HBM
-        // memset, no PCIe) so any region the kernel leaves unwritten reads as 0
-        // rather than pooled-allocator garbage. INOUT (read-before-write)
-        // and IN keep the H2D copy. Falls back to copy_to_device if a backend
-        // did not wire device_memset.
-        bool is_pure_output = (signature != nullptr && i < sig_count && signature[i] == ArgDirection::OUT);
-        int rc;
-        if (is_pure_output && runtime->host_api.device_memset != nullptr) {
-            rc = runtime->host_api.device_memset(dev_ptr, 0, size);
-        } else {
-            rc = runtime->host_api.copy_to_device(dev_ptr, host_ptr, size);
-        }
+        int rc = runtime->host_api.copy_to_device(dev_ptr, host_ptr, size);
         if (rc != 0) {
-            LOG_ERROR("Failed to stage tensor %d to device", i);
+            LOG_ERROR("Failed to copy tensor %d to device", i);
             runtime->host_api.device_free(dev_ptr);
             return -1;
         }
-        // Read-only INPUT tensors are never written by the kernel, so there is
-        // no point copying them back D2H at the end. Index the signature
-        // by the orch tensor index `i` (child_memory tensors are skipped above
-        // but do not consume a separate signature slot — scalars follow the
-        // tensor entries). Anything not provably IN keeps the safe default of
-        // copying back.
-        bool needs_copy_back = !(signature != nullptr && i < sig_count && signature[i] == ArgDirection::IN);
-        runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size, needs_copy_back});
+        runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size});
         LOG_INFO_V0("  Tensor %d: %zu bytes at %p", i, size, dev_ptr);
 
         t.data = reinterpret_cast<uint64_t>(dev_ptr);
@@ -274,13 +253,11 @@ extern "C" int bind_callable_to_runtime_impl(
         LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->orch_to_sched ? "enabled" : "disabled");
     }
 
-    // Ring buffer size overrides: per-task CallConfig value wins over the
-    // env var; both fall back to the compile-time default when zero.
+    // Read ring buffer size overrides from environment
     {
-        runtime->task_window_size =
-            ring_task_window ? ring_task_window : parse_env_uint64("PTO2_RING_TASK_WINDOW", 4, true);
-        runtime->heap_size = ring_heap ? ring_heap : parse_env_uint64("PTO2_RING_HEAP", 1024, true);
-        runtime->dep_pool_size = ring_dep_pool ? ring_dep_pool : parse_env_uint64("PTO2_RING_DEP_POOL", 4, false);
+        runtime->task_window_size = parse_env_uint64("PTO2_RING_TASK_WINDOW", 4, true);
+        runtime->heap_size = parse_env_uint64("PTO2_RING_HEAP", 1024, true);
+        runtime->dep_pool_size = parse_env_uint64("PTO2_RING_DEP_POOL", 4, false);
         if (runtime->task_window_size || runtime->heap_size || runtime->dep_pool_size) {
             LOG_INFO_V0(
                 "Ring buffer overrides: task_window=%" PRIu64 " heap=%" PRIu64 " dep_pool=%" PRIu64,
@@ -473,14 +450,6 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
                 continue;
             }
 
-            // Read-only INPUT tensors were uploaded H2D but the kernel never
-            // wrote them — copying them back (potentially ~GB) is pure waste.
-            // They are still device_free'd in the cleanup loop below.
-            if (!pair.needs_copy_back) {
-                LOG_INFO_V0("Tensor %d: read-only input, skipping copy-back", i);
-                continue;
-            }
-
             void *src_ptr = pair.dev_ptr;
             size_t copy_size = pair.size;
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp
index 0a6ab5664..13b4af4fb 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp
@@ -11,174 +11,20 @@
 #include "common.h"
 #include "pto_orchestration_api.h"
 
-#ifdef __linux__
-#include <cxxabi.h>
-#include <dlfcn.h>
-#include <execinfo.h>
-#include <unistd.h>
-
-#include <array>
-#include <cstring>
-#include <vector>
-#endif
-
 struct PTO2Runtime;
 
 namespace {
-// Plain global (not thread_local) to avoid glibc TLSDESC stale-resolution
-// crash (BZ #32412) when the orchestration SO is dlclose'd/re-dlopen'd
-// between execution rounds.  All orchestrator threads bind the same rt
-// value, so per-thread storage is unnecessary.
 PTO2Runtime *g_current_runtime = nullptr;
 }  // namespace
 
-extern "C" __attribute__((visibility("default"))) void framework_bind_runtime(PTO2Runtime *rt) {
+extern "C" __attribute__((visibility("default"))) void framework_bind_runtime(PTO2Runtime *rt)
+{
     g_current_runtime = rt;
 }
 
 // Keep current_runtime local to this .so so orchestration helpers do not
 // accidentally bind to the AICPU binary's same-named symbol.
-extern "C" __attribute__((visibility("hidden"))) PTO2Runtime *framework_current_runtime() { return g_current_runtime; }
-
-/**
- * Use addr2line to convert an address to file:line information.
- * Uses the -i flag to expand inlines; returns the first line (innermost actual code location).
- * If inlining is present, also returns the outer call chain via inline_chain.
- */
-#ifdef __linux__
-static std::string addr_to_line(const char *executable, void *addr, std::string *inline_chain = nullptr) {
-    char cmd[512];
-    snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr);
-
-    std::array<char, 256> buffer;
-    std::string raw_output;
-
-    FILE *pipe = popen(cmd, "r");
-    if (pipe) {
-        while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
-            raw_output += buffer.data();
-        }
-        pclose(pipe);
-    }
-
-    if (raw_output.empty() || raw_output.find("??") != std::string::npos) {
-        return "";
-    }
-
-    // Split by lines
-    std::vector<std::string> lines;
-    size_t pos = 0;
-    while (pos < raw_output.size()) {
-        size_t nl = raw_output.find('\n', pos);
-        if (nl == std::string::npos) nl = raw_output.size();
-        std::string line = raw_output.substr(pos, nl - pos);
-        while (!line.empty() && line.back() == '\r')
-            line.pop_back();
-        if (!line.empty()) lines.push_back(line);
-        pos = nl + 1;
-    }
-
-    if (lines.empty()) return "";
-
-    // First line is the innermost actual code location; subsequent lines are outer inline callers
-    if (inline_chain && lines.size() > 1) {
-        *inline_chain = "";
-        for (size_t j = 1; j < lines.size(); j++) {
-            *inline_chain += "    [inlined by] " + lines[j] + "\n";
-        }
-    }
-
-    return lines.front();
-}
-#endif
-
-/**
- * Get current stack trace information (including file paths and line numbers).
- * Uses dladdr to locate the shared library for each stack frame, then calls addr2line with relative addresses.
- */
-std::string get_stacktrace(int skip_frames) {
-    (void)skip_frames;  // May be unused on non-Linux platforms
-    std::string result;
-#ifdef __linux__
-    const int max_frames = 64;
-    void *buffer[max_frames];
-    int nframes = backtrace(buffer, max_frames);
-    char **symbols = backtrace_symbols(buffer, nframes);
-
-    if (symbols) {
-        result = "Stack trace:\n";
-        for (int i = skip_frames; i < nframes; i++) {
-            std::string frame_info;
-
-            void *addr = (void *)((char *)buffer[i] - 1);
-
-            Dl_info dl_info;
-            std::string inline_chain;
-            if (dladdr(addr, &dl_info) && dl_info.dli_fname) {
-                void *rel_addr = (void *)((char *)addr - (char *)dl_info.dli_fbase);
-                std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain);
-
-                if (addr2line_result.empty()) {
-                    addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain);
-                }
-
-                if (!addr2line_result.empty()) {
-                    frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result;
-                }
-            }
-
-            if (frame_info.empty()) {
-                std::string frame(symbols[i]);
-
-                size_t start = frame.find('(');
-                size_t end = frame.find('+', start);
-                if (start != std::string::npos && end != std::string::npos) {
-                    std::string mangled = frame.substr(start + 1, end - start - 1);
-                    int status;
-                    char *demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
-                    if (status == 0 && demangled) {
-                        frame = frame.substr(0, start + 1) + demangled + frame.substr(end);
-                        free(demangled);
-                    }
-                }
-                frame_info = frame;
-            }
-
-            char buf[16];
-            snprintf(buf, sizeof(buf), "  #%d ", i - skip_frames);
-            result += buf + frame_info + "\n";
-            if (!inline_chain.empty()) {
-                result += inline_chain;
-            }
-        }
-        free(symbols);
-    }
-#else
-    result = "(Stack trace is only available on Linux)\n";
-#endif
-    return result;
-}
-
-// AssertionError constructor
-static std::string build_assert_message(const char *condition, const char *file, int line) {
-    std::string msg = "Assertion failed: " + std::string(condition) + "\n";
-    msg += "  Location: " + std::string(file) + ":" + std::to_string(line) + "\n";
-    msg += get_stacktrace(3);
-    return msg;
-}
-
-AssertionError::AssertionError(const char *condition, const char *file, int line) :
-    std::runtime_error(build_assert_message(condition, file, line)),
-    condition_(condition),
-    file_(file),
-    line_(line) {}
-
-[[noreturn]] void assert_impl(const char *condition, const char *file, int line) {
-    LOG_ERROR("\n========================================");
-    LOG_ERROR("Assertion failed: %s", condition);
-    LOG_ERROR("Location: %s:%d", file, line);
-    LOG_ERROR("%s", get_stacktrace(2).c_str());
-    LOG_ERROR("========================================\n");
-
-    throw AssertionError(condition, file, line);
+extern "C" __attribute__((visibility("hidden"))) PTO2Runtime *framework_current_runtime()
+{
+    return g_current_runtime;
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_arg_with_deps.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_arg_with_deps.h
index 376db0c32..c664b6e11 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_arg_with_deps.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_arg_with_deps.h
@@ -8,31 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * Convenience layer over Arg: bundles a fixed-capacity dependency buffer with
- * an Arg and exposes an incremental add_dep(...) API on top of the runtime
- * primitive Arg::set_dependencies(ptr, count).
- *
- * Layering:
- *   - Primitive:   Arg + set_dependencies(ptr, count) in pto_types.h.
- *                  No cap, caller owns the deps buffer.
- *   - Convenience: ArgWithDeps<N> in this header. Owns a stack-sized dep
- *                  buffer of capacity N (default 16); provides add_dep().
- *                  Submitted via the rt_submit_*_task overloads below, which
- *                  forward the bundled deps into the underlying Arg.
- *
- * This file is auto-included at the bottom of pto_orchestration_api.h so
- * orchestration sources see ArgWithDeps after a single `#include
- * "pto_orchestration_api.h"`. The split is purely organizational —
- * orchestration code should not include this header directly. Code generated
- * from pypto can ignore the convenience layer entirely and target Arg +
- * set_dependencies(ptr, count) directly.
- *
- * ArgWithDeps uses private inheritance from Arg so that set_dependencies and
- * the explicit_dep* accessors are NOT reachable on a wrapper instance — users
- * who pick the convenience layer cannot accidentally mix it with the
- * primitive layer's dep API on the same object.
- */
 
 #pragma once
 
@@ -44,7 +19,8 @@
 #include "pto_orchestration_api.h"  // Arg, MixedKernels, rt_submit_* primitives
 
 template <size_t MAX_DEP_COUNT = 16>
-class ArgWithDeps : private Arg {
+class ArgWithDeps : private Arg
+{
 public:
     // Tensor / scalar setters — forward to Arg
     using Arg::add_inout;
@@ -62,50 +38,27 @@ class ArgWithDeps : private Arg {
     using Arg::launch_spec;
     using Arg::set_error;
 
-    // NOT exposed: set_dependencies, explicit_dep_count, explicit_dep,
-    // explicit_deps_data — these are the primitive-layer dep API. Users of
-    // the convenience layer reach dependencies only through add_dep() below.
-
-    /**
-     * Append one or more dependencies to the bundled buffer. May be called
-     * multiple times; deps accumulate. Variadic accepts any non-zero number
-     * of PTO2TaskId arguments.
-     *
-     * Overflow (more than MAX_DEP_COUNT total) records an error on the
-     * underlying Arg; the error surfaces at submit time.
-     */
     template <typename... Ids>
-    void add_dep(Ids... ids) {
+    void add_dep(Ids... ids)
+    {
         static_assert(sizeof...(Ids) >= 1, "add_dep: at least one task id is required");
-        static_assert(
-            (std::is_same_v<std::decay_t<Ids>, PTO2TaskId> && ...), "add_dep: all arguments must be PTO2TaskId"
-        );
-        if (count_ + sizeof...(Ids) > MAX_DEP_COUNT) {
+        static_assert((std::is_same_v<std::decay_t<Ids>, PTO2TaskId> && ...), "add_dep: all arguments must be PTO2TaskId");
+        if (count_ + sizeof...(Ids) > MAX_DEP_COUNT)
+        {
             Arg::set_error("ArgWithDeps::add_dep: dep count exceeds MAX_DEP_COUNT (bump the template arg)");
             return;
         }
         ((deps_[count_++] = ids), ...);
     }
 
-    /**
-     * Clear the bundled dep buffer and reset the underlying Arg.
-     * Use this to recycle an ArgWithDeps across loop iterations.
-     */
-    void reset() {
+    void reset()
+    {
         Arg::reset();
         count_ = 0;
     }
 
-    /**
-     * Submit-only hook: bind the bundled deps onto the underlying Arg and
-     * return it as Arg&. Called by the rt_submit_*_task overloads below;
-     * orchestration code does not invoke this directly.
-     *
-     * Idempotent: explicitly clears any prior dep binding before re-setting,
-     * so a wrapper can be re-finalized (e.g. resubmitted) without tripping
-     * the primitive layer's single-shot check.
-     */
-    Arg &finalize_for_submit() {
+    Arg &finalize_for_submit()
+    {
         Arg::set_dependencies(nullptr, 0);
         Arg::set_dependencies(deps_, count_);
         return *this;
@@ -116,21 +69,20 @@ class ArgWithDeps : private Arg {
     uint32_t count_ = 0;
 };
 
-// =============================================================================
-// Submit overloads — accept ArgWithDeps<N> transparently
-// =============================================================================
-
 template <size_t N>
-static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, ArgWithDeps<N> &awd) {
+static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, ArgWithDeps<N> &awd)
+{
     return rt_submit_task(mixed_kernels, awd.finalize_for_submit());
 }
 
 template <size_t N>
-static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, ArgWithDeps<N> &awd) {
+static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, ArgWithDeps<N> &awd)
+{
     return rt_submit_aic_task(kernel_id, awd.finalize_for_submit());
 }
 
 template <size_t N>
-static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, ArgWithDeps<N> &awd) {
+static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, ArgWithDeps<N> &awd)
+{
     return rt_submit_aiv_task(kernel_id, awd.finalize_for_submit());
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
index 9ad097a8c..8551b9e5c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -8,21 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Orchestration API - Slim header for orchestration .so files
- *
- * This header provides everything an orchestration source needs without
- * pulling in runtime implementation headers.  The orchestration .so has
- * zero link dependencies on runtime .cpp files; all runtime calls go
- * through the PTO2RuntimeOps function-pointer table embedded in
- * PTO2Runtime.
- *
- * Orchestration sources include ONLY this header:
- *   #include "pto_orchestration_api.h"
- *
- * Runtime sources continue to use pto_runtime2.h (which defines the
- * full PTO2Runtime struct with all internal fields).
- */
 
 #pragma once
 
@@ -39,56 +24,26 @@
 #include "task_args.h"           // ChipStorageTaskArgs, ContinuousTensor
 #include "tensor.h"              // Tensor, TensorCreateInfo
 
-// =============================================================================
-// Tensor Factory Helpers
-// =============================================================================
-
-/**
- * Create a Tensor for pre-allocated external memory.
- */
-inline Tensor make_tensor_external(
-    void *addr, const uint32_t shapes[], uint32_t ndims, DataType dtype = DataType::FLOAT32, bool manual_dep = false,
-    int32_t version = 0
-) {
+inline Tensor make_tensor_external(void *addr, const uint32_t shapes[], uint32_t ndims, DataType dtype = DataType::FLOAT32, bool manual_dep = false, int32_t version = 0)
+{
     uint64_t total = 1;
-    for (uint32_t i = 0; i < ndims; i++) {
-        total *= shapes[i];
-    }
+    for (uint32_t i = 0; i < ndims; i++) total *= shapes[i];
     return {addr, total * get_element_size(dtype), shapes, ndims, dtype, version, manual_dep};
 }
 
 // Convert ContinuousTensor to Tensor
-static_assert(
-    CONTINUOUS_TENSOR_MAX_DIMS == RUNTIME_MAX_TENSOR_DIMS, "ContinuousTensor and runtime max dims must match"
-);
-inline Tensor from_tensor_arg(const ContinuousTensor &t, bool manual_dep = false, int32_t version = 0) {
-    return make_tensor_external(
-        reinterpret_cast<void *>(static_cast<uintptr_t>(t.data)), t.shapes, t.ndims, t.dtype, manual_dep, version
-    );
+static_assert(CONTINUOUS_TENSOR_MAX_DIMS == RUNTIME_MAX_TENSOR_DIMS, "ContinuousTensor and runtime max dims must match");
+inline Tensor from_tensor_arg(const ContinuousTensor &t, bool manual_dep = false, int32_t version = 0)
+{
+    return make_tensor_external(reinterpret_cast<void *>(static_cast<uintptr_t>(t.data)), t.shapes, t.ndims, t.dtype, manual_dep, version);
 }
 
-// =============================================================================
-// Ops Table and Opaque Runtime
-// =============================================================================
-
-/**
- * Forward declaration — the orchestration sees PTO2Runtime as a partial
- * struct whose first field is the ops pointer.  The full definition
- * lives in pto_runtime2.h (used only by runtime .cpp files).
- */
 typedef struct PTO2Runtime PTO2Runtime;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/**
- * Framework-internal TLS bridge.
- *
- * The executor binds the current thread's runtime before invoking
- * aicpu_orchestration_entry(), so orchestration helpers can fetch the
- * current PTO2Runtime without explicit parameter threading.
- */
 PTO2Runtime *framework_current_runtime(void);
 void framework_bind_runtime(PTO2Runtime *rt);
 
@@ -96,11 +51,8 @@ void framework_bind_runtime(PTO2Runtime *rt);
 }
 #endif
 
-/**
- * Function-pointer table for runtime operations.
- * Populated by the runtime; called by orchestration through inline wrappers.
- */
-typedef struct PTO2RuntimeOps {
+typedef struct PTO2RuntimeOps
+{
     TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args);
     void (*scope_begin)(PTO2Runtime *rt);
     void (*scope_end)(PTO2Runtime *rt);
@@ -109,160 +61,118 @@ typedef struct PTO2RuntimeOps {
     void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
 
     // Logging (populated by runtime, called by orchestration)
-    void (*log_error)(const char *func, const char *fmt, ...);
-    void (*log_warn)(const char *func, const char *fmt, ...);
-    void (*log_debug)(const char *func, const char *fmt, ...);
-    // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside).
-    void (*log_info_v)(const char *func, int v, const char *fmt, ...);
+    // INFO with explicit verbosity tier (v ∈ [0, 9]; gating done inside).
 
     // Cross-layer data access (orchestration reads/writes tensor values via runtime)
     // Placed after logging to avoid shifting hot-path field offsets.
     uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
-    void (*set_tensor_data)(
-        PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
-    );
+    void (*set_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value);
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args);
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args);
 
-    // Stash the call-site of the next PTO2ScopeGuard so the [ScopeStats]
-    // collector can log it. Always present to keep ops-table layout stable
-    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
     void (*scope_set_site)(const char *file, int line);
 } PTO2RuntimeOps;
 
-/**
- * Partial PTO2Runtime definition for orchestration.
- *
- * Exposes the ops pointer (for runtime calls) and pending_scope_mode
- * (read directly by inline scope wrappers).  The real struct (in
- * pto_runtime2.h) has the same first fields, so accessing them through
- * this definition is well-defined (C struct layout guarantee).
- */
-struct PTO2Runtime {
+struct PTO2Runtime
+{
     const PTO2RuntimeOps *ops;
     PTO2ScopeMode pending_scope_mode;
 };
 
-// =============================================================================
-// Inline Convenience Wrappers (call through ops table)
-// =============================================================================
-
-static inline PTO2Runtime *current_runtime() { return framework_current_runtime(); }
+static inline PTO2Runtime *current_runtime()
+{
+    return framework_current_runtime();
+}
 
-static inline TaskOutputTensors alloc_tensors(const Arg &args) {
+static inline TaskOutputTensors alloc_tensors(const Arg &args)
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return TaskOutputTensors{};
-    }
+    if (rt->ops->is_fatal(rt)) return TaskOutputTensors{};
     return rt->ops->alloc_tensors(rt, args);
 }
 
-static inline TaskOutputTensors alloc_tensors(const TensorCreateInfo create_infos[], uint32_t count) {
+static inline TaskOutputTensors alloc_tensors(const TensorCreateInfo create_infos[], uint32_t count)
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return TaskOutputTensors{};
-    }
+    if (rt->ops->is_fatal(rt)) return TaskOutputTensors{};
     Arg args;
-    for (uint32_t i = 0; i < count; i++) {
-        args.add_output(create_infos[i]);
-    }
-    if (args.has_error) {
-        rt->ops->report_fatal(
-            rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
-            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
-        );
+    for (uint32_t i = 0; i < count; i++) args.add_output(create_infos[i]);
+    if (args.has_error)
+    {
+        rt->ops->report_fatal(rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg");
         return TaskOutputTensors{};
     }
     return alloc_tensors(args);
 }
 
 template <typename... CIs>
-static inline TaskOutputTensors alloc_tensors(const CIs &...cis) {
+static inline TaskOutputTensors alloc_tensors(const CIs &...cis)
+{
     static_assert(sizeof...(cis) > 0, "alloc_tensors requires at least one TensorCreateInfo");
-    static_assert(
-        (std::is_same_v<std::decay_t<CIs>, TensorCreateInfo> && ...),
-        "alloc_tensors only accepts TensorCreateInfo arguments"
-    );
+    static_assert((std::is_same_v<std::decay_t<CIs>, TensorCreateInfo> && ...), "alloc_tensors only accepts TensorCreateInfo arguments");
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return TaskOutputTensors{};
-    }
+    if (rt->ops->is_fatal(rt)) return TaskOutputTensors{};
     Arg args;
     (args.add_output(cis), ...);
-    if (args.has_error) {
-        rt->ops->report_fatal(
-            rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
-            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
-        );
+    if (args.has_error)
+    {
+        rt->ops->report_fatal(rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg");
         return TaskOutputTensors{};
     }
     return alloc_tensors(args);
 }
 
-static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, const Arg &args) {
+static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, const Arg &args)
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return TaskOutputTensors{};
-    }
+    if (rt->ops->is_fatal(rt)) return TaskOutputTensors{};
     return rt->ops->submit_task(rt, mixed_kernels, args);
 }
 
-/**
- * Convenience wrapper: submit an AIC-only task.
- */
-static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, const Arg &args) {
+static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, const Arg &args)
+{
     MixedKernels mk;
     mk.aic_kernel_id = kernel_id;
     return rt_submit_task(mk, args);
 }
 
-/**
- * Convenience wrapper: submit an AIV-only task (uses AIV0 slot).
- */
-static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, const Arg &args) {
+static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, const Arg &args)
+{
     MixedKernels mk;
     mk.aiv0_kernel_id = kernel_id;
     return rt_submit_task(mk, args);
 }
 
-/**
- * Submit a dependency-only task. Accepts the same Arg shape as rt_submit_task
- * (inputs, outputs, inouts, explicit_deps, scalars) but does not run any
- * AICore kernel. The task still participates in the dependency graph: it
- * waits on its fanin and notifies its fanout. Useful as a synchronization
- * barrier or as a placeholder producer for tests / dep-graph wiring.
- */
-static inline TaskOutputTensors rt_submit_dummy_task(const Arg &args) {
+static inline TaskOutputTensors rt_submit_dummy_task(const Arg &args)
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return TaskOutputTensors{};
-    }
+    if (rt->ops->is_fatal(rt)) return TaskOutputTensors{};
     return rt->ops->submit_dummy_task(rt, args);
 }
 
-static inline void rt_scope_begin(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) {
+static inline void rt_scope_begin(PTO2ScopeMode mode = PTO2ScopeMode::AUTO)
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return;
-    }
+    if (rt->ops->is_fatal(rt)) return;
     rt->pending_scope_mode = mode;
     rt->ops->scope_begin(rt);
 }
 
-static inline void rt_scope_end() {
+static inline void rt_scope_end()
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return;
-    }
+    if (rt->ops->is_fatal(rt)) return;
     rt->ops->scope_end(rt);
 }
 
-static inline void rt_orchestration_done() {
+static inline void rt_orchestration_done()
+{
     PTO2Runtime *rt = current_runtime();
     rt->ops->orchestration_done(rt);
 }
 
-static inline bool rt_is_fatal() {
+static inline bool rt_is_fatal()
+{
     PTO2Runtime *rt = current_runtime();
     return rt->ops->is_fatal(rt);
 }
@@ -273,111 +183,40 @@ static inline bool rt_is_fatal() {
         _rt->ops->report_fatal(_rt, (code), __FUNCTION__, (fmt), ##__VA_ARGS__); \
     } while (0)
 
-// =============================================================================
-// Logging Macros for Orchestration (call through ops table)
-// =============================================================================
-
-#define LOG_ERROR(fmt, ...) current_runtime()->ops->log_error(__FUNCTION__, fmt, ##__VA_ARGS__)
-#define LOG_WARN(fmt, ...) current_runtime()->ops->log_warn(__FUNCTION__, fmt, ##__VA_ARGS__)
-#define LOG_DEBUG(fmt, ...) current_runtime()->ops->log_debug(__FUNCTION__, fmt, ##__VA_ARGS__)
-
 // INFO verbosity tiers. v=0 most verbose, v=9 must-see, v=5 default.
-#define LOG_INFO_V0(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 0, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V1(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 1, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V2(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 2, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V3(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 3, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V4(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 4, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V5(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 5, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V6(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 6, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V7(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 7, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V8(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 8, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V9(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 9, fmt, ##__VA_ARGS__)
-
-// =============================================================================
-// Cross-Layer Data Access
-// =============================================================================
-
-/**
- * Read a value from a tensor at the given multi-dimensional indices.
- *
- * Default T = uint64_t preserves old behavior (raw bits).
- * Specify T to get automatic type conversion:
- *
- *   uint64_t raw = get_tensor_data(tensor, 1, idx);       // old usage unchanged
- *   float val = get_tensor_data<float>(tensor, 1, idx);   // typed read
- *
- * If the tensor has a producer in TensorMap, spin-waits until the producer
- * task completes before reading. External tensors (make_tensor_external)
- * are read immediately without waiting.
- */
+
 template <typename T = uint64_t>
-static inline T get_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) {
+static inline T get_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[])
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return from_u64<T>(0);
-    }
+    if (rt->ops->is_fatal(rt)) return from_u64<T>(0);
     return from_u64<T>(rt->ops->get_tensor_data(rt, tensor, ndims, indices));
 }
 
-/**
- * Write a value to a tensor at the given multi-dimensional indices.
- *
- * Type is deduced from value argument; uint64_t by default:
- *
- *   set_tensor_data(tensor, 1, idx, raw_u64);     // old usage unchanged
- *   set_tensor_data(tensor, 1, idx, 42.0f);       // typed write (T = float)
- *
- * If the tensor has a producer in TensorMap, spin-waits until the producer
- * and all its consumers complete before writing (WAW + WAR safety).
- * External tensors (make_tensor_external) with no TensorMap entry are
- * written immediately without waiting.
- *
- * Limitation: TensorMap only tracks producers (OUTPUT/INOUT), not consumers
- * that used the tensor as INPUT. If a kernel reads this tensor as INPUT
- * (not INOUT) and the tensor has no TensorMap producer entry, set_tensor_data
- * cannot detect the reader and may cause a data race.
- *
- * To ensure WAR safety for all access patterns, use add_inout() instead of
- * add_input() for kernel parameters that may later be written via
- * set_tensor_data. INOUT creates a TensorMap entry that enables automatic
- * consumer tracking via fanout_refcount.
- *
- * The tensor must already have an allocated buffer (addr != 0).
- * For runtime-created outputs, call this only on the Tensor returned by
- * add_output(TensorCreateInfo) after submit returns.
- */
 template <typename T = uint64_t>
-static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[], T value) {
+static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[], T value)
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return;
-    }
+    if (rt->ops->is_fatal(rt)) return;
     rt->ops->set_tensor_data(rt, tensor, ndims, indices, to_u64(value));
 }
 
-// =============================================================================
-// C++ Scope Guards and Macros
-// =============================================================================
-
-/**
- * RAII Scope Guard (calls through ops table)
- */
-class PTO2ScopeGuard {
+class PTO2ScopeGuard
+{
 public:
-    explicit PTO2ScopeGuard(
-        PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE()
-    ) :
-        rt_(current_runtime()) {
-        if (!rt_->ops->is_fatal(rt_)) {
+    explicit PTO2ScopeGuard(PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE()) :
+        rt_(current_runtime())
+    {
+        if (!rt_->ops->is_fatal(rt_))
+        {
             rt_->pending_scope_mode = mode;
             if (rt_->ops->scope_set_site) rt_->ops->scope_set_site(file, line);
             rt_->ops->scope_begin(rt_);
         }
     }
-    ~PTO2ScopeGuard() {
-        if (!rt_->ops->is_fatal(rt_)) {
-            rt_->ops->scope_end(rt_);
-        }
+    ~PTO2ScopeGuard()
+    {
+        if (!rt_->ops->is_fatal(rt_)) rt_->ops->scope_end(rt_);
     }
 
 private:
@@ -389,34 +228,14 @@ class PTO2ScopeGuard {
 
 #define PTO2_SCOPE_GUARD() [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__)
 
-/**
- * Scoped block macro:
- *   PTO2_SCOPE() {
- *       rt_submit_task(...);
- *   }
- */
 #define PTO2_SCOPE(...) if (PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__){__VA_ARGS__}; true)
 
-// =============================================================================
-// Orchestration Config
-// =============================================================================
-
-/**
- * Configuration exported by orchestration .so via aicpu_orchestration_config().
- * The executor reads these values to set up shared memory and runtime.
- *
- * This struct is defined identically in pto_runtime2.h (with an include
- * guard) so the executor can use the same type without including this header.
- */
 #ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
 #define PTO2_ORCHESTRATION_CONFIG_DEFINED
-struct PTO2OrchestrationConfig {
+struct PTO2OrchestrationConfig
+{
     int expected_arg_count;
 };
 #endif
 
-// Convenience layer (ArgWithDeps<N> + matching rt_submit_*_task overloads).
-// Pulled in at the bottom so the wrapper sees Arg, MixedKernels, and the
-// rt_submit_*_task primitives defined above. Orchestration sources include
-// only this single header to access both the primitive and convenience APIs.
 #include "pto_arg_with_deps.h"  // NOLINT(build/include_subdir)
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox.h
index d67626662..ca3d084e9 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox.h
@@ -19,42 +19,18 @@
 #include "pto_constants.h"
 #include "pto_task_id.h"
 
-// AICPU-only MPSC ring used to convey deferred-completion observations from
-// FIN-handling scheduler threads to the dispatch thread. Producers push under
-// CAS on `head`; the single consumer (dispatch thread, under AsyncWaitList::
-// busy) drains in seq order. Kernel-side code never touches this struct —
-// AICore writes go into DeferredCompletionSlab (see
-// aicore_completion_mailbox_types.h), which the FIN thread reads, flattens
-// into messages here, and forwards.
-
 #define AICORE_COMPLETION_MAILBOX_CAPACITY 4096u
 #define AICORE_COMPLETION_MAILBOX_MASK (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)
 
-static_assert(
-    (AICORE_COMPLETION_MAILBOX_CAPACITY & (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)) == 0,
-    "AICORE_COMPLETION_MAILBOX_CAPACITY must be a power of two"
-);
-
-// Mailbox message discriminator. CONDITION carries one deferred-completion
-// observation flattened from a DeferredCompletionEntry. TASK_NORMAL_DONE
-// carries the slot_state pointer in `addr` so the consumer can finalize the
-// AsyncWaitEntry.slot_state binding for tasks whose conditions arrived
-// before the FIN thread saw mixed_complete. New kinds may be added in future
-// without growing the message — the `_pad[5]` slack is reserved for
-// kind-specific payload extension.
+static_assert((AICORE_COMPLETION_MAILBOX_CAPACITY & (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)) == 0, "AICORE_COMPLETION_MAILBOX_CAPACITY must be a power of two");
+
 #define MSG_KIND_CONDITION 0u
 #define MSG_KIND_TASK_NORMAL_DONE 1u
 
-struct AICoreCompletionMailboxMessage {
-    // Per-slot ready flag. Producer publishes `tail+1` after filling the rest
-    // of the slot with a release store; consumer waits for the matching seq
-    // value with an acquire load. The release-acquire pair publishes all
-    // other fields below as a side effect, so they stay plain.
+struct AICoreCompletionMailboxMessage
+{
     std::atomic<uint64_t> seq;
     PTO2TaskId task_token;
-    // CONDITION: completion observation addr (counter / SDMA event record).
-    // TASK_NORMAL_DONE: PTO2TaskSlotState pointer carried over to the consumer
-    //   so it can finalize the AsyncWaitEntry.slot_state binding.
     uint64_t addr;
     uint32_t expected_value;
     uint32_t engine;
@@ -64,19 +40,11 @@ struct AICoreCompletionMailboxMessage {
 };
 
 static_assert(sizeof(AICoreCompletionMailboxMessage) == PTO2_ALIGN_SIZE, "AICoreCompletionMailboxMessage layout drift");
-static_assert(
-    sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
-    "std::atomic<uint64_t> must be layout-compatible with uint64_t for the message slot layout to hold"
-);
-static_assert(
-    std::atomic<uint64_t>::is_always_lock_free,
-    "AICoreCompletionMailbox requires lock-free uint64_t atomics on every supported target"
-);
-
-// POD view of a drained message. `seq` is the ring's publication flag, not
-// payload, so try_pop copies out only the fields below (and seq is not even
-// copyable — it is a std::atomic).
-struct AICoreCompletionMsgView {
+static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t), "std::atomic<uint64_t> must be layout-compatible with uint64_t for the message slot layout to hold");
+static_assert(std::atomic<uint64_t>::is_always_lock_free, "AICoreCompletionMailbox requires lock-free uint64_t atomics on every supported target");
+
+struct AICoreCompletionMsgView
+{
     PTO2TaskId task_token{PTO2TaskId::invalid()};
     uint64_t addr{0};
     uint32_t expected_value{0};
@@ -85,7 +53,8 @@ struct AICoreCompletionMsgView {
     uint32_t kind{0};
 };
 
-struct AICoreCompletionMailbox {
+struct AICoreCompletionMailbox
+{
     // head and tail live on their own cache lines so producer CAS contention
     // on head can't false-share with the consumer's tail updates.
     alignas(PTO2_ALIGN_SIZE) std::atomic<uint64_t> head;
@@ -96,32 +65,21 @@ struct AICoreCompletionMailbox {
 
     // Cheap, lock-free pending hint. Callers may invoke this outside the
     // consumer lock; a stale answer only over/under-triggers a drain attempt.
-    bool has_pending() { return tail.load(std::memory_order_acquire) < head.load(std::memory_order_acquire); }
-
-    // MPSC push for a CONDITION message. Returns false when the ring is full
-    // (head - tail >= CAPACITY); caller should SPIN_WAIT_HINT and retry.
-    // Lock-free: CAS the shared head to claim a slot, write the fields, then
-    // release-store seq so the single consumer observes the publication.
-    //
-    // The head CAS is relaxed: head is a pure ticket counter and carries no
-    // data to the consumer — publication is solely the seq release-store, and
-    // slot-reuse safety rests on the acquire load of tail. The relaxed failure
-    // order is likewise sufficient since a lost CAS just re-reads head and
-    // retries. compare_exchange_weak is used because this loop already re-reads
-    // head and re-checks fullness, so masking LL/SC spurious failures (what
-    // _strong adds on aarch64) would only be a redundant inner retry.
-    //
-    // Safe to call concurrently from any number of producers; structurally
-    // independent of the AsyncWaitList::busy lock.
-    bool try_push_condition(
-        PTO2TaskId task_token, uint64_t addr, uint32_t expected_value, uint32_t engine, int32_t completion_type
-    ) {
-        while (true) {
+    bool has_pending()
+    {
+        return tail.load(std::memory_order_acquire) < head.load(std::memory_order_acquire);
+    }
+
+    bool try_push_condition(PTO2TaskId task_token, uint64_t addr, uint32_t expected_value, uint32_t engine, int32_t completion_type)
+    {
+        while (true)
+        {
             uint64_t h = head.load(std::memory_order_relaxed);
             uint64_t t = tail.load(std::memory_order_acquire);
             if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false;
             uint64_t new_head = h + 1;
-            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) {
+            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed))
+            {
                 AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK];
                 slot->task_token.raw = task_token.raw;
                 slot->addr = addr;
@@ -136,16 +94,16 @@ struct AICoreCompletionMailbox {
         }
     }
 
-    // MPSC push for a TASK_NORMAL_DONE sentinel. Carries the PTO2TaskSlotState
-    // pointer in the `addr` field so the consumer can finish binding the
-    // AsyncWaitEntry.slot_state without going back to the FIN-handling thread.
-    bool try_push_normal_done(PTO2TaskId task_token, uint64_t slot_state_addr) {
-        while (true) {
+    bool try_push_normal_done(PTO2TaskId task_token, uint64_t slot_state_addr)
+    {
+        while (true)
+        {
             uint64_t h = head.load(std::memory_order_relaxed);
             uint64_t t = tail.load(std::memory_order_acquire);
             if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false;
             uint64_t new_head = h + 1;
-            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) {
+            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed))
+            {
                 AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK];
                 slot->task_token.raw = task_token.raw;
                 slot->addr = slot_state_addr;
@@ -159,13 +117,8 @@ struct AICoreCompletionMailbox {
         }
     }
 
-    // Single-consumer transport-level dequeue (caller holds the consumer lock).
-    // Returns false at the first not-yet-published slot (gap) or when empty;
-    // otherwise copies the next message in tail order into `out`, advances
-    // tail, and returns true. tail is consumer-only-written (relaxed read);
-    // head bounds the scan (relaxed); the seq acquire is the real publication
-    // gate; the tail release publishes "slot free" to reusing producers.
-    bool try_pop(AICoreCompletionMsgView &out) {
+    bool try_pop(AICoreCompletionMsgView &out)
+    {
         uint64_t t = tail.load(std::memory_order_relaxed);
         uint64_t h = head.load(std::memory_order_relaxed);
         if (t >= h) return false;
@@ -182,8 +135,6 @@ struct AICoreCompletionMailbox {
     }
 };
 
-static_assert(
-    sizeof(AICoreCompletionMailbox) % PTO2_ALIGN_SIZE == 0, "AICoreCompletionMailbox size must be cache-line aligned"
-);
+static_assert(sizeof(AICoreCompletionMailbox) % PTO2_ALIGN_SIZE == 0, "AICoreCompletionMailbox size must be cache-line aligned");
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox_types.h
index da0d89ad7..5617cd6d4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox_types.h
@@ -16,16 +16,6 @@
 
 #include "pto_constants.h"
 
-// Types shared across the AICore↔AICPU boundary.
-//
-// This header is reachable from AICore-side translation units (via
-// pto_async_kernel_api.h / pto_completion_token.h / sdma_completion_kernel.h)
-// and must stay parseable by every AICore toolchain configuration: no
-// <atomic>, no __atomic_* intrinsics, no MPSC ring buffer struct.
-//
-// The MPSC ring (AICoreCompletionMailbox) and its push/drain helpers live in
-// aicore_completion_mailbox.h, which is AICPU-only.
-
 inline constexpr int32_t MAX_COMPLETIONS_PER_TASK = 64;
 
 #define COMPLETION_ENGINE_SDMA 0u
@@ -36,14 +26,8 @@ inline constexpr int32_t MAX_COMPLETIONS_PER_TASK = 64;
 #define COMPLETION_TYPE_COUNTER 0
 #define COMPLETION_TYPE_SDMA_EVENT_RECORD 1
 
-// DeferredCompletionEntry / DeferredCompletionSlab back the per-task scratch
-// area that AICore writes into to record "this completion has to be observed
-// before the task can retire." The FIN-handling scheduler thread reads the
-// slab, flattens entries into AICoreCompletionMailbox messages, and forwards
-// them to the dispatch thread. `volatile` here is load-bearing: writers live
-// on AICore and readers on AICPU, so the qualifier is the correct way to
-// pin the compiler against caching / reordering on either side.
-struct DeferredCompletionEntry {
+struct DeferredCompletionEntry
+{
     uint64_t addr;
     uint32_t expected_value;
     uint32_t engine;
@@ -53,15 +37,13 @@ struct DeferredCompletionEntry {
 
 static_assert(sizeof(DeferredCompletionEntry) == 24, "DeferredCompletionEntry layout drift");
 
-struct alignas(PTO2_ALIGN_SIZE) DeferredCompletionSlab {
+struct alignas(PTO2_ALIGN_SIZE) DeferredCompletionSlab
+{
     volatile uint32_t count;
     volatile int32_t error_code;
     DeferredCompletionEntry entries[MAX_COMPLETIONS_PER_TASK];
 };
 
-static_assert(
-    sizeof(DeferredCompletionSlab) % PTO2_ALIGN_SIZE == 0,
-    "DeferredCompletionSlab size must preserve array element cache-line boundaries"
-);
+static_assert(sizeof(DeferredCompletionSlab) % PTO2_ALIGN_SIZE == 0, "DeferredCompletionSlab size must preserve array element cache-line boundaries");
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_kernel.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_kernel.h
index 49ee7cc11..c83bb475e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_kernel.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_kernel.h
@@ -31,24 +31,15 @@
 // <pto/npu/comm/async/sdma/sdma_types.hpp> just to spell their scratch tile.
 inline constexpr uint32_t SDMA_SCRATCH_ALIGNMENT = pto::comm::sdma::UB_ALIGN_SIZE;
 
-enum class SdmaOp : uint8_t {
+enum class SdmaOp : uint8_t
+{
     TGET = 0,
     TPUT = 1,
 };
 
-// SdmaRequestDescriptor bundles everything send_request_entry needs to drive
-// one SDMA transfer + completion registration. It is a template because the
-// destination / source / scratch types carry tensor shape & stride at compile
-// time; the SdmaTget() / SdmaTput() helpers below let callers skip the
-// template arguments.
-//
-// sync_id selects which event-record slot inside the workspace the engine
-// writes into. Concurrent dispatches must use distinct sync_ids; today every
-// caller submits one request per kernel invocation so passing 0 is safe.
-// Future work (see .docs/25.comm-api-refactor/03.implementation-plan.md §5.2)
-// will fold sync_id allocation into the adapter.
 template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
-struct SdmaRequestDescriptor {
+struct SdmaRequestDescriptor
+{
     SdmaOp op;
     DstTensor dst;
     SrcTensor src;
@@ -58,45 +49,38 @@ struct SdmaRequestDescriptor {
 };
 
 template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
-inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTget(
-    const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace,
-    uint32_t sync_id = 0
-) {
-    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TGET, dst,       src,
-                                                                     scratch,      workspace, sync_id};
+inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTget(const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace, uint32_t sync_id = 0)
+{
+    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TGET, dst, src, scratch, workspace, sync_id};
 }
 
 template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
-inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTput(
-    const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace,
-    uint32_t sync_id = 0
-) {
-    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TPUT, dst,       src,
-                                                                     scratch,      workspace, sync_id};
+inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTput(const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace, uint32_t sync_id = 0)
+{
+    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TPUT, dst, src, scratch, workspace, sync_id};
 }
 
 namespace pto2::detail {
 
-inline __aicore__ void register_sdma_event_record(AsyncCtx &ctx, volatile __gm__ void *record_addr) {
-    CompletionToken token{
-        reinterpret_cast<uint64_t>(record_addr), 0, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_SDMA_EVENT_RECORD, 0
-    };
+inline __aicore__ void register_sdma_event_record(AsyncCtx &ctx, volatile __gm__ void *record_addr)
+{
+    CompletionToken token{reinterpret_cast<uint64_t>(record_addr), 0, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_SDMA_EVENT_RECORD, 0};
     (void)register_completion_condition(ctx, token);
 }
 
 template <typename PtoAsyncEvent, typename PtoAsyncSession>
-inline __aicore__ void
-register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsyncSession &session) {
-    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) {
+inline __aicore__ void register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsyncSession &session)
+{
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr)
+    {
         (void)event.Wait(session);
         return;
     }
-    if (event.handle == 0) {
-        return;
-    }
+    if (event.handle == 0) return;
 
     const uint32_t engine = static_cast<uint32_t>(event.engine);
-    if (engine != static_cast<uint32_t>(::pto::comm::DmaEngine::SDMA)) {
+    if (engine != static_cast<uint32_t>(::pto::comm::DmaEngine::SDMA))
+    {
         defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
         return;
     }
@@ -105,38 +89,29 @@ register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsy
     uint32_t sync_id = 0;
     __gm__ uint8_t *recv_workspace = nullptr;
     uint32_t queue_num = 0;
-    if (!::pto::comm::sdma::detail::PrepareEventCheck(
-            session.sdmaSession, tmp_buf, sync_id, recv_workspace, queue_num
-        )) {
+    if (!::pto::comm::sdma::detail::PrepareEventCheck(session.sdmaSession, tmp_buf, sync_id, recv_workspace, queue_num))
+    {
         defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
         return;
     }
-    for (uint32_t queue_id = 0; queue_id < queue_num; ++queue_id) {
-        register_sdma_event_record(ctx, ::pto::comm::sdma::detail::GetEventRecord(recv_workspace, queue_id));
-    }
+    for (uint32_t queue_id = 0; queue_id < queue_num; ++queue_id) register_sdma_event_record(ctx, ::pto::comm::sdma::detail::GetEventRecord(recv_workspace, queue_id));
 }
 
 }  // namespace pto2::detail
 
-// SDMA overload of the runtime's send_request_entry. Submits the descriptor
-// to PTO-ISA, then registers the resulting AsyncEvent's GM flag(s) into the
-// AsyncCtx deferred-wait slab and flushes. Returns false on submit/session
-// failure (also records the error in ctx.completion_error_code).
 template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
-inline __aicore__ bool
-send_request_entry(AsyncCtx &ctx, SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> desc) {
+inline __aicore__ bool send_request_entry(AsyncCtx &ctx, SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> desc)
+{
     pto::comm::AsyncSession session;
-    if (!pto::comm::BuildAsyncSession(desc.scratch, desc.workspace, session, desc.sync_id)) {
+    if (!pto::comm::BuildAsyncSession(desc.scratch, desc.workspace, session, desc.sync_id))
+    {
         pto2::detail::defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
         return false;
     }
 
     pto::comm::AsyncEvent event;
-    if (desc.op == SdmaOp::TGET) {
-        event = pto::comm::TGET_ASYNC(desc.dst, desc.src, session);
-    } else {
-        event = pto::comm::TPUT_ASYNC(desc.dst, desc.src, session);
-    }
+    if (desc.op == SdmaOp::TGET) event = pto::comm::TGET_ASYNC(desc.dst, desc.src, session);
+    else event = pto::comm::TPUT_ASYNC(desc.dst, desc.src, session);
     pto2::detail::register_pto_async_event(ctx, event, session);
     pto2::detail::defer_flush(ctx);
     return true;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_scheduler.h
index 689219c35..577e5138d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_scheduler.h
@@ -19,10 +19,8 @@
 #include "pto_completion_token.h"
 #include "pto_runtime_status.h"
 
-// runtime-side mirror of the PTO-ISA SdmaEventRecord. SDMA backend is the only
-// allowed holder of this ABI knowledge; the generic scheduler dispatches into
-// the helpers below through the completion ops table.
-struct SdmaEventRecord {
+struct SdmaEventRecord
+{
     uint32_t flag;
     uint32_t sq_tail;
     uint64_t channel_info;
@@ -31,25 +29,24 @@ struct SdmaEventRecord {
 static_assert(sizeof(SdmaEventRecord) == 16, "SDMA event record ABI drift");
 static_assert(offsetof(SdmaEventRecord, sq_tail) == 4, "SDMA event record ABI drift");
 
-inline uintptr_t sdma_completion_cache_line(const volatile void *addr) {
+inline uintptr_t sdma_completion_cache_line(const volatile void *addr)
+{
     return reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
 }
 
-inline CompletionPollResult poll_sdma_event_record(uint64_t record_addr) {
-    if (record_addr == 0) {
-        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
-    }
-    volatile SdmaEventRecord *record =
-        reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
+inline CompletionPollResult poll_sdma_event_record(uint64_t record_addr)
+{
+    if (record_addr == 0) return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    volatile SdmaEventRecord *record = reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
     cache_invalidate_range(reinterpret_cast<const void *>(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE);
     uint32_t flag = __atomic_load_n(&record->flag, __ATOMIC_ACQUIRE);
     return {flag != 0 ? CompletionPollState::READY : CompletionPollState::PENDING, PTO2_ERROR_NONE};
 }
 
-inline void retire_sdma_event_record(uint64_t record_addr) {
+inline void retire_sdma_event_record(uint64_t record_addr)
+{
     if (record_addr == 0) return;
-    volatile SdmaEventRecord *record =
-        reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
+    volatile SdmaEventRecord *record = reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
     cache_invalidate_range(reinterpret_cast<const void *>(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE);
     uint32_t completed_tail = __atomic_load_n(&record->sq_tail, __ATOMIC_ACQUIRE);
     uint64_t channel_info_addr = __atomic_load_n(&record->channel_info, __ATOMIC_ACQUIRE);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/common.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/common.h
index b87412e74..89a8d64ce 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/common.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/common.h
@@ -17,22 +17,151 @@
 #include <stdexcept>
 #include <string>
 
-/**
- * Get the current stack trace, including file paths and line numbers.
- * Implemented in common.cpp.
- */
-std::string get_stacktrace(int skip_frames = 1);
+#ifdef __linux__
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <unistd.h>
 
-/**
- * Assertion failure exception with condition, file, line, and stack trace.
- */
-class AssertionError : public std::runtime_error {
+#include <array>
+#include <cstring>
+#include <vector>
+
+inline std::string addr_to_line(const char *executable, void *addr, std::string *inline_chain = nullptr)
+{
+    char cmd[512];
+    snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr);
+
+    std::array<char, 256> buffer;
+    std::string raw_output;
+
+    FILE *pipe = popen(cmd, "r");
+    if (pipe)
+    {
+        while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) raw_output += buffer.data();
+        pclose(pipe);
+    }
+
+    if (raw_output.empty() || raw_output.find("??") != std::string::npos) return "";
+
+    std::vector<std::string> lines;
+    size_t pos = 0;
+    while (pos < raw_output.size())
+    {
+        size_t nl = raw_output.find('\n', pos);
+        if (nl == std::string::npos) nl = raw_output.size();
+        std::string line = raw_output.substr(pos, nl - pos);
+        while (!line.empty() && line.back() == '\r') line.pop_back();
+        if (!line.empty()) lines.push_back(line);
+        pos = nl + 1;
+    }
+
+    if (lines.empty()) return "";
+
+    if (inline_chain && lines.size() > 1)
+    {
+        *inline_chain = "";
+        for (size_t j = 1; j < lines.size(); j++) *inline_chain += "    [inlined by] " + lines[j] + "\n";
+    }
+
+    return lines.front();
+}
+#endif
+
+inline std::string get_stacktrace(int skip_frames)
+{
+    (void)skip_frames;  // May be unused on non-Linux platforms
+    std::string result;
+#ifdef __linux__
+    const int max_frames = 64;
+    void *buffer[max_frames];
+    int nframes = backtrace(buffer, max_frames);
+    char **symbols = backtrace_symbols(buffer, nframes);
+
+    if (symbols)
+    {
+        result = "Stack trace:\n";
+        for (int i = skip_frames; i < nframes; i++)
+        {
+            std::string frame_info;
+
+            void *addr = (void *)((char *)buffer[i] - 1);
+
+            Dl_info dl_info;
+            std::string inline_chain;
+            if (dladdr(addr, &dl_info) && dl_info.dli_fname)
+            {
+                void *rel_addr = (void *)((char *)addr - (char *)dl_info.dli_fbase);
+                std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain);
+
+                if (addr2line_result.empty()) addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain);
+
+                if (!addr2line_result.empty()) frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result;
+            }
+
+            if (frame_info.empty())
+            {
+                std::string frame(symbols[i]);
+
+                size_t start = frame.find('(');
+                size_t end = frame.find('+', start);
+                if (start != std::string::npos && end != std::string::npos)
+                {
+                    std::string mangled = frame.substr(start + 1, end - start - 1);
+                    int status;
+                    char *demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
+                    if (status == 0 && demangled)
+                    {
+                        frame = frame.substr(0, start + 1) + demangled + frame.substr(end);
+                        free(demangled);
+                    }
+                }
+                frame_info = frame;
+            }
+
+            char buf[16];
+            snprintf(buf, sizeof(buf), "  #%d ", i - skip_frames);
+            result += buf + frame_info + "\n";
+            if (!inline_chain.empty()) result += inline_chain;
+        }
+        free(symbols);
+    }
+#else
+    result = "(Stack trace is only available on Linux)\n";
+#endif
+    return result;
+}
+
+inline std::string build_assert_message(const char *condition, const char *file, int line)
+{
+    std::string msg = "Assertion failed: " + std::string(condition) + "\n";
+    msg += "  Location: " + std::string(file) + ":" + std::to_string(line) + "\n";
+    msg += get_stacktrace(3);
+    return msg;
+}
+
+class AssertionError : public std::runtime_error
+{
 public:
-    AssertionError(const char *condition, const char *file, int line);
+    AssertionError(const char *condition, const char *file, int line) :
+        std::runtime_error(build_assert_message(condition, file, line)),
+        condition_(condition),
+        file_(file),
+        line_(line)
+    {}
 
-    const char *condition() const { return condition_; }
-    const char *file() const { return file_; }
-    int line() const { return line_; }
+    const char *condition() const
+    {
+        return condition_;
+    }
+    const char *file() const
+    {
+        return file_;
+    }
+    int line() const
+    {
+        return line_;
+    }
 
 private:
     const char *condition_;
@@ -40,35 +169,27 @@ class AssertionError : public std::runtime_error {
     int line_;
 };
 
-/**
- * Assertion failure handler.
- * Implemented in common.cpp.
- */
-[[noreturn]] void assert_impl(const char *condition, const char *file, int line);
+[[noreturn]] inline void assert_impl(const char *condition, const char *file, int line)
+{
+    throw AssertionError(condition, file, line);
+}
 
-/**
- * debug_assert macro:
- * checks the condition in debug builds and throws with a stack trace on failure.
- * It is a no-op in release builds (NDEBUG).
- */
 #ifdef NDEBUG
 #define debug_assert(cond) ((void)0)
 #else
 #define debug_assert(cond)                          \
     do {                                            \
-        if (!(cond)) {                              \
+        if (!(cond))                                \
+        {                                           \
             assert_impl(#cond, __FILE__, __LINE__); \
         }                                           \
     } while (0)
 #endif
 
-/**
- * always_assert macro:
- * checks the condition in both debug and release builds.
- */
 #define always_assert(cond)                         \
     do {                                            \
-        if (!(cond)) {                              \
+        if (!(cond))                                \
+        {                                           \
             assert_impl(#cond, __FILE__, __LINE__); \
         }                                           \
     } while (0)
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
index b9d757117..a6d13e754 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
@@ -9,29 +9,6 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * @file pto2_dispatch_payload.h
- * @brief Per-core dispatch payload for AICore kernel execution
- *
- * PTO2DispatchPayload holds the kernel function address, a per-core args[]
- * array, and embedded SPMD context (LocalContext + GlobalContext).  AICPU
- * maintains a static array of these (one per core).
- *
- * GlobalContext (sub_block_id) is initialized once at runtime startup via
- * init_global_context() and never modified afterwards.
- *
- * LocalContext (block_idx, block_num) and args[] are rebuilt by build_payload()
- * before each dispatch.  Both context struct pointers are written into the
- * args[] suffix on every dispatch (since args[] is rebuilt entirely each time).
- *
- * AICore caches a pointer to its per-core slot at startup and reads from
- * it on each dispatch.  The struct is cache-line aligned to avoid false
- * sharing across concurrently dispatched cores.
- *
- * The DATA_MAIN_BASE register protocol is unchanged from the base runtime:
- * a monotonically increasing reg_task_id signals new work to AICore.
- */
-
 #pragma once
 
 #include <stdint.h>
@@ -39,7 +16,6 @@
 #include "intrinsic.h"
 #include "pto_types.h"
 
-/** Max dispatch arguments: 16 scalars + up to 32 tensor pointers + ext params */
 #ifndef PTO2_DISPATCH_MAX_ARGS
 #define PTO2_DISPATCH_MAX_ARGS (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + PTO2_EXT_PARAMS_COUNT)
 #endif
@@ -49,45 +25,22 @@
 #endif
 
 // Verify hardcoded indices in intrinsic.h match the computed values.
-static_assert(
-    (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) == SPMD_LOCAL_CONTEXT_INDEX, "LOCAL_CONTEXT_INDEX out of sync with intrinsic.h"
-);
-static_assert(
-    (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + 1) == SPMD_GLOBAL_CONTEXT_INDEX,
-    "GLOBAL_CONTEXT_INDEX out of sync with intrinsic.h"
-);
+static_assert((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) == SPMD_LOCAL_CONTEXT_INDEX, "LOCAL_CONTEXT_INDEX out of sync with intrinsic.h");
+static_assert((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + 1) == SPMD_GLOBAL_CONTEXT_INDEX, "GLOBAL_CONTEXT_INDEX out of sync with intrinsic.h");
 
-/**
- * Per-core dispatch payload: function address + args[] + SPMD context.
- *
- * AICPU maintains a static array s_payload_per_core[RUNTIME_MAX_WORKER].
- * AICore caches a pointer to its per-core slot at startup (via Handshake.task)
- * and reads from it on each dispatch.
- *
- * The struct is cache-line aligned to prevent false sharing across
- * concurrently dispatched cores.
- */
-struct alignas(64) PTO2DispatchPayload {
-    uint64_t function_bin_addr;            /**< Kernel entry address in GM (set by Scheduler) */
-    uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars + ext params) */
+struct alignas(64) PTO2DispatchPayload
+{
+    uint64_t function_bin_addr;
+    uint64_t args[PTO2_DISPATCH_MAX_ARGS];
 
-    /** Per-dispatch context: block_idx and block_num.
-     *  Written by build_payload() before each dispatch.
-     *  args[SPMD_LOCAL_CONTEXT_INDEX] points here. */
     LocalContext local_context;
 
-    /** Per-core global context: sub_block_id (AIV lane identity).
-     *  Initialized once by init_global_context() at runtime startup.
-     *  args[SPMD_GLOBAL_CONTEXT_INDEX] points here. */
     GlobalContext global_context;
 
     uint8_t reserved_payload_abi_pad[8];
 
     static_assert(sizeof(args[0]) == 8);
-    static_assert(
-        PTO2_ALIGN_UP((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]), 64) ==
-        (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0])
-    );
+    static_assert(PTO2_ALIGN_UP((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]), 64) == (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]));
 };
 
 static_assert(sizeof(PTO2DispatchPayload) == 512, "PTO2DispatchPayload hardware ABI size drift");
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_kernel_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_kernel_api.h
index cf6eb4790..357a1fdcf 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_kernel_api.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_kernel_api.h
@@ -29,13 +29,10 @@
 #define __gm__
 #endif
 
-// Public surface: get_async_ctx, async_ctx_is_deferred,
-// register_completion_condition, send_notification,
-// save_expected_notification_counter. Everything else lives in
-// pto2::detail and is reserved for backend adapters / internal use.
 namespace pto2::detail {
 
-inline __aicore__ void defer_load_slab(AsyncCtx &ctx) {
+inline __aicore__ void defer_load_slab(AsyncCtx &ctx)
+{
     if (ctx.completion_count == nullptr) return;
 #if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
     uintptr_t line = reinterpret_cast<uintptr_t>(ctx.completion_count) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
@@ -45,41 +42,33 @@ inline __aicore__ void defer_load_slab(AsyncCtx &ctx) {
 #endif
 }
 
-inline __aicore__ void defer_error(AsyncCtx &ctx, int32_t error_code) {
-    if (ctx.task_token.is_valid() && ctx.completion_error_code != nullptr) {
-        *ctx.completion_error_code = error_code;
-    }
+inline __aicore__ void defer_error(AsyncCtx &ctx, int32_t error_code)
+{
+    if (ctx.task_token.is_valid() && ctx.completion_error_code != nullptr) *ctx.completion_error_code = error_code;
 }
 
-inline __aicore__ void defer_flush_range(volatile __gm__ void *addr, uint32_t size_bytes) {
+inline __aicore__ void defer_flush_range(volatile __gm__ void *addr, uint32_t size_bytes)
+{
     if (addr == nullptr || size_bytes == 0) return;
 #if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
     uintptr_t start = reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
-    uintptr_t end =
-        (reinterpret_cast<uintptr_t>(addr) + size_bytes + PTO2_ALIGN_SIZE - 1u) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
-    for (uintptr_t p = start; p < end; p += PTO2_ALIGN_SIZE) {
-        dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT);
-    }
+    uintptr_t end = (reinterpret_cast<uintptr_t>(addr) + size_bytes + PTO2_ALIGN_SIZE - 1u) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+    for (uintptr_t p = start; p < end; p += PTO2_ALIGN_SIZE) dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT);
 #else
     (void)addr;
     (void)size_bytes;
 #endif
 }
 
-inline __aicore__ void defer_flush(AsyncCtx &ctx) {
+inline __aicore__ void defer_flush(AsyncCtx &ctx)
+{
     if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr) return;
 #if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
     uint32_t count = *ctx.completion_count;
-    if (count > ctx.completion_capacity) {
-        count = ctx.completion_capacity;
-    }
+    if (count > ctx.completion_capacity) count = ctx.completion_capacity;
     uint32_t flush_bytes = static_cast<uint32_t>(sizeof(*ctx.completion_count));
-    if (ctx.completion_error_code != nullptr) {
-        flush_bytes += static_cast<uint32_t>(sizeof(*ctx.completion_error_code));
-    }
-    if (ctx.completion_entries != nullptr) {
-        flush_bytes += count * static_cast<uint32_t>(sizeof(DeferredCompletionEntry));
-    }
+    if (ctx.completion_error_code != nullptr) flush_bytes += static_cast<uint32_t>(sizeof(*ctx.completion_error_code));
+    if (ctx.completion_entries != nullptr) flush_bytes += count * static_cast<uint32_t>(sizeof(DeferredCompletionEntry));
     defer_flush_range(ctx.completion_count, flush_bytes);
 #if defined(__CPU_SIM)
     dsb(0);
@@ -95,9 +84,9 @@ inline __aicore__ void defer_flush(AsyncCtx &ctx) {
 
 }  // namespace pto2::detail
 
-inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args) {
-    __gm__ LocalContext *lc =
-        reinterpret_cast<__gm__ LocalContext *>(static_cast<uintptr_t>(args[PAYLOAD_LOCAL_CONTEXT_INDEX]));
+inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args)
+{
+    __gm__ LocalContext *lc = reinterpret_cast<__gm__ LocalContext *>(static_cast<uintptr_t>(args[PAYLOAD_LOCAL_CONTEXT_INDEX]));
     AsyncCtx ctx{};
     ctx.completion_count = lc->async_ctx.completion_count;
     ctx.completion_error_code = lc->async_ctx.completion_error_code;
@@ -108,23 +97,19 @@ inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args) {
     return ctx;
 }
 
-inline __aicore__ bool async_ctx_is_deferred(const AsyncCtx &ctx) { return ctx.task_token.is_valid(); }
+inline __aicore__ bool async_ctx_is_deferred(const AsyncCtx &ctx)
+{
+    return ctx.task_token.is_valid();
+}
 
-// Canonical writer: backend submit handlers build a CompletionToken and pass
-// it here. Writes one DeferredCompletionEntry to the AsyncCtx slab and
-// bumps completion_count. Returns false on overflow (also stores
-// PTO2_ERROR_ASYNC_WAIT_OVERFLOW in ctx.completion_error_code) or when ctx is
-// not currently a deferred context.
-inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const CompletionToken &token) {
-    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) {
-        return false;
-    }
+inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const CompletionToken &token)
+{
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) return false;
 
     uint32_t idx = *ctx.completion_count;
-    if (idx >= ctx.completion_capacity) {
-        if (ctx.completion_error_code != nullptr) {
-            *ctx.completion_error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
-        }
+    if (idx >= ctx.completion_capacity)
+    {
+        if (ctx.completion_error_code != nullptr) *ctx.completion_error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
         return false;
     }
 
@@ -138,18 +123,16 @@ inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const Comple
     return true;
 }
 
-inline __aicore__ void
-send_notification(volatile __gm__ void *remote_counter_addr, int32_t value, pto::comm::NotifyOp notify_op) {
+inline __aicore__ void send_notification(volatile __gm__ void *remote_counter_addr, int32_t value, pto::comm::NotifyOp notify_op)
+{
     __gm__ int32_t *counter = reinterpret_cast<__gm__ int32_t *>(const_cast<__gm__ void *>(remote_counter_addr));
     pto::comm::Signal signal(counter);
     pto::comm::TNOTIFY(signal, value, notify_op);
 }
 
-inline __aicore__ void
-save_expected_notification_counter(AsyncCtx &ctx, volatile __gm__ void *counter_addr, uint32_t expected_value) {
-    CompletionToken token{
-        reinterpret_cast<uint64_t>(counter_addr), expected_value, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_COUNTER, 0
-    };
+inline __aicore__ void save_expected_notification_counter(AsyncCtx &ctx, volatile __gm__ void *counter_addr, uint32_t expected_value)
+{
+    CompletionToken token{reinterpret_cast<uint64_t>(counter_addr), expected_value, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_COUNTER, 0};
     (void)register_completion_condition(ctx, token);
     pto2::detail::defer_flush(ctx);
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
index 65608ad2f..429dd65b4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
@@ -29,12 +29,8 @@ struct CompletionStats;
 
 inline constexpr int32_t MAX_ASYNC_WAITS = 64;
 
-// The mailbox transport (has_pending / try_push_condition /
-// try_push_normal_done / try_pop) lives as AICoreCompletionMailbox member
-// functions in aicore_completion_mailbox.h. This file only holds the
-// application layer: translating drained messages into wait-list state.
-
-inline uintptr_t mailbox_cache_line(const volatile void *addr) {
+inline uintptr_t mailbox_cache_line(const volatile void *addr)
+{
     return reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
 }
 
@@ -43,12 +39,14 @@ struct CompletionCondition;
 using CompletionPollFn = CompletionPollResult (*)(const CompletionCondition &);
 using CompletionRetireFn = void (*)(CompletionCondition &);
 
-struct CompletionBackendOps {
+struct CompletionBackendOps
+{
     CompletionPollFn poll;
     CompletionRetireFn retire;
 };
 
-struct CompletionCondition {
+struct CompletionCondition
+{
     AsyncEngine engine{ASYNC_ENGINE_SDMA};
     int32_t completion_type{COMPLETION_TYPE_COUNTER};
     bool satisfied{false};
@@ -61,28 +59,27 @@ struct CompletionCondition {
     void retire();
 };
 
-// Per-completion-type ops. SDMA_EVENT_RECORD detail lives in
-// backend/sdma/sdma_completion_scheduler.h; the op wrappers below are thin
-// glue mapping CompletionCondition.addr into the backend's raw-addr helpers.
-inline CompletionPollResult counter_poll_op(const CompletionCondition &cond) {
-    if (cond.counter_addr == nullptr) {
-        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
-    }
-    return {
-        *cond.counter_addr >= cond.expected_value ? CompletionPollState::READY : CompletionPollState::PENDING,
-        PTO2_ERROR_NONE
-    };
+inline CompletionPollResult counter_poll_op(const CompletionCondition &cond)
+{
+    if (cond.counter_addr == nullptr) return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    return {*cond.counter_addr >= cond.expected_value ? CompletionPollState::READY : CompletionPollState::PENDING, PTO2_ERROR_NONE};
 }
 
-inline void counter_retire_op(CompletionCondition & /*cond*/) {}
+inline void counter_retire_op(CompletionCondition &)
+{}
 
-inline CompletionPollResult sdma_event_record_poll_op(const CompletionCondition &cond) {
+inline CompletionPollResult sdma_event_record_poll_op(const CompletionCondition &cond)
+{
     return poll_sdma_event_record(cond.addr);
 }
 
-inline void sdma_event_record_retire_op(CompletionCondition &cond) { retire_sdma_event_record(cond.addr); }
+inline void sdma_event_record_retire_op(CompletionCondition &cond)
+{
+    retire_sdma_event_record(cond.addr);
+}
 
-inline const CompletionBackendOps *completion_backend_ops_for(int completion_type) {
+inline const CompletionBackendOps *completion_backend_ops_for(int completion_type)
+{
     static const CompletionBackendOps kOps[] = {
         {counter_poll_op, counter_retire_op},                      // COMPLETION_TYPE_COUNTER = 0
         {sdma_event_record_poll_op, sdma_event_record_retire_op},  // COMPLETION_TYPE_SDMA_EVENT_RECORD = 1
@@ -92,27 +89,24 @@ inline const CompletionBackendOps *completion_backend_ops_for(int completion_typ
     return &kOps[completion_type];
 }
 
-inline CompletionPollResult CompletionCondition::test() const {
-    if (satisfied) {
-        return {CompletionPollState::READY, PTO2_ERROR_NONE};
-    }
+inline CompletionPollResult CompletionCondition::test() const
+{
+    if (satisfied) return {CompletionPollState::READY, PTO2_ERROR_NONE};
     const CompletionBackendOps *ops = completion_backend_ops_for(completion_type);
-    if (ops == nullptr || ops->poll == nullptr) {
-        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
-    }
+    if (ops == nullptr || ops->poll == nullptr) return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
     return ops->poll(*this);
 }
 
-inline void CompletionCondition::retire() {
+inline void CompletionCondition::retire()
+{
     if (retired) return;
     const CompletionBackendOps *ops = completion_backend_ops_for(completion_type);
-    if (ops != nullptr && ops->retire != nullptr) {
-        ops->retire(*this);
-    }
+    if (ops != nullptr && ops->retire != nullptr) ops->retire(*this);
     retired = true;
 }
 
-struct AsyncWaitEntry {
+struct AsyncWaitEntry
+{
     PTO2TaskSlotState *slot_state{nullptr};
     PTO2TaskId task_token{PTO2TaskId::invalid()};
     CompletionCondition conditions[MAX_COMPLETIONS_PER_TASK];
@@ -121,14 +115,17 @@ struct AsyncWaitEntry {
     bool normal_done{false};
 };
 
-struct AsyncPollResult {
+struct AsyncPollResult
+{
     int32_t completed{0};
     int32_t error_code{PTO2_ERROR_NONE};
     PTO2TaskSlotState *failed_slot_state{nullptr};
 };
 
-inline const char *async_engine_name(AsyncEngine engine) {
-    switch (engine) {
+inline const char *async_engine_name(AsyncEngine engine)
+{
+    switch (engine)
+    {
     case ASYNC_ENGINE_SDMA:
         return "SDMA";
     case ASYNC_ENGINE_ROCE:
@@ -142,75 +139,67 @@ inline const char *async_engine_name(AsyncEngine engine) {
     }
 }
 
-struct AsyncWaitList {
+struct AsyncWaitList
+{
     std::atomic<int32_t> busy{0};
     AsyncWaitEntry entries[MAX_ASYNC_WAITS];
     int32_t count{0};
-    // Diagnostic: counts every FIN-side try_push that hit a full mailbox.
-    // Expected to stay zero on real workloads (ring is 4096 entries); a
-    // non-zero value means consumers are too slow or the ring is undersized.
-    // Read by scheduler shutdown / l2 perf summary; not on the hot path.
     std::atomic<uint64_t> mpsc_skipped_count{0};
 
-    bool try_lock() {
+    bool try_lock()
+    {
         int32_t expected = 0;
         return busy.compare_exchange_strong(expected, 1, std::memory_order_acquire, std::memory_order_relaxed);
     }
 
-    void unlock() { busy.store(0, std::memory_order_release); }
+    void unlock()
+    {
+        busy.store(0, std::memory_order_release);
+    }
 
-    AsyncWaitEntry *find_entry_by_token(PTO2TaskId token) {
-        for (int32_t i = 0; i < count; i++) {
+    AsyncWaitEntry *find_entry_by_token(PTO2TaskId token)
+    {
+        for (int32_t i = 0; i < count; i++)
             if (entries[i].task_token == token) return &entries[i];
-        }
         return nullptr;
     }
 
-    // Captures the side-channel a scheduler-aware drain needs to complete
-    // NotDeferred tasks inline (without storing a transient entry in
-    // entries[]).
-    struct DrainCompletionSink {
+    struct DrainCompletionSink
+    {
         PTO2SchedulerState *sched{nullptr};
         PTO2LocalReadyBuffer *local_bufs{nullptr};
         PTO2TaskSlotState **deferred_release_slot_states{nullptr};
         int32_t *deferred_release_count{nullptr};
         int32_t deferred_release_capacity{0};
         int32_t inline_completed{0};
-#if PTO2_SCHED_PROFILING
-        int32_t thread_idx{0};
-#endif
 
-        bool can_inline_complete() const { return sched != nullptr; }
+        bool can_inline_complete() const
+        {
+            return sched != nullptr;
+        }
     };
 
     // Inline-complete a NotDeferred task during drain. Returns false on
     // deferred_release_slot_states overflow.
     bool try_inline_complete_locked(DrainCompletionSink &sink, PTO2TaskSlotState &slot_state);
 
-    // Single-consumer drain: pop each published message in tail order and
-    // translate it into wait-list state. An empty sink (sched == nullptr) just
-    // materializes entries; a sched-aware sink additionally inline-completes
-    // lonely NotDeferred NORMAL_DONEs without ever growing entries[].
-    int32_t drain_aicore_completion_mailbox_locked(
-        AICoreCompletionMailbox *aicore_mailbox, DrainCompletionSink &sink, int32_t &error_code
-    ) {
+    int32_t drain_aicore_completion_mailbox_locked(AICoreCompletionMailbox *aicore_mailbox, DrainCompletionSink &sink, int32_t &error_code)
+    {
         error_code = PTO2_ERROR_NONE;
         if (aicore_mailbox == nullptr) return 0;
 
         int32_t drained = 0;
         AICoreCompletionMsgView msg;
-        // try_pop is the transport layer (seq-gated, in-order dequeue); this
-        // loop is the application layer (translate each message into wait-list
-        // state). try_pop returns false at the first gap or when empty.
-        while (aicore_mailbox->try_pop(msg)) {
+        while (aicore_mailbox->try_pop(msg))
+        {
             drained++;
-            if (msg.kind == MSG_KIND_CONDITION) {
+            if (msg.kind == MSG_KIND_CONDITION)
+            {
                 AsyncWaitEntry *entry = find_entry_by_token(msg.task_token);
-                if (entry == nullptr) {
-                    // First message for this task — materialize the entry here.
-                    // slot_state stays null until the matching TASK_NORMAL_DONE
-                    // sentinel arrives.
-                    if (count >= MAX_ASYNC_WAITS) {
+                if (entry == nullptr)
+                {
+                    if (count >= MAX_ASYNC_WAITS)
+                    {
                         error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
                         return drained;
                     }
@@ -221,28 +210,21 @@ struct AsyncWaitList {
                     entry->waiting_completion_count = 0;
                     entry->normal_done = false;
                 }
-                if (!append_condition_locked(
-                        *entry, msg.addr, msg.expected_value, static_cast<AsyncEngine>(msg.engine), msg.completion_type,
-                        error_code
-                    )) {
-                    return drained;
-                }
-            } else if (msg.kind == MSG_KIND_TASK_NORMAL_DONE) {
-                PTO2TaskSlotState *slot_state_ptr =
-                    reinterpret_cast<PTO2TaskSlotState *>(static_cast<uintptr_t>(msg.addr));
+                if (!append_condition_locked(*entry, msg.addr, msg.expected_value, static_cast<AsyncEngine>(msg.engine), msg.completion_type, error_code)) return drained;
+            }
+            else if (msg.kind == MSG_KIND_TASK_NORMAL_DONE)
+            {
+                PTO2TaskSlotState *slot_state_ptr = reinterpret_cast<PTO2TaskSlotState *>(static_cast<uintptr_t>(msg.addr));
                 AsyncWaitEntry *entry = find_entry_by_token(msg.task_token);
-                if (entry == nullptr) {
-                    // Producers strictly order: all CONDITIONs for token T are
-                    // pushed before the matching NORMAL_DONE (the acq_rel on
-                    // on_subtask_complete enforces this across producers). So
-                    // observing NORMAL_DONE first => the task registered no
-                    // conditions => NotDeferred. Complete it inline when the
-                    // sink allows; otherwise fall back to the entry-store path.
-                    if (sink.can_inline_complete()) {
+                if (entry == nullptr)
+                {
+                    if (sink.can_inline_complete())
+                    {
                         (void)try_inline_complete_locked(sink, *slot_state_ptr);
                         continue;
                     }
-                    if (count >= MAX_ASYNC_WAITS) {
+                    if (count >= MAX_ASYNC_WAITS)
+                    {
                         error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
                         return drained;
                     }
@@ -252,13 +234,15 @@ struct AsyncWaitList {
                     entry->condition_count = 0;
                     entry->waiting_completion_count = 0;
                     entry->normal_done = true;
-                } else {
-                    if (entry->slot_state == nullptr) {
-                        entry->slot_state = slot_state_ptr;
-                    }
+                }
+                else
+                {
+                    if (entry->slot_state == nullptr) entry->slot_state = slot_state_ptr;
                     entry->normal_done = true;
                 }
-            } else {
+            }
+            else
+            {
                 error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED;
                 return drained;
             }
@@ -266,11 +250,10 @@ struct AsyncWaitList {
         return drained;
     }
 
-    bool append_condition_locked(
-        AsyncWaitEntry &entry, uint64_t addr, uint32_t expected_value, AsyncEngine engine, int32_t completion_type,
-        int32_t &error_code
-    ) {
-        if (entry.condition_count >= MAX_COMPLETIONS_PER_TASK) {
+    bool append_condition_locked(AsyncWaitEntry &entry, uint64_t addr, uint32_t expected_value, AsyncEngine engine, int32_t completion_type, int32_t &error_code)
+    {
+        if (entry.condition_count >= MAX_COMPLETIONS_PER_TASK)
+        {
             error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED;
             return false;
         }
@@ -280,24 +263,14 @@ struct AsyncWaitList {
         cond.satisfied = false;
         cond.retired = false;
         cond.addr = addr;
-        cond.counter_addr = completion_type == COMPLETION_TYPE_COUNTER ?
-                                reinterpret_cast<volatile uint32_t *>(static_cast<uintptr_t>(addr)) :
-                                nullptr;
+        cond.counter_addr = completion_type == COMPLETION_TYPE_COUNTER ? reinterpret_cast<volatile uint32_t *>(static_cast<uintptr_t>(addr)) : nullptr;
         cond.expected_value = expected_value;
         entry.waiting_completion_count++;
         return true;
     }
 
     template <bool Profiling>
-    AsyncPollResult poll_and_complete(
-        AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs,
-        PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count,
-        int32_t deferred_release_capacity
-#if PTO2_SCHED_PROFILING
-        ,
-        int thread_idx
-#endif
-    );
+    AsyncPollResult poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs, PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, int32_t deferred_release_capacity);
 };
 
 #endif  // PTO_ASYNC_WAIT_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_completion_token.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_completion_token.h
index c5a8c345f..d017f8597 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_completion_token.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_completion_token.h
@@ -17,13 +17,8 @@
 #include "aicore_completion_mailbox_types.h"
 #include "pto_runtime_status.h"
 
-// CompletionToken is the runtime-internal POD that backend submit handlers
-// produce and the generic register_completion_condition() consumes. It is the
-// ABI contract for "this is one completion to wait on" — independent of which
-// backend (SDMA, RoCE, notification counter, ...) generated it. Each backend's
-// (poll, retire) pair is registered in pto_async_wait.h's ops table, keyed by
-// completion_type.
-struct CompletionToken {
+struct CompletionToken
+{
     uint64_t addr;
     uint32_t expected_value;
     uint32_t engine;
@@ -31,13 +26,15 @@ struct CompletionToken {
     uint64_t backend_cookie;
 };
 
-enum class CompletionPollState : uint8_t {
+enum class CompletionPollState : uint8_t
+{
     PENDING = 0,
     READY = 1,
     FAILED = 2,
 };
 
-struct CompletionPollResult {
+struct CompletionPollResult
+{
     CompletionPollState state{CompletionPollState::PENDING};
     int32_t error_code{PTO2_ERROR_NONE};
 };
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_dep_compute.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_dep_compute.h
index 6078fd757..feaef7961 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_dep_compute.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_dep_compute.h
@@ -9,37 +9,6 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * @file pto_dep_compute.h
- * @brief Dependency computation primitives shared by runtime submit_task and dep_gen replay.
- *
- * Two header-only template entry points:
- *
- *   compute_task_fanin     — STEP 3 in submit_task: per-tensor creator retention (Step A)
- *                            + tensormap.lookup for INPUT/INOUT (Step B). Calls back into
- *                            user-supplied `emit` for each producer it identifies.
- *
- *   register_task_outputs  — STEP 4 in submit_task: tensormap.insert for INOUT and
- *                            OUTPUT_EXISTING tensors. No callbacks.
- *
- * STEP 1 (explicit_deps) is intentionally left at the runtime call site because its
- * `last_task_alive` shortcut + unchecked slot lookup is subtly different from the
- * `slot_state->task->task_id == producer` reuse check in STEP 3. Unifying them would
- * require two emit semantics or a marginal behavior change in transients — not worth
- * the minor structural overlap. Replay handles STEP 1 with a one-line loop of its own.
- *
- * The Emit callback contract:
- *   bool emit(PTO2TaskId producer);
- *     - return true to continue (whether or not the producer was actually recorded —
- *       producer-not-alive / dedup-hit / etc. all return true silently)
- *     - return false to signal fatal (e.g. fanin spill overflow); caller bails
- *
- * Performance: Emit is a template parameter, not std::function. Both runtime
- * (lambda capturing fanin_builder + sm_header) and replay (lambda capturing edge
- * vector) instantiate at the call site and inline through. Do NOT replace with
- * std::function — it would break the inlining and add ~5 ns/call to the orch hot path.
- */
-
 #ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
 #define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
 
@@ -51,14 +20,8 @@
 #include "tensor.h"
 #include "tensor_arg.h"  // TensorArgType
 
-/**
- * View struct for inputs to compute_task_fanin / register_task_outputs.
- *
- * Both runtime and replay assemble one of these from their own data sources
- * (runtime: from Arg accessors; replay: from SubmitTraceEntry fields). All
- * pointer arrays must remain valid for the duration of the call.
- */
-struct DepInputs {
+struct DepInputs
+{
     int32_t tensor_count;
     const TensorRef *tensors;        // length = tensor_count (union; OUTPUT slots' .ptr is unused)
     const TensorArgType *arg_types;  // length = tensor_count
@@ -66,28 +29,16 @@ struct DepInputs {
     const PTO2TaskId *explicit_deps;  // length = explicit_dep_count (validity checked by caller)
 };
 
-/**
- * Compute fanin for a task being submitted (STEP 3: Step A creator retention +
- * Step B tensormap modifier lookup).
- *
- * For each non-OUTPUT tensor:
- *   - If owner_task_id is valid, emit(owner)
- *   - For INPUT/INOUT (and not manual_dep), tensor_map.lookup(*tensor) and emit
- *     each matching producer. INOUT+COVERED triggers tensor_map.remove_entry(entry).
- *
- * @return true on success (or producer-skipped-silently); false if emit signaled
- *         fatal — caller should propagate (after any fatal bookkeeping done by emit).
- */
 template <typename Emit>
-[[nodiscard]] inline bool
-compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, Emit emit) {
-    if (in_manual_scope) {
-        return true;
-    }
+[[nodiscard]] inline bool compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, Emit emit)
+{
+    if (in_manual_scope) return true;
 
-    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+    for (int32_t i = 0; i < inputs.tensor_count; i++)
+    {
         TensorArgType ptype = inputs.arg_types[i];
-        if (ptype == TensorArgType::OUTPUT) {
+        if (ptype == TensorArgType::OUTPUT)
+        {
             // Runtime-created OUTPUT tensors are not looked up in the TensorMap since
             // they have no dependencies.
             continue;
@@ -97,58 +48,40 @@ compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_m
 
         // Step A: creator retention — all existing tensors extend their creator lifetime.
         PTO2TaskId owner = tensor->owner_task_id;
-        if (owner.is_valid()) {
-            if (!emit(owner)) {
-                return false;
-            }
+        if (owner.is_valid())
+        {
+            if (!emit(owner)) return false;
         }
 
         // Step B: only INPUT/INOUT need modifier dependency lookup.
-        if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) {
-            continue;
-        }
-        if (tensor->manual_dep) {
-            continue;
-        }
+        if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) continue;
+        if (tensor->manual_dep) continue;
 
         bool fatal = false;
         tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool {
-            if (!emit(entry.producer_task_id)) {
+            if (!emit(entry.producer_task_id))
+            {
                 fatal = true;
                 return false;  // stop iteration
             }
-            if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) {
-                tensor_map.remove_entry(entry);
-            }
+            if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) tensor_map.remove_entry(entry);
             return true;
         });
-        if (fatal) {
-            return false;
-        }
+        if (fatal) return false;
     }
     return true;
 }
 
-/**
- * Register a task's outputs in the tensormap (STEP 4 in submit_task).
- *
- * For INOUT and OUTPUT_EXISTING tensors (excluding manual_dep), inserts the
- * tensor into tensor_map keyed by its buffer.addr with `task_id` as producer.
- *
- * No-op when in_manual_scope.
- */
-inline void
-register_task_outputs(const DepInputs &inputs, PTO2TaskId task_id, PTO2TensorMap &tensor_map, bool in_manual_scope) {
-    if (in_manual_scope) {
-        return;
-    }
-    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+inline void register_task_outputs(const DepInputs &inputs, PTO2TaskId task_id, PTO2TensorMap &tensor_map, bool in_manual_scope)
+{
+    if (in_manual_scope) return;
+    for (int32_t i = 0; i < inputs.tensor_count; i++)
+    {
         TensorArgType ptype = inputs.arg_types[i];
-        if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) {
+        if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING)
+        {
             const Tensor *tensor = inputs.tensors[i].ptr;
-            if (!tensor->manual_dep) {
-                tensor_map.insert(*tensor, task_id);
-            }
+            if (!tensor->manual_dep) tensor_map.insert(*tensor, task_id);
         }
     }
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
deleted file mode 100644
index 87e4027d2..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ /dev/null
@@ -1,961 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-/**
- * PTO Runtime2 - Orchestrator Implementation
- *
- * Implements orchestrator state management, scope handling, and task submission.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_orchestrator.h"
-
-#include <assert.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "aicpu/dep_gen_collector_aicpu.h"
-#include "common/dep_gen.h"
-#include "common/unified_log.h"
-#include "pto_dep_compute.h"
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-#include "pto_tensormap.h"
-#include "pto_types.h"
-#include "tensor.h"
-
-#if PTO2_PROFILING
-#include "aicpu/scope_stats_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-#endif
-
-// Verify the captured Tensor blob size in DepGenRecord matches the runtime
-// Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without
-// including runtime/tensor.h, so this check lives at the orch callsite.
-static_assert(sizeof(Tensor) == DEP_GEN_TENSOR_SIZE, "DepGenRecord::tensors slot size out of sync with sizeof(Tensor)");
-// DEP_GEN_MAX_EXPLICIT_DEPS is a diagnostic-side capture cap only; the runtime
-// imposes no hard cap on explicit dep count. If a submit exceeds this cap,
-// dep_gen_aicpu_record_submit() logs and truncates — runtime correctness is
-// unaffected, only the captured replay record is truncated.
-
-// Weak fallbacks: dep_gen_collector_aicpu.cpp provides the strong symbols in
-// AICPU builds. Host builds (host_build_graph runtime, future dep_gen replay)
-// link these no-op stubs so the runtime translation unit is self-contained.
-// Visibility is hidden so the HOST .so doesn't export them into the global
-// dynamic symbol table where they'd shadow the AICPU .so's strong symbols
-// (same pattern as get_sys_cnt_aicpu / l2_swimlane_aicpu_record_orch_phase below).
-extern "C" __attribute__((weak, visibility("hidden"))) bool is_dep_gen_enabled() { return false; }
-__attribute__((weak, visibility("hidden"))) void dep_gen_aicpu_record_submit(
-    uint64_t, bool, int, const void *const *, const uint8_t *, int, const uint64_t *, const int32_t[3]
-) {}
-
-// Scope_stats enable gate, queried via the same predicate idiom as
-// is_dep_gen_enabled above. The AICPU collector links the strong definition;
-// host builds fall back to this weak `false`. Gating here still skips the
-// cross-agent occupancy reads that feed the sample when scope_stats is disabled.
-extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; }
-
-// Heap-ring wrap report, called from the allocator (pto_ring_buffer.h) on each
-// wrap. Strong definition lives in the AICPU collector; host builds fall back to
-// this weak no-op so the runtime translation unit stays self-contained.
-extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {}
-
-// =============================================================================
-// Orchestrator Profiling (compile-time toggle)
-// =============================================================================
-#if PTO2_ORCH_PROFILING
-#include "aicpu/device_time.h"
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-// Weak fallback for builds that don't link device_time.cpp (e.g. host).
-// The strong symbol from platform/.../device_time.cpp wins in the AICPU build.
-//
-// IMPORTANT: visibility("hidden") is required to prevent the HOST .so from
-// exporting this weak fallback into the global dynamic symbol table via
-// RTLD_GLOBAL. Without it, when the AICPU .so is loaded and its PLT entry
-// for get_sys_cnt_aicpu is resolved, the dynamic linker finds the HOST .so's
-// weak definition first (already in global table) and uses it — returning 0.
-// With hidden visibility, the HOST .so does not export this symbol globally,
-// so the AICPU .so's PLT resolves to its own strong definition from
-// device_time.cpp.
-__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-// Weak fallback for builds that don't link l2_swimlane_collector_aicpu.cpp.
-// The strong symbol from the AICPU build wins when profiling is available.
-// Also hidden to prevent HOST .so from polluting the global symbol table.
-__attribute__((weak, visibility("hidden"))) void
-l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {}
-// Accumulated cycles per sub-step (only needed for ORCH_PROFILING export)
-static uint64_t g_orch_sync_cycle = 0;       // tensormap sync
-static uint64_t g_orch_alloc_cycle = 0;      // unified task+heap alloc
-static uint64_t g_orch_args_cycle = 0;       // param copy
-static uint64_t g_orch_lookup_cycle = 0;     // tensormap lookup + dep building
-static uint64_t g_orch_insert_cycle = 0;     // tensormap insert
-static uint64_t g_orch_fanin_cycle = 0;      // fanin list + early-return check
-static uint64_t g_orch_scope_end_cycle = 0;  // scope_end overhead
-static int64_t g_orch_submit_count = 0;
-static uint32_t g_orch_submit_idx = 0;
-uint64_t g_orch_alloc_wait_cycle = 0;
-uint64_t g_orch_fanin_wait_cycle = 0;
-uint64_t g_orch_alloc_atomic_count = 0;
-uint64_t g_orch_args_atomic_count = 0;
-uint64_t g_orch_scope_end_atomic_count = 0;
-// Cycle accumulation is unconditional under PTO2_ORCH_PROFILING (that's what
-// the flag is for) and feeds the per-sub-step `g_orch_*_cycle` cumulatives
-// printed in the cold-path log.
-//
-// Per-submit ORCH_SUBMIT record is the only swim-lane emit on the orch
-// path — one record per submit_task() / alloc_tensors() call spanning
-// the entire [start, end] window. Per-sub-step phase records were dropped
-// in favour of the cumulatives + per-submit envelope; the dispatcher
-// already inserts one record at the end of each submit path via
-// CYCLE_COUNT_ORCH_SUBMIT_RECORD.
-#define CYCLE_COUNT_START()                                                        \
-    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
-    uint64_t _t0 = get_sys_cnt_aicpu(), _t1;                                       \
-    uint64_t _submit_start_ts = _t0
-#define CYCLE_COUNT_LAP(acc)       \
-    do {                           \
-        _t1 = get_sys_cnt_aicpu(); \
-        acc += (_t1 - _t0);        \
-        _t0 = _t1;                 \
-    } while (0)
-#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                       \
-    do {                                                                                          \
-        if (_prof_active) {                                                                       \
-            l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \
-        }                                                                                         \
-    } while (0)
-#elif PTO2_PROFILING
-#include "aicpu/device_time.h"
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-__attribute__((weak, visibility("hidden"))) void
-l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {}
-// submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
-static uint32_t g_orch_submit_idx = 0;
-#define CYCLE_COUNT_START()                                                        \
-    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
-    uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0;                \
-    uint64_t _submit_start_ts = _t0
-#define CYCLE_COUNT_LAP(acc) \
-    do {                     \
-    } while (0)
-#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                       \
-    do {                                                                                          \
-        if (_prof_active) {                                                                       \
-            _t1 = get_sys_cnt_aicpu();                                                            \
-            l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \
-        }                                                                                         \
-    } while (0)
-#else
-#define CYCLE_COUNT_START()
-#define CYCLE_COUNT_LAP(acc)
-#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)
-#endif
-
-static int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code) {
-    always_assert(orch != nullptr);
-    orch->fatal = true;
-    if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) {
-        return PTO2_ERROR_NONE;
-    }
-
-    int32_t expected = PTO2_ERROR_NONE;
-    std::atomic<int32_t> &orch_error_code = orch->sm_header->orch_error_code;
-    if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) {
-        return error_code;
-    }
-    return expected;
-}
-
-static void
-orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *func, const char *fmt, va_list args) {
-    int32_t latched_code = orch_mark_fatal(orch, error_code);
-
-#if PTO2_PROFILING
-    // Flush the current scope's peaks BEFORE the FATAL log line, so the
-    // diagnostic context (which pool/window filled up) appears right next to
-    // the failure reason. on_fatal is latched, so duplicate fatals from
-    // different layers don't print multiple stats lines.
-    scope_stats_on_fatal();
-#endif
-
-    if (fmt == nullptr || fmt[0] == '\0') {
-        if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
-            unified_log_error(func, "FATAL(code=%d, latched=%d)", error_code, latched_code);
-        } else {
-            unified_log_error(func, "FATAL(code=%d)", error_code);
-        }
-        return;
-    }
-
-    char message[1024];
-    vsnprintf(message, sizeof(message), fmt, args);
-    if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
-        unified_log_error(func, "FATAL(code=%d, latched=%d): %s", error_code, latched_code, message);
-        return;
-    }
-    unified_log_error(func, "FATAL(code=%d): %s", error_code, message);
-}
-
-void PTO2OrchestratorState::report_fatal(int32_t error_code, const char *func, const char *fmt, ...) {
-    auto *orch = this;
-    va_list args;
-    va_start(args, fmt);
-    orch_report_fatal_v(orch, error_code, func, fmt, args);
-    va_end(args);
-}
-
-struct PTO2FaninBuilder {
-    PTO2FaninBuilder(PTO2FaninPool &spill_pool) :
-        count(0),
-        spill_start(0),
-        spill_pool(spill_pool) {}
-    int32_t count{0};
-    int32_t spill_start{0};
-    PTO2FaninPool &spill_pool;
-    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP];
-
-    template <typename Fn>
-    PTO2FaninForEachReturn<Fn> for_each(Fn &&fn) const {
-        return for_each_fanin_storage(inline_slots, count, spill_start, spill_pool, static_cast<Fn &&>(fn));
-    }
-
-    bool contains(PTO2TaskSlotState *prod_state) const {
-        bool found = false;
-        for_each([&](PTO2TaskSlotState *slot_state) {
-            if (slot_state == prod_state) {
-                found = true;
-                return false;
-            }
-            return true;
-        });
-        if (found) {
-            return true;
-        }
-        return false;
-    }
-};
-
-static bool append_fanin_or_fail(
-    PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder, uint8_t ring_id
-) {
-    if (fanin_builder->contains(prod_state)) {
-        return true;
-    }
-
-    if (fanin_builder->count < PTO2_FANIN_INLINE_CAP) {
-        fanin_builder->inline_slots[fanin_builder->count++] = prod_state;
-        return true;
-    }
-
-    PTO2FaninPool &fanin_pool = fanin_builder->spill_pool;
-    if (!fanin_pool.ensure_space(orch->sm_header->rings[ring_id], 1)) {
-        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
-        return false;
-    }
-    int32_t spill_idx = fanin_pool.top;
-    PTO2FaninSpillEntry *entry = fanin_pool.alloc();
-    if (entry == nullptr) {
-        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
-        return false;
-    }
-    if (fanin_builder->count == PTO2_FANIN_INLINE_CAP) {
-        fanin_builder->spill_start = spill_idx;
-    }
-    entry->slot_state = prod_state;
-    fanin_builder->count++;
-    return true;
-}
-
-static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state);
-
-struct PTO2PreparedTask {
-    PTO2TaskId task_id = PTO2TaskId::invalid();
-    PTO2TaskAllocResult alloc_result = {-1, 0, nullptr, nullptr};
-    PTO2TaskDescriptor *task = nullptr;
-    PTO2TaskPayload *payload = nullptr;
-    PTO2TaskSlotState *slot_state = nullptr;
-};
-
-static PTO2OutputLayout calculate_output_layout(const Arg &args) {
-    PTO2OutputLayout layout;
-    for (int32_t i = 0; i < args.tensor_count(); i++) {
-        if (args.tag(i) != TensorArgType::OUTPUT) {
-            continue;
-        }
-        layout.offsets[i] = layout.total_output_size;
-        layout.buffer_sizes[i] =
-            PTO2_ALIGN_UP(args.tensor(i).create_info->buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN);
-        layout.total_output_size += layout.buffer_sizes[i];
-    }
-    return layout;
-}
-
-static bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator, uint8_t ring_id) {
-    always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
-
-    int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top];
-    if (scope_task_count < allocator.window_size() - 1) {
-        return true;
-    }
-
-    int32_t active_count = allocator.active_count();
-
-    LOG_ERROR("========================================");
-    LOG_ERROR("FATAL: Scope Deadlock Detected! (ring %d)", ring_id);
-    LOG_ERROR("========================================");
-    LOG_ERROR("Tasks in current scope (%d) >= task_window_size (%d).", scope_task_count, allocator.window_size());
-    LOG_ERROR("  scope_depth:        %d", orch->scope_stack_top + 1);
-    LOG_ERROR("  ring_id:            %d", ring_id);
-    LOG_ERROR("  scope_task_count:   %d", scope_task_count);
-    LOG_ERROR("  active_tasks:       %d / %d", active_count, allocator.window_size());
-    LOG_ERROR("Root Cause:");
-    LOG_ERROR("  Tasks within a scope hold a fanout_count reference that is only");
-    LOG_ERROR("  released at scope_end. When scope task count >= window_size,");
-    LOG_ERROR("  no slots can be reclaimed -> deadlock.");
-    LOG_ERROR("Solution:");
-    LOG_ERROR("  1. Reduce tasks per scope (use batching/unroll)");
-    LOG_ERROR("  2. Increase task window (current: %d)", allocator.window_size());
-    LOG_ERROR("     Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
-    LOG_ERROR("     Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2>");
-    LOG_ERROR("  3. Split work across multiple scopes");
-    LOG_ERROR("========================================");
-    orch_mark_fatal(orch, PTO2_ERROR_SCOPE_DEADLOCK);
-    return false;
-}
-
-static void prefetch_payload(PTO2TaskPayload *payload, int32_t tensor_count, int32_t scalar_count) {
-    for (int32_t i = 0; i < tensor_count; i++) {
-        __builtin_prefetch(&payload->tensors[i], 1, 3);
-        __builtin_prefetch(reinterpret_cast<char *>(&payload->tensors[i]) + 64, 1, 3);
-    }
-    for (int32_t i = 0; i < scalar_count; i += 8) {
-        __builtin_prefetch(&payload->scalars[i], 1, 3);
-    }
-    __builtin_prefetch(payload, 1, 3);
-    __builtin_prefetch(reinterpret_cast<char *>(payload) + 64, 1, 3);
-    __builtin_prefetch(reinterpret_cast<char *>(payload) + 128, 1, 3);
-}
-
-static bool prepare_task(
-    PTO2OrchestratorState *orch, const Arg &args, int32_t total_output_size, ActiveMask active_mask,
-    PTO2PreparedTask *out
-) {
-    uint8_t ring_id = orch->current_ring_id();
-    auto &allocator = orch->rings[ring_id].task_allocator;
-
-    if (!check_scope_can_accept_task(orch, allocator, ring_id)) {
-        return false;
-    }
-
-    out->alloc_result = allocator.alloc(total_output_size);
-    if (out->alloc_result.failed()) {
-        orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK);
-        return false;
-    }
-
-    out->task_id = PTO2TaskId::make(ring_id, static_cast<uint32_t>(out->alloc_result.task_id));
-    out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot);
-    out->task = &orch->sm_header->rings[ring_id].task_descriptors[out->alloc_result.slot];
-    out->payload = &orch->sm_header->rings[ring_id].task_payloads[out->alloc_result.slot];
-
-    prefetch_payload(out->payload, args.tensor_count(), args.scalar_count());
-
-    // Re-bind payload/task pointers each submit. Value is per-slot constant
-    // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing
-    // here lets RingSchedState::init() skip the O(window_size) bind loop.
-    // Both writes hit the same 64B slot_state cache line we're about to
-    // dirty below, so the extra cost is two stores on an already-hot line.
-    // Must precede the scheduler wiring.queue.push at the end of
-    // submit_task_common — that push is the first read of slot_state->task /
-    // slot_state->payload by another thread.
-    out->slot_state->bind_buffers(out->payload, out->task);
-
-    // Fields already reset by advance_ring_pointers (eager reset after CONSUMED):
-    //   fanout_lock=0, fanout_count=1, fanout_head=nullptr,
-    //   fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0
-    // Fields immutable after RingSchedState::init():
-    //   ring_id
-    // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor
-    // observers); set to PENDING here when orchestrator actually reuses the slot.
-    out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
-    int16_t block_num = args.launch_spec.block_num();
-    out->slot_state->total_required_subtasks =
-        static_cast<int16_t>(block_num * __builtin_popcount(active_mask.core_mask()));
-    out->slot_state->logical_block_num = block_num;
-    out->slot_state->active_mask = active_mask;
-    // fanin_count is set by scheduler during wiring
-    scope_tasks_push(orch, out->slot_state);
-
-    return true;
-}
-
-// =============================================================================
-// Scope Management
-// =============================================================================
-
-static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state) {
-    if (orch->scope_tasks_size >= orch->scope_tasks_capacity) {
-        // scope_tasks lives in the per-Worker arena (single backing allocation),
-        // so realloc is not legal. Capacity == PTO2_SCOPE_TASKS_CAP ==
-        // PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH, the total in-flight slot
-        // budget — hitting it means every ring is saturated, so no further push
-        // could succeed regardless of buffer growth.
-        orch->report_fatal(
-            PTO2_ERROR_SCOPE_TASKS_OVERFLOW, __FUNCTION__,
-            "scope_tasks buffer saturated at %d entries (all rings full)", orch->scope_tasks_capacity
-        );
-        return;
-    }
-    orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state;
-}
-
-void PTO2OrchestratorState::begin_scope(PTO2ScopeMode mode) {
-    auto *orch = this;
-    if (orch->fatal) {
-        return;
-    }
-    assert(orch->scope_stack_top < static_cast<int32_t>(orch->scope_stack_capacity - 1) && "Scope stack overflow");
-    if (mode == PTO2ScopeMode::AUTO && orch->in_manual_scope()) {
-        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "auto scope nested inside manual scope is not supported");
-        return;
-    }
-
-    bool already_in_manual_scope = orch->in_manual_scope();
-    ++orch->scope_stack_top;
-    orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size;
-    if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) {
-        orch->manual_begin_depth = orch->scope_stack_top;
-    }
-#if PTO2_PROFILING
-    // Gate via is_scope_stats_enabled() (weak-false in host builds) BEFORE the
-    // collector call: when disabled we pay nothing. Sample the current ring's
-    // task/heap start-end and tensormap usage at the scope boundary.
-    if (is_scope_stats_enabled()) {
-        uint8_t ring_id = orch->current_ring_id();
-        auto &alloc = orch->rings[ring_id].task_allocator;
-        int32_t dep_pool_tail = 0;
-        int32_t dep_pool_top = 0;
-        if (orch->scheduler) {
-            orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top);
-        }
-        scope_stats_begin(
-            ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail,
-            dep_pool_top, orch->tensor_map.current_used()
-        );
-    }
-#endif
-}
-
-void PTO2OrchestratorState::end_scope() {
-    auto *orch = this;
-    if (orch->fatal) {
-        return;
-    }
-    assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
-
-    // Snapshot the ring start/end BEFORE the orchestrator drains pending tasks
-    // via scheduler->on_scope_end, so the end record reflects the scope's
-    // occupancy at close, not the residual after teardown.
-#if PTO2_PROFILING
-    // Gate via is_scope_stats_enabled() (see begin_scope). One collector call
-    // emits the end-boundary record and tears down bookkeeping.
-    if (is_scope_stats_enabled()) {
-        uint8_t ring_id = orch->current_ring_id();
-        auto &alloc = orch->rings[ring_id].task_allocator;
-        int32_t dep_pool_tail = 0;
-        int32_t dep_pool_top = 0;
-        if (orch->scheduler) {
-            orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top);
-        }
-        scope_stats_end(
-            ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail,
-            dep_pool_top, orch->tensor_map.current_used()
-        );
-    }
-#endif
-
-#if PTO2_ORCH_PROFILING
-    uint64_t _se0 = get_sys_cnt_aicpu();
-#endif
-
-    bool ending_manual_scope = orch->scope_stack_top == orch->manual_begin_depth;
-    int32_t begin = orch->scope_begins[orch->scope_stack_top--];
-    int32_t count = orch->scope_tasks_size - begin;
-    if (ending_manual_scope) {
-        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-    }
-
-    if (orch->scheduler && count > 0) {
-        orch->scheduler->on_scope_end(&orch->scope_tasks[begin], count);
-    }
-
-    // Rewind the task buffer — these entries are no longer needed
-    orch->scope_tasks_size = begin;
-
-#if PTO2_ORCH_PROFILING
-    uint64_t _se1 = get_sys_cnt_aicpu();
-    g_orch_scope_end_cycle += (_se1 - _se0);
-#endif
-}
-
-// =============================================================================
-// Task Submission
-// =============================================================================
-
-// Shared body for submit_task / submit_dummy_task. Caller has already validated
-// args.has_error, decided active_mask (empty for dummy), and resolved the per-slot
-// kernel_ids (all INVALID_KERNEL_ID for dummy). Performs tensormap sync, fanin
-// computation (explicit_deps + auto), output registration, slot init, and pushes
-// to the scheduler wiring queue.
-static TaskOutputTensors submit_task_common(
-    PTO2OrchestratorState *orch, const Arg &args, ActiveMask active_mask, int32_t aic_kernel_id, int32_t aiv0_kernel_id,
-    int32_t aiv1_kernel_id
-) {
-    CYCLE_COUNT_START();
-    TaskOutputTensors result;
-    PTO2OutputLayout layout = calculate_output_layout(args);
-    PTO2PreparedTask prepared;
-    if (!prepare_task(orch, args, layout.total_output_size, active_mask, &prepared)) {
-        return result;
-    }
-    uint8_t ring_id = prepared.task_id.ring();
-    PTO2SchedulerState *sched = orch->scheduler;
-    PTO2RingFlowControl &fc = orch->sm_header->rings[ring_id].fc;
-    PTO2TaskId task_id = prepared.task_id;
-    PTO2TaskSlotState &cur_slot_state = *prepared.slot_state;
-    PTO2TaskDescriptor &task = *prepared.task;
-    PTO2TaskPayload &payload = *prepared.payload;
-    result.set_task_id(task_id);
-
-    // dep_gen capture point: snapshot the orch submit_task inputs while the
-    // tensormap is still in its pre-lookup state for this task. Replay reads
-    // these records offline to reconstruct the complete dep graph — the sole
-    // source of truth for fanout now that the swimlane hot path no longer
-    // records it.
-    if (is_dep_gen_enabled()) {
-        const void *tensor_ptrs[MAX_TENSOR_ARGS];
-        // TensorArgType is `enum class : int32_t` (4 bytes); the on-disk record
-        // packs arg_types as uint8_t[16] (5-value enum fits in a byte). Narrow
-        // each tag here rather than letting the AICPU writer reinterpret a
-        // 4×-wider array as bytes — that path silently lost two of every three
-        // tags on little-endian and synthesized phantom self-edges in replay.
-        uint8_t arg_types_u8[MAX_TENSOR_ARGS];
-        // Clamp to MAX_TENSOR_ARGS even though the Arg builder caps adds at
-        // MAX_TENSOR_ARGS: defensive against any future builder bypass /
-        // shared-memory bit-flip that could otherwise overrun the two
-        // MAX_TENSOR_ARGS-sized stack buffers above.
-        const int tc_raw = args.tensor_count();
-        const int tc = tc_raw > MAX_TENSOR_ARGS ? MAX_TENSOR_ARGS : tc_raw;
-        for (int i = 0; i < tc; i++) {
-            // OUTPUT slots carry create_info (not yet a Tensor); skip them —
-            // they have no producer to look up and replay's per-tensor loop
-            // also skips OUTPUT.
-            tensor_ptrs[i] = (args.tag(i) == TensorArgType::OUTPUT) ? nullptr : args.tensor(i).ptr;
-            arg_types_u8[i] = static_cast<uint8_t>(args.tag(i));
-        }
-        const int32_t kernel_ids_capture[3] = {aic_kernel_id, aiv0_kernel_id, aiv1_kernel_id};
-        dep_gen_aicpu_record_submit(
-            task_id.raw, orch->in_manual_scope(), tc, tensor_ptrs, arg_types_u8,
-            static_cast<int>(args.explicit_dep_count()), reinterpret_cast<const uint64_t *>(args.explicit_deps_data()),
-            kernel_ids_capture
-        );
-    }
-
-    PTO2FaninBuilder fanin_builder(orch->rings[ring_id].fanin_pool);
-
-    CYCLE_COUNT_LAP(g_orch_alloc_cycle);
-
-#if PTO2_PROFILING
-    if (layout.total_output_size > 0) {
-        orch->buffers_allocated++;
-        orch->bytes_allocated += layout.total_output_size;
-    }
-#endif
-
-    // === STEP 2: Sync TensorMap validity and optional cleanup ===
-    // Read current last_task_alive from shared memory for this ring
-    int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire);
-
-    orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive);
-
-    CYCLE_COUNT_LAP(g_orch_sync_cycle);
-
-    for (uint32_t i = 0; i < args.explicit_dep_count(); i++) {
-        PTO2TaskId dep_task_id = args.explicit_dep(i);
-        if (!dep_task_id.is_valid()) {
-            orch->report_fatal(
-                PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "Arg.set_dependencies(...) requires valid task ids"
-            );
-            return result;
-        }
-        PTO2SharedMemoryRingHeader &dep_ring = orch->sm_header->rings[dep_task_id.ring()];
-        int32_t dep_local_task_id = static_cast<int32_t>(dep_task_id.local());
-        int32_t dep_last_task_alive = dep_ring.fc.last_task_alive.load(std::memory_order_acquire);
-        if (dep_local_task_id < dep_last_task_alive) {
-            continue;
-        }
-        PTO2TaskSlotState *producer_slot_state = &dep_ring.get_slot_state_by_task_id(dep_local_task_id);
-        if (!append_fanin_or_fail(orch, producer_slot_state, &fanin_builder, ring_id)) {
-            return result;
-        }
-    }
-
-    // === STEP 3: Lookup inputs (creator retention + tensormap modifier lookup) ===
-    DepInputs dep_inputs{
-        args.tensor_count(),       args.tensor_data(), args.tag_data(), static_cast<int32_t>(args.explicit_dep_count()),
-        args.explicit_deps_data(),
-    };
-
-    auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool {
-        PTO2TaskSlotState *prod_state =
-            &orch->sm_header->rings[producer_task_id.ring()].get_slot_state_by_task_id(producer_task_id.local());
-        return append_fanin_or_fail(orch, prod_state, &fanin_builder, ring_id);
-    };
-
-    if (!compute_task_fanin(dep_inputs, orch->tensor_map, orch->in_manual_scope(), runtime_emit)) {
-        return result;
-    }
-
-    CYCLE_COUNT_LAP(g_orch_lookup_cycle);
-
-    // === STEP 4: Register outputs/inouts in TensorMap (must be separate from lookup) ===
-    register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope());
-
-    CYCLE_COUNT_LAP(g_orch_insert_cycle);
-
-    // === STEP 5: Batch-write to GM (single cache line burst) ===
-    // Deferred from allocation phase to avoid scattered GM writes that get
-    // evicted by TensorMap lookup/insert cache pressure.
-    __builtin_prefetch(&task, 1, 1);
-    task.task_id = task_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = aic_kernel_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = aiv0_kernel_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = aiv1_kernel_id;
-    task.packed_buffer_base = prepared.alloc_result.packed_base;
-    task.packed_buffer_end = prepared.alloc_result.packed_end;
-
-    // Increment fanout_count on each producer (no lock — only orch writes this field).
-    // Prevents premature CONSUMED: scope_end's release_producer checks fanout_refcount == fanout_count.
-    for_each_fanin_storage(
-        fanin_builder.inline_slots, fanin_builder.count, fanin_builder.spill_start, fanin_builder.spill_pool,
-        [](PTO2TaskSlotState *producer) {
-            producer->fanout_count++;
-        }
-    );
-
-    int32_t inline_count = std::min(fanin_builder.count, PTO2_FANIN_INLINE_CAP);
-    // Store fanin metadata in payload for scheduler to iterate
-    payload.fanin_actual_count = fanin_builder.count;
-    payload.fanin_spill_start = fanin_builder.spill_start;
-    payload.fanin_spill_pool = &fanin_builder.spill_pool;
-    for (int i = 0; i < inline_count; i++) {
-        payload.fanin_inline_slot_states[i] = fanin_builder.inline_slots[i];
-    }
-
-    payload.init(args, result, prepared.alloc_result, layout);
-#if PTO2_PROFILING
-    if (is_dump_tensor_enabled()) {
-        if (args.scalar_count() > 0) {
-            set_dump_tensor_task_scalar_dtypes(
-                task_id.raw, static_cast<uint32_t>(args.scalar_count()), args.scalar_dtypes()
-            );
-        }
-        // Selective vs full dump is latched at dump_tensor_init from DumpDataHeader
-        // (host-decided before any dispatch), so it is race-free regardless of
-        // submission order. Here we only record each marked task's arg mask and
-        // metadata flags, which selective collection consults.
-        if (args.tensor_dump_arg_mask() != 0) {
-            set_dump_tensor_task_mask(
-                task_id.raw, args.tensor_dump_arg_mask(), args.tensor_dump_arg_index_ambiguous_mask()
-            );
-        }
-    }
-#endif
-
-    CYCLE_COUNT_LAP(g_orch_args_cycle);
-#if PTO2_ORCH_PROFILING
-    g_orch_args_atomic_count += 2;  // fanout_lock.store + fanout_count.store
-#endif
-
-    // === STEP 6: push to wiring queue ===
-    // Deferred wiring: orchestrator only stores dependency metadata and increments
-    // fanout_count. The actual fanout_head wiring (lock + dep_pool + early_finished)
-    // is handled asynchronously by scheduler thread 0 via the wiring queue.
-    // Push to global wiring queue — scheduler sets fanin_count, wires fanout, checks readiness
-    while (!sched->wiring.queue.push(&cur_slot_state)) {
-        SPIN_WAIT_HINT();
-    }
-
-    CYCLE_COUNT_LAP(g_orch_fanin_cycle);
-    CYCLE_COUNT_ORCH_SUBMIT_RECORD(task_id.raw);
-
-#if PTO2_PROFILING
-    orch->tasks_submitted++;
-#if PTO2_ORCH_PROFILING
-    g_orch_submit_count++;
-#endif
-    g_orch_submit_idx++;
-#endif
-    return result;
-}
-
-TaskOutputTensors PTO2OrchestratorState::submit_task(const MixedKernels &mixed_kernels, const Arg &args) {
-    auto *orch = this;
-
-    // Orchestration API should short-circuit after fatal, but keep this entry
-    // robust as a no-op in case a caller reaches it directly.
-    if (orch->fatal) {
-        return TaskOutputTensors{};
-    }
-
-    // Validate Arg construction (errors recorded by add_input/add_output/etc.)
-    if (args.has_error) {
-        LOG_ERROR("========================================");
-        LOG_ERROR("FATAL: Invalid Arg Detected!");
-        LOG_ERROR("========================================");
-        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
-        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
-        LOG_ERROR("This is a bug in the orchestration code.");
-        LOG_ERROR("========================================");
-        orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
-        return TaskOutputTensors{};
-    }
-    always_assert(orch->scheduler != nullptr);
-    // === Validate submit inputs ===
-    ActiveMask active_mask = mixed_kernels.to_active_mask();
-    always_assert(static_cast<bool>(active_mask) && "MixedKernels must have at least one active slot");
-
-    int16_t block_num = args.launch_spec.block_num();
-    always_assert(block_num >= 1 && "block_num must be >= 1");
-
-    // Normalize single-AIV tasks: if only aiv1 is set (no aic, no aiv0), move
-    // it to the aiv0 slot.  This guarantees the dispatch path can always use
-    // PTO2SubtaskSlot::AIV0 for single-AIV shapes without inspecting active_mask.
-    // Mixed tasks (AIC+AIV) keep their original AIV identity so the correct
-    // hardware channel (AIV0→AIC vs AIV1→AIC) is used at dispatch time.
-    MixedKernels normalized = mixed_kernels;
-    bool has_aic = active_mask.has_mask(PTO2_SUBTASK_MASK_AIC);
-    bool has_aiv0 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV0);
-    bool has_aiv1 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV1);
-    if (!has_aic && has_aiv1 && !has_aiv0) {
-        normalized.aiv0_kernel_id = normalized.aiv1_kernel_id;
-        normalized.aiv1_kernel_id = INVALID_KERNEL_ID;
-        active_mask = normalized.to_active_mask();
-    }
-
-    // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1)
-    if (block_num > 1 && args.launch_spec.require_sync_start()) {
-        // Deadlock check: block_num >= total available slots of the required type.
-        // For MIX/AIC: limit is total_cluster_count (one AIC per cluster).
-        // For AIV:     limit is total_aiv_count.
-        PTO2ResourceShape shape = active_mask.to_shape();
-        int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count;
-        if (limit > 0 && block_num > limit) {
-            report_fatal(
-                PTO2_ERROR_REQUIRE_SYNC_START_INVALID, __FUNCTION__,
-                "require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit
-            );
-            return TaskOutputTensors{};
-        }
-        active_mask.set_sync_start();
-    }
-
-    return submit_task_common(
-        orch, args, active_mask, normalized.aic_kernel_id, normalized.aiv0_kernel_id, normalized.aiv1_kernel_id
-    );
-}
-
-// Submit a dependency-only task: full dependency graph participation
-// (tensormap lookup/insert, explicit_deps, manual_dep, manual_scope) but no
-// AICore dispatch. Empty active_mask routes the slot to the DUMMY ready
-// bucket; dispatch loop short-circuits to completion. Accepts the same Arg
-// shape as submit_task; scalars are permitted but never consumed.
-TaskOutputTensors PTO2OrchestratorState::submit_dummy_task(const Arg &args) {
-    auto *orch = this;
-
-    if (orch->fatal) {
-        return TaskOutputTensors{};
-    }
-
-    if (args.has_error) {
-        LOG_ERROR("========================================");
-        LOG_ERROR("FATAL: Invalid Arg in submit_dummy_task!");
-        LOG_ERROR("========================================");
-        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
-        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
-        LOG_ERROR("========================================");
-        orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
-        return TaskOutputTensors{};
-    }
-    always_assert(orch->scheduler != nullptr);
-
-    return submit_task_common(orch, args, ActiveMask{}, INVALID_KERNEL_ID, INVALID_KERNEL_ID, INVALID_KERNEL_ID);
-}
-
-TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const Arg &args) {
-    auto *orch = this;
-    // Orchestration API should short-circuit after fatal, but keep this entry
-    // robust as a no-op in case a caller reaches it directly.
-    if (orch->fatal) {
-        return TaskOutputTensors{};
-    }
-
-    if (args.tensor_count() <= 0) {
-        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors requires at least one TensorCreateInfo");
-        return TaskOutputTensors{};
-    }
-    if (args.scalar_count() != 0) {
-        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args");
-        return TaskOutputTensors{};
-    }
-    for (int32_t i = 0; i < args.tensor_count(); i++) {
-        if (args.tag(i) != TensorArgType::OUTPUT) {
-            report_fatal(
-                PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args"
-            );
-            return TaskOutputTensors{};
-        }
-    }
-
-    CYCLE_COUNT_START();
-
-    if (args.has_error) {
-        report_fatal(
-            PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
-            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
-        );
-        return TaskOutputTensors{};
-    }
-
-    PTO2OutputLayout layout = calculate_output_layout(args);
-    PTO2PreparedTask prepared;
-    if (!prepare_task(orch, args, layout.total_output_size, ActiveMask{}, &prepared)) {
-        return TaskOutputTensors{};
-    }
-
-    PTO2TaskDescriptor &task = *prepared.task;
-    PTO2TaskPayload &payload = *prepared.payload;
-
-    CYCLE_COUNT_LAP(g_orch_alloc_cycle);
-
-#if PTO2_PROFILING
-    if (layout.total_output_size > 0) {
-        orch->buffers_allocated++;
-        orch->bytes_allocated += layout.total_output_size;
-    }
-#endif
-
-    task.task_id = prepared.task_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = INVALID_KERNEL_ID;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = INVALID_KERNEL_ID;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = INVALID_KERNEL_ID;
-    task.packed_buffer_base = prepared.alloc_result.packed_base;
-    task.packed_buffer_end = prepared.alloc_result.packed_end;
-
-    TaskOutputTensors outputs;
-    outputs.set_task_id(prepared.task_id);
-    payload.init(args, outputs, prepared.alloc_result, layout);
-    payload.fanin_actual_count = 0;
-    payload.fanin_spill_start = 0;
-    payload.fanin_spill_pool = &orch->rings[prepared.task_id.ring()].fanin_pool;
-    CYCLE_COUNT_LAP(g_orch_args_cycle);
-
-    if (prepared.slot_state != nullptr) {
-        // Hidden alloc tasks complete inline in the orchestrator before any
-        // consumer can exist, so they have no fanout to notify and no worker
-        // subtasks to retire. Running the full on_mixed_task_complete path
-        // would only pay unnecessary fanout_lock / traversal overhead here.
-        // The generic slot initialization done in prepare_task() is still
-        // required so scope_end can release the producer-side reference and
-        // drive the slot to CONSUMED, but worker dispatch fields are never
-        // observed for hidden alloc tasks.
-        prepared.slot_state->task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
-    }
-    orch->inline_completed_tasks++;
-
-    CYCLE_COUNT_LAP(g_orch_fanin_cycle);
-    CYCLE_COUNT_ORCH_SUBMIT_RECORD(prepared.task_id.raw);
-
-#if PTO2_PROFILING
-    orch->tasks_submitted++;
-#if PTO2_ORCH_PROFILING
-    g_orch_submit_count++;
-#endif
-    g_orch_submit_idx++;
-#endif
-
-    return outputs;
-}
-
-// =============================================================================
-// Flow Control
-// =============================================================================
-
-void PTO2OrchestratorState::mark_done() {
-    auto *orch = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        int32_t total_tasks = orch->rings[r].task_allocator.active_count();
-        if (total_tasks > 0) {
-            LOG_INFO_V0("=== [Orchestrator] ring %d: total_tasks=%d ===", r, total_tasks);
-        }
-        auto &fanin_pool = orch->rings[r].fanin_pool;
-        if (fanin_pool.top > 1) {
-            LOG_INFO_V0(
-                "=== [FaninPool %d] top=%d tail=%d used=%d high_water=%d capacity=%d ===", r, fanin_pool.top,
-                fanin_pool.tail, fanin_pool.top - fanin_pool.tail, fanin_pool.high_water, fanin_pool.capacity
-            );
-        }
-    }
-    orch->sm_header->orchestrator_done.store(1, std::memory_order_release);
-    orch->scope_tasks_size = 0;
-    orch->scope_stack_top = -1;
-    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-#if !PTO2_ORCH_PROFILING && PTO2_PROFILING
-    g_orch_submit_idx = 0;
-#endif
-}
-
-#if PTO2_ORCH_PROFILING
-PTO2OrchProfilingData orchestrator_get_profiling() {
-    PTO2OrchProfilingData d;
-    d.sync_cycle = g_orch_sync_cycle;
-    d.alloc_cycle = g_orch_alloc_cycle;
-    d.args_cycle = g_orch_args_cycle;
-    d.lookup_cycle = g_orch_lookup_cycle;
-    d.insert_cycle = g_orch_insert_cycle;
-    d.fanin_cycle = g_orch_fanin_cycle;
-    d.scope_end_cycle = g_orch_scope_end_cycle;
-    d.submit_count = g_orch_submit_count;
-    d.alloc_wait_cycle = g_orch_alloc_wait_cycle;
-    d.fanin_wait_cycle = g_orch_fanin_wait_cycle;
-    d.alloc_atomic_count = g_orch_alloc_atomic_count;
-    d.args_atomic_count = g_orch_args_atomic_count;
-    d.scope_end_atomic_count = g_orch_scope_end_atomic_count;
-
-    // Reset
-    g_orch_sync_cycle = g_orch_alloc_cycle = g_orch_args_cycle = 0;
-    g_orch_lookup_cycle = g_orch_insert_cycle = 0;
-    g_orch_fanin_cycle = g_orch_scope_end_cycle = 0;
-    g_orch_submit_count = 0;
-    g_orch_submit_idx = 0;
-    g_orch_alloc_wait_cycle = 0;
-    g_orch_fanin_wait_cycle = 0;
-    g_orch_alloc_atomic_count = 0;
-    g_orch_args_atomic_count = 0;
-    g_orch_scope_end_atomic_count = 0;
-    return d;
-}
-#endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index f45ff4897..081d97bf8 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -8,22 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Orchestrator Interface
- *
- * The Orchestrator is responsible for:
- * 1. Executing the orchestration function (Turing-complete control flow)
- * 2. Allocating intermediate buffers from the heap
- * 3. Submitting tasks via async InCore function calls
- * 4. Building the dependency graph using TensorMap
- * 5. Managing buffer scopes for lifecycle control
- *
- * The Orchestrator can run on either:
- * - Host CPU (lower latency for complex control, easier debugging)
- * - Device AI_CPU (lower latency for task submission)
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
 
 #ifndef PTO_ORCHESTRATOR_H
 #define PTO_ORCHESTRATOR_H
@@ -33,18 +17,64 @@
 #include "pto_ring_buffer.h"
 #include "pto_runtime2_types.h"
 #include "pto_submit_types.h"
-#include "scheduler/pto_scheduler.h"
+#include "pto_scheduler.h"
 #include "pto_shared_memory.h"
 #include "pto_tensormap.h"
 #include "pto_types.h"
 
-/**
- * Layout descriptor produced by PTO2OrchestratorState::reserve_layout(). Holds
- * arena offsets for every sub-region the orchestrator owns (per-ring fanin
- * pools, scope arrays, plus the nested PTO2TensorMap layout).
- */
-struct PTO2OrchestratorLayout {
-    size_t off_fanin_pool[PTO2_MAX_RING_DEPTH];
+#include <stdarg.h>
+#include <stdio.h>
+#include "aicpu/dep_gen_collector_aicpu.h"
+#include "common/dep_gen.h"
+#include "pto_dep_compute.h"
+#include "tensor.h"
+
+struct PTO2OrchestratorState;
+
+// Full definitions of helper aggregate types that the inline methods on
+// PTO2OrchestratorState (and the helpers below) construct by value.
+struct PTO2PreparedTask
+{
+    PTO2TaskId task_id = PTO2TaskId::invalid();
+    PTO2TaskAllocResult alloc_result = {-1, 0, nullptr, nullptr};
+    PTO2TaskDescriptor *task = nullptr;
+    PTO2TaskPayload *payload = nullptr;
+    PTO2TaskSlotState *slot_state = nullptr;
+};
+
+struct PTO2FaninBuilder
+{
+    int32_t count{0};
+    PTO2TaskSlotState *slots[PTO2_MAX_FANIN];
+
+    template <typename Fn>
+    PTO2FaninForEachReturn<Fn> for_each(Fn &&fn) const
+    {
+        return for_each_fanin_in(slots, count, static_cast<Fn &&>(fn));
+    }
+
+    bool contains(PTO2TaskSlotState *prod_state) const
+    {
+        for (int32_t i = 0; i < count; i++)
+            if (slots[i] == prod_state) return true;
+        return false;
+    }
+};
+
+// Forward declarations of helpers defined below — needed because the inline
+// methods on PTO2OrchestratorState reference them.
+inline int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code);
+inline void orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *fmt, va_list args);
+inline void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state);
+inline bool prepare_task(PTO2OrchestratorState *orch, const Arg &args, int32_t total_output_size, ActiveMask active_mask, PTO2PreparedTask *out);
+inline PTO2OutputLayout calculate_output_layout(const Arg &args);
+inline bool append_fanin_or_fail(PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder);
+inline bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator);
+inline void prefetch_payload(PTO2TaskPayload *payload, int32_t tensor_count, int32_t scalar_count);
+inline TaskOutputTensors submit_task_common(PTO2OrchestratorState *orch, const Arg &args, ActiveMask active_mask, int32_t aic_kernel_id, int32_t aiv0_kernel_id, int32_t aiv1_kernel_id);
+
+struct PTO2OrchestratorLayout
+{
     size_t off_scope_tasks;
     size_t off_scope_begins;
     PTO2TensorMapLayout tensor_map;
@@ -53,16 +83,8 @@ struct PTO2OrchestratorLayout {
     uint64_t scope_stack_capacity;
 };
 
-// =============================================================================
-// Orchestrator State
-// =============================================================================
-
-/**
- * Orchestrator state structure (private to Orchestrator)
- *
- * Contains all state needed for task graph construction and buffer management.
- */
-struct PTO2OrchestratorState {
+struct PTO2OrchestratorState
+{
     // === SHARED MEMORY ACCESS ===
     PTO2SharedMemoryHeader *sm_header;
 
@@ -72,10 +94,6 @@ struct PTO2OrchestratorState {
     // === TENSOR MAP (Private) ===
     PTO2TensorMap tensor_map;  // Producer lookup
 
-    // === SCOPE STACK (Private) ===
-    // Single contiguous buffer of task IDs, partitioned by scope level.
-    // scope_begins[i] is the index into scope_tasks where scope i starts.
-    // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size).
     PTO2TaskSlotState **scope_tasks;  // Flat buffer of taskSlotState (all scopes concatenated)
     int32_t scope_tasks_size;         // Number of task IDs currently in the buffer
     int32_t scope_tasks_capacity;     // Allocated capacity of scope_tasks
@@ -84,115 +102,478 @@ struct PTO2OrchestratorState {
     uint64_t scope_stack_capacity;    // Max nesting depth (PTO2_MAX_SCOPE_DEPTH)
     int32_t manual_begin_depth{PTO2_MAX_SCOPE_DEPTH};
 
-    // === SCHEDULER REFERENCE ===
-    // Note: In simulated mode, orchestrator and scheduler share address space
-    // In real mode, they communicate via shared memory only
     PTO2SchedulerState *scheduler;  // For simulated mode only
 
     // Total core counts set once at executor init; used for submit-time deadlock detection.
     int32_t total_cluster_count{0};  // AIC cores = MIX clusters
     int32_t total_aiv_count{0};      // AIV cores (= 2 × clusters on standard hardware)
-#if PTO2_PROFILING
-    // L2 swimlane_level copied from get_l2_swimlane_level().
-    L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED};
-#endif
 
     // === GM HEAP (for output buffers) ===
     void *gm_heap_base;     // Base address of GM heap
     uint64_t gm_heap_size;  // Total size of GM heap (all rings)
 
-    // === FATAL ERROR ===
-    // Fatal error flag (single-thread access by orchestrator, no atomic needed)
-    // Cross-thread notification uses shared memory orch_error_code (atomic)
     bool fatal;
 
-    // Hidden alloc tasks complete synchronously inside the orchestrator and
-    // therefore bypass the executor's normal worker-completion counter path.
-    // The executor adds this count into its completed_tasks_ progress counter
-    // after orchestration finishes so shutdown/profiling totals remain closed.
     int64_t inline_completed_tasks{0};
 
     // === STATISTICS ===
-#if PTO2_PROFILING
-    int64_t tasks_submitted;
-    int64_t buffers_allocated;
-    int64_t bytes_allocated;
-#endif
-
-    /**
-     * Get current ring index from scope depth.
-     * Maps scope depth to ring_id: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
-     */
-    uint8_t current_ring_id() const {
+
+    uint8_t current_ring_id() const
+    {
         int32_t depth = scope_stack_top;
         if (depth < 0) depth = 0;
         return depth < PTO2_MAX_RING_DEPTH ? static_cast<uint8_t>(depth) : PTO2_MAX_RING_DEPTH - 1;
     }
 
-    bool in_manual_scope() const { return scope_stack_top >= manual_begin_depth; }
+    bool in_manual_scope() const
+    {
+        return scope_stack_top >= manual_begin_depth;
+    }
+
+    // === Cold-path API ===
+
+    static PTO2OrchestratorLayout reserve_layout(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity)
+    {
+        PTO2OrchestratorLayout layout{};
+        layout.dep_pool_capacity = dep_pool_capacity;
+        layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
+        layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
+
+        layout.off_scope_tasks = arena.reserve(static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *));
+        layout.off_scope_begins = arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
+        layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
+        return layout;
+    }
+
+    bool init_data_from_layout(const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, uint64_t task_window_size)
+    {
+        auto *orch = this;
+        *orch = PTO2OrchestratorState{};
+
+        orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+        orch->gm_heap_base = gm_heap;
+        orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
+        orch->fatal = false;
+
+        // Mirror the SM API's per-ring window-size shape so a future per-ring
+        // SM layout cannot silently disagree with the addresses we compute here.
+        uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_window_sizes[r] = task_window_size;
+
+        auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
+            auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
+            auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
+            auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
 
-    // === Cold-path API (defined in pto_orchestrator.cpp) ===
+            orch->rings[r].task_allocator.init(task_descs_dev, static_cast<int32_t>(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base, heap_size, orch_err);
+        }
 
-    // Phase 1: declare every sub-region (per-ring fanin pool, scope arrays,
-    // tensor_map sub-layout) on the supplied arena. task_window_sizes feeds
-    // the nested tensor_map layout. Returned layout is consumed by
-    // init_from_layout.
-    static PTO2OrchestratorLayout reserve_layout(
-        DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
-        int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
-    );
+        if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) return false;
 
-    // Phase 3a: write everything *except* arena-internal pointer fields.
-    // sm_dev_base is the SM device address (only stored, never dereferenced);
-    // task_window_size feeds the per-ring SM address arithmetic. Safe to call
-    // on a host arena that holds the prebuilt image.
-    bool init_data_from_layout(
-        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
-        uint64_t task_window_size
-    );
+        orch->scope_tasks_size = 0;
+        orch->scope_tasks_capacity = layout.scope_tasks_cap;
+        orch->scope_stack_top = -1;
+        orch->scope_stack_capacity = layout.scope_stack_capacity;
+        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
 
-    // Phase 3b: write the arena-internal pointer fields (scope_tasks,
-    // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool,
-    // free_entry_list,task_entry_heads}, scheduler reference).
-    // Idempotent — host runs once on the image, AICPU runs once after attach.
-    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler);
+        return true;
+    }
+
+    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg)
+    {
+        auto *orch = this;
+        orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
+        orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
+        orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
+        orch->scheduler = scheduler_arg;
+    }
 
     // Forget pointers; arena owns the backing buffers.
-    void destroy();
-    void set_scheduler(PTO2SchedulerState *scheduler);
-    void report_fatal(int32_t error_code, const char *func, const char *fmt, ...);
-    void begin_scope(PTO2ScopeMode mode = PTO2ScopeMode::AUTO);
-    void end_scope();
-    TaskOutputTensors submit_task(const MixedKernels &mixed_kernels, const Arg &args);
-    TaskOutputTensors submit_dummy_task(const Arg &args);
-    TaskOutputTensors alloc_tensors(const Arg &args);
-    void mark_done();
-};
+    void destroy()
+    {
+        auto *orch = this;
+        orch->tensor_map.destroy();
+        orch->scope_tasks = nullptr;
+        orch->scope_begins = nullptr;
+    }
+    void set_scheduler(PTO2SchedulerState *scheduler)
+    {
+        this->scheduler = scheduler;
+    }
+    void report_fatal(int32_t error_code, [[maybe_unused]] const char *func, const char *fmt, ...)
+    {
+        auto *orch = this;
+        va_list args;
+        va_start(args, fmt);
+        orch_report_fatal_v(orch, error_code, fmt, args);
+        va_end(args);
+    }
+    void begin_scope(PTO2ScopeMode mode)
+    {
+        auto *orch = this;
+        if (orch->fatal) return;
+        assert(orch->scope_stack_top < static_cast<int32_t>(orch->scope_stack_capacity - 1) && "Scope stack overflow");
+        if (mode == PTO2ScopeMode::AUTO && orch->in_manual_scope())
+        {
+            report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "auto scope nested inside manual scope is not supported");
+            return;
+        }
+
+        bool already_in_manual_scope = orch->in_manual_scope();
+        ++orch->scope_stack_top;
+        orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size;
+        if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) orch->manual_begin_depth = orch->scope_stack_top;
+    }
+    void end_scope()
+    {
+        auto *orch = this;
+        if (orch->fatal) return;
+        assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
+
+        bool ending_manual_scope = orch->scope_stack_top == orch->manual_begin_depth;
+        int32_t begin = orch->scope_begins[orch->scope_stack_top--];
+        int32_t count = orch->scope_tasks_size - begin;
+        if (ending_manual_scope) orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+
+        if (orch->scheduler && count > 0) orch->scheduler->on_scope_end(&orch->scope_tasks[begin], count);
+
+        // Rewind the task buffer — these entries are no longer needed
+        orch->scope_tasks_size = begin;
+    }
+    TaskOutputTensors submit_task(const MixedKernels &mixed_kernels, const Arg &args)
+    {
+        auto *orch = this;
+
+        // Orchestration API should short-circuit after fatal, but keep this entry
+        // robust as a no-op in case a caller reaches it directly.
+        if (orch->fatal) return TaskOutputTensors{};
+
+        // Validate Arg construction (errors recorded by add_input/add_output/etc.)
+        if (args.has_error)
+        {
+            orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
+            return TaskOutputTensors{};
+        }
+        always_assert(orch->scheduler != nullptr);
+        // === Validate submit inputs ===
+        ActiveMask active_mask = mixed_kernels.to_active_mask();
+        always_assert(static_cast<bool>(active_mask) && "MixedKernels must have at least one active slot");
+
+        int16_t block_num = args.launch_spec.block_num();
+        always_assert(block_num >= 1 && "block_num must be >= 1");
+
+        MixedKernels normalized = mixed_kernels;
+        bool has_aic = active_mask.has_mask(PTO2_SUBTASK_MASK_AIC);
+        bool has_aiv0 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV0);
+        bool has_aiv1 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV1);
+        if (!has_aic && has_aiv1 && !has_aiv0)
+        {
+            normalized.aiv0_kernel_id = normalized.aiv1_kernel_id;
+            normalized.aiv1_kernel_id = INVALID_KERNEL_ID;
+            active_mask = normalized.to_active_mask();
+        }
+
+        // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1)
+        if (block_num > 1 && args.launch_spec.require_sync_start())
+        {
+            PTO2ResourceShape shape = active_mask.to_shape();
+            int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count;
+            if (limit > 0 && block_num > limit)
+            {
+                report_fatal(PTO2_ERROR_REQUIRE_SYNC_START_INVALID, __FUNCTION__, "require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit);
+                return TaskOutputTensors{};
+            }
+            active_mask.set_sync_start();
+        }
+
+        return submit_task_common(orch, args, active_mask, normalized.aic_kernel_id, normalized.aiv0_kernel_id, normalized.aiv1_kernel_id);
+    }
+    TaskOutputTensors submit_dummy_task(const Arg &args)
+    {
+        auto *orch = this;
+
+        if (orch->fatal) return TaskOutputTensors{};
+
+        if (args.has_error)
+        {
+            orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
+            return TaskOutputTensors{};
+        }
+        always_assert(orch->scheduler != nullptr);
+
+        return submit_task_common(orch, args, ActiveMask{}, INVALID_KERNEL_ID, INVALID_KERNEL_ID, INVALID_KERNEL_ID);
+    }
+    TaskOutputTensors alloc_tensors(const Arg &args)
+    {
+        auto *orch = this;
+        // Orchestration API should short-circuit after fatal, but keep this entry
+        // robust as a no-op in case a caller reaches it directly.
+        if (orch->fatal) return TaskOutputTensors{};
+
+        if (args.tensor_count() <= 0)
+        {
+            report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors requires at least one TensorCreateInfo");
+            return TaskOutputTensors{};
+        }
+        if (args.scalar_count() != 0)
+        {
+            report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args");
+            return TaskOutputTensors{};
+        }
+        for (int32_t i = 0; i < args.tensor_count(); i++)
+        {
+            if (args.tag(i) != TensorArgType::OUTPUT)
+            {
+                report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args");
+                return TaskOutputTensors{};
+            }
+        }
 
-// =============================================================================
-// Orchestrator Profiling Data
-// =============================================================================
-
-#if PTO2_ORCH_PROFILING
-struct PTO2OrchProfilingData {
-    uint64_t sync_cycle;
-    uint64_t alloc_cycle;  // Combined task slot + heap allocation
-    uint64_t args_cycle;
-    uint64_t lookup_cycle;
-    uint64_t insert_cycle;
-    uint64_t fanin_cycle;
-    uint64_t scope_end_cycle;
-    int64_t submit_count;
-    // Wait time tracking for blocking phases
-    uint64_t alloc_wait_cycle;  // Cycles spent waiting in unified alloc
-    uint64_t fanin_wait_cycle;  // Cycles spent waiting in fanout_lock
-    // Atomic operation counts per phase
-    uint64_t alloc_atomic_count;
-    uint64_t args_atomic_count;
-    uint64_t scope_end_atomic_count;
+        if (args.has_error)
+        {
+            report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg");
+            return TaskOutputTensors{};
+        }
+
+        PTO2OutputLayout layout = calculate_output_layout(args);
+        PTO2PreparedTask prepared;
+        if (!prepare_task(orch, args, layout.total_output_size, ActiveMask{}, &prepared)) return TaskOutputTensors{};
+
+        PTO2TaskDescriptor &task = *prepared.task;
+        PTO2TaskPayload &payload = *prepared.payload;
+
+        task.task_id = prepared.task_id;
+        task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = INVALID_KERNEL_ID;
+        task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = INVALID_KERNEL_ID;
+        task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = INVALID_KERNEL_ID;
+        task.packed_buffer_base = prepared.alloc_result.packed_base;
+        task.packed_buffer_end = prepared.alloc_result.packed_end;
+
+        TaskOutputTensors outputs;
+        outputs.set_task_id(prepared.task_id);
+        payload.init(args, outputs, prepared.alloc_result, layout);
+        payload.fanin_count = 0;
+
+        if (prepared.slot_state != nullptr) prepared.slot_state->task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+        orch->inline_completed_tasks++;
+
+        return outputs;
+    }
+    void mark_done()
+    {
+        auto *orch = this;
+        orch->sm_header->orchestrator_done.store(1, std::memory_order_release);
+        orch->scope_tasks_size = 0;
+        orch->scope_stack_top = -1;
+        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+    }
 };
 
-PTO2OrchProfilingData orchestrator_get_profiling();
-#endif
+// -----------------------------------------------------------------------------
+// Helpers
+// -----------------------------------------------------------------------------
+
+inline int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code)
+{
+    always_assert(orch != nullptr);
+    orch->fatal = true;
+    if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) return PTO2_ERROR_NONE;
+
+    int32_t expected = PTO2_ERROR_NONE;
+    std::atomic<int32_t> &orch_error_code = orch->sm_header->orch_error_code;
+    if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) return error_code;
+    return expected;
+}
+
+inline void orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *fmt, va_list args)
+{
+    int32_t latched_code = orch_mark_fatal(orch, error_code);
+
+    if (fmt == nullptr || fmt[0] == '\0') return;
+
+    char message[1024];
+    vsnprintf(message, sizeof(message), fmt, args);
+    (void)latched_code;
+    (void)message;
+}
+
+inline bool append_fanin_or_fail(PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder)
+{
+    if (fanin_builder->contains(prod_state)) return true;
+    if (fanin_builder->count >= PTO2_MAX_FANIN)
+    {
+        orch_mark_fatal(orch, PTO2_ERROR_DEPENDENCY_OVERFLOW);
+        return false;
+    }
+    fanin_builder->slots[fanin_builder->count++] = prod_state;
+    return true;
+}
+
+inline PTO2OutputLayout calculate_output_layout(const Arg &args)
+{
+    PTO2OutputLayout layout;
+    for (int32_t i = 0; i < args.tensor_count(); i++)
+    {
+        if (args.tag(i) != TensorArgType::OUTPUT) continue;
+        layout.offsets[i] = layout.total_output_size;
+        layout.buffer_sizes[i] = PTO2_ALIGN_UP(args.tensor(i).create_info->buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN);
+        layout.total_output_size += layout.buffer_sizes[i];
+    }
+    return layout;
+}
+
+inline bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator)
+{
+    always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
+
+    int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top];
+    if (scope_task_count < allocator.window_size() - 1) return true;
+
+    orch_mark_fatal(orch, PTO2_ERROR_SCOPE_DEADLOCK);
+    return false;
+}
+
+inline void prefetch_payload(PTO2TaskPayload *payload, int32_t tensor_count, int32_t scalar_count)
+{
+    for (int32_t i = 0; i < tensor_count; i++)
+    {
+        __builtin_prefetch(&payload->tensors[i], 1, 3);
+        __builtin_prefetch(reinterpret_cast<char *>(&payload->tensors[i]) + 64, 1, 3);
+    }
+    for (int32_t i = 0; i < scalar_count; i += 8) __builtin_prefetch(&payload->scalars[i], 1, 3);
+    __builtin_prefetch(payload, 1, 3);
+    __builtin_prefetch(reinterpret_cast<char *>(payload) + 64, 1, 3);
+    __builtin_prefetch(reinterpret_cast<char *>(payload) + 128, 1, 3);
+}
+
+inline bool prepare_task(PTO2OrchestratorState *orch, const Arg &args, int32_t total_output_size, ActiveMask active_mask, PTO2PreparedTask *out)
+{
+    uint8_t ring_id = orch->current_ring_id();
+    auto &allocator = orch->rings[ring_id].task_allocator;
+
+    if (!check_scope_can_accept_task(orch, allocator)) return false;
+
+    out->alloc_result = allocator.alloc(total_output_size);
+    if (out->alloc_result.failed())
+    {
+        orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK);
+        return false;
+    }
+
+    out->task_id = PTO2TaskId::make(ring_id, static_cast<uint32_t>(out->alloc_result.task_id));
+    out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot);
+    out->task = &orch->sm_header->rings[ring_id].task_descriptors[out->alloc_result.slot];
+    out->payload = &orch->sm_header->rings[ring_id].task_payloads[out->alloc_result.slot];
+
+    prefetch_payload(out->payload, args.tensor_count(), args.scalar_count());
+
+    out->slot_state->bind_buffers(out->payload, out->task);
+
+    out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    int16_t block_num = args.launch_spec.block_num();
+    out->slot_state->total_required_subtasks = static_cast<int16_t>(block_num * __builtin_popcount(active_mask.core_mask()));
+    out->slot_state->logical_block_num = block_num;
+    out->slot_state->active_mask = active_mask;
+    scope_tasks_push(orch, out->slot_state);
+
+    return true;
+}
+
+inline void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state)
+{
+    if (orch->scope_tasks_size >= orch->scope_tasks_capacity)
+    {
+        orch->report_fatal(PTO2_ERROR_SCOPE_TASKS_OVERFLOW, __FUNCTION__, "scope_tasks buffer saturated at %d entries (all rings full)", orch->scope_tasks_capacity);
+        return;
+    }
+    orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state;
+}
+
+inline TaskOutputTensors submit_task_common(PTO2OrchestratorState *orch, const Arg &args, ActiveMask active_mask, int32_t aic_kernel_id, int32_t aiv0_kernel_id, int32_t aiv1_kernel_id)
+{
+    TaskOutputTensors result;
+    PTO2OutputLayout layout = calculate_output_layout(args);
+    PTO2PreparedTask prepared;
+    if (!prepare_task(orch, args, layout.total_output_size, active_mask, &prepared)) return result;
+    uint8_t ring_id = prepared.task_id.ring();
+    PTO2SchedulerState *sched = orch->scheduler;
+    PTO2RingFlowControl &fc = orch->sm_header->rings[ring_id].fc;
+    PTO2TaskId task_id = prepared.task_id;
+    PTO2TaskSlotState &cur_slot_state = *prepared.slot_state;
+    PTO2TaskDescriptor &task = *prepared.task;
+    PTO2TaskPayload &payload = *prepared.payload;
+    result.set_task_id(task_id);
+
+    if (is_dep_gen_enabled())
+    {
+        const void *tensor_ptrs[MAX_TENSOR_ARGS];
+        uint8_t arg_types_u8[MAX_TENSOR_ARGS];
+        const int tc_raw = args.tensor_count();
+        const int tc = tc_raw > MAX_TENSOR_ARGS ? MAX_TENSOR_ARGS : tc_raw;
+        for (int i = 0; i < tc; i++)
+        {
+            tensor_ptrs[i] = (args.tag(i) == TensorArgType::OUTPUT) ? nullptr : args.tensor(i).ptr;
+            arg_types_u8[i] = static_cast<uint8_t>(args.tag(i));
+        }
+        const int32_t kernel_ids_capture[3] = {aic_kernel_id, aiv0_kernel_id, aiv1_kernel_id};
+        dep_gen_aicpu_record_submit(task_id.raw, orch->in_manual_scope(), tc, tensor_ptrs, arg_types_u8, static_cast<int>(args.explicit_dep_count()), reinterpret_cast<const uint64_t *>(args.explicit_deps_data()), kernel_ids_capture);
+    }
+
+    PTO2FaninBuilder fanin_builder;
+
+    int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire);
+    orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive);
+
+    for (uint32_t i = 0; i < args.explicit_dep_count(); i++)
+    {
+        PTO2TaskId dep_task_id = args.explicit_dep(i);
+        if (!dep_task_id.is_valid())
+        {
+            orch->report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "Arg.set_dependencies(...) requires valid task ids");
+            return result;
+        }
+        PTO2SharedMemoryRingHeader &dep_ring = orch->sm_header->rings[dep_task_id.ring()];
+        int32_t dep_local_task_id = static_cast<int32_t>(dep_task_id.local());
+        int32_t dep_last_task_alive = dep_ring.fc.last_task_alive.load(std::memory_order_acquire);
+        if (dep_local_task_id < dep_last_task_alive) continue;
+        PTO2TaskSlotState *producer_slot_state = &dep_ring.get_slot_state_by_task_id(dep_local_task_id);
+        if (!append_fanin_or_fail(orch, producer_slot_state, &fanin_builder)) return result;
+    }
+
+    DepInputs dep_inputs{
+        args.tensor_count(), args.tensor_data(), args.tag_data(), static_cast<int32_t>(args.explicit_dep_count()), args.explicit_deps_data(),
+    };
+
+    auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool {
+        PTO2TaskSlotState *prod_state = &orch->sm_header->rings[producer_task_id.ring()].get_slot_state_by_task_id(producer_task_id.local());
+        return append_fanin_or_fail(orch, prod_state, &fanin_builder);
+    };
+
+    if (!compute_task_fanin(dep_inputs, orch->tensor_map, orch->in_manual_scope(), runtime_emit)) return result;
+
+    register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope());
+
+    __builtin_prefetch(&task, 1, 1);
+    task.task_id = task_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = aic_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = aiv0_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = aiv1_kernel_id;
+    task.packed_buffer_base = prepared.alloc_result.packed_base;
+    task.packed_buffer_end = prepared.alloc_result.packed_end;
+
+    for (int32_t i = 0; i < fanin_builder.count; i++) fanin_builder.slots[i]->fanout_count++;
+
+    payload.fanin_count = fanin_builder.count;
+    for (int32_t i = 0; i < fanin_builder.count; i++) payload.fanin_slot_states[i] = fanin_builder.slots[i];
+
+    payload.init(args, result, prepared.alloc_result, layout);
+
+    while (!sched->wiring.queue.push(&cur_slot_state)) SPIN_WAIT_HINT();
+
+    return result;
+}
 
 #endif  // PTO_ORCHESTRATOR_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
deleted file mode 100644
index f6009dc57..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - Ring Buffer Implementation
- *
- * Implements DepListPool ring buffer for zero-overhead dependency management.
- * TaskAllocator methods are defined inline in pto_ring_buffer.h.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_ring_buffer.h"
-#include <inttypes.h>
-#include <string.h>
-#include "common/unified_log.h"
-#include "scheduler/pto_scheduler.h"
-
-static void latch_pool_error(std::atomic<int32_t> *error_code_ptr, int32_t error_code) {
-    if (error_code_ptr == nullptr) {
-        return;
-    }
-    int32_t expected = PTO2_ERROR_NONE;
-    error_code_ptr->compare_exchange_strong(expected, error_code, std::memory_order_acq_rel);
-}
-
-// =============================================================================
-// Fanin Spill Pool Implementation
-// =============================================================================
-void PTO2FaninPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) {
-    if (sm_last_task_alive <= reclaim_task_cursor) return;
-
-    int32_t scan_end = sm_last_task_alive;
-    for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) {
-        PTO2TaskPayload &payload = ring.get_payload_by_task_id(task_id);
-        if (payload.fanin_spill_pool != this) {
-            continue;
-        }
-
-        int32_t inline_count = std::min(payload.fanin_actual_count, PTO2_FANIN_INLINE_CAP);
-        int32_t spill_edge_count = payload.fanin_actual_count - inline_count;
-        if (spill_edge_count > 0) {
-            advance_tail(payload.fanin_spill_start + spill_edge_count);
-        }
-    }
-    reclaim_task_cursor = scan_end;
-}
-
-bool PTO2FaninPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) {
-    if (available() >= needed) return true;
-
-    int spin_count = 0;
-    int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
-    while (available() < needed) {
-        reclaim(ring, prev_last_alive);
-        if (available() >= needed) return true;
-
-        spin_count++;
-
-        int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
-        if (cur_last_alive > prev_last_alive) {
-            spin_count = 0;
-            prev_last_alive = cur_last_alive;
-        }
-
-        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
-            int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire);
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Fanin Spill Pool Deadlock Detected!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("Fanin spill pool cannot reclaim space after %d spins (no progress).", spin_count);
-            LOG_ERROR(
-                "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
-                (capacity > 0) ? (100.0 * used() / capacity) : 0.0
-            );
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("  - Needed:        %d entries", needed);
-            LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
-            LOG_ERROR("  - current_task:    %d", current);
-            LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
-            LOG_ERROR("Diagnosis:");
-            LOG_ERROR("  last_task_alive is not advancing, so fanin spill pool tail");
-            LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase fanin spill pool capacity (current: %d, recommended: %d)", capacity, high_water * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
-            LOG_ERROR("========================================");
-            latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW);
-            return false;
-        }
-        SPIN_WAIT_HINT();
-    }
-    return true;
-}
-
-// =============================================================================
-// Dependency List Pool Implementation
-// =============================================================================
-void PTO2DepListPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) {
-    if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) {
-        int32_t mark = ring.get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark;
-        if (mark > 0) {
-            advance_tail(mark);
-        }
-        last_reclaimed = sm_last_task_alive;
-    }
-}
-
-bool PTO2DepListPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) {
-    if (available() >= needed) return true;
-
-    int spin_count = 0;
-    int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
-    while (available() < needed) {
-        reclaim(ring, prev_last_alive);
-        if (available() >= needed) return true;
-
-        spin_count++;
-
-        // Progress detection: reset spin counter if last_task_alive advances
-        int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
-        if (cur_last_alive > prev_last_alive) {
-            spin_count = 0;
-            prev_last_alive = cur_last_alive;
-        }
-
-        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
-            int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire);
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Dependency Pool Deadlock Detected!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count);
-            LOG_ERROR(
-                "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
-                (capacity > 0) ? (100.0 * used() / capacity) : 0.0
-            );
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("  - Needed:        %d entries", needed);
-            LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
-            LOG_ERROR("  - current_task:    %d", current);
-            LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
-            LOG_ERROR("Diagnosis:");
-            LOG_ERROR("  last_task_alive is not advancing, so dep pool tail");
-            LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d)", capacity, high_water * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
-            LOG_ERROR("========================================");
-            latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW);
-            return false;
-        }
-        SPIN_WAIT_HINT();
-    }
-    return true;
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 4e04dc832..ebc91f324 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -8,28 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Ring Buffer Data Structures
- *
- * Implements ring buffer designs for zero-overhead memory management:
- *
- * 1. TaskAllocator - Unified task slot + output buffer allocation
- *    - Combines task ring (slot allocation) and heap ring (output buffer allocation)
- *    - Single spin-wait loop with unified back-pressure and deadlock detection
- *    - O(1) bump allocation for both task slots and heap buffers
- *
- * 2. FaninPool - Fanin spill entry allocation
- *    - Ring buffer for spilled fanin entries
- *    - O(1) append allocation
- *    - Implicit reclamation with task ring
- *
- * 3. DepListPool - Dependency list entry allocation
- *    - Ring buffer for linked list entries
- *    - O(1) prepend operation
- *    - Implicit reclamation with task ring
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
 
 #ifndef PTO_RING_BUFFER_H
 #define PTO_RING_BUFFER_H
@@ -40,14 +18,6 @@
 
 #include "pto_runtime2_types.h"
 #include "pto_shared_memory.h"
-#include "common/unified_log.h"
-
-#if PTO2_PROFILING
-// Heap-ring wrap reporting — the allocator is the only place each individual
-// wrap is observable, so it notifies the scope_stats collector here. Gated:
-// pays nothing (no include, no call) when profiling is compiled out.
-#include "aicpu/scope_stats_collector_aicpu.h"
-#endif
 
 // Block notification interval (in spin counts)
 #define PTO2_BLOCK_NOTIFY_INTERVAL 10000
@@ -57,41 +27,18 @@
 // Dep pool spin limit - if exceeded, dep pool capacity too small for workload
 #define PTO2_DEP_POOL_SPIN_LIMIT 100000
 
-// =============================================================================
-// Task Allocator (unified task slot + heap buffer allocation)
-// =============================================================================
-
-/**
- * Unified task slot + heap buffer allocator.
- *
- * Since task and heap are always allocated together and the orchestrator is
- * single-threaded, both pointers (task index, heap top) are tracked locally
- * and published to shared memory via plain store — no fetch_add or CAS needed.
- *
- * The alloc() method checks both resources BEFORE committing to either,
- * eliminating the need for rollback on partial failure.
- */
-class PTO2TaskAllocator {
+inline void latch_pool_error(std::atomic<int32_t> *error_code_ptr, int32_t error_code)
+{
+    if (error_code_ptr == nullptr) return;
+    int32_t expected = PTO2_ERROR_NONE;
+    error_code_ptr->compare_exchange_strong(expected, error_code, std::memory_order_acq_rel);
+}
+
+class PTO2TaskAllocator
+{
 public:
-    /**
-     * Initialize the allocator with task ring and heap ring resources.
-     *
-     * All pointer arguments are device addresses (live in SM / GM heap); this
-     * function only stores them, no dereferences, so it is safe to invoke
-     * from host code that constructs a prebuilt arena image.
-     *
-     * Production callers leave `initial_local_task_id` at 0: the SM ring
-     * flow-control counters that current_index_ptr / last_alive_ptr point at
-     * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM
-     * reset), so we keep local_task_id_ aligned with that without reading the
-     * SM. Tests that drive SM state directly may pass a non-zero seed to
-     * exercise corner cases like task IDs near INT32_MAX.
-     */
-    void init(
-        PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr,
-        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr,
-        int32_t initial_local_task_id = 0
-    ) {
+    void init(PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr, std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr, int32_t initial_local_task_id = 0)
+    {
         descriptors_ = descriptors;
         window_size_ = window_size;
         window_mask_ = window_size - 1;
@@ -106,69 +53,50 @@ class PTO2TaskAllocator {
         last_alive_seen_ = 0;
     }
 
-    /**
-     * Allocate a task slot and its associated output buffer in one call.
-     *
-     * Both task index and heap top are maintained as local counters and
-     * published to shared memory only on success. Since the orchestrator is
-     * single-threaded, no CAS or fetch_add is needed — just check-then-commit.
-     *
-     * @param output_size  Total packed output size in bytes (0 = no heap needed)
-     * @return Allocation result; check failed() for errors
-     */
-    PTO2TaskAllocResult alloc(int32_t output_size) {
-        uint64_t aligned_size =
-            output_size > 0 ? PTO2_ALIGN_UP(static_cast<uint64_t>(output_size), PTO2_ALIGN_SIZE) : 0;
+    PTO2TaskAllocResult alloc(int32_t output_size)
+    {
+        uint64_t aligned_size = output_size > 0 ? PTO2_ALIGN_UP(static_cast<uint64_t>(output_size), PTO2_ALIGN_SIZE) : 0;
 
         int spin_count = 0;
         int32_t prev_last_alive = last_alive_ptr_->load(std::memory_order_acquire);
         int32_t last_alive = prev_last_alive;
         update_heap_tail(last_alive);
         bool blocked_on_heap = false;
-#if PTO2_ORCH_PROFILING
-        uint64_t wait_start = 0;
-        bool waiting = false;
-#endif
 
-        while (true) {
+        while (true)
+        {
             // Check both resources; commit only if both available
-            if (local_task_id_ - last_alive + 1 < window_size_) {
+            if (local_task_id_ - last_alive + 1 < window_size_)
+            {
                 void *heap_ptr = try_bump_heap(aligned_size);
-                if (heap_ptr) {
+                if (heap_ptr)
+                {
                     int32_t task_id = commit_task();
-#if PTO2_ORCH_PROFILING
-                    record_wait(spin_count, wait_start, waiting);
-#endif
                     return {task_id, task_id & window_mask_, heap_ptr, static_cast<char *>(heap_ptr) + aligned_size};
                 }
                 blocked_on_heap = true;
-            } else {
+            }
+            else
+            {
                 blocked_on_heap = false;
             }
 
             // Spin: wait for scheduler to advance last_task_alive
             spin_count++;
-#if PTO2_ORCH_PROFILING
-            if (!waiting) {
-                wait_start = get_sys_cnt_aicpu();
-                waiting = true;
-            }
-#endif
             last_alive = last_alive_ptr_->load(std::memory_order_acquire);
             update_heap_tail(last_alive);
-            if (last_alive > prev_last_alive) {
+            if (last_alive > prev_last_alive)
+            {
                 spin_count = 0;
                 prev_last_alive = last_alive;
-            } else {
-                if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0) {
-                    LOG_WARN(
-                        "[TaskAllocator] BLOCKED: tasks=%d/%d, heap=%" PRIu64 "/%" PRIu64 ", on=%s, spins=%d",
-                        local_task_id_ - last_alive, window_size_, heap_top_, heap_size_,
-                        blocked_on_heap ? "heap" : "task", spin_count
-                    );
-                }
-                if (spin_count >= PTO2_ALLOC_SPIN_LIMIT) {
-                    report_deadlock(output_size, blocked_on_heap);
+            }
+            else
+            {
+                if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0)
+                {}
+                if (spin_count >= PTO2_ALLOC_SPIN_LIMIT)
+                {
+                    report_deadlock(blocked_on_heap);
                     return {-1, -1, nullptr, nullptr};
                 }
             }
@@ -176,25 +104,33 @@ class PTO2TaskAllocator {
         }
     }
 
-    // =========================================================================
-    // State queries
-    // =========================================================================
-
-    int32_t active_count() const {
+    int32_t active_count() const
+    {
         int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire);
         return local_task_id_ - last_alive;
     }
 
     // Task ring start/end: tail = oldest live task (last_task_alive), head =
     // next task id to allocate. head - tail == active_count().
-    int32_t task_tail() const { return last_alive_ptr_->load(std::memory_order_acquire); }
-    int32_t task_head() const { return local_task_id_; }
+    int32_t task_tail() const
+    {
+        return last_alive_ptr_->load(std::memory_order_acquire);
+    }
+    int32_t task_head() const
+    {
+        return local_task_id_;
+    }
 
-    int32_t window_size() const { return window_size_; }
+    int32_t window_size() const
+    {
+        return window_size_;
+    }
 
-    uint64_t heap_available() const {
+    uint64_t heap_available() const
+    {
         uint64_t tail = heap_tail_;
-        if (heap_top_ >= tail) {
+        if (heap_top_ >= tail)
+        {
             uint64_t at_end = heap_size_ - heap_top_;
             uint64_t at_begin = tail;
             return at_end > at_begin ? at_end : at_begin;
@@ -202,12 +138,22 @@ class PTO2TaskAllocator {
         return tail - heap_top_;
     }
 
-    uint64_t heap_top() const { return heap_top_; }
+    uint64_t heap_top() const
+    {
+        return heap_top_;
+    }
     // Heap ring start: reclaim pointer (oldest byte still live). heap_top() is
     // the end (next allocation). heap_top - heap_tail == heap_used_bytes().
-    uint64_t heap_tail() const { return heap_tail_; }
-    uint64_t heap_capacity() const { return heap_size_; }
-    uint64_t heap_used_bytes() const {
+    uint64_t heap_tail() const
+    {
+        return heap_tail_;
+    }
+    uint64_t heap_capacity() const
+    {
+        return heap_size_;
+    }
+    uint64_t heap_used_bytes() const
+    {
         if (heap_size_ == 0) return 0;
         return (heap_top_ + heap_size_ - heap_tail_) % heap_size_;
     }
@@ -233,461 +179,104 @@ class PTO2TaskAllocator {
     // --- Shared ---
     std::atomic<int32_t> *error_code_ptr_ = nullptr;
 
-    // =========================================================================
-    // Internal helpers
-    // =========================================================================
-
-    /**
-     * Commit a task slot: bump local counter and publish to shared memory.
-     * Must only be called after space check has passed.
-     */
-    int32_t commit_task() {
+    int32_t commit_task()
+    {
         int32_t task_id = local_task_id_++;
         current_index_ptr_->store(local_task_id_, std::memory_order_release);
         return task_id;
     }
 
-    /**
-     * Derive heap_tail_ from the last consumed task's packed_buffer_end.
-     *
-     * Every task has a valid packed_buffer_end (equal to packed_buffer_base
-     * for zero-size allocations), so the last consumed task always determines
-     * the correct heap_tail — no backward scan needed.
-     */
-    void update_heap_tail(int32_t last_alive) {
+    void update_heap_tail(int32_t last_alive)
+    {
         if (last_alive <= last_alive_seen_) return;
         last_alive_seen_ = last_alive;
 
         PTO2TaskDescriptor &desc = descriptors_[(last_alive - 1) & window_mask_];
-        uint64_t old_tail = heap_tail_;
-        heap_tail_ =
-            static_cast<uint64_t>(static_cast<char *>(desc.packed_buffer_end) - static_cast<char *>(heap_base_));
-#if PTO2_PROFILING
-        // Reclaim pointer moves forward monotonically in ring order; a decrease
-        // means it wrapped past heap_size_ (occupancy < heap_size_ guarantees at
-        // most one wrap per call). Report it so scope_stats can unroll.
-        if (is_scope_stats_enabled() && heap_tail_ < old_tail) {
-            scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_RECLAIM);
-        }
-#else
-        (void)old_tail;
-#endif
+        heap_tail_ = static_cast<uint64_t>(static_cast<char *>(desc.packed_buffer_end) - static_cast<char *>(heap_base_));
     }
 
-    /**
-     * Bump the heap pointer for the given allocation size.
-     * Returns the allocated pointer, or nullptr if insufficient space.
-     * When alloc_size == 0, returns current position without advancing.
-     */
-    void *try_bump_heap(uint64_t alloc_size) {
+    void *try_bump_heap(uint64_t alloc_size)
+    {
         uint64_t top = heap_top_;
-        if (alloc_size == 0) {
-            return static_cast<char *>(heap_base_) + top;
-        }
+        if (alloc_size == 0) return static_cast<char *>(heap_base_) + top;
         uint64_t tail = heap_tail_;
         void *result;
 
-        if (top >= tail) {
+        if (top >= tail)
+        {
             uint64_t space_at_end = heap_size_ - top;
-            if (space_at_end >= alloc_size) {
+            if (space_at_end >= alloc_size)
+            {
                 result = static_cast<char *>(heap_base_) + top;
                 heap_top_ = top + alloc_size;
-            } else if (tail > alloc_size) {
-                LOG_DEBUG(
-                    "try_bump_heap wrap-around alloc: top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64, top, tail,
-                    alloc_size
-                );
+            }
+            else if (tail > alloc_size)
+            {
                 result = heap_base_;
                 heap_top_ = alloc_size;
-#if PTO2_PROFILING
-                // Allocation pointer just wrapped past heap_size_; report it so
-                // scope_stats can unroll the wrapping offset into a monotonic value.
-                // The collector attributes the wrap to the current scope's ring.
-                if (is_scope_stats_enabled()) scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_ALLOC);
-#endif
-            } else {
-                LOG_DEBUG(
-                    "try_bump_heap failed (top>=tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64
-                    ", heap_size=%" PRIu64,
-                    top, tail, alloc_size, heap_size_
-                );
-                return nullptr;
             }
-        } else {
-            if (tail - top > alloc_size) {
-                result = static_cast<char *>(heap_base_) + top;
-                heap_top_ = top + alloc_size;
-            } else {
-                LOG_DEBUG(
-                    "try_bump_heap failed (top<tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64
-                    ", free_gap=%" PRIu64,
-                    top, tail, alloc_size, tail - top
-                );
+            else
+            {
                 return nullptr;
             }
         }
-
-        return result;
-    }
-
-#if PTO2_ORCH_PROFILING
-    void record_wait(int spin_count, uint64_t wait_start, bool waiting) {
-        if (waiting) {
-            extern uint64_t g_orch_alloc_wait_cycle;
-            g_orch_alloc_wait_cycle += (get_sys_cnt_aicpu() - wait_start);
+        else if (tail - top > alloc_size)
+        {
+            result = static_cast<char *>(heap_base_) + top;
+            heap_top_ = top + alloc_size;
         }
+        else
         {
-            extern uint64_t g_orch_alloc_atomic_count;
-            g_orch_alloc_atomic_count += spin_count + 1;
+            return nullptr;
         }
+
+        return result;
     }
-#endif
 
-    /**
-     * Report deadlock with targeted diagnostics.
-     */
-    void report_deadlock(int32_t requested_output_size, bool heap_blocked) {
-        int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire);
-        int32_t active_tasks = local_task_id_ - last_alive;
-        uint64_t htail = heap_tail_;
-
-        LOG_ERROR("========================================");
-        if (heap_blocked) {
-            LOG_ERROR("FATAL: Task Allocator Deadlock - Heap Exhausted!");
-        } else {
-            LOG_ERROR("FATAL: Task Allocator Deadlock - Task Ring Full!");
-        }
-        LOG_ERROR("========================================");
-        LOG_ERROR("No progress after %d spins.", PTO2_ALLOC_SPIN_LIMIT);
-        LOG_ERROR(
-            "  Task ring:  current=%d, last_alive=%d, active=%d/%d (%.1f%%)", local_task_id_, last_alive, active_tasks,
-            window_size_, 100.0 * active_tasks / window_size_
-        );
-        LOG_ERROR(
-            "  Heap ring:  top=%" PRIu64 ", tail=%" PRIu64 ", size=%" PRIu64 ", available=%" PRIu64, heap_top_, htail,
-            heap_size_, heap_available()
-        );
-        if (heap_blocked) {
-            LOG_ERROR("  Requested:  %d bytes", requested_output_size);
-        }
-        LOG_ERROR("Diagnosis:");
-        LOG_ERROR("  last_task_alive is stuck at %d, meaning task %d", last_alive, last_alive);
-        LOG_ERROR("  cannot transition to CONSUMED. Possible causes:");
-        LOG_ERROR("  1. Task %d still executing (subtasks not complete)", last_alive);
-        LOG_ERROR("  2. Task %d fanout not fully released (downstream not done)", last_alive);
-        LOG_ERROR("  3. Scope reference not released (scope_end not called)");
-        LOG_ERROR("  4. Orchestrator blocked here -> can't call scope_end -> circular wait");
-        LOG_ERROR("Solution:");
-        if (heap_blocked) {
-            LOG_ERROR(
-                "  Increase heap size (current: %" PRIu64 ", recommended: %" PRIu64 ")", heap_size_, heap_size_ * 2
-            );
-            LOG_ERROR("  Compile-time: PTO2_HEAP_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_HEAP=<power-of-2 bytes> (e.g. %" PRIu64 ")", heap_size_ * 2);
-        } else {
-            LOG_ERROR("  Increase task window size (current: %d, recommended: %d)", window_size_, active_tasks * 2);
-            LOG_ERROR("  Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2> (e.g. %d)", active_tasks * 2);
-        }
-        LOG_ERROR("========================================");
-        if (error_code_ptr_) {
+    void report_deadlock(bool heap_blocked)
+    {
+        if (error_code_ptr_)
+        {
             int32_t code = heap_blocked ? PTO2_ERROR_HEAP_RING_DEADLOCK : PTO2_ERROR_FLOW_CONTROL_DEADLOCK;
             error_code_ptr_->store(code, std::memory_order_release);
         }
     }
 };
 
-// =============================================================================
-// Fanin Spill Pool
-// =============================================================================
-
-/**
- * Fanin spill pool structure
- *
- * True ring buffer for allocating spilled fanin entries.
- * Entries are reclaimed when their consumer tasks become CONSUMED.
- *
- * Linear counters (top, tail) grow monotonically; the physical index
- * is obtained via modulo: base[linear_index % capacity].
- */
-struct PTO2FaninPool {
-    PTO2FaninSpillEntry *base;       // Pool base address
-    int32_t capacity;                // Total number of entries
-    int32_t top;                     // Linear next-allocation counter (starts from 1)
-    int32_t tail;                    // Linear first-alive counter (entries before this are dead)
-    int32_t high_water;              // Peak concurrent usage (top - tail)
-    int32_t reclaim_task_cursor{0};  // Last task id scanned for reclaim on this pool
-
-    std::atomic<int32_t> *error_code_ptr = nullptr;
-
-    void init(PTO2FaninSpillEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
-        base = in_base;
-        capacity = in_capacity;
-        top = 1;
-        tail = 1;
-        high_water = 0;
-        reclaim_task_cursor = 0;
-        base[0].slot_state = nullptr;
-        error_code_ptr = in_error_code_ptr;
-    }
-
-    void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive);
-
-    bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed);
-
-    PTO2FaninSpillEntry *alloc() {
-        int32_t used = top - tail;
-        if (used >= capacity) {
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Fanin Spill Pool Overflow!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("Fanin spill pool exhausted: %d entries alive (capacity=%d).", used, capacity);
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase fanin spill pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
-            LOG_ERROR("========================================");
-            if (error_code_ptr) {
-                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
-            }
-            return nullptr;
-        }
-        int32_t idx = top % capacity;
-        top++;
-        used++;
-        if (used > high_water) high_water = used;
-        return &base[idx];
-    }
-
-    void advance_tail(int32_t new_tail) {
-        if (new_tail > tail) {
-            tail = new_tail;
-        }
-    }
-
-    int32_t used() const { return top - tail; }
-
-    int32_t available() const { return capacity - used(); }
-};
-
 template <typename Fn>
 using PTO2FaninCallbackResult = std::invoke_result_t<Fn &, PTO2TaskSlotState *>;
 
 template <typename Fn>
 using PTO2FaninForEachReturn = std::conditional_t<std::is_same_v<PTO2FaninCallbackResult<Fn>, void>, void, bool>;
 
-template <typename InlineSlots, typename Fn>
-inline PTO2FaninForEachReturn<Fn> for_each_fanin_storage(
-    InlineSlots &&inline_slot_states, int32_t fanin_count, int32_t spill_start, PTO2FaninPool &spill_pool, Fn &&fn
-) {
+template <typename Slots, typename Fn>
+inline PTO2FaninForEachReturn<Fn> for_each_fanin_in(Slots &&slot_states, int32_t fanin_count, Fn &&fn)
+{
     using FaninCallbackResult = PTO2FaninCallbackResult<Fn>;
-    static_assert(
-        std::is_same_v<FaninCallbackResult, void> || std::is_same_v<FaninCallbackResult, bool>,
-        "fanin callback must return void or bool"
-    );
-
-    if constexpr (std::is_void_v<FaninCallbackResult>) {
-        int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP);
-        for (int32_t i = 0; i < inline_count; i++) {
-            fn(inline_slot_states[i]);
-        }
-
-        int32_t spill_count = fanin_count - inline_count;
-        if (spill_count <= 0) {
-            return;
-        }
-
-        int32_t start_idx = spill_start % spill_pool.capacity;
-        int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx);
-        PTO2FaninSpillEntry *first = spill_pool.base + start_idx;
-        for (int32_t i = 0; i < first_count; i++) {
-            fn(first[i].slot_state);
-        }
+    static_assert(std::is_same_v<FaninCallbackResult, void> || std::is_same_v<FaninCallbackResult, bool>, "fanin callback must return void or bool");
 
-        int32_t second_count = spill_count - first_count;
-        for (int32_t i = 0; i < second_count; i++) {
-            fn(spill_pool.base[i].slot_state);
-        }
-        return;
-    } else {
-        int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP);
-        for (int32_t i = 0; i < inline_count; i++) {
-            if (!fn(inline_slot_states[i])) {
-                return false;
-            }
-        }
-
-        int32_t spill_count = fanin_count - inline_count;
-        if (spill_count <= 0) {
-            return true;
-        }
-
-        int32_t start_idx = spill_start % spill_pool.capacity;
-        int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx);
-        PTO2FaninSpillEntry *first = spill_pool.base + start_idx;
-        for (int32_t i = 0; i < first_count; i++) {
-            if (!fn(first[i].slot_state)) {
-                return false;
-            }
-        }
-
-        int32_t second_count = spill_count - first_count;
-        for (int32_t i = 0; i < second_count; i++) {
-            if (!fn(spill_pool.base[i].slot_state)) {
-                return false;
-            }
-        }
+    if constexpr (std::is_void_v<FaninCallbackResult>)
+    {
+        for (int32_t i = 0; i < fanin_count; i++) fn(slot_states[i]);
+    }
+    else
+    {
+        for (int32_t i = 0; i < fanin_count; i++)
+            if (!fn(slot_states[i])) return false;
         return true;
     }
 }
 
 template <typename Fn>
-inline PTO2FaninForEachReturn<Fn> for_each_fanin_slot_state(const PTO2TaskPayload &payload, Fn &&fn) {
-    return for_each_fanin_storage(
-        payload.fanin_inline_slot_states, payload.fanin_actual_count, payload.fanin_spill_start,
-        *payload.fanin_spill_pool, static_cast<Fn &&>(fn)
-    );
+inline PTO2FaninForEachReturn<Fn> for_each_fanin_slot_state(const PTO2TaskPayload &payload, Fn &&fn)
+{
+    return for_each_fanin_in(payload.fanin_slot_states, payload.fanin_count, static_cast<Fn &&>(fn));
 }
 
-// =============================================================================
-// Dependency List Pool
-// =============================================================================
-
-/**
- * Dependency list pool structure
- *
- * True ring buffer for allocating linked list entries.
- * Entries are reclaimed when their producer tasks become CONSUMED,
- * as tracked by the orchestrator via dep_pool_mark per task.
- *
- * Linear counters (top, tail) grow monotonically; the physical index
- * is obtained via modulo: base[linear_index % capacity].
- */
-struct PTO2DepListPool {
-    PTO2DepListEntry *base;     // Pool base address
-    int32_t capacity;           // Total number of entries
-    int32_t top;                // Linear next-allocation counter (starts from 1)
-    int32_t tail;               // Linear first-alive counter (entries before this are dead)
-    int32_t high_water;         // Peak concurrent usage (top - tail)
-    int32_t last_reclaimed{0};  // last_task_alive at last successful reclamation
-
-    // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
-    std::atomic<int32_t> *error_code_ptr = nullptr;
-
-    /**
-     *
-     * Initialize dependency list pool
-     * @param base      Pool base address from shared memory
-     * @param capacity  Total number of entries
-     */
-    void init(PTO2DepListEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
-        base = in_base;
-        capacity = in_capacity;
-        top = 1;   // Start from 1, 0 means NULL/empty
-        tail = 1;  // Match initial top (no reclaimable entries yet)
-        high_water = 0;
-        last_reclaimed = 0;
-
-        // Initialize entry 0 as NULL marker
-        base[0].slot_state = nullptr;
-        base[0].next = nullptr;
-
-        error_code_ptr = in_error_code_ptr;
-    }
-
-    /**
-     * Reclaim dead entries based on scheduler's slot state dep_pool_mark.
-     * Safe to call multiple times — only advances tail forward.
-     *
-     * @param ring             Ring header (for reading slot dep_pool_mark)
-     * @param sm_last_task_alive Current last_task_alive from shared memory
-     */
-    void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive);
-
-    /**
-     * Ensure dep pool for a specific ring has at least `needed` entries available.
-     * Spin-waits for reclamation if under pressure. Detects deadlock if no progress.
-     */
-    bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed);
-
-    /**
-     * Allocate a single entry from the pool (single-thread per pool instance)
-     *
-     * @return Pointer to allocated entry, or nullptr on fatal error
-     */
-    PTO2DepListEntry *alloc() {
-        int32_t used = top - tail;
-        if (used >= capacity) {
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Dependency Pool Overflow!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("DepListPool exhausted: %d entries alive (capacity=%d).", used, capacity);
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
-            LOG_ERROR("========================================");
-            if (error_code_ptr) {
-                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
-            }
-            return nullptr;
-        }
-        int32_t idx = top % capacity;
-        top++;
-        used++;
-        if (used > high_water) high_water = used;
-        return &base[idx];
-    }
 
-    /**
-     * Advance the tail pointer, reclaiming dead entries.
-     * Called by the orchestrator based on last_task_alive advancement.
-     */
-    void advance_tail(int32_t new_tail) {
-        if (new_tail > tail) {
-            tail = new_tail;
-        }
-    }
-
-    /**
-     * Prepend a task ID to a dependency list
-     *
-     * O(1) operation: allocates new entry and links to current head.
-     *
-     * @param current_head  Current list head offset (0 = empty list)
-     * @param task_slot     Task slot to prepend
-     * @return New head offset
-     */
-    PTO2DepListEntry *prepend(PTO2DepListEntry *cur, PTO2TaskSlotState *slot_state) {
-        PTO2DepListEntry *new_entry = alloc();
-        if (!new_entry) return nullptr;
-        new_entry->slot_state = slot_state;
-        new_entry->next = cur;
-        return new_entry;
-    }
-
-    int32_t used() const { return top - tail; }
-
-    int32_t available() const { return capacity - used(); }
-};
-
-// =============================================================================
-// Ring Set (per-depth aggregate)
-// =============================================================================
-
-/**
- * Groups a TaskAllocator and DepPool into one per-depth unit.
- * PTO2_MAX_RING_DEPTH instances provide independent reclamation per scope depth.
- */
-struct PTO2RingSet {
+struct PTO2RingSet
+{
     PTO2TaskAllocator task_allocator;
-    PTO2FaninPool fanin_pool;
 };
 
 #endif  // PTO_RING_BUFFER_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
deleted file mode 100644
index 8aee802b1..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-/**
- * PTO Runtime2 - Main Implementation
- *
- * Implements the unified runtime API that combines orchestrator and scheduler.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_runtime2.h"
-
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <algorithm>
-
-#include "aicpu/device_time.h"
-#include "common/unified_log.h"
-#if PTO2_PROFILING
-#include "aicpu/scope_stats_collector_aicpu.h"
-#endif
-
-// Weak fallback for HOST .so builds (never called, but satisfies linker).
-// The AICPU build links the strong symbol from platform/.../device_time.cpp.
-// Hidden visibility prevents HOST .so from polluting global symbol table.
-__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-
-// =============================================================================
-// Orchestration Ops Table (function-pointer dispatch for orchestration .so)
-// =============================================================================
-
-static TaskOutputTensors submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args) {
-    return rt->orchestrator.submit_task(mixed_kernels, args);
-}
-
-static TaskOutputTensors alloc_tensors_impl(PTO2Runtime *rt, const Arg &args) {
-    return rt->orchestrator.alloc_tensors(args);
-}
-
-static TaskOutputTensors submit_dummy_task_impl(PTO2Runtime *rt, const Arg &args) {
-    return rt->orchestrator.submit_dummy_task(args);
-}
-
-void rt_scope_begin(PTO2Runtime *rt) {
-    PTO2ScopeMode mode = rt->pending_scope_mode;
-    rt->pending_scope_mode = PTO2ScopeMode::AUTO;
-    rt->orchestrator.begin_scope(mode);
-}
-
-void rt_scope_end(PTO2Runtime *rt) { rt->orchestrator.end_scope(); }
-
-void rt_orchestration_done(PTO2Runtime *rt) { rt->orchestrator.mark_done(); }
-
-static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; }
-
-void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    if (fmt == nullptr || fmt[0] == '\0') {
-        rt->orchestrator.report_fatal(error_code, func, nullptr);
-    } else {
-        char message[1024];
-        vsnprintf(message, sizeof(message), fmt, args);
-        rt->orchestrator.report_fatal(error_code, func, "%s", message);
-    }
-    va_end(args);
-}
-
-// Wait for all producers of this tensor to be safe for data access.
-// Checks owner metadata (lifecycle anchor) and OverlapMap (modifier writers).
-// For reads: wait until each producer COMPLETED (done writing).
-// For writes: also wait until all consumers done reading
-//   (fanout_refcount >= fanout_count - 1, excluding scope reference).
-// Uses cycle-based timeout (checked every 1024 spins).
-// Returns false on timeout (sets orch.fatal).
-MAYBE_UNINITIALIZED_BEGIN
-static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller) {
-    PTO2TaskId owner = tensor.owner_task_id;
-    PTO2OrchestratorState &orch = rt->orchestrator;
-
-    // Segmented wait: collect up to kSegmentCap producer slots, then flush by
-    // spinning on each. When the segment fills, we wait for the accumulated
-    // batch before continuing to gather more. Dedup is per-segment only; a
-    // producer that appears in two segments is waited on twice, which is
-    // idempotent (task_state is monotonic) and only adds one atomic load on
-    // the second encounter.
-    constexpr int kSegmentCap = 64;
-    const PTO2TaskSlotState *seg[kSegmentCap];
-    int seg_count = 0;
-    bool signaled = false;
-    bool failed = false;
-
-    auto wait_one_producer = [&](const PTO2TaskSlotState &slot) {
-        uint8_t ring_id = slot.ring_id;
-        int32_t local_id = static_cast<int32_t>(slot.task->task_id.local());
-        uint64_t t0 = get_sys_cnt_aicpu();
-        int32_t spin_count = 0;
-        while (slot.task_state.load(std::memory_order_acquire) < PTO2_TASK_COMPLETED) {
-            SPIN_WAIT_HINT();
-            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) {
-                orch.report_fatal(
-                    PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller,
-                    "Timeout (%llu cycles): producer (ring=%d, local=%d) not completed",
-                    (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id
-                );
-                failed = true;
-                return;
-            }
-        }
-    };
-
-    auto wait_one_consumers = [&](const PTO2TaskSlotState &slot) {
-        uint8_t ring_id = slot.ring_id;
-        int32_t local_id = slot.task->task_id.local();
-        uint64_t t0 = get_sys_cnt_aicpu();
-        int32_t spin_count = 0;
-        while (slot.fanout_refcount.load(std::memory_order_acquire) < slot.fanout_count - 1) {
-            SPIN_WAIT_HINT();
-            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) {
-                orch.report_fatal(
-                    PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller,
-                    "Timeout (%llu cycles): consumers of producer (ring=%d, local=%d) not done",
-                    (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id
-                );
-                failed = true;
-                return;
-            }
-        }
-    };
-
-    auto flush_segment = [&]() {
-        for (int i = 0; i < seg_count; i++) {
-            wait_one_producer(*seg[i]);
-            if (failed) return;
-            if (!wait_for_consumers) continue;
-            wait_one_consumers(*seg[i]);
-            if (failed) return;
-        }
-        seg_count = 0;
-    };
-
-    auto try_push = [&](const PTO2TaskSlotState &s) {
-        for (int j = 0; j < seg_count; j++) {
-            if (seg[j] == &s) return;  // per-segment dedup
-        }
-        if (seg_count == kSegmentCap) {
-            flush_segment();
-            if (failed) return;
-        }
-        seg[seg_count++] = &s;
-        if (!signaled) {
-            orch.scheduler->wiring.orch_needs_drain.store(true, std::memory_order_release);
-            signaled = true;
-        }
-    };
-
-    auto do_wait = [&]() {
-        // Step A: creator retention — read owner directly from tensor metadata
-        if (owner.is_valid()) {
-            auto &s = orch.sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local());
-            try_push(s);
-            if (failed) return;
-        }
-
-        // Step B: modifier writer lookup (OverlapMap), direct callback
-        orch.tensor_map.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus) -> bool {
-            PTO2TaskId pid = entry.producer_task_id;
-            auto &s = orch.sm_header->rings[pid.ring()].get_slot_state_by_task_id(pid.local());
-            try_push(s);
-            return !failed;
-        });
-        if (failed) return;
-        flush_segment();
-    };
-
-    do_wait();
-    if (signaled) {
-        orch.scheduler->wiring.orch_needs_drain.store(false, std::memory_order_release);
-    }
-    return !failed;
-}
-MAYBE_UNINITIALIZED_END
-
-uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) {
-    if (tensor.buffer.addr == 0) {
-        unified_log_error(
-            __FUNCTION__, "get_tensor_data: buffer not allocated (addr=0). "
-                          "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns."
-        );
-        return 0;
-    }
-
-    if (!wait_for_tensor_ready(rt, tensor, false, __FUNCTION__)) {
-        return 0;
-    }
-
-    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
-    uint64_t elem_size = get_element_size(tensor.dtype);
-    const void *ptr = reinterpret_cast<const void *>(tensor.buffer.addr + flat_offset * elem_size);
-    uint64_t result = 0;
-    memcpy(&result, ptr, elem_size);
-    return result;
-}
-
-void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value) {
-    if (tensor.buffer.addr == 0) {
-        unified_log_error(
-            __FUNCTION__, "set_tensor_data: buffer not allocated (addr=0). "
-                          "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns."
-        );
-        return;
-    }
-
-    // Wait for producer + all consumers before writing (WAW + WAR safety)
-    if (!wait_for_tensor_ready(rt, tensor, true, __FUNCTION__)) {
-        return;
-    }
-
-    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
-    uint64_t elem_size = get_element_size(tensor.dtype);
-    void *ptr = reinterpret_cast<void *>(tensor.buffer.addr + flat_offset * elem_size);
-    memcpy(ptr, &value, elem_size);
-}
-
-// Ops-table entry that hands the call-site captured by PTO2ScopeGuard to the
-// [ScopeStats] collector. The slot is always present in the struct to keep
-// the layout stable; at PTO2_PROFILING=0 we fill nullptr so the orchestration
-// .so's null-check skips it.
-#if PTO2_PROFILING
-static void scope_set_site_impl(const char *file, int line) { scope_stats_set_pending_site(file, line); }
-#endif
-
-static const PTO2RuntimeOps s_runtime_ops = {
-    .submit_task = submit_task_impl,
-    .scope_begin = rt_scope_begin,
-    .scope_end = rt_scope_end,
-    .orchestration_done = rt_orchestration_done,
-    .is_fatal = is_fatal_impl,
-    .report_fatal = rt_report_fatal,
-    .log_error = unified_log_error,
-    .log_warn = unified_log_warn,
-    .log_debug = unified_log_debug,
-    .log_info_v = unified_log_info_v,
-    .get_tensor_data = get_tensor_data,
-    .set_tensor_data = set_tensor_data,
-    .alloc_tensors = alloc_tensors_impl,
-    .submit_dummy_task = submit_dummy_task_impl,
-#if PTO2_PROFILING
-    .scope_set_site = scope_set_site_impl,
-#else
-    .scope_set_site = nullptr,
-#endif
-};
-
-// =============================================================================
-// Runtime Lifecycle (AICPU-only fixup)
-// =============================================================================
-//
-// Layout / init_data / wire / destroy live in
-// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the
-// prebuilt arena image. The pieces below — wiring the ops table and the
-// SPMD core counts — depend on the device-side s_runtime_ops global and the
-// AICPU SchedulerContext respectively, so they remain in the AICPU build.
-
-void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) {
-    rt->ops = &s_runtime_ops;
-    rt->orchestrator.total_cluster_count = aic_count;
-    rt->orchestrator.total_aiv_count = aiv_count;
-}
-
-void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) {
-    if (rt) {
-        rt->mode = mode;
-    }
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 155809365..004a386c5 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -8,29 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Main Interface
- *
- * This is the main header for the PTO Runtime2 system.
- * It provides a unified API for task graph construction and execution.
- *
- * Key Features:
- * - Ring buffer based memory management (zero allocation overhead)
- * - Lazy invalidation TensorMap for dependency discovery
- * - Scope-based buffer lifecycle management
- * - Per-task spinlocks for concurrent fanout updates
- * - Orchestrator-Scheduler decoupling via shared memory
- *
- * Usage:
- *   1. Create runtime: PTO2Runtime create methods
- *   2. Build task graph in orchestration function:
- *      - begin_scope() / end_scope()
- *      - submit_task()
- *   3. Mark orchestration complete: mark_done()
- *   4. Destroy runtime
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
 
 #pragma once
 
@@ -40,33 +17,28 @@
 #include "pto_shared_memory.h"
 #include "pto_ring_buffer.h"
 #include "pto_tensormap.h"
-#include "scheduler/pto_scheduler.h"
+#include "pto_scheduler.h"
 #include "pto_orchestrator.h"
 #include "aicore_completion_mailbox.h"
 
-// =============================================================================
-// Runtime Context
-// =============================================================================
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include "aicpu/device_time.h"
 
-/**
- * Runtime execution mode
- */
-enum PTO2RuntimeMode {
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu();
+
+enum PTO2RuntimeMode
+{
     PTO2_MODE_EXECUTE = 0,    // Execute tasks on workers
     PTO2_MODE_SIMULATE = 1,   // Simulate task execution with cycle counting
     PTO2_MODE_GRAPH_ONLY = 2  // Build graph only, no execution
 };
 
-/**
- * Function-pointer ops table for runtime operations.
- *
- * The orchestration .so calls runtime functions through this table
- * (via pto_orchestration_api.h inline wrappers), so it has zero link
- * dependencies on runtime .cpp files.
- */
 typedef struct PTO2Runtime PTO2Runtime;  // forward declare for ops signatures
 
-struct PTO2RuntimeOps {
+struct PTO2RuntimeOps
+{
     TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args);
     void (*scope_begin)(PTO2Runtime *rt);
     void (*scope_end)(PTO2Runtime *rt);
@@ -75,34 +47,19 @@ struct PTO2RuntimeOps {
     void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
 
     // Logging (populated by runtime, called by orchestration)
-    void (*log_error)(const char *func, const char *fmt, ...);
-    void (*log_warn)(const char *func, const char *fmt, ...);
-    void (*log_debug)(const char *func, const char *fmt, ...);
-    // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside).
-    void (*log_info_v)(const char *func, int v, const char *fmt, ...);
+    // INFO with explicit verbosity tier (v ∈ [0, 9]; gating done inside).
 
     // Cross-layer data access (orchestration reads/writes tensor values via runtime)
     // Placed after logging to avoid shifting hot-path field offsets.
     uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
-    void (*set_tensor_data)(
-        PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
-    );
+    void (*set_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value);
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args);
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args);
-    // Stash the call-site captured by PTO2ScopeGuard into the [ScopeStats]
-    // collector. Always present in the struct to keep ops-table layout stable
-    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
     void (*scope_set_site)(const char *file, int line);
 };
 
-/**
- * Layout descriptor for the prebuilt runtime arena. Holds all sub-region
- * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header /
- * AICore mailbox) plus the layout-defining capacities. Produced once on the
- * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout
- * and runtime_wire_arena_pointers.
- */
-struct PTO2RuntimeArenaLayout {
+struct PTO2RuntimeArenaLayout
+{
     size_t off_sm_handle{0};
     PTO2OrchestratorLayout orch;
     PTO2SchedulerLayout sched;
@@ -119,13 +76,8 @@ struct PTO2RuntimeArenaLayout {
     size_t arena_size{0};
 };
 
-/**
- * PTO Runtime2 context
- *
- * Contains all state for orchestration and scheduling.
- * In simulated mode, runs in single process with shared address space.
- */
-struct PTO2Runtime {
+struct PTO2Runtime
+{
     // Ops table (first field — used by orchestration .so via function pointers)
     const PTO2RuntimeOps *ops;
     PTO2ScopeMode pending_scope_mode;
@@ -147,136 +99,282 @@ struct PTO2Runtime {
     // Statistics
     int64_t total_cycles;
 
-    // Prebuilt-arena fast path metadata. Carries every offset
-    // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct
-    // all arena-internal pointer fields without re-running init_data. The
-    // device base of the runtime arena travels separately on the host-side
-    // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it
-    // *before* dereferencing this image. Populated on host by
-    // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by
-    // aicpu_executor.cpp.
     PTO2RuntimeArenaLayout prebuilt_layout;
 };
 
-// =============================================================================
-// Runtime Lifecycle API
-// =============================================================================
-
-/**
- * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator /
- * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied
- * arena. Pure arithmetic; does not touch device memory and may run on host.
- * Returns the layout descriptor; caller commits/attaches the arena before
- * Phase 2/3.
- */
-PTO2RuntimeArenaLayout runtime_reserve_layout(
-    DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
-);
-
-/**
- * Phase 2 — write the data half of the runtime arena: standalone fields,
- * memset'd arena regions, sub-structure initializers, and SM-side device
- * pointers. The arena must already be committed (or attached); writes go
- * into arena.base() + sub-region offsets.
- *
- * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store
- * them (never dereference). Safe to run on a host arena that owns a host
- * mirror of the runtime image — the resulting buffer is rtMemcpy-ready.
- *
- * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena.
- * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the
- * AICore-side count fields are left untouched and must be filled by the
- * AICPU at boot.
- */
-PTO2Runtime *runtime_init_data_from_layout(
-    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
-    void *gm_heap_dev_base, uint64_t heap_size
-);
-
-/**
- * Phase 3 — wire every arena-internal pointer field (rt->sm_handle,
- * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler,
- * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool,
- * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on
- * both host (writing host-mirror addresses) and AICPU (writing device
- * addresses) sides.
- */
-void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt);
-
-/**
- * AICPU-only Phase 4 — fill in the few fields the host could not know at
- * prebuilt-image build time: the ops table (s_runtime_ops is a device-side
- * file-local global, host cannot resolve its device address) and the
- * orchestrator's core counts (depend on the executor's scheduler context).
- * Call once per boot after runtime_wire_arena_pointers.
- */
-void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count);
-
-/**
- * Destroy runtime. With the prebuilt-arena fast path the arena buffer is
- * pooled across runs by DeviceRunner, so we never call arena.release()
- * here — the destructor only forgets sub-structure pointers (idempotent
- * cleanup).
- */
-void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena);
-
-/**
- * Set execution mode
- */
-void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode);
-
-// =============================================================================
-// Orchestration API (called by orchestration function)
-// =============================================================================
-
-/**
- * Begin a new scope
- *
- * All tasks submitted within this scope will have their lifetime
- * bounded by the scope. When scope_end() is called, the scope
- * releases its reference to all enclosed tasks.
- */
-void rt_scope_begin(PTO2Runtime *rt);
-
-/**
- * End current scope
- *
- * Releases scope reference for all tasks submitted since scope_begin().
- * Tasks whose refcount reaches zero will have their buffers released.
- */
-void rt_scope_end(PTO2Runtime *rt);
-
-/**
- * Mark orchestration as complete
- *
- * Signals that no more tasks will be submitted.
- */
-void rt_orchestration_done(PTO2Runtime *rt);
-
-/**
- * Enter fatal state explicitly from orchestration.
- */
-void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
-
-/**
- * Cross-layer data access: read a tensor value by waiting for its producer.
- */
-uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
+inline PTO2RuntimeArenaLayout runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity)
+{
+    PTO2RuntimeArenaLayout layout{};
+    layout.task_window_size = task_window_size;
+    layout.dep_pool_capacity = dep_pool_capacity;
+
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_window_sizes[r] = static_cast<int32_t>(task_window_size);
+
+    layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity);
+    layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity);
+    layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
+    layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
+
+    layout.arena_size = arena.total_size();
+    return layout;
+}
+
+inline PTO2Runtime *runtime_init_data_from_layout(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t, void *gm_heap_dev_base, uint64_t heap_size)
+{
+    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.off_runtime));
+    memset(rt, 0, sizeof(*rt));
+
+    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    memset(sm_wrap, 0, sizeof(*sm_wrap));
+
+    // rt->ops is filled by the AICPU at boot.
+    rt->mode = mode;
+    rt->gm_heap = gm_heap_dev_base;
+    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
+    rt->gm_heap_owned = false;
+    rt->total_cycles = 0;
+
+    if (!rt->orchestrator.init_data_from_layout(layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.task_window_size)) return nullptr;
+    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) return nullptr;
+
+    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    memset(mailbox, 0, sizeof(*mailbox));
+
+    return rt;
+}
+
+inline void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt)
+{
+    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler);
+    rt->scheduler.wire_arena_pointers(layout.sched, arena);
+}
+
+inline void runtime_destroy(PTO2Runtime *rt)
+{
+    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
+    if (!rt) return;
+    rt->scheduler.destroy();
+    rt->orchestrator.destroy();
+    rt->aicore_mailbox = nullptr;
+    rt->sm_handle = nullptr;
+}
+
+inline void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode)
+{
+    if (rt) rt->mode = mode;
+}
+
+inline void rt_scope_begin(PTO2Runtime *rt)
+{
+    PTO2ScopeMode mode = rt->pending_scope_mode;
+    rt->pending_scope_mode = PTO2ScopeMode::AUTO;
+    rt->orchestrator.begin_scope(mode);
+}
+
+inline void rt_scope_end(PTO2Runtime *rt)
+{
+    rt->orchestrator.end_scope();
+}
+
+inline void rt_orchestration_done(PTO2Runtime *rt)
+{
+    rt->orchestrator.mark_done();
+}
+
+inline void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    if (fmt == nullptr || fmt[0] == '\0')
+    {
+        rt->orchestrator.report_fatal(error_code, func, nullptr);
+    }
+    else
+    {
+        char message[1024];
+        vsnprintf(message, sizeof(message), fmt, args);
+        rt->orchestrator.report_fatal(error_code, func, "%s", message);
+    }
+    va_end(args);
+}
+
+MAYBE_UNINITIALIZED_BEGIN
+inline bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller)
+{
+    PTO2TaskId owner = tensor.owner_task_id;
+    PTO2OrchestratorState &orch = rt->orchestrator;
+
+    constexpr int kSegmentCap = 64;
+    const PTO2TaskSlotState *seg[kSegmentCap];
+    int seg_count = 0;
+    bool signaled = false;
+    bool failed = false;
+
+    auto wait_one_producer = [&](const PTO2TaskSlotState &slot) {
+        uint8_t ring_id = slot.ring_id;
+        int32_t local_id = static_cast<int32_t>(slot.task->task_id.local());
+        uint64_t t0 = get_sys_cnt_aicpu();
+        int32_t spin_count = 0;
+        while (slot.task_state.load(std::memory_order_acquire) < PTO2_TASK_COMPLETED)
+        {
+            SPIN_WAIT_HINT();
+            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES)
+            {
+                orch.report_fatal(PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller, "Timeout (%llu cycles): producer (ring=%d, local=%d) not completed", (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id);
+                failed = true;
+                return;
+            }
+        }
+    };
+
+    auto wait_one_consumers = [&](const PTO2TaskSlotState &slot) {
+        uint8_t ring_id = slot.ring_id;
+        int32_t local_id = slot.task->task_id.local();
+        uint64_t t0 = get_sys_cnt_aicpu();
+        int32_t spin_count = 0;
+        while (slot.fanout_refcount.load(std::memory_order_acquire) < slot.fanout_count - 1)
+        {
+            SPIN_WAIT_HINT();
+            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES)
+            {
+                orch.report_fatal(PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller, "Timeout (%llu cycles): consumers of producer (ring=%d, local=%d) not done", (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id);
+                failed = true;
+                return;
+            }
+        }
+    };
+
+    auto flush_segment = [&]() {
+        for (int i = 0; i < seg_count; i++)
+        {
+            wait_one_producer(*seg[i]);
+            if (failed) return;
+            if (!wait_for_consumers) continue;
+            wait_one_consumers(*seg[i]);
+            if (failed) return;
+        }
+        seg_count = 0;
+    };
+
+    auto try_push = [&](const PTO2TaskSlotState &s) {
+        for (int j = 0; j < seg_count; j++)
+            if (seg[j] == &s) return;
+        if (seg_count == kSegmentCap)
+        {
+            flush_segment();
+            if (failed) return;
+        }
+        seg[seg_count++] = &s;
+        if (!signaled)
+        {
+            orch.scheduler->wiring.orch_needs_drain.store(true, std::memory_order_release);
+            signaled = true;
+        }
+    };
+
+    auto do_wait = [&]() {
+        if (owner.is_valid())
+        {
+            auto &s = orch.sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local());
+            try_push(s);
+            if (failed) return;
+        }
+
+        orch.tensor_map.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus) -> bool {
+            PTO2TaskId pid = entry.producer_task_id;
+            auto &s = orch.sm_header->rings[pid.ring()].get_slot_state_by_task_id(pid.local());
+            try_push(s);
+            return !failed;
+        });
+        if (failed) return;
+        flush_segment();
+    };
+
+    do_wait();
+    if (signaled) orch.scheduler->wiring.orch_needs_drain.store(false, std::memory_order_release);
+    return !failed;
+}
+MAYBE_UNINITIALIZED_END
+
+inline uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[])
+{
+    if (tensor.buffer.addr == 0) return 0;
+
+    if (!wait_for_tensor_ready(rt, tensor, false, __FUNCTION__)) return 0;
+
+    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
+    uint64_t elem_size = get_element_size(tensor.dtype);
+    const void *ptr = reinterpret_cast<const void *>(tensor.buffer.addr + flat_offset * elem_size);
+    uint64_t result = 0;
+    memcpy(&result, ptr, elem_size);
+    return result;
+}
+
+inline void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value)
+{
+    if (tensor.buffer.addr == 0) return;
+
+    // Wait for producer + all consumers before writing (WAW + WAR safety)
+    if (!wait_for_tensor_ready(rt, tensor, true, __FUNCTION__)) return;
+
+    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
+    uint64_t elem_size = get_element_size(tensor.dtype);
+    void *ptr = reinterpret_cast<void *>(tensor.buffer.addr + flat_offset * elem_size);
+    memcpy(ptr, &value, elem_size);
+}
+
+// Function-pointer ops table backing — moved from pto_runtime2.cpp so that
+// the inline runtime_finalize_after_wire above can refer to it.
+
+inline TaskOutputTensors submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args)
+{
+    return rt->orchestrator.submit_task(mixed_kernels, args);
+}
+
+inline TaskOutputTensors alloc_tensors_impl(PTO2Runtime *rt, const Arg &args)
+{
+    return rt->orchestrator.alloc_tensors(args);
+}
+
+inline TaskOutputTensors submit_dummy_task_impl(PTO2Runtime *rt, const Arg &args)
+{
+    return rt->orchestrator.submit_dummy_task(args);
+}
+
+inline bool is_fatal_impl(PTO2Runtime *rt)
+{
+    return rt->orchestrator.fatal;
+}
+
+inline const PTO2RuntimeOps s_runtime_ops = {
+    .submit_task = submit_task_impl,
+    .scope_begin = rt_scope_begin,
+    .scope_end = rt_scope_end,
+    .orchestration_done = rt_orchestration_done,
+    .is_fatal = is_fatal_impl,
+    .report_fatal = rt_report_fatal,
+    .get_tensor_data = get_tensor_data,
+    .set_tensor_data = set_tensor_data,
+    .alloc_tensors = alloc_tensors_impl,
+    .submit_dummy_task = submit_dummy_task_impl,
+    .scope_set_site = nullptr,
+};
 
-/**
- * Cross-layer data access: write a value to a tensor at given indices.
- * Waits for producer completion (WAW) and all consumers (WAR) via TensorMap.
- * See set_tensor_data in pto_orchestration_api.h for full documentation.
- */
-void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value);
+inline void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count)
+{
+    rt->ops = &s_runtime_ops;
+    rt->orchestrator.total_cluster_count = aic_count;
+    rt->orchestrator.total_aiv_count = aiv_count;
+}
 
-/**
- * Slim config struct exported by orchestration .so via aicpu_orchestration_config().
- * Shared definition with pto_orchestration_api.h (same layout, guarded).
- */
 #ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
 #define PTO2_ORCHESTRATION_CONFIG_DEFINED
-struct PTO2OrchestrationConfig {
+struct PTO2OrchestrationConfig
+{
     int expected_arg_count;
 };
 #endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index ecda02555..a22825088 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -9,19 +9,6 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO Runtime2 - Core Type Definitions
- *
- * This header defines all fundamental types used by the PTO Runtime2 system:
- * - Configuration constants
- * - Worker types and task states
- * - Tensor regions and task parameters
- * - Task descriptors with fanin/fanout tracking
- * - Dependency list entries
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
 #ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
 #define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
 
@@ -39,66 +26,16 @@
 #include "pto_task_id.h"
 #include "pto_types.h"
 
-// Spin-wait hint for AICPU threads.  On real hardware the AICPU has dedicated
-// ARM A55 cores — no OS yield is needed, so the hint is a no-op.  In simulation
-// all threads share host CPU cores, so we yield to prevent starvation.
-// This header is also compiled into the Host .so (for struct definitions only),
-// where the hint is never called — the fallback no-op keeps Host builds clean.
 #if __has_include("spin_hint.h")
 #include "spin_hint.h"
 #else
 #define SPIN_WAIT_HINT() ((void)0)
 #endif
 
-// =============================================================================
-// Profiling Configuration
-// =============================================================================
-
-#ifndef PTO2_PROFILING
-#define PTO2_PROFILING 1
-#endif
-
-#ifndef PTO2_ORCH_PROFILING
-#define PTO2_ORCH_PROFILING 0
-#endif
-
-#ifndef PTO2_SCHED_PROFILING
-#define PTO2_SCHED_PROFILING 0
-#endif
-
-#ifndef PTO2_TENSORMAP_PROFILING
-#define PTO2_TENSORMAP_PROFILING 0
-#endif
-
-#if PTO2_ORCH_PROFILING && !PTO2_PROFILING
-#error "PTO2_ORCH_PROFILING requires PTO2_PROFILING=1"
-#endif
-
-#if PTO2_SCHED_PROFILING && !PTO2_PROFILING
-#error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1"
-#endif
-
-#if PTO2_TENSORMAP_PROFILING && !PTO2_ORCH_PROFILING
-#error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1"
-#endif
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-#include "aicpu/device_time.h"
-#endif
-
-// =============================================================================
-// Configuration Constants
-// =============================================================================
-
-// Task management
-// NOTE: PTO2_TASK_WINDOW_SIZE is now a per-ring default value.
-// Actual window size is passed at runtime to runtime_create_from_sm().
-// Use pto2_task_slot(sched, task_id) for slot calculation.
 #define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
 
-// Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer)
-// Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
-#define PTO2_MAX_RING_DEPTH 4
+// Step 1 of static-N migration: single-ring layout. All scopes map to ring 0.
+#define PTO2_MAX_RING_DEPTH 1
 
 // Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH)
 #define PTO2_HEAP_SIZE (256 * 1024 * 1024)  // 256MB per ring (1GB total)
@@ -108,11 +45,6 @@
 
 // Scope management
 #define PTO2_MAX_SCOPE_DEPTH 64  // Maximum nesting depth
-// Hard cap for the scope_tasks buffer. Equals the total in-flight ring slot
-// budget (PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH): once every ring slot
-// is in flight, no more tasks can ever be pushed regardless of buffer size.
-// scope_tasks_push fatals on overflow rather than growing the arena-owned
-// buffer (which would be UB on the arena's malloc'd backing).
 #define PTO2_SCOPE_TASKS_CAP (PTO2_TASK_WINDOW_SIZE * PTO2_MAX_RING_DEPTH)
 
 // Ready queue
@@ -121,8 +53,8 @@
 // Wiring queue
 #define PTO2_WRIRING_QUEUE_SIZE 1024  // Per-shape queue size
 
-// Fanin storage
-#define PTO2_FANIN_INLINE_CAP 64
+// Fanin storage — absolute max number of unique fanin dependencies per task.
+#define PTO2_MAX_FANIN 16
 
 // TensorMap cleanup interval
 #define PTO2_TENSORMAP_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
@@ -132,87 +64,37 @@
 // ~10s on hardware (1.5 GHz counter), ~10s on simulation (chrono-based).
 constexpr uint64_t PTO2_TENSOR_DATA_TIMEOUT_CYCLES = 15 * 1000 * 1000 * 1000ULL;
 
-// =============================================================================
-// Task States
-// =============================================================================
-
-/**
- * Task state enumeration
- *
- * State transitions:
- *   PENDING -> COMPLETED -> CONSUMED
- *
- * The slot stays in PENDING from submit through "ready in queue" and "running
- * on a worker"; readiness and running-vs-idle are derived from fanin_refcount
- * and per-core running_slot_state respectively, not from task_state itself.
- *
- * Conditions:
- *   PENDING->COMPLETED:   all subtasks finish (set by scheduler) or task is a
- *                         hidden alloc completed inline by the orchestrator
- *   COMPLETED->CONSUMED:  fanout_refcount == fanout_count && state == COMPLETED
- */
-typedef enum {
+typedef enum
+{
     PTO2_TASK_PENDING = 0,    // Submitted; awaiting fanin, queued, or dispatched
     PTO2_TASK_COMPLETED = 1,  // Execution finished, output may still be in use
     PTO2_TASK_CONSUMED = 2    // Output fully consumed, buffers can be released
 } PTO2TaskState;
 
-/**
- * Result of a unified task allocation.
- */
-struct PTO2TaskAllocResult {
+struct PTO2TaskAllocResult
+{
     int32_t task_id;    // Absolute task ID (not wrapped)
     int32_t slot;       // task_id & (window_size - 1)
     void *packed_base;  // Heap allocation result (nullptr if failure)
     void *packed_end;   // packed_base + aligned output_size
 
-    bool failed() const { return task_id < 0; }
+    bool failed() const
+    {
+        return task_id < 0;
+    }
 };
 
-struct PTO2OutputLayout {
+struct PTO2OutputLayout
+{
     uint64_t offsets[MAX_TENSOR_ARGS] = {};
     uint64_t buffer_sizes[MAX_TENSOR_ARGS] = {};
     int32_t total_output_size = 0;
 };
 
-// =============================================================================
-// Dependency List Entry
-// =============================================================================
-
-/**
- * Fanin spill entry
- * Stored in the dedicated fanin spill ring buffer.
- */
 struct PTO2TaskSlotState;  // Forward declaration
-struct PTO2FaninPool;      // Forward declaration
-struct PTO2FaninSpillEntry {
-    PTO2TaskSlotState *slot_state;
-};
-static_assert(sizeof(PTO2FaninSpillEntry) == sizeof(PTO2TaskSlotState *));
-
-/**
- * Dependency list entry (singly-linked list node)
- * Stored in DepListPool ring buffer.
- */
-struct PTO2DepListEntry {
-    PTO2TaskSlotState *slot_state;  // Consumer slot state (direct pointer)
-    PTO2DepListEntry *next;         // next entry
-};
 
-// =============================================================================
-// Task Descriptor
-// =============================================================================
-
-/**
- * Task descriptor structure (shared memory)
- *
- * Stored in the TaskDescriptor ring buffer in shared memory.
- * Contains static identification and buffer pointers only.
- * Dynamic scheduling state (fanin/fanout/task_state) is in PTO2TaskSlotState.
- *
- * Fields set by Orchestrator at submission, read by Scheduler for dispatch.
- */
-struct PTO2TaskDescriptor {
+struct PTO2TaskDescriptor
+{
     // Mixed-task identification (encodes ring_id in upper 32 bits)
     PTO2TaskId task_id;  // raw: (ring_id << 32) | local_id
 
@@ -224,58 +106,36 @@ struct PTO2TaskDescriptor {
     void *packed_buffer_end;   // End of packed buffer (for heap reclamation)
 };
 
-// =============================================================================
-// Per-Slot Scheduling State
-// =============================================================================
-
-/**
- * Task payload data (cold path - only accessed during orchestration and dispatch)
- *
- * Layout: metadata + inline fanin packed in the first 9 cache lines, followed
- * by bulk tensor and scalar data. Small fanins stay fully inline; larger
- * fanins spill into a per-ring ring buffer slice.
- */
-struct PTO2TaskPayload {
-    // === Cache lines 0-8 (576B) — metadata + inline fanin ===
+struct PTO2TaskPayload
+{
+    // === Cache lines 0-2 (192B) — metadata + fanin ===
     int32_t tensor_count{0};
     int32_t scalar_count{0};
-    int32_t fanin_actual_count{0};  // Actual fanin count (without the +1 redundance)
-    int32_t fanin_spill_start{0};   // Linear start index in fanin spill pool (0 = no spill)
-    PTO2FaninPool *fanin_spill_pool{nullptr};
-    PTO2TaskSlotState *fanin_inline_slot_states[PTO2_FANIN_INLINE_CAP];
-    // === Cache lines 9-72 (4096B) — tensors (alignas(64) forces alignment) ===
+    int32_t fanin_count{0};  // Number of valid entries in fanin_slot_states
+    PTO2TaskSlotState *fanin_slot_states[PTO2_MAX_FANIN];
+    // === Tensors (Tensor is alignas(64); array is naturally aligned) ===
     Tensor tensors[MAX_TENSOR_ARGS];
-    // === Cache lines 73-74 (128B) — scalars ===
+    // === Scalars ===
     uint64_t scalars[MAX_SCALAR_ARGS];
 
-    // Layout verification (size checks that don't need offsetof).
     static_assert(sizeof(Tensor) == 128, "Tensor must be 2 cache lines");
-    static_assert(MAX_SCALAR_ARGS * sizeof(uint64_t) == 128, "scalar region must be 128B (2 cache lines)");
-
-    /**
-     * Initialize payload: copy tensors, store scalars.
-     *
-     * For each param slot, the tensor source is determined by TensorArgType:
-     * - OUTPUT -> use materialized_outputs.output_ptr(out_idx++)
-     * - INPUT / INOUT -> use refs[i].tensor
-     *
-     * @param args                Task arguments (tensors + scalars)
-     * @param result  Materialized output tensors (from TensorCreateInfo path)
-     */
-    void init(const Arg &args, TaskOutputTensors &result, PTO2TaskAllocResult &alloc_result, PTO2OutputLayout &layout) {
+    static_assert(MAX_SCALAR_ARGS * sizeof(uint64_t) == MAX_SCALAR_ARGS * 8, "scalar region size matches MAX_SCALAR_ARGS");
+
+    void init(const Arg &args, TaskOutputTensors &result, PTO2TaskAllocResult &alloc_result, PTO2OutputLayout &layout)
+    {
         tensor_count = args.tensor_count();
         scalar_count = args.scalar_count();
 
         // int32_t out_idx = 0;
-        for (int32_t i = 0; i < args.tensor_count(); i++) {
-            if (args.tag(i) != TensorArgType::OUTPUT) {
+        for (int32_t i = 0; i < args.tensor_count(); i++)
+        {
+            if (args.tag(i) != TensorArgType::OUTPUT)
+            {
                 tensors[i].copy(*args.tensor(i).ptr);
-            } else {
-                tensors[i].init_from_create_info(
-                    *args.tensor(i).create_info,
-                    reinterpret_cast<void *>(reinterpret_cast<char *>(alloc_result.packed_base) + layout.offsets[i]),
-                    layout.buffer_sizes[i]
-                );
+            }
+            else
+            {
+                tensors[i].init_from_create_info(*args.tensor(i).create_info, reinterpret_cast<void *>(reinterpret_cast<char *>(alloc_result.packed_base) + layout.offsets[i]), layout.buffer_sizes[i]);
                 tensors[i].owner_task_id = result.task_id();
                 result.materialize_output(tensors[i]);
             }
@@ -287,165 +147,61 @@ struct PTO2TaskPayload {
 };
 
 // PTO2TaskPayload layout verification (offsetof requires complete type).
-static_assert(offsetof(PTO2TaskPayload, fanin_spill_pool) == 16, "spill pool pointer layout drift");
-static_assert(
-    offsetof(PTO2TaskPayload, fanin_inline_slot_states) == 24, "inline fanin array must follow spill metadata"
-);
-static_assert(offsetof(PTO2TaskPayload, tensors) == 576, "tensors must start at byte 576 (cache line 9)");
-static_assert(
-    offsetof(PTO2TaskPayload, scalars) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor),
-    "scalars must immediately follow tensors"
-);
-static_assert(
-    sizeof(PTO2TaskPayload) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor) + MAX_SCALAR_ARGS * sizeof(uint64_t),
-    "PTO2TaskPayload size must stay on the baseline cache-line footprint"
-);
-
-/**
- * Per-task slot scheduling state (scheduler-private, NOT in shared memory)
- *
- * Consolidates all hot-path scheduling fields into a single cache-friendly
- * structure (32 bytes = half a cache line). Accessing any field of a task's
- * slot state brings all related fields into the same cache line.
- *
- * Concurrency notes:
- * - fanout_head, fanout_count protected by fanout_lock (per-task spinlock)
- * - fanin_count set once at submission, read-only after (hot path for ready check)
- * - task_state, fanin_refcount, fanout_refcount updated atomically
- */
-struct alignas(64) PTO2TaskSlotState {
-    // Fanout lock + list (accessed together under lock in on_task_complete)
-    std::atomic<int32_t> fanout_lock;  // Per-task spinlock (0=unlocked, 1=locked)
-    int32_t fanout_count;              // 1 (owning scope) + number of consumers
-
-    PTO2DepListEntry *fanout_head;  // Pointer to first fanout entry (nullptr = empty)
-
-    // Task state (completion, consumed check, ready check)
-    std::atomic<PTO2TaskState> task_state;  // PENDING/COMPLETED/CONSUMED
-
-    // Fanin (accessed together in release_fanin_and_check_ready)
-    std::atomic<int32_t> fanin_refcount;  // Dynamic: counts completed producers
-    int32_t fanin_count;                  // Number of producer dependencies (set once by wiring)
+static_assert(offsetof(PTO2TaskPayload, fanin_slot_states) == 16, "fanin array must follow metadata words");
+static_assert(offsetof(PTO2TaskPayload, scalars) == offsetof(PTO2TaskPayload, tensors) + MAX_TENSOR_ARGS * sizeof(Tensor), "scalars must immediately follow tensors");
+static_assert(sizeof(PTO2TaskPayload) == offsetof(PTO2TaskPayload, scalars) + MAX_SCALAR_ARGS * sizeof(uint64_t), "no trailing padding after scalars");
+
+struct alignas(64) PTO2TaskSlotState
+{
+    // Fanout: tracks producer->CONSUMED transition. Incremented by the
+    // orchestrator (+1 sentinel and once per consumer of this slot) and
+    // matched by release_producer in on_task_release.
+    int32_t fanout_count;
+    std::atomic<int32_t> fanout_refcount;
+
+    // Task state (PENDING/COMPLETED/CONSUMED). Polling readiness reads
+    // task_state on producer slots.
+    std::atomic<PTO2TaskState> task_state;
 
-    // Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
-    std::atomic<int32_t> fanout_refcount;  // Dynamic: counts released references
-
-    // --- Per-slot constant, re-bound by orch::prepare_task each submit ---
-    // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]),
-    // but written here per-submit instead of in an O(window_size) init loop —
-    // these are the only "scale-dependent" pointers in this struct, so moving
-    // them out of init makes startup cost independent of task_window_size.
     PTO2TaskPayload *payload;
     PTO2TaskDescriptor *task;
 
+    // Intrusive linkage for the thread-0 pending-readiness queue.
+    PTO2TaskSlotState *next_pending{nullptr};
+
     // --- Set per-submit (depend on task inputs) ---
     ActiveMask active_mask;  // Bitmask of active subtask slots (set once)
     uint8_t ring_id;         // Ring layer (immutable after init)
-    // Set by any subtask FIN that pushed deferred-completion CONDITIONs to
-    // the runtime mailbox; read by the last subtask FIN to decide whether
-    // the task needs MPSC-deferred completion or can complete inline on this
-    // thread. Carved out of the otherwise-padding byte between ring_id and
-    // dep_pool_mark to keep PTO2TaskSlotState at 64 bytes. The write is
-    // sequenced before on_subtask_complete's acq_rel fetch_add and the read
-    // after, so all earlier subtasks' writes are visible to the last subtask.
     std::atomic<bool> any_subtask_deferred{false};
     uint8_t _async_pad{0};
-    int32_t dep_pool_mark{0};  // Dep pool top after wiring (thread-0-only)
 
     std::atomic<int16_t> completed_subtasks{0};  // Each core completion increments by 1
     int16_t total_required_subtasks{0};          // = logical_block_num * popcount(active_mask)
     int16_t logical_block_num{1};                // Total logical blocks (set by orchestrator)
     int16_t next_block_idx{0};                   // Next block to dispatch (scheduler state)
 
-    /**
-     * Bind the slot-invariant ring id. Called once per slot during
-     * RingSchedState::init(); ring_id never changes across reuses.
-     */
-    void bind_ring(uint8_t rid) { ring_id = rid; }
-
-    /**
-     * Re-bind the per-slot payload/task pointers. Called by
-     * orch::prepare_task on every submit. Value is constant for a given
-     * slot, but we pay the cheap re-write each submit (both fields land on
-     * the same 64B slot_state cache line that prepare_task is already
-     * dirtying) to avoid the init-time per-slot loop.
-     */
-    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) {
+    void bind_ring(uint8_t rid)
+    {
+        ring_id = rid;
+    }
+
+    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t)
+    {
         payload = p;
         task = t;
     }
 
-    /**
-     * Reset dynamic scheduling fields for slot reuse.
-     * Called by advance_ring_pointers() after a slot transitions to CONSUMED
-     * and last_task_alive advances past it, but before sync_to_sm() publishes
-     * the new last_task_alive to the orchestrator.
-     *
-     * Skips payload, task, ring_id (immutable, bound once at init).
-     * Skips task_state: left as CONSUMED so that wait_for_tensor_ready()
-     * callers holding stale owner_task_id still observe a completed state.
-     * task_state is set to PENDING by the orchestrator when it reuses the slot.
-     */
-    void reset_for_reuse() {
-        fanout_lock.store(0, std::memory_order_relaxed);
+    void reset_for_reuse()
+    {
         fanout_count = 1;
-        fanout_head = nullptr;
-        fanin_refcount.store(0, std::memory_order_relaxed);
         fanout_refcount.store(0, std::memory_order_relaxed);
         completed_subtasks.store(0, std::memory_order_relaxed);
         next_block_idx = 0;
         any_subtask_deferred.store(false, std::memory_order_relaxed);
+        next_pending = nullptr;
     }
-
-    // === Per-task fanout spinlock ===
-    //
-    // Used by BOTH the orchestrator and the scheduler. The fanout_lock MUST
-    // be held whenever reading or writing fanout_head / fanout_count, because
-    // the orchestrator adds consumers concurrently with the scheduler
-    // traversing the list after task completion.
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void lock_fanout(uint64_t &atomic_count, uint64_t &wait_cycle) {
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-
-        for (;;) {
-            while (fanout_lock.load(std::memory_order_acquire) != 0) {
-                contended = true;
-                atomic_ops++;
-                SPIN_WAIT_HINT();
-            }
-            int32_t expected = 0;
-            if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
-                atomic_ops++;
-                atomic_count += atomic_ops;
-                if (contended) {
-                    wait_cycle += (get_sys_cnt_aicpu() - t0);
-                }
-                return;
-            }
-            contended = true;
-            atomic_ops++;
-        }
-    }
-#endif
-
-    void lock_fanout() {
-        for (;;) {
-            while (fanout_lock.load(std::memory_order_acquire) != 0) {
-                SPIN_WAIT_HINT();
-            }
-            int32_t expected = 0;
-            if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
-                return;
-            }
-        }
-    }
-
-    void unlock_fanout() { fanout_lock.store(0, std::memory_order_release); }
 };
 
-static_assert(sizeof(PTO2TaskSlotState) == 64);
+static_assert(sizeof(PTO2TaskSlotState) <= 128, "slot state should fit in two cache lines");
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
new file mode 100644
index 000000000..98a7f7c26
--- /dev/null
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -0,0 +1,724 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <atomic>
+
+#include "common/core_type.h"
+#include "utils/device_arena.h"
+#include "pto_async_wait.h"
+#include "pto_ring_buffer.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+
+struct PTO2ReadyQueueSlot
+{
+    std::atomic<int64_t> sequence;
+    PTO2TaskSlotState *slot_state;
+};
+
+// Number of CoreType values eligible for local dispatch (AIC=0, AIV=1)
+static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2;
+
+struct PTO2LocalReadyBuffer
+{
+    PTO2TaskSlotState **slot_states = nullptr;
+    int count = 0;
+    int capacity = 0;
+
+    void reset(PTO2TaskSlotState **buf, int cap)
+    {
+        slot_states = buf;
+        count = 0;
+        capacity = cap;
+    }
+
+    bool try_push(PTO2TaskSlotState *s)
+    {
+        if (slot_states && count < capacity)
+        {
+            slot_states[count++] = s;
+            return true;
+        }
+        return false;
+    }
+
+    PTO2TaskSlotState *pop()
+    {
+        return (count > 0) ? slot_states[--count] : nullptr;
+    }
+};
+
+struct alignas(64) PTO2ReadyQueue
+{
+    PTO2ReadyQueueSlot *slots;
+    uint64_t capacity;
+    uint64_t mask;        // capacity - 1
+    char _pad0[64 - 24];  // Pad to own cache line
+
+    std::atomic<uint64_t> enqueue_pos;
+    char _pad1[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
+
+    std::atomic<uint64_t> dequeue_pos;
+    char _pad2[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
+
+    uint64_t size()
+    {
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        return (e >= d) ? (e - d) : 0;
+    }
+
+    bool push(PTO2TaskSlotState *slot_state)
+    {
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        while (true)
+        {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos);
+            if (diff == 0)
+            {
+                if (enqueue_pos.compare_exchange_weak(pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed)) break;
+            }
+            else if (diff < 0)
+            {
+                return false;  // Queue full
+            }
+        }
+
+        slot->slot_state = slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
+        return true;
+    }
+
+    // Batch push: reserve count slots with a single CAS after confirming
+    // every target slot is available under the usual Vyukov sequence check.
+    void push_batch(PTO2TaskSlotState **items, int count)
+    {
+        if (count == 0) return;
+
+        uint64_t pos;
+        while (true)
+        {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            bool ready = true;
+            for (int i = 0; i < count; i++)
+            {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + i);
+                if (diff != 0)
+                {
+                    ready = false;
+                    break;
+                }
+            }
+            if (!ready) continue;
+            if (enqueue_pos.compare_exchange_weak(pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed)) break;
+        }
+
+        for (int i = 0; i < count; i++)
+        {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            slot->slot_state = items[i];
+            slot->sequence.store(static_cast<int64_t>(pos + i + 1), std::memory_order_release);
+        }
+    }
+
+    PTO2TaskSlotState *pop()
+    {
+        // Fast-path: skip slot load when queue is clearly empty
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        if (d >= e) return nullptr;
+
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        while (true)
+        {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos + 1);
+            if (diff == 0)
+            {
+                if (dequeue_pos.compare_exchange_weak(pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed)) break;
+            }
+            else if (diff < 0)
+            {
+                return nullptr;  // Queue empty
+            }
+        }
+
+        PTO2TaskSlotState *result = slot->slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
+        return result;
+    }
+
+    // Batch pop: reserve a contiguous run of ready slots with a single CAS.
+    // Returns actual number of items popped (may be less than max_count).
+    int pop_batch(PTO2TaskSlotState **out, int max_count)
+    {
+        uint64_t pos;
+        int count;
+        while (true)
+        {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            count = 0;
+            while (count < max_count)
+            {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
+                if (diff == 0)
+                {
+                    count++;
+                    continue;
+                }
+                if (diff < 0) break;
+                count = -1;
+                break;
+            }
+            if (count == 0) return 0;
+            if (count < 0) continue;
+            if (dequeue_pos.compare_exchange_weak(pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed)) break;
+        }
+
+        for (int i = 0; i < count; i++)
+        {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            out[i] = slot->slot_state;
+            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
+        }
+        return count;
+    }
+};
+
+inline size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity)
+{
+    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
+}
+inline bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity)
+{
+    // Address the slots region for data writes without storing the pointer in
+    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
+    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+    queue->capacity = capacity;
+    queue->mask = capacity - 1;
+    queue->enqueue_pos.store(0, std::memory_order_relaxed);
+    queue->dequeue_pos.store(0, std::memory_order_relaxed);
+
+    for (uint64_t i = 0; i < capacity; i++)
+    {
+        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
+        slots_arena[i].slot_state = nullptr;
+    }
+
+    return true;
+}
+// Stores queue->slots = arena.region_ptr(slots_off). Idempotent.
+inline void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off)
+{
+    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+}
+inline void ready_queue_destroy(PTO2ReadyQueue *queue)
+{
+    // Arena owns the slots[] buffer; just forget the pointer.
+    queue->slots = nullptr;
+}
+
+struct alignas(64) PTO2SpscQueue
+{
+    // --- Producer cache lines (orchestrator thread) ---
+    alignas(64) std::atomic<uint64_t> head_{0};
+    alignas(64) uint64_t tail_cached_{0};
+
+    // --- Consumer cache lines (scheduler thread 0) ---
+    alignas(64) std::atomic<uint64_t> tail_{0};
+    alignas(64) uint64_t head_cached_{0};
+
+    // --- Shared Cacheline (read only) with mask and data ptr (immutable after init) ---
+    alignas(64) PTO2TaskSlotState **buffer_{nullptr};
+    uint64_t mask_{0};
+
+    // Padding to exactly 5 cache lines
+    char padding[64 - sizeof(PTO2TaskSlotState **) - sizeof(uint64_t)];
+
+    static size_t reserve_layout(DeviceArena &arena, uint64_t capacity)
+    {
+        return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
+    }
+
+    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity)
+    {
+        if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false;
+        auto *buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+        // calloc'd-equivalent: zero the slot pointers so spurious early pops
+        // observe nullptr.
+        for (uint64_t i = 0; i < capacity; i++) buf[i] = nullptr;
+        mask_ = capacity - 1;
+        head_.store(0, std::memory_order_relaxed);
+        tail_.store(0, std::memory_order_relaxed);
+        tail_cached_ = 0;
+        head_cached_ = 0;
+        return true;
+    }
+
+    // Wire the arena-internal pointer. Called by both host (with host arena)
+    // and AICPU (with device arena attached to the prebuilt image).
+    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off)
+    {
+        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+    }
+
+    // Arena owns the buffer; here we only forget our pointer.
+    void destroy()
+    {
+        buffer_ = nullptr;
+    }
+
+    bool push(PTO2TaskSlotState *item)
+    {
+        uint64_t h = head_.load(std::memory_order_relaxed);
+        uint64_t next_h = h + 1;
+        if (next_h - tail_cached_ > mask_)
+        {
+            tail_cached_ = tail_.load(std::memory_order_acquire);
+            if (next_h - tail_cached_ > mask_) return false;
+        }
+        buffer_[h & mask_] = item;
+        head_.store(next_h, std::memory_order_release);
+        return true;
+    }
+
+    // Pop up to max_count items (consumer only). Returns actual count.
+    int pop_batch(PTO2TaskSlotState **out, int max_count)
+    {
+        uint64_t t = tail_.load(std::memory_order_relaxed);
+        uint64_t avail = head_cached_ - t;
+        if (avail < static_cast<uint64_t>(max_count))
+        {
+            head_cached_ = head_.load(std::memory_order_acquire);
+            avail = head_cached_ - t;
+            if (avail == 0) return 0;
+        }
+        int count = (avail < static_cast<uint64_t>(max_count)) ? static_cast<int>(avail) : max_count;
+        for (int i = 0; i < count; i++) out[i] = buffer_[(t + i) & mask_];
+        tail_.store(t + count, std::memory_order_release);
+        return count;
+    }
+
+    // Approximate size (used for backoff decisions, not exact).
+    uint64_t size() const
+    {
+        uint64_t h = head_.load(std::memory_order_acquire);
+        uint64_t t = tail_.load(std::memory_order_acquire);
+        return h - t;
+    }
+};
+
+static_assert(sizeof(PTO2SpscQueue) == 5 * 64, "PTO2SpscQueue must be exactly 5 cache lines (320B)");
+// =============================================================================
+
+struct CompletionStats
+{
+    int32_t fanout_edges;       // Number of fanout edges traversed (notify consumers)
+    int32_t tasks_enqueued;     // Number of consumers that became READY
+    int32_t fanin_edges;        // Number of fanin edges traversed (release producers)
+    bool mixed_task_completed;  // True only when this callback completed a mixed task
+};
+
+struct PTO2SchedulerLayout
+{
+    size_t off_ready_queue_slots[PTO2_NUM_RESOURCE_SHAPES];
+    size_t off_dummy_ready_queue_slots;
+    size_t off_pending_spsc_buffer;
+    uint64_t ready_queue_capacity;
+    uint64_t spsc_capacity;
+};
+
+struct PTO2SchedulerState
+{
+    // Shared memory access
+    PTO2SharedMemoryHeader *sm_header;
+
+    // Per-ring state
+    struct alignas(64) RingSchedState
+    {
+        PTO2SharedMemoryRingHeader *ring;
+        int32_t last_task_alive;
+        std::atomic<int32_t> advance_lock;  // multi-thread CAS
+
+        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id)
+        {
+            ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
+            last_task_alive = 0;
+            advance_lock.store(0, std::memory_order_relaxed);
+            return true;
+        }
+
+        void destroy() { ring = nullptr; }
+
+        void sync_to_sm()
+        {
+            ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release);
+        }
+
+        void advance_ring_pointers()
+        {
+            int32_t current_task_index = ring->fc.current_task_index.load(std::memory_order_acquire);
+            int32_t old_last_task_alive = last_task_alive;
+
+            while (last_task_alive < current_task_index)
+            {
+                PTO2TaskSlotState &slot_state = ring->get_slot_state_by_task_id(last_task_alive);
+                if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) break;
+                last_task_alive++;
+            }
+
+            for (int32_t id = old_last_task_alive; id < last_task_alive; id++) ring->get_slot_state_by_task_id(id).reset_for_reuse();
+
+            sync_to_sm();
+        }
+    } ring_sched_states[PTO2_MAX_RING_DEPTH];
+
+    // Ready queues remain global (scheduling is ring-agnostic)
+    PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES];
+
+    // Dependency-only tasks (active_mask is empty, shape == DUMMY). Drained by
+    // the dispatch loop and completed inline -- never goes to AICore.
+    PTO2ReadyQueue dummy_ready_queue;
+
+    // Thread 0 exclusive: intrusive pending list of tasks awaiting fanin
+    // readiness. SPSC queue receives slot_states from the orchestrator; thread 0
+    // drains them into the pending list and polls fanin producers' task_state.
+    struct alignas(64) PendingState
+    {
+        static constexpr int BACKOFF_LIMIT = 32;
+        static constexpr int DRAIN_BATCH = 30;
+        static constexpr int POLL_MAX_PER_ITER = 128;
+
+        // --- Thread 0 exclusive ---
+        PTO2TaskSlotState *pending_head{nullptr};
+        PTO2TaskSlotState *pending_tail{nullptr};
+        int32_t pending_count{0};
+        int backoff_counter{0};
+        PTO2TaskSlotState *drain_buf[DRAIN_BATCH];
+
+        // --- SPSC queue: orchestrator (push) ↔ thread 0 (pop) ---
+        PTO2SpscQueue queue;
+
+        // --- Orchestrator write, thread 0 read ---
+        alignas(64) std::atomic<bool> orch_needs_drain{false};
+    } wiring;
+
+    alignas(64) AsyncWaitList async_wait_list;
+
+    void push_ready_routed(PTO2TaskSlotState *slot_state)
+    {
+        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+        if (shape == PTO2ResourceShape::DUMMY) dummy_ready_queue.push(slot_state);
+        else ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+    }
+
+    // Append slot to the tail of the intrusive pending list.
+    void pending_push_back(PTO2TaskSlotState *s)
+    {
+        s->next_pending = nullptr;
+        if (wiring.pending_tail) wiring.pending_tail->next_pending = s;
+        else wiring.pending_head = s;
+        wiring.pending_tail = s;
+        wiring.pending_count++;
+    }
+
+    // Pop the head of the pending list (or nullptr).
+    PTO2TaskSlotState *pending_pop_front()
+    {
+        PTO2TaskSlotState *s = wiring.pending_head;
+        if (s == nullptr) return nullptr;
+        wiring.pending_head = s->next_pending;
+        if (wiring.pending_head == nullptr) wiring.pending_tail = nullptr;
+        s->next_pending = nullptr;
+        wiring.pending_count--;
+        return s;
+    }
+
+    bool fanin_satisfied(PTO2TaskSlotState *s) const
+    {
+        const PTO2TaskPayload &p = *s->payload;
+        for (int32_t i = 0; i < p.fanin_count; i++)
+            if (p.fanin_slot_states[i]->task_state.load(std::memory_order_acquire) < PTO2_TASK_COMPLETED) return false;
+        return true;
+    }
+
+    // Thread 0 entry point: drain SPSC into pending list, then poll pending
+    // for newly-ready tasks. Not-ready tasks rotate to the tail.
+    // Returns >0 if anything moved (SPSC drained OR tasks routed to ready);
+    // 0 signals no productive work.
+    int drain_wiring_queue(bool force_drain = false)
+    {
+        // Stage 1: drain SPSC → pending list tail
+        int drained = wiring.queue.pop_batch(wiring.drain_buf, PendingState::DRAIN_BATCH);
+        for (int i = 0; i < drained; i++) pending_push_back(wiring.drain_buf[i]);
+
+        // Backoff when nothing to do and orchestrator isn't pressing
+        if (drained == 0 && wiring.pending_head == nullptr)
+        {
+            if (!force_drain && !wiring.orch_needs_drain.load(std::memory_order_acquire) && wiring.backoff_counter < PendingState::BACKOFF_LIMIT)
+            {
+                wiring.backoff_counter++;
+                return 0;
+            }
+        }
+        wiring.backoff_counter = 0;
+
+        // Stage 2: poll pending list, route ready tasks
+        int routed = 0;
+        int to_visit = wiring.pending_count;
+        if (to_visit > PendingState::POLL_MAX_PER_ITER) to_visit = PendingState::POLL_MAX_PER_ITER;
+        for (int i = 0; i < to_visit; i++)
+        {
+            PTO2TaskSlotState *s = pending_pop_front();
+            if (s == nullptr) break;
+            if (fanin_satisfied(s))
+            {
+                push_ready_routed(s);
+                routed++;
+            }
+            else
+            {
+                pending_push_back(s);
+            }
+        }
+
+        return drained + routed;
+    }
+
+    void check_and_handle_consumed(PTO2TaskSlotState &slot_state)
+    {
+        if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return;
+
+        PTO2TaskState expected = PTO2_TASK_COMPLETED;
+        if (!slot_state.task_state.compare_exchange_strong(expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire)) return;
+
+        int32_t ring_id = slot_state.ring_id;
+        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
+        int32_t expected_lock = 0;
+        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed))
+        {
+            ring_sched_states[ring_id].advance_ring_pointers();
+            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
+        }
+    }
+
+    void release_producer(PTO2TaskSlotState &slot_state)
+    {
+        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
+        check_and_handle_consumed(slot_state);
+    }
+
+    int get_ready_tasks_batch(PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count)
+    {
+        int count = 0;
+        while (count < max_count && local_buf.count > 0) out[count++] = local_buf.slot_states[--local_buf.count];
+        int remaining = max_count - count;
+        if (remaining > 0) count += ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining);
+        return count;
+    }
+
+    void on_scope_end(PTO2TaskSlotState **task_slot_states, int32_t count)
+    {
+        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
+        for (int32_t i = 0; i < count; i++)
+        {
+            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
+            release_producer(*task_slot_states[i]);
+        }
+    }
+
+    bool on_subtask_complete(PTO2TaskSlotState &slot_state)
+    {
+        int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
+        return (prev + 1) == slot_state.total_required_subtasks;
+    }
+
+    void on_mixed_task_complete(
+        PTO2TaskSlotState &slot_state,
+
+        [[maybe_unused]] PTO2LocalReadyBuffer *local_bufs = nullptr
+    )
+    {
+        // Polling model: just publish COMPLETED. Thread 0's pending-poll loop
+        // observes producer task_state and routes consumers when their fanin
+        // is satisfied.
+        slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+    }
+
+    int32_t on_task_release(PTO2TaskSlotState &slot_state)
+    {
+        PTO2TaskPayload *payload = slot_state.payload;
+        for_each_fanin_slot_state(*payload, [&](PTO2TaskSlotState *producer_slot_state) {
+            release_producer(*producer_slot_state);
+        });
+
+        // Self consumed check
+        check_and_handle_consumed(slot_state);
+        return payload->fanin_count;
+    }
+
+    // === Cold-path API ===
+
+    static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t /*dep_pool_capacity*/)
+    {
+        PTO2SchedulerLayout layout{};
+        layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
+        layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
+
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+        layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+        layout.off_pending_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
+        return layout;
+    }
+
+    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base)
+    {
+        PTO2SchedulerState *sched = this;
+        sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) return false;
+
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++)
+            if (!ready_queue_init_data_from_layout(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity)) return false;
+        if (!ready_queue_init_data_from_layout(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity)) return false;
+
+        if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_pending_spsc_buffer, layout.spsc_capacity)) return false;
+        sched->wiring.pending_head = nullptr;
+        sched->wiring.pending_tail = nullptr;
+        sched->wiring.pending_count = 0;
+        sched->wiring.backoff_counter = 0;
+
+        return true;
+    }
+
+    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena)
+    {
+        PTO2SchedulerState *sched = this;
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
+        ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
+        sched->wiring.queue.wire_arena_pointers(arena, layout.off_pending_spsc_buffer);
+    }
+
+    // Forget per-region pointers; arena owns the backing memory.
+    void destroy()
+    {
+        PTO2SchedulerState *sched = this;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) sched->ring_sched_states[r].destroy();
+        sched->wiring.queue.destroy();
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) ready_queue_destroy(&sched->ready_queues[i]);
+        ready_queue_destroy(&sched->dummy_ready_queue);
+    }
+    void print_stats()
+    {}
+    void print_queues()
+    {}
+};
+
+// Scheduler cold-path API is declared as PTO2SchedulerState member functions.
+// See init()/destroy()/print_stats()/print_queues() below the struct definition.
+
+inline bool AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state)
+{
+    sink.sched->on_mixed_task_complete(slot_state, sink.local_bufs);
+    if (*sink.deferred_release_count >= sink.deferred_release_capacity)
+        while (*sink.deferred_release_count > 0) sink.sched->on_task_release(*sink.deferred_release_slot_states[--(*sink.deferred_release_count)]);
+    sink.deferred_release_slot_states[(*sink.deferred_release_count)++] = &slot_state;
+    sink.inline_completed++;
+    return true;
+}
+
+template <bool Profiling>
+inline AsyncPollResult AsyncWaitList::poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs, PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, int32_t deferred_release_capacity)
+{
+    AsyncPollResult result;
+    if (!try_lock()) return result;
+
+    AsyncWaitList::DrainCompletionSink sink{};
+    sink.sched = sched;
+    sink.local_bufs = local_bufs;
+    sink.deferred_release_slot_states = deferred_release_slot_states;
+    sink.deferred_release_count = &deferred_release_count;
+    sink.deferred_release_capacity = deferred_release_capacity;
+
+    int32_t drain_err = PTO2_ERROR_NONE;
+    drain_aicore_completion_mailbox_locked(aicore_mailbox, sink, drain_err);
+    if (drain_err != PTO2_ERROR_NONE)
+    {
+        result.error_code = drain_err;
+        unlock();
+        return result;
+    }
+    result.completed += sink.inline_completed;
+
+    for (int32_t i = count - 1; i >= 0; --i)
+    {
+        AsyncWaitEntry &entry = entries[i];
+        uintptr_t last_invalidated_counter_line = static_cast<uintptr_t>(-1);
+        for (int32_t c = 0; c < entry.condition_count; c++)
+        {
+            CompletionCondition &cond = entry.conditions[c];
+            if (cond.satisfied) continue;
+            if (cond.completion_type == COMPLETION_TYPE_COUNTER && cond.counter_addr != nullptr)
+            {
+                uintptr_t counter_line = mailbox_cache_line(cond.counter_addr);
+                if (counter_line != last_invalidated_counter_line)
+                {
+                    cache_invalidate_range(reinterpret_cast<const void *>(counter_line), sizeof(uint32_t));
+                    last_invalidated_counter_line = counter_line;
+                }
+            }
+            CompletionPollResult poll = cond.test();
+            if (poll.state == CompletionPollState::FAILED)
+            {
+                result.error_code = poll.error_code;
+                result.failed_slot_state = entry.slot_state;
+                unlock();
+                return result;
+            }
+            if (poll.state == CompletionPollState::READY)
+            {
+                cond.satisfied = true;
+                cond.retire();
+                entry.waiting_completion_count--;
+            }
+        }
+
+        if (entry.normal_done && entry.waiting_completion_count <= 0)
+        {
+            sched->on_mixed_task_complete(*entry.slot_state, local_bufs);
+            if (deferred_release_count >= deferred_release_capacity)
+                while (deferred_release_count > 0) sched->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+            deferred_release_slot_states[deferred_release_count++] = entry.slot_state;
+            result.completed++;
+
+            int32_t last = count - 1;
+            if (i != last) entries[i] = entries[last];
+            count = last;
+        }
+    }
+
+    unlock();
+    return result;
+}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index 6d2275f21..47c2115be 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -8,64 +8,24 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Shared Memory Layout
- *
- * Defines the shared memory structure for Orchestrator-Scheduler communication.
- *
- * Memory Layout (per-ring sections repeat for each ring 0..PTO2_MAX_RING_DEPTH-1):
- *   +---------------------------+
- *   | SharedMemoryHeader        |  (per-ring flow control + sync)
- *   +---------------------------+
- *   | Ring 0: TaskDescriptor[]  |
- *   | Ring 0: TaskPayload[]     |
- *   | Ring 0: TaskSlotState[]   |
- *   +---------------------------+
- *   | Ring 1: TaskDescriptor[]  |
- *   | Ring 1: TaskPayload[]     |
- *   | Ring 1: TaskSlotState[]   |
- *   +---------------------------+
- *   | ...                       |
- *   +---------------------------+
- *
- * Design principles:
- * - Only data needed for Orchestrator<->Scheduler communication is here
- * - TensorMap, scope_stack, ready_queues, dep_pool are in private memory
- * - Flow control via atomic counters/flags (no locks needed for single-word R/W)
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
 
 #pragma once
 
 #include "utils/device_arena.h"
 #include "pto_runtime2_types.h"
 
-// =============================================================================
-// Shared Memory Header
-// =============================================================================
-
 struct PTO2SharedMemoryHandle;
 
-/**
- * Per-ring flow control state in shared memory.
- * Written/read by Orchestrator and Scheduler for synchronization.
- */
-struct alignas(64) PTO2RingFlowControl {
+struct alignas(64) PTO2RingFlowControl
+{
     // === Cache Line 0: Written by Orchestrator, Read by Scheduler ===
     alignas(64) std::atomic<int32_t> current_task_index;  // Task ring head (next to allocate)
 
     // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) ===
     alignas(64) std::atomic<int32_t> last_task_alive;  // Task ring tail (oldest active task)
 
-    // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private
-    // local_task_id_ from initial_local_task_id (default 0 in production)
-    // *without* dereferencing current_task_index — it relies on this reset
-    // running on every AICPU boot so 0 stays in sync. If you ever change
-    // the initial fc value or the boot ordering, update the default in
-    // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or
-    // submit IDs will be off by the divergence.
-    void init() {
+    void init()
+    {
         current_task_index.store(0, std::memory_order_relaxed);
         last_task_alive.store(0, std::memory_order_relaxed);
     }
@@ -75,13 +35,8 @@ struct alignas(64) PTO2RingFlowControl {
 
 static_assert(sizeof(PTO2RingFlowControl) == 128, "PTO2RingFlowControl must be exactly 2 cache lines (128B)");
 
-/**
- * Per-ring shared memory header section.
- *
- * Groups flow-control, layout info, and per-ring data pointers for a single ring.
- * Pointers are host-side only (set by setup_pointers, invalid on device).
- */
-struct alignas(64) PTO2SharedMemoryRingHeader {
+struct alignas(64) PTO2SharedMemoryRingHeader
+{
     PTO2RingFlowControl fc;
 
     // Layout metadata (set once at init)
@@ -95,25 +50,39 @@ struct alignas(64) PTO2SharedMemoryRingHeader {
     PTO2TaskPayload *task_payloads;
     PTO2TaskSlotState *slot_states;
 
-    PTO2TaskDescriptor &get_task_by_slot(int32_t slot) { return task_descriptors[slot]; }
+    PTO2TaskDescriptor &get_task_by_slot(int32_t slot)
+    {
+        return task_descriptors[slot];
+    }
 
-    PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id) { return task_descriptors[local_id & task_window_mask]; }
+    PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id)
+    {
+        return task_descriptors[local_id & task_window_mask];
+    }
 
-    PTO2TaskPayload &get_payload_by_slot(int32_t slot) { return task_payloads[slot]; }
+    PTO2TaskPayload &get_payload_by_slot(int32_t slot)
+    {
+        return task_payloads[slot];
+    }
 
-    PTO2TaskPayload &get_payload_by_task_id(int32_t local_id) { return task_payloads[local_id & task_window_mask]; }
+    PTO2TaskPayload &get_payload_by_task_id(int32_t local_id)
+    {
+        return task_payloads[local_id & task_window_mask];
+    }
 
-    PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; }
+    PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot)
+    {
+        return slot_states[slot];
+    }
 
-    PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) { return slot_states[local_id & task_window_mask]; }
+    PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id)
+    {
+        return slot_states[local_id & task_window_mask];
+    }
 };
 
-/**
- * Shared memory header structure
- *
- * Contains per-ring flow control and global layout information.
- */
-struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader {
+struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader
+{
     // === PER-RING FLOW CONTROL + LAYOUT INFO (set once at init) ===
     PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH];
 
@@ -141,20 +110,10 @@ struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader {
     std::atomic<int32_t> sched_error_thread;   // Thread index of last error writer
 };
 
-static_assert(
-    (sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096),
-    "PTO2SharedMemoryHeader should be reasonably sized"
-);
+static_assert((sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096), "PTO2SharedMemoryHeader should be reasonably sized");
 
-// =============================================================================
-// Shared Memory Handle
-// =============================================================================
-
-/**
- * Handle for shared memory lifecycle management (create/destroy).
- * Runtime components (orchestrator, scheduler) use PTO2SharedMemoryHeader* directly.
- */
-struct PTO2SharedMemoryHandle {
+struct PTO2SharedMemoryHandle
+{
     void *sm_base;     // Base address of shared memory
     uint64_t sm_size;  // Total size of shared memory
 
@@ -165,91 +124,202 @@ struct PTO2SharedMemoryHandle {
 
     // === Static helpers ===
 
-    static uint64_t calculate_size(uint64_t task_window_size);
-    static uint64_t calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+    static uint64_t calculate_size(uint64_t task_window_size)
+    {
+        uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_window_sizes[r] = task_window_size;
+        return calculate_size_per_ring(task_window_sizes);
+    }
+    static uint64_t calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        uint64_t size = 0;
+
+        // Header (aligned to cache line)
+        size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+
+        // Per-ring task descriptors and payloads
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+            size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+            size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+        }
+
+        return size;
+    }
 
-    // UT convenience: reserve wrapper + sm_base on `arena`, commit, and init
-    // using default PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE. Only valid when the
-    // arena is otherwise empty (the call performs the single commit). All
-    // memory is owned by the arena — caller must not call destroy().
-    static PTO2SharedMemoryHandle *create_and_init_default(DeviceArena &arena);
+    static PTO2SharedMemoryHandle *create_and_init_default(DeviceArena &arena)
+    {
+        const uint64_t buffer_size = calculate_size(PTO2_TASK_WINDOW_SIZE);
+        const size_t off_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+        const size_t off_buffer = arena.reserve(static_cast<size_t>(buffer_size), PTO2_ALIGN_SIZE);
+        if (arena.commit() == nullptr) return nullptr;
+
+        auto *handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_handle));
+        memset(handle, 0, sizeof(*handle));
+        void *buffer = arena.region_ptr(off_buffer);
+        memset(buffer, 0, static_cast<size_t>(buffer_size));
+        if (!handle->init(buffer, buffer_size, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE)) return nullptr;
+        return handle;
+    }
 
     // === Instance methods ===
 
-    // In-place init for caller-provided wrapper storage (e.g. a region carved
-    // out of a DeviceArena). Sets is_owner = false, calls setup_pointers and
-    // init_header. Returns false when `sm_size` is too small for the requested
-    // `task_window_size`.
-    bool init(void *sm_base, uint64_t sm_size, uint64_t task_window_size, uint64_t heap_size);
+    bool init(void *sm_base_arg, uint64_t sm_size_arg, uint64_t task_window_size, uint64_t heap_size)
+    {
+        if (!sm_base_arg || sm_size_arg == 0) return false;
+        if (sm_size_arg < calculate_size(task_window_size)) return false;
+
+        sm_base = sm_base_arg;
+        sm_size = sm_size_arg;
+        is_owner = false;
+        setup_pointers(task_window_size);
+        init_header(task_window_size, heap_size);
+        return true;
+    }
+
+    void destroy()
+    {
+        // Arena-owned wrappers (is_owner == false) are reclaimed by arena.release();
+        // calling destroy on them is a no-op so existing callers stay safe.
+        if (is_owner && sm_base)
+        {
+            free(sm_base);
+            free(this);
+        }
+    }
+    void print_layout()
+    {
+        if (!header) return;
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {}
+    }
+    bool validate()
+    {
+        if (!sm_base) return false;
+        if (!header) return false;
+
+        PTO2SharedMemoryHeader *h = header;
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            if (!h->rings[r].fc.validate(this, r)) return false;
 
-    void destroy();
-    void print_layout();
-    bool validate();
+        return true;
+    }
 
 private:
-    void init_header(uint64_t task_window_size, uint64_t heap_size);
-    void init_header_per_ring(
-        const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
-    );
-    void setup_pointers(uint64_t task_window_size);
-    void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+    void init_header(uint64_t task_window_size, uint64_t heap_size)
+    {
+        uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+        uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            task_window_sizes[r] = task_window_size;
+            heap_sizes[r] = heap_size;
+        }
+        init_header_per_ring(task_window_sizes, heap_sizes);
+    }
+    void init_header_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        // Per-ring flow control (start at 0)
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) header->rings[r].fc.init();
+
+        header->orchestrator_done.store(0, std::memory_order_relaxed);
+
+        // Per-ring layout info
+        uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            header->rings[r].task_window_size = task_window_sizes[r];
+            header->rings[r].task_window_mask = static_cast<int32_t>(task_window_sizes[r] - 1);
+            header->rings[r].heap_size = heap_sizes[r];
+            header->rings[r].task_descriptors_offset = offset;
+            offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+            offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+            offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+        }
+
+        header->total_size = sm_size;
+        header->graph_output_ptr.store(0, std::memory_order_relaxed);
+        header->graph_output_size.store(0, std::memory_order_relaxed);
+
+        // Error reporting
+        header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+        header->sched_error_bitmap.store(0, std::memory_order_relaxed);
+        header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+        header->sched_error_thread.store(-1, std::memory_order_relaxed);
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            auto &ring = header->rings[r];
+            for (uint64_t i = 0; i < task_window_sizes[r]; i++)
+            {
+                ring.slot_states[i].bind_ring(static_cast<uint8_t>(r));
+                ring.slot_states[i].reset_for_reuse();
+                ring.slot_states[i].active_mask = ActiveMask{};
+            }
+        }
+    }
+    void setup_pointers(uint64_t task_window_size)
+    {
+        uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_window_sizes[r] = task_window_size;
+        setup_pointers_per_ring(task_window_sizes);
+    }
+    void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        char *ptr = (char *)sm_base;
+
+        // Header
+        header = (PTO2SharedMemoryHeader *)ptr;
+        ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+
+        // Per-ring task descriptors, payloads, and slot states
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            auto &ring = header->rings[r];
+            ring.task_descriptors = (PTO2TaskDescriptor *)ptr;
+            ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+
+            ring.task_payloads = (PTO2TaskPayload *)ptr;
+            ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+
+            ring.slot_states = (PTO2TaskSlotState *)ptr;
+            ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+        }
+    }
 };
 
-// =============================================================================
-// SM Device Layout Helpers
-// =============================================================================
-//
-// When the host pre-builds a runtime-arena image, it needs the device-side
-// addresses of several SM sub-fields (ring flow-control counters,
-// task_descriptors arrays, orch_error_code) so it can wire them into the
-// orchestrator / scheduler init_data path without dereferencing the SM —
-// the SM lives in device memory and cannot be touched from host.
-//
-// These helpers compute those addresses by offset arithmetic on the SM
-// device base. Pure pointer math, no loads/stores; safe to call from host.
-// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's
-// own setup_pointers), so values are guaranteed consistent across sides.
 namespace pto2_sm_layout {
 
-inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept {
-    return reinterpret_cast<std::atomic<int32_t> *>(
-        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code)
-    );
+inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept
+{
+    return reinterpret_cast<std::atomic<int32_t> *>(static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code));
 }
 
-inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept {
-    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(
-        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) +
-        static_cast<size_t>(ring_id) * sizeof(PTO2SharedMemoryRingHeader)
-    );
+inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept
+{
+    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) + static_cast<size_t>(ring_id) * sizeof(PTO2SharedMemoryRingHeader));
 }
 
-inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept {
-    return reinterpret_cast<std::atomic<int32_t> *>(
-        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
-        offsetof(PTO2RingFlowControl, current_task_index)
-    );
+inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept
+{
+    return reinterpret_cast<std::atomic<int32_t> *>(reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + offsetof(PTO2RingFlowControl, current_task_index));
 }
 
-inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept {
-    return reinterpret_cast<std::atomic<int32_t> *>(
-        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
-        offsetof(PTO2RingFlowControl, last_task_alive)
-    );
+inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept
+{
+    return reinterpret_cast<std::atomic<int32_t> *>(reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + offsetof(PTO2RingFlowControl, last_task_alive));
 }
 
-// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring)
-// to compute ring `ring_id`'s task_descriptors device address. Accepts a
-// per-ring window-size array so the helper's signature mirrors
-// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently
-// disagree with the SM layout when (hypothetically) ring sizes diverge.
-inline PTO2TaskDescriptor *ring_task_descriptors_addr(
-    void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id
-) noexcept {
+inline PTO2TaskDescriptor *ring_task_descriptors_addr(void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id) noexcept
+{
     assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range");
     char *p = static_cast<char *>(sm_dev_base);
     p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-    for (int r = 0; r < ring_id; r++) {
+    for (int r = 0; r < ring_id; r++)
+    {
         p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
         p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
         p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
index 7f7e735c3..79b878e4d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
@@ -9,83 +9,66 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO Submit Types - Shared submit-contract definitions
- *
- * Header-only definitions shared by orchestration-facing and runtime-facing
- * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h).
- */
-
 #pragma once
 
 #include <stdint.h>
 
 inline constexpr int32_t INVALID_KERNEL_ID = -1;
 
-/**
- * Subtask slot count: AIC, AIV0, AIV1
- */
 inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3;
 
-/**
- * Subtask slot indices
- */
-enum class PTO2SubtaskSlot : uint8_t {
+enum class PTO2SubtaskSlot : uint8_t
+{
     AIC = 0,
     AIV0 = 1,
     AIV1 = 2,
 };
 
-/**
- * Subtask mask bits (for ActiveMask)
- */
 inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0);         // 0x1
 inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);        // 0x2
 inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);        // 0x4
 inline constexpr uint8_t PTO2_SUBTASK_FLAG_SYNC_START = (1u << 3);  // 0x8: all blocks must launch atomically
 
-/**
- * Resource shape — classifies a MixedKernels into one of 3 scheduling buckets.
- *
- * Multi-subtask tasks (2+ active slots) are all scheduled as MIX, which
- * requires a fully-idle cluster (1 AIC + 2 AIV).  The actual cores used
- * are determined at dispatch time by active_mask — unused cores in the
- * cluster remain idle and available for single-core tasks.
- *
- * DUMMY is a synthetic shape for dep-only tasks (no AICore dispatch). Tasks
- * with an empty core_mask route to a dedicated DUMMY ready queue and are
- * completed inline by the scheduler dispatch loop, bypassing core allocation.
- */
-enum class PTO2ResourceShape : uint8_t {
+enum class PTO2ResourceShape : uint8_t
+{
     AIC = 0,    // Single AIC
     AIV = 1,    // Single AIV
     MIX = 2,    // Full cluster (dispatch uses active_mask)
     DUMMY = 3,  // Dependency-only (no AICore dispatch)
 };
 
-// Number of *dispatchable* resource shapes (AIC, AIV, MIX). DUMMY does not
-// allocate a per-shape ready_queue entry / local buffer — it lives in a
-// dedicated queue inside PTO2SchedulerState.
 inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 3;
 
-/**
- * Bitmask of active subtask slots + flags, sizeof == 1.
- */
-class ActiveMask {
+class ActiveMask
+{
 public:
     constexpr ActiveMask() = default;
     constexpr explicit ActiveMask(uint8_t raw) :
-        raw_(raw) {}
+        raw_(raw)
+    {}
 
-    uint8_t raw() const { return raw_; }
+    uint8_t raw() const
+    {
+        return raw_;
+    }
 
-    bool subtask_active(PTO2SubtaskSlot slot) const { return (raw_ & (1u << static_cast<uint8_t>(slot))) != 0; }
+    bool subtask_active(PTO2SubtaskSlot slot) const
+    {
+        return (raw_ & (1u << static_cast<uint8_t>(slot))) != 0;
+    }
 
-    uint8_t core_mask() const { return raw_ & 0x07u; }
+    uint8_t core_mask() const
+    {
+        return raw_ & 0x07u;
+    }
 
-    bool requires_sync_start() const { return (raw_ & PTO2_SUBTASK_FLAG_SYNC_START) != 0; }
+    bool requires_sync_start() const
+    {
+        return (raw_ & PTO2_SUBTASK_FLAG_SYNC_START) != 0;
+    }
 
-    PTO2ResourceShape to_shape() const {
+    PTO2ResourceShape to_shape() const
+    {
         uint8_t cmask = core_mask();
         if (cmask == 0) return PTO2ResourceShape::DUMMY;
         int bit_count = __builtin_popcount(cmask);
@@ -94,22 +77,44 @@ class ActiveMask {
         return PTO2ResourceShape::AIV;
     }
 
-    void set_sync_start() { raw_ |= PTO2_SUBTASK_FLAG_SYNC_START; }
+    void set_sync_start()
+    {
+        raw_ |= PTO2_SUBTASK_FLAG_SYNC_START;
+    }
 
-    bool operator==(ActiveMask other) const { return raw_ == other.raw_; }
-    bool operator!=(ActiveMask other) const { return raw_ != other.raw_; }
+    bool operator==(ActiveMask other) const
+    {
+        return raw_ == other.raw_;
+    }
+    bool operator!=(ActiveMask other) const
+    {
+        return raw_ != other.raw_;
+    }
 
-    ActiveMask operator|(ActiveMask other) const { return ActiveMask(raw_ | other.raw_); }
-    ActiveMask &operator|=(ActiveMask other) {
+    ActiveMask operator|(ActiveMask other) const
+    {
+        return ActiveMask(raw_ | other.raw_);
+    }
+    ActiveMask &operator|=(ActiveMask other)
+    {
         raw_ |= other.raw_;
         return *this;
     }
 
-    ActiveMask operator&(uint8_t mask) const { return ActiveMask(raw_ & mask); }
+    ActiveMask operator&(uint8_t mask) const
+    {
+        return ActiveMask(raw_ & mask);
+    }
 
-    bool has_mask(uint8_t mask) const { return (raw_ & mask) != 0; }
+    bool has_mask(uint8_t mask) const
+    {
+        return (raw_ & mask) != 0;
+    }
 
-    explicit operator bool() const { return raw_ != 0; }
+    explicit operator bool() const
+    {
+        return raw_ != 0;
+    }
 
 private:
     uint8_t raw_{0};
@@ -117,18 +122,14 @@ class ActiveMask {
 
 static_assert(sizeof(ActiveMask) == 1, "ActiveMask must be exactly 1 byte");
 
-/**
- * Mixed-task submit contract.
- *
- * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive).
- * At least one slot must be valid.
- */
-struct MixedKernels {
+struct MixedKernels
+{
     int32_t aic_kernel_id{INVALID_KERNEL_ID};
     int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
     int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
 
-    ActiveMask to_active_mask() const {
+    ActiveMask to_active_mask() const
+    {
         uint8_t mask = 0;
         if (aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC;
         if (aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0;
@@ -137,22 +138,28 @@ struct MixedKernels {
     }
 };
 
-/**
- * SPMD launch parameters carried inside Arg.
- *
- * Controls how many logical blocks (SPMD dimension) a single task
- * is expanded into at dispatch time.  Each block receives a unique
- * block_idx in [0, block_num) via the per-dispatch LocalContext.
- */
-class PTO2LaunchSpec {
+class PTO2LaunchSpec
+{
 public:
     constexpr PTO2LaunchSpec() = default;
 
-    int16_t block_num() const { return block_num_; }
-    void set_block_num(int16_t n) { block_num_ = n; }
+    int16_t block_num() const
+    {
+        return block_num_;
+    }
+    void set_block_num(int16_t n)
+    {
+        block_num_ = n;
+    }
 
-    bool require_sync_start() const { return require_sync_start_; }
-    void set_require_sync_start(bool v) { require_sync_start_ = v; }
+    bool require_sync_start() const
+    {
+        return require_sync_start_;
+    }
+    void set_require_sync_start(bool v)
+    {
+        require_sync_start_ = v;
+    }
 
 private:
     int16_t block_num_{1};
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_task_id.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_task_id.h
index 0996ce5d8..f3040998c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_task_id.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_task_id.h
@@ -9,43 +9,49 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO2TaskId — minimal standalone header.
- *
- * Factored out of pto_runtime2_types.h so that tensor.h can include it
- * without pulling in scheduler-internal constants (heap sizes, timeouts, etc.).
- */
-
 #pragma once
 
 #include <cstdint>
 
-/**
- * TaskId: 64-bit encoding used across Runtime2.
- *
- * raw encoding: (ring_id << 32) | local_id
- *
- * ring_id:  which ring layer (0..PTO2_MAX_RING_DEPTH-1)
- * local_id: per-ring monotonic counter
- *
- * Invalid sentinel: raw == UINT64_MAX (no valid task has this encoding).
- */
-struct PTO2TaskId {
+struct PTO2TaskId
+{
     uint64_t raw;
 
-    static constexpr PTO2TaskId make(uint8_t ring_id, uint32_t local_id) {
+    static constexpr PTO2TaskId make(uint8_t ring_id, uint32_t local_id)
+    {
         return PTO2TaskId{(static_cast<uint64_t>(ring_id) << 32) | static_cast<uint64_t>(local_id)};
     }
 
-    static constexpr PTO2TaskId invalid() { return PTO2TaskId{UINT64_MAX}; }
+    static constexpr PTO2TaskId invalid()
+    {
+        return PTO2TaskId{UINT64_MAX};
+    }
 
-    constexpr uint8_t ring() const { return static_cast<uint8_t>(raw >> 32); }
-    constexpr uint32_t local() const { return static_cast<uint32_t>(raw & 0xFFFFFFFFu); }
-    constexpr bool is_valid() const { return raw != UINT64_MAX; }
-    constexpr bool is_invalid() const { return raw == UINT64_MAX; }
+    constexpr uint8_t ring() const
+    {
+        return static_cast<uint8_t>(raw >> 32);
+    }
+    constexpr uint32_t local() const
+    {
+        return static_cast<uint32_t>(raw & 0xFFFFFFFFu);
+    }
+    constexpr bool is_valid() const
+    {
+        return raw != UINT64_MAX;
+    }
+    constexpr bool is_invalid() const
+    {
+        return raw == UINT64_MAX;
+    }
 
-    constexpr bool operator==(const PTO2TaskId &other) const { return raw == other.raw; }
-    constexpr bool operator!=(const PTO2TaskId &other) const { return raw != other.raw; }
+    constexpr bool operator==(const PTO2TaskId &other) const
+    {
+        return raw == other.raw;
+    }
+    constexpr bool operator!=(const PTO2TaskId &other) const
+    {
+        return raw != other.raw;
+    }
 };
 
 static_assert(sizeof(PTO2TaskId) == 8, "PTO2TaskId must stay 8 bytes (shared memory ABI)");
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 875b79bbe..e9e29e2d5 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -9,37 +9,6 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO Runtime2 - TensorMap Interface
- *
- * TensorMap provides producer lookup for dependency discovery:
- * - Maps Tensor -> producer task ID
- * - Used by pto_submit_task() to find dependencies
- *
- * Key design features:
- * 1. Ring buffer pool for entries (no malloc/free)
- * 2. Lazy invalidation (entries become stale when producer retires)
- * 3. Per-task per-ring entry tracking for efficient cleanup
- * 4. OVERLAP DETECTION: Detects dependencies for overlapping sub-regions
- *
- * Hash table with chaining:
- * - buckets[] array of head offsets
- * - Entries linked via next_in_bucket
- * - Insert at head (newest first) for sorted chains
- *
- * CRITICAL: Hash only by base_ptr
- * ==============================
- * For overlap detection to work, ALL sub-regions of the same base tensor
- * MUST be in the SAME hash bucket. This allows lookup to compare all
- * potentially overlapping regions.
- *
- * Overlap detection: Two regions create a dependency if:
- *   1. Same base_ptr (raw tensor pointer)
- *   2. Byte ranges [offset, offset+size) intersect
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
 #pragma once
 
 #include "common.h"
@@ -47,14 +16,8 @@
 #include "pto_runtime2_types.h"
 #include "tensor.h"
 
-/**
- * Layout descriptor produced by PTO2TensorMap::reserve_layout(). Stores the
- * region offsets returned by DeviceArena::reserve() so init_from_layout()
- * can fetch the matching pointers after the arena is committed.
- *
- * All offsets are relative to the arena's base.
- */
-struct PTO2TensorMapLayout {
+struct PTO2TensorMapLayout
+{
     size_t off_buckets;
     size_t off_entry_pool;
     size_t off_free_entry_list;
@@ -64,65 +27,20 @@ struct PTO2TensorMapLayout {
     int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
 };
 
-// =============================================================================
-// TensorMap Lookup Profiling (must precede inline lookup/insert methods)
-// =============================================================================
-#ifndef PTO2_TENSORMAP_PROFILING
-#define PTO2_TENSORMAP_PROFILING 0
-#endif
-
-#if PTO2_TENSORMAP_PROFILING
-extern uint64_t g_lookup_chain_total;
-extern uint64_t g_lookup_count;
-extern int32_t g_lookup_chain_max;
-extern uint64_t g_lookup_overlap_checks;
-extern uint64_t g_lookup_overlap_hits;
-extern uint64_t g_insert_count;
-#endif
-
-// =============================================================================
-// TensorMap Structure
-// =============================================================================
-
-/**
- * TensorMap entry structure — cache-line optimized for lookup
- *
- * Cache line 1 (64B, lookup hot path) mirrors Tensor cache line 1 byte-for-byte
- * from byte 16 onward, so that `memcpy(this, &tensor, 64)` populates everything
- * we need for overlap checks. Bytes [0, 16) carry entry-only fields (hash
- * bucket head + chain pointer) that overlap Tensor::buffer (addr in [0, 8) is
- * the hash key, size in [8, 16) is unused by the entry — we repurpose it for
- * `next_in_bucket`).
- *
- *   buffer_addr / next_in_bucket / producer_task_id   — chain traversal + match
- *   start_offset                                       — overlap byte range begin
- *   version, ndims, dtype, manual_dep, is_contiguous   — overlap fast path
- *   shapes[5]                                          — overlap comparison (line 1)
- *
- * Cache line 2 (64B, slow-path / non-contiguous overlap):
- *   prev_in_bucket / next_in_task / prev_in_task       — chain manipulation
- *   bucket_index                                       — bookkeeping
- *   extent_elem_cache                                  — overlap byte range end
- *   strides[5]                                          — reserved for L2 overlap (PR-2)
- *
- * When both entry & probe are `is_contiguous && start_offset == 0`, the overlap
- * check derives `extent_elem = prod(shapes)` from cache line 1 alone.
- *
- * Entry size: 128B (2 cache lines), matches Tensor.
- */
-struct alignas(64) PTO2TensorMapEntry {
+struct alignas(64) PTO2TensorMapEntry
+{
     // === Cache line 1 (64B) — lookup hot path; mirrors Tensor line 1 from byte 16 ===
-    uint64_t buffer_addr;                // 8B [0, 8):   tensor base address (hash key, mirrors Tensor::buffer.addr)
-    PTO2TensorMapEntry *next_in_bucket;  // 8B [8, 16):  next entry in hash bucket chain (overlays Tensor::buffer.size)
-    PTO2TaskId producer_task_id;         // 8B [16,24):  mirrors Tensor::owner_task_id slot
-    uint64_t start_offset;               // 8B [24,32):  mirrors Tensor::start_offset (element offset)
-    int32_t version;                     // 4B [32,36):  mirrors Tensor::version
-    uint32_t ndims;                      // 4B [36,40):  mirrors Tensor::ndims
-    DataType dtype;                      // 1B [40,41):  mirrors Tensor::dtype
-    bool manual_dep;                     // 1B [41,42):  mirrors Tensor::manual_dep
-    bool is_contiguous;                  // 1B [42,43):  mirrors Tensor::is_contiguous
-    uint8_t __padding1__;                // 1B [43,44):  mirrors Tensor padding
-    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS];  // 20B [44,64): mirrors Tensor::shapes
+    uint64_t buffer_addr;                      // 8B [0, 8):   tensor base address (hash key, mirrors Tensor::buffer.addr)
+    PTO2TensorMapEntry *next_in_bucket;        // 8B [8, 16):  next entry in hash bucket chain (overlays Tensor::buffer.size)
+    PTO2TaskId producer_task_id;               // 8B [16, 24):  mirrors Tensor::owner_task_id slot
+    uint64_t start_offset;                     // 8B [24, 32):  mirrors Tensor::start_offset (element offset)
+    int32_t version;                           // 4B [32, 36):  mirrors Tensor::version
+    uint32_t ndims;                            // 4B [36, 40):  mirrors Tensor::ndims
+    DataType dtype;                            // 1B [40, 41):  mirrors Tensor::dtype
+    bool manual_dep;                           // 1B [41, 42):  mirrors Tensor::manual_dep
+    bool is_contiguous;                        // 1B [42, 43):  mirrors Tensor::is_contiguous
+    uint8_t __padding1__;                      // 1B [43, 44):  mirrors Tensor padding
+    uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS];  // 20B [44, 64): mirrors Tensor::shapes
 
     // === Cache line 2 (64B) — chain manipulation + non-contiguous overlap data ===
     PTO2TensorMapEntry *prev_in_bucket;         // 8B [64, 72)
@@ -130,99 +48,65 @@ struct alignas(64) PTO2TensorMapEntry {
     PTO2TensorMapEntry *prev_in_task;           // 8B [80, 88)
     int32_t bucket_index;                       // 4B [88, 92): -1 when unlinked
     uint32_t __padding2__;                      // 4B [92, 96)
-    uint64_t extent_elem_cache;                 // 8B [96,104): non-contiguous extent (mirrors Tensor)
-    uint32_t strides[RUNTIME_MAX_TENSOR_DIMS];  // 20B [104,124): element strides, mirrors Tensor::strides
-    uint8_t __padding3__[4];                    // 4B [124,128)
-
-    /**
-     * Copy overlap-relevant fields from a Tensor into this entry.
-     *
-     * 64B memcpy of Tensor cache line 1 populates buffer_addr (byte [0,8)),
-     * producer_task_id, start_offset, version, ndims, dtype, manual_dep,
-     * is_contiguous and shapes[]. Byte [8,16) holds Tensor::buffer.size in
-     * the source and gets written into next_in_bucket; that's harmless
-     * because link_entry() overwrites next_in_bucket immediately after.
-     *
-     * Cache line 2 (stride / extent_elem_cache) is derived from line 1 when
-     * the source is canonically contiguous (is_contiguous && start_offset==0),
-     * so the producer Tensor's cache line 2 stays cold during insert. Only
-     * non-contiguous producers pay one extra line 2 read.
-     */
-    void copy_from_tensor(const Tensor &tensor) {
+    uint64_t extent_elem_cache;                 // 8B [96, 104): non-contiguous extent (mirrors Tensor)
+    uint32_t strides[RUNTIME_MAX_TENSOR_DIMS];  // 20B [104, 124): element strides, mirrors Tensor::strides
+    uint8_t __padding3__[4];                    // 4B [124, 128)
+
+    void copy_from_tensor(const Tensor &tensor)
+    {
         memcpy(this, &tensor, 64);
-        if (tensor.is_contiguous && tensor.start_offset == 0) {
+        if (tensor.is_contiguous && tensor.start_offset == 0)
+        {
             uint64_t numel = 1;
-            for (uint32_t i = 0; i < tensor.ndims; i++)
-                numel *= tensor.shapes[i];
+            for (uint32_t i = 0; i < tensor.ndims; i++) numel *= tensor.shapes[i];
             extent_elem_cache = numel;
             uint32_t s = 1;
-            for (int32_t i = static_cast<int32_t>(tensor.ndims) - 1; i >= 0; i--) {
+            for (int32_t i = static_cast<int32_t>(tensor.ndims) - 1; i >= 0; i--)
+            {
                 strides[i] = s;
                 s *= tensor.shapes[i];
             }
-        } else {
+        }
+        else
+        {
             extent_elem_cache = tensor.extent_elem_cache;
-            for (uint32_t i = 0; i < tensor.ndims; i++) {
-                strides[i] = tensor.strides[i];
-            }
+            for (uint32_t i = 0; i < tensor.ndims; i++) strides[i] = tensor.strides[i];
         }
     }
 
-    void copy_tensor_create_info(const TensorCreateInfo &tensor_create_info, uint64_t addr) {
+    void copy_tensor_create_info(const TensorCreateInfo &tensor_create_info, uint64_t addr)
+    {
         memcpy(this, &tensor_create_info, 64);
         buffer_addr = addr;
         // Create-info outputs are always contiguous with start_offset = 0;
         // extent_elem = prod(shapes); stride is row-major.
         uint64_t numel = 1;
-        for (uint32_t i = 0; i < tensor_create_info.ndims; i++) {
-            numel *= tensor_create_info.shapes[i];
-        }
+        for (uint32_t i = 0; i < tensor_create_info.ndims; i++) numel *= tensor_create_info.shapes[i];
         extent_elem_cache = numel;
         uint32_t s = 1;
-        for (int32_t i = static_cast<int32_t>(tensor_create_info.ndims) - 1; i >= 0; i--) {
+        for (int32_t i = static_cast<int32_t>(tensor_create_info.ndims) - 1; i >= 0; i--)
+        {
             strides[i] = s;
             s *= tensor_create_info.shapes[i];
         }
     }
 
-    /**
-     * Effective element extent of this entry.
-     * Contiguous-aligned views compute it from shapes alone (line 1 hit only);
-     * non-contiguous views read the cached value from line 2.
-     */
-    uint64_t effective_extent_elem() const {
-        if (is_contiguous) {
+    uint64_t effective_extent_elem() const
+    {
+        if (is_contiguous)
+        {
             uint64_t n = 1;
-            for (uint32_t i = 0; i < ndims; i++)
-                n *= shapes[i];
+            for (uint32_t i = 0; i < ndims; i++) n *= shapes[i];
             return n;
         }
         return extent_elem_cache;
     }
 
-    /**
-     * Check overlap between input tensor and this entry (the producer output).
-     *
-     * Three-level cascade:
-     *   L1 — O(1) byte-range intersection. Disjoint -> NO_OVERLAP.
-     *   L2 — O(ndims) hyper-rectangle precise check, eligible only when both
-     *        sides share the same canonical row-major axis layout (same
-     *        dtype/ndims/strides[], stride descends as integer multiples,
-     *        start_offset decomposes cleanly under the reference shape).
-     *        Yields NO_OVERLAP / COVERED / OTHER per-dim.
-     *   L3 — Non-hyper-rectangle pairs (transpose/permute mismatch, slice
-     *        with step, etc): conservative OTHER. Exact enumeration via
-     *        contiguous-segment merge is scheduled for a follow-up.
-     *
-     * COVERED is returned when `input` completely contains `entry` per-dim
-     * — dep_compute uses this to retire the now-redundant entry.
-     */
-    OverlapStatus check_overlap(const Tensor &input) const {
+    OverlapStatus check_overlap(const Tensor &input) const
+    {
         debug_assert(input.buffer.addr == buffer_addr);
         debug_assert(input.version >= version);
-        if (input.version > version) {
-            return OverlapStatus::OTHER;
-        }
+        if (input.version > version) return OverlapStatus::OTHER;
 
         // -------- L1: byte-range intersection (O(1) fast reject) --------
         const uint64_t in_begin = input.start_offset;
@@ -231,50 +115,18 @@ struct alignas(64) PTO2TensorMapEntry {
         const uint64_t ent_end = start_offset + effective_extent_elem();
         Segment in_range_bytes{in_begin, in_end};
         Segment ent_range_bytes{ent_begin, ent_end};
-        if (!in_range_bytes.line_segment_intersection(ent_range_bytes)) {
-            return OverlapStatus::NO_OVERLAP;
-        }
+        if (!in_range_bytes.line_segment_intersection(ent_range_bytes)) return OverlapStatus::NO_OVERLAP;
 
         // -------- L2 prereqs: same axis layout? --------
-        if (input.dtype != dtype || input.ndims != ndims || ndims == 0) {
-            return OverlapStatus::OTHER;
-        }
-        for (uint32_t i = 0; i < ndims; i++) {
+        if (input.dtype != dtype || input.ndims != ndims || ndims == 0) return OverlapStatus::OTHER;
+        for (uint32_t i = 0; i < ndims; i++)
             if (input.strides[i] != strides[i]) return OverlapStatus::OTHER;
-        }
-        // strides[ndims-1] must be 1 and strides[i-1] must be an integer
-        // multiple of strides[i] for the row-major reference-shape derivation
-        // below to hold. This rejects slice-with-step (strides[d] != prev factor)
-        // and any view chain that scrambles the axis order. (strides is
-        // uint32_t with the > 0 invariant enforced at construction, so no
-        // sign check needed.)
         if (strides[ndims - 1] != 1) return OverlapStatus::OTHER;
-        for (uint32_t i = 1; i < ndims; i++) {
+        for (uint32_t i = 1; i < ndims; i++)
             if (strides[i - 1] % strides[i] != 0) return OverlapStatus::OTHER;
-        }
 
-        // Derive reference shape A from stride. By construction stride is
-        // row-major over A: strides[i] = prod(A[i+1..ndims-1]). So
-        //   A[i] = strides[i-1] / strides[i]   for i >= 1
-        //   A[0] = (buffer.size / dtype_bytes) / strides[0]
-        // input.buffer.size is the storage size; entry shares the same buffer
-        // (debug-asserted by buffer.addr equality at the top), so we read it
-        // from input rather than mirroring buffer.size into the entry.
-        //
-        // Note on buffer padding: runtime allocators may over-allocate
-        // `buffer.size` (cache-line / 1024B alignment, ring-buffer slot
-        // rounding, etc). When that happens, `numel_storage` is larger than
-        // the true logical extent and `ref_shapes[0]` ends up generously over-
-        // sized. This is intentional: ref_shapes is only used as an *upper
-        // bound* in the in-bounds checks below; the actual overlap test (the
-        // per-dim line-segment intersection on the real start_offset /
-        // shapes / stride further down) is unaffected. A larger-than-truth
-        // ref_shapes[0] simply makes the bounds check more permissive — it
-        // can never cause a false NO_OVERLAP nor a false COVERED.
         uint32_t ref_shapes[RUNTIME_MAX_TENSOR_DIMS] = {};
-        for (uint32_t i = 1; i < ndims; i++) {
-            ref_shapes[i] = strides[i - 1] / strides[i];
-        }
+        for (uint32_t i = 1; i < ndims; i++) ref_shapes[i] = strides[i - 1] / strides[i];
         const uint64_t elem_size = get_element_size(dtype);
         if (elem_size == 0) return OverlapStatus::OTHER;
         const uint64_t numel_storage = input.buffer.size / elem_size;
@@ -282,14 +134,12 @@ struct alignas(64) PTO2TensorMapEntry {
         if (numel_storage % stride0 != 0) return OverlapStatus::OTHER;
         ref_shapes[0] = static_cast<uint32_t>(numel_storage / stride0);
 
-        // Decompose start_offset into row-major multi-dim offsets. By the same
-        // relation strides[i] = prod(ref_shapes[i+1..]) so dividing by strides[i]
-        // (no inner loop) yields each axis offset directly.
         uint32_t in_offsets[RUNTIME_MAX_TENSOR_DIMS] = {};
         uint32_t ent_offsets[RUNTIME_MAX_TENSOR_DIMS] = {};
         uint64_t in_remain = input.start_offset;
         uint64_t ent_remain = start_offset;
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             const uint32_t s = strides[i];
             in_offsets[i] = static_cast<uint32_t>(in_remain / s);
             ent_offsets[i] = static_cast<uint32_t>(ent_remain / s);
@@ -300,22 +150,20 @@ struct alignas(64) PTO2TensorMapEntry {
 
         // Validate that each side fits within ref_shapes (defense in depth —
         // a well-formed view always satisfies this).
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             if (static_cast<uint64_t>(in_offsets[i]) + input.shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER;
             if (static_cast<uint64_t>(ent_offsets[i]) + shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER;
         }
 
         // -------- L2 core: per-dim line-segment intersection --------
         bool input_contains_entry = true;
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             Segment in_seg{in_offsets[i], static_cast<uint64_t>(in_offsets[i]) + input.shapes[i]};
             Segment ent_seg{ent_offsets[i], static_cast<uint64_t>(ent_offsets[i]) + shapes[i]};
-            if (!in_seg.line_segment_intersection(ent_seg)) {
-                return OverlapStatus::NO_OVERLAP;
-            }
-            if (!in_seg.contains(ent_seg)) {
-                input_contains_entry = false;
-            }
+            if (!in_seg.line_segment_intersection(ent_seg)) return OverlapStatus::NO_OVERLAP;
+            if (!in_seg.contains(ent_seg)) input_contains_entry = false;
         }
         return input_contains_entry ? OverlapStatus::COVERED : OverlapStatus::OTHER;
     }
@@ -331,20 +179,10 @@ static_assert(offsetof(PTO2TensorMapEntry, dtype) == offsetof(Tensor, dtype));
 static_assert(offsetof(PTO2TensorMapEntry, manual_dep) == offsetof(Tensor, manual_dep));
 static_assert(offsetof(PTO2TensorMapEntry, is_contiguous) == offsetof(Tensor, is_contiguous));
 static_assert(offsetof(PTO2TensorMapEntry, shapes) == offsetof(Tensor, shapes));
-static_assert(
-    offsetof(PTO2TensorMapEntry, prev_in_bucket) == 64, "TensorMapEntry must be exactly 2 cache lines (128 bytes)"
-);
-
-// =============================================================================
-// TensorMap Lookup Chain Length Statistics (compile-time toggle)
-// =============================================================================
-
-/**
- * TensorMap structure
- *
- * Hash table with ring buffer entry pool and lazy invalidation.
- */
-struct PTO2TensorMap {
+static_assert(offsetof(PTO2TensorMapEntry, prev_in_bucket) == 64, "TensorMapEntry must be exactly 2 cache lines (128 bytes)");
+
+struct PTO2TensorMap
+{
     // Hash table buckets (fixed size, power of 2)
     PTO2TensorMapEntry **buckets;  // Array of offsets into entry_pool (-1 = empty)
     int32_t num_buckets;           // Must be power of 2 for fast modulo
@@ -367,20 +205,25 @@ struct PTO2TensorMap {
     // Per-ring cleanup progress (for periodic cleanup_retired)
     int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
 
-    uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const {
+    uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const
+    {
         return task_local_id & (task_window_sizes[ring_id] - 1);
     }
 
-    // Accessors read by scope_stats_collector. Declared unconditionally so the
-    // collector .cpp compiles at PTO2_PROFILING=0 (collector is unconditional —
-    // setter symbols must export for host dlsym; the probe call sites that use
-    // these accessors stay gated by PTO2_PROFILING).
-    int32_t current_used() const { return next_entry_idx - free_num; }
-    int32_t pool_capacity() const { return pool_size; }
+    int32_t current_used() const
+    {
+        return next_entry_idx - free_num;
+    }
+    int32_t pool_capacity() const
+    {
+        return pool_size;
+    }
 
     // new_entry only allocates memory, does not assign attributes
-    PTO2TensorMapEntry *new_entry() {
-        if (free_num > 0) {
+    PTO2TensorMapEntry *new_entry()
+    {
+        if (free_num > 0)
+        {
             PTO2TensorMapEntry *res = free_entry_list[--free_num];
             debug_assert(res->bucket_index == -1);
             return res;
@@ -391,22 +234,24 @@ struct PTO2TensorMap {
         return res;
     }
 
-    void free_entry(PTO2TensorMapEntry &entry) {
+    void free_entry(PTO2TensorMapEntry &entry)
+    {
         always_assert(entry.bucket_index != -1);  // must still be in a bucket
 
         // Update predecessor's next pointer (O(1) via prev_in_bucket)
-        if (entry.prev_in_bucket == nullptr) {
+        if (entry.prev_in_bucket == nullptr)
+        {
             // Entry is the head of its bucket chain, update bucket head
             // Must compute hash BEFORE clearing tensor
             buckets[entry.bucket_index] = entry.next_in_bucket;
-        } else {
+        }
+        else
+        {
             entry.prev_in_bucket->next_in_bucket = entry.next_in_bucket;
         }
 
         // Update successor's prev pointer
-        if (entry.next_in_bucket != nullptr) {
-            entry.next_in_bucket->prev_in_bucket = entry.prev_in_bucket;
-        }
+        if (entry.next_in_bucket != nullptr) entry.next_in_bucket->prev_in_bucket = entry.prev_in_bucket;
 
         free_entry_list[free_num++] = &entry;
         entry.bucket_index = -1;
@@ -416,164 +261,144 @@ struct PTO2TensorMap {
         entry.prev_in_task = nullptr;
     }
 
-    // =============================================================================
-    // TensorMap API
-    // =============================================================================
-
-    /**
-     * Phase 1: reserve every sub-region (buckets, entry_pool, free list, per-ring
-     * task_entry_heads) on the supplied arena. Records the resulting offsets in
-     * the returned layout descriptor. Must be called before the arena is
-     * committed.
-     */
-    static PTO2TensorMapLayout reserve_layout(
-        DeviceArena &arena, int32_t num_buckets, int32_t pool_size, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]
-    );
-
-    /**
-     * Same as reserve_layout() with default sizes (PTO2_TENSORMAP_NUM_BUCKETS,
-     * PTO2_TENSORMAP_POOL_SIZE).
-     */
-    static PTO2TensorMapLayout
-    reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
-
-    /**
-     * Phase 3a: write everything *except* arena-internal pointer fields
-     * (buckets, entry_pool, free_entry_list, task_entry_heads[r]).
-     * Uses arena.region_ptr to address the arena regions for data writes,
-     * but does not store those addresses in struct fields. Safe to call on
-     * a host arena that holds the prebuilt image.
-     */
-    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
-
-    /**
-     * Phase 3b: write the arena-internal pointer fields. Idempotent;
-     * called once on the host arena and once on the AICPU after attach.
-     */
-    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena);
-
-    /**
-     * Tear down state. Does not free memory — the arena owns the backing
-     * buffer. Pointers are set to nullptr so accidental reuse traps.
-     */
-    void destroy();
-
-    /**
-     * Update validity threshold from shared memory
-     * Called periodically to refresh the lazy invalidation threshold.
-     *
-     * @param last_task_alive  Current value from shared memory
-     */
-    void sync_validity(int32_t ring_id, int32_t last_task_alive) { this->last_task_alives[ring_id] = last_task_alive; }
-
-    /**
-     * Lookup producer for a tensor region
-     *
-     * Searches the hash table for matching regions and invokes the callback
-     * for each overlapping valid entry.
-     * Stale entries from different rings are skipped (not truncated).
-     *
-     * The callback receives (PTO2TensorMapEntry &, OverlapStatus) and should
-     * return true to continue iteration, false to stop early. It is safe for
-     * the callback to call remove_entry() on the current entry: next_in_bucket
-     * is latched before invocation.
-     *
-     * @param tensor    Tensor to look up
-     * @param on_match  Callback invoked for each overlapping entry
-     */
+    static PTO2TensorMapLayout reserve_layout(DeviceArena &arena, int32_t new_num_buckets, int32_t new_pool_size, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        // num_buckets must be a power of two for the hash truncation to work.
+        always_assert((new_num_buckets & (new_num_buckets - 1)) == 0);
+
+        PTO2TensorMapLayout layout{};
+        layout.num_buckets = new_num_buckets;
+        layout.pool_size = new_pool_size;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) layout.task_window_sizes[r] = new_task_window_sizes[r];
+
+        layout.off_buckets = arena.reserve(static_cast<size_t>(new_num_buckets) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+        layout.off_entry_pool = arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry), alignof(PTO2TensorMapEntry));
+        layout.off_free_entry_list = arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) layout.off_task_entry_heads[r] = arena.reserve(static_cast<size_t>(new_task_window_sizes[r]) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+        return layout;
+    }
+
+    static PTO2TensorMapLayout reserve_layout_default(DeviceArena &arena, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
+    }
+
+    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena)
+    {
+        num_buckets = layout.num_buckets;
+        pool_size = layout.pool_size;
+
+        // Address arena regions for data writes; do not store these in struct
+        // fields (wire_arena_pointers does that).
+        auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+        auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+        auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+
+        // buckets[]: empty == nullptr.
+        for (int32_t i = 0; i < num_buckets; i++) buckets_arena[i] = nullptr;
+
+        memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
+        for (int32_t i = 0; i < pool_size; i++)
+        {
+            entry_pool_arena[i].bucket_index = -1;
+            entry_pool_arena[i].next_in_bucket = nullptr;
+            entry_pool_arena[i].prev_in_bucket = nullptr;
+            entry_pool_arena[i].next_in_task = nullptr;
+            entry_pool_arena[i].prev_in_task = nullptr;
+            entry_pool_arena[i].producer_task_id = PTO2TaskId{};
+        }
+
+        // free_entry_list: zeroed (was calloc'd before); contents become meaningful
+        // only after entries are freed back, so the body of the array stays as 0.
+        memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
+
+        next_entry_idx = 0;
+        free_num = 0;
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+            for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) heads_arena[i] = nullptr;
+            task_window_sizes[r] = layout.task_window_sizes[r];
+            last_task_alives[r] = 0;
+            last_cleanup[r] = 0;
+        }
+
+        return true;
+    }
+
+    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena)
+    {
+        buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+        entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+        free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+    }
+
+    void destroy()
+    {
+        buckets = nullptr;
+        entry_pool = nullptr;
+        free_entry_list = nullptr;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_entry_heads[r] = nullptr;
+    }
+
+    void sync_validity(int32_t ring_id, int32_t last_task_alive)
+    {
+        this->last_task_alives[ring_id] = last_task_alive;
+    }
+
     template <typename Fn>
-    void lookup(const Tensor &tensor, Fn &&on_match) {
+    void lookup(const Tensor &tensor, Fn &&on_match)
+    {
         uint32_t bucket_index = hash(tensor.buffer.addr);
         PTO2TensorMapEntry *cur_entry = buckets[bucket_index];
 
-#if PTO2_TENSORMAP_PROFILING
-        g_lookup_count++;
-        int32_t chain_len = 0;
-#endif
-
-        while (cur_entry != nullptr) {
+        while (cur_entry != nullptr)
+        {
             PTO2TensorMapEntry *next_entry = cur_entry->next_in_bucket;
 
-#if PTO2_TENSORMAP_PROFILING
-            chain_len++;
-#endif
-            // Skip stale entries (no chain truncation — entries from different
-            // rings can be interleaved, so a stale entry from one ring does NOT
-            // imply subsequent entries from other rings are also stale)
-            if (!entry_valid(*cur_entry)) {
+            if (!entry_valid(*cur_entry))
+            {
                 cur_entry = next_entry;
                 continue;
             }
 
-            // Entry is valid - check if regions OVERLAP (not just exact match)
-            // Since we hash only by base_ptr, all entries in this bucket have
-            // potential to overlap. We must check actual byte-range overlap.
-            if (tensor.buffer.addr == cur_entry->buffer_addr) {
-#if PTO2_TENSORMAP_PROFILING
-                g_lookup_overlap_checks++;
-#endif
+            if (tensor.buffer.addr == cur_entry->buffer_addr)
+            {
                 auto overlap_status = cur_entry->check_overlap(tensor);
-                if (overlap_status != OverlapStatus::NO_OVERLAP) {
-#if PTO2_TENSORMAP_PROFILING
-                    g_lookup_overlap_hits++;
-#endif
-                    if (!on_match(*cur_entry, overlap_status)) {
-#if PTO2_TENSORMAP_PROFILING
-                        g_lookup_chain_total += chain_len;
-                        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
-#endif
-                        return;
-                    }
+                if (overlap_status != OverlapStatus::NO_OVERLAP)
+                {
+                    if (!on_match(*cur_entry, overlap_status)) return;
                 }
             }
 
             // Move to next entry
             cur_entry = next_entry;
         }
-#if PTO2_TENSORMAP_PROFILING
-        g_lookup_chain_total += chain_len;
-        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
-#endif
     }
 
-    /**
-     * Insert a new entry (called when task produces output)
-     *
-     * Allocates from ring buffer pool, may overwrite stale entries.
-     * Inserts at head of hash bucket chain (maintains task_id ordering).
-     *
-     * @param tensor            Tensor produced
-     * @param producer_task_id  Task ID of producer
-     */
-    void insert(const Tensor &tensor, PTO2TaskId producer_task_id) {
+    void insert(const Tensor &tensor, PTO2TaskId producer_task_id)
+    {
         PTO2TensorMapEntry *entry = new_entry();
         entry->copy_from_tensor(tensor);
         link_entry(entry, tensor.buffer.addr, producer_task_id);
     }
 
-    /**
-     * Cleanup stale entries for retired tasks
-     *
-     * Called periodically by Orchestrator when last_task_alive advances.
-     * Removes entries from bucket chains for tasks in [old, new) range.
-     *
-     * @param old_last_task_alive  Previous threshold
-     * @param new_last_task_alive  New threshold
-     */
-    void cleanup_retired(int32_t ring_id, int32_t old_last_task_alive, int32_t new_last_task_alive) {
+    void cleanup_retired(int32_t ring_id, int32_t old_last_task_alive, int32_t new_last_task_alive)
+    {
         // Iterate through retired tasks on this ring and remove their entries
-        for (int32_t local_id = old_last_task_alive; local_id < new_last_task_alive; local_id++) {
+        for (int32_t local_id = old_last_task_alive; local_id < new_last_task_alive; local_id++)
+        {
             int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
             PTO2TensorMapEntry *cur_entry = task_entry_heads[ring_id][task_slot];
 
-            while (cur_entry != nullptr) {
+            while (cur_entry != nullptr)
+            {
                 PTO2TensorMapEntry *next_entry = cur_entry->next_in_task;  // Save before clearing
                 // Only remove if this entry belongs to the retiring task
                 // (slot may have been reused by a newer task)
-                debug_assert(
-                    cur_entry->producer_task_id ==
-                    PTO2TaskId::make(static_cast<uint8_t>(ring_id), static_cast<uint32_t>(local_id))
-                );
+                debug_assert(cur_entry->producer_task_id == PTO2TaskId::make(static_cast<uint8_t>(ring_id), static_cast<uint32_t>(local_id)));
                 free_entry(*cur_entry);
                 cur_entry = next_entry;
             }
@@ -583,30 +408,14 @@ struct PTO2TensorMap {
         }
     }
 
-    // =============================================================================
-    // Internal Helpers (exposed for testing)
-    // =============================================================================
-
-    /**
-     * Compute hash for tensor addr
-     *
-     * Multiplicative hash using the golden-ratio constant.  Multiplication
-     * mixes ALL input bits into the high bits of the product, so aligned
-     * addresses (low bits all-zero) still distribute evenly.  We extract
-     * the top log2(num_buckets) bits which carry the most entropy.
-     */
-    uint32_t hash(uint64_t key) {
+    uint32_t hash(uint64_t key)
+    {
         key *= 0x9E3779B97F4A7C15ULL;
         return static_cast<uint32_t>(key >> (64 - __builtin_ctz(num_buckets)));
     }
 
-    /**
-     * Link an initialized entry into bucket and task chains.
-     */
-    void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id) {
-#if PTO2_TENSORMAP_PROFILING
-        g_insert_count++;
-#endif
+    void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id)
+    {
         uint32_t bucket_index = hash(addr);
         auto ring_id = producer_task_id.ring();
         auto local_id = producer_task_id.local();
@@ -617,95 +426,122 @@ struct PTO2TensorMap {
         // Insert at head of hash bucket
         entry->bucket_index = bucket_index;
         entry->next_in_bucket = buckets[bucket_index];
-        if (entry->next_in_bucket != nullptr) {
-            entry->next_in_bucket->prev_in_bucket = entry;
-        }
+        if (entry->next_in_bucket != nullptr) entry->next_in_bucket->prev_in_bucket = entry;
         buckets[bucket_index] = entry;
         entry->prev_in_bucket = nullptr;
 
         // Link to task's entry list
         entry->next_in_task = task_entry_heads[ring_id][task_slot];
         entry->prev_in_task = nullptr;
-        if (entry->next_in_task != nullptr) {
-            entry->next_in_task->prev_in_task = entry;
-        }
+        if (entry->next_in_task != nullptr) entry->next_in_task->prev_in_task = entry;
         task_entry_heads[ring_id][task_slot] = entry;
     }
 
-    /**
-     * Check if entry is valid (producer has not retired)
-     */
-    bool entry_valid(const PTO2TensorMapEntry &entry) const {
+    bool entry_valid(const PTO2TensorMapEntry &entry) const
+    {
         return static_cast<int32_t>(entry.producer_task_id.local()) >= last_task_alives[entry.producer_task_id.ring()];
     }
 
-    void remove_entry(PTO2TensorMapEntry &entry) {
+    void remove_entry(PTO2TensorMapEntry &entry)
+    {
         remove_from_task(entry);
         free_entry(entry);
     }
 
-    /**
-     * Remove entry from its task chain (O(1) with prev pointer)
-     * Called during pool wrap-around to unlink reused entries.
-     */
-    void remove_from_task(PTO2TensorMapEntry &entry) {
+    void remove_from_task(PTO2TensorMapEntry &entry)
+    {
         always_assert(entry.bucket_index != -1);  // must still be in a bucket
         // Update predecessor's next pointer (O(1) via prev_in_task)
-        if (entry.prev_in_task == nullptr) {
+        if (entry.prev_in_task == nullptr)
+        {
             // Entry is the head of its task chain, update task_entry_heads
             int32_t ring_id = entry.producer_task_id.ring();
             int32_t local_id = static_cast<int32_t>(entry.producer_task_id.local());
             int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
             task_entry_heads[ring_id][task_slot] = entry.next_in_task;
-        } else {
+        }
+        else
+        {
             entry.prev_in_task->next_in_task = entry.next_in_task;
         }
 
         // Update successor's prev pointer
-        if (entry.next_in_task != nullptr) {
-            entry.next_in_task->prev_in_task = entry.prev_in_task;
-        }
+        if (entry.next_in_task != nullptr) entry.next_in_task->prev_in_task = entry.prev_in_task;
 
         entry.next_in_task = nullptr;
         entry.prev_in_task = nullptr;
     }
 
-    // =============================================================================
-    // Debug Utilities
-    // =============================================================================
-
-    /**
-     * Print TensorMap statistics
-     */
-    void print_stats();
-
-    /**
-     * Get count of valid entries
-     */
-    int32_t valid_count();
-
-    // =============================================================================
-    // TensorMap Synchronization
-    // =============================================================================
-
-    /**
-     * Sync TensorMap validity threshold from shared memory
-     *
-     * Called periodically to refresh the lazy invalidation threshold.
-     * Also triggers cleanup if threshold has advanced significantly.
-     */
-    void sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive);
-};
+    void print_stats()
+    {
+        int32_t valid = 0;
+        int32_t stale = 0;
+        int32_t empty_buckets = 0;
+        int32_t max_chain = 0;
+        int64_t total_chain = 0;
+        int32_t non_empty_buckets = 0;
+
+        // Count entries
+        for (int32_t i = 0; i < pool_size; i++)
+        {
+            if (entry_pool[i].bucket_index != -1)
+            {
+                if (entry_valid(entry_pool[i])) valid++;
+                else stale++;
+            }
+        }
 
-#if PTO2_TENSORMAP_PROFILING
-struct PTO2TensorMapProfilingData {
-    uint64_t lookup_chain_total;
-    uint64_t lookup_count;
-    int32_t lookup_chain_max;
-    uint64_t overlap_checks;
-    uint64_t overlap_hits;
-    uint64_t insert_count;
-};
+        // Count bucket stats
+        for (int32_t b = 0; b < num_buckets; b++)
+        {
+            int32_t chain_len = 0;
+            auto cur_entry = buckets[b];
+
+            while (cur_entry != nullptr)
+            {
+                chain_len++;
+                cur_entry = cur_entry->next_in_bucket;
+            }
+
+            if (chain_len == 0)
+            {
+                empty_buckets++;
+            }
+            else
+            {
+                non_empty_buckets++;
+                total_chain += chain_len;
+                if (chain_len > max_chain) max_chain = chain_len;
+            }
+        }
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {}
+    }
+
+    int32_t valid_count()
+    {
+        int32_t count = 0;
+
+        for (int32_t i = 0; i < pool_size; i++)
+            if (entry_pool[i].bucket_index != -1 && entry_valid(entry_pool[i])) count++;
+
+        return count;
+    }
 
-PTO2TensorMapProfilingData pto2_tensormap_get_profiling();
-#endif
+    void sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive)
+    {
+        auto ring_id = task_id.ring();
+        auto local_id = task_id.local();
+        sync_validity(ring_id, sm_last_task_alive);
+
+        // Only attempt cleanup when last_task_alive has actually advanced;
+        // otherwise cleanup_retired would empty-loop and we'd spin forever.
+        auto overlap = get_task_local_id_slot(ring_id, local_id) == get_task_local_id_slot(ring_id, last_cleanup[ring_id]);
+        if (sm_last_task_alive - last_cleanup[ring_id] >= PTO2_TENSORMAP_CLEANUP_INTERVAL || overlap)
+        {
+            cleanup_retired(ring_id, last_cleanup[ring_id], sm_last_task_alive);
+            last_cleanup[ring_id] = sm_last_task_alive;
+        }
+    }
+};
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
index 16dc796ea..9f7f671c5 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h
@@ -8,19 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * Orchestration Build Graph Types - Data structures for orchestration runtime extensions
- *
- * Standalone header defining orchestration-specific types for:
- * - TaskOutputTensors: Return value from submit containing materialized output Tensors
- * - Arg: Aggregated argument container for pto_submit_task API
- *
- * Tensor descriptor types (Tensor, PTOBufferHandle, TensorCreateInfo) are
- * defined in tensor.h.
- *
- * This header is independent of orch_build_graph_runtime.h to allow inclusion from runtime.h
- * without type conflicts (Handshake, TensorPair, HostApi).
- */
 
 #ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_
 #define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_
@@ -28,32 +15,20 @@
 #include <stdint.h>
 #include <string.h>
 
-#include <type_traits>
-#include <utility>
-
 #if defined(__aarch64__)
 #include <arm_neon.h>
 #endif
 
-#include "data_type.h"
 #include "pto_submit_types.h"
 #include "task_args.h"
 #include "tensor.h"
 #include "tensor_arg.h"
 
-#ifndef PTO2_PROFILING
-#define PTO2_PROFILING 1
-#endif
-
-// Task arguments — alias the common CORE_MAX_* constants (single source of
-// truth in src/common/task_interface/arg_direction.h, transitively included
-// via task_args.h above). Keeping the MAX_TENSOR_ARGS / MAX_SCALAR_ARGS names
-// because they are referenced widely in this runtime (pto_runtime2_types.h,
-// pto2_dispatch_payload.h, intrinsic.h comments).
 #define MAX_TENSOR_ARGS CORE_MAX_TENSOR_ARGS
 #define MAX_SCALAR_ARGS CORE_MAX_SCALAR_ARGS
 
-typedef enum {
+typedef enum
+{
     ASYNC_ENGINE_SDMA = 0,
     ASYNC_ENGINE_ROCE = 1,
     ASYNC_ENGINE_URMA = 2,
@@ -61,73 +36,58 @@ typedef enum {
     NUM_ASYNC_ENGINES = 4,
 } AsyncEngine;
 
-enum class CompletionType : int32_t {
+enum class CompletionType : int32_t
+{
     COUNTER = 0,
 };
 
-// =============================================================================
-// Task Output Tensors (return value from submit)
-// =============================================================================
-
-enum class PTO2ScopeMode : uint8_t {
+enum class PTO2ScopeMode : uint8_t
+{
     AUTO = 0,
     MANUAL = 1,
 };
 
-/**
- * TaskOutputTensors — returned by submit, holds materialized output Tensors.
- *
- * Only runtime-created outputs are stored here, indexed in add_output order.
- *
- * The underlying storage is uninitialized; only output_count elements are
- * valid after submit returns.  This avoids default-constructing Tensor[]
- * on the hot path (2 KB of unnecessary zeroing per submit).
- *
- * Users must hold a named TaskOutputTensors variable and borrow via get_ref();
- * binding get_ref() on an rvalue is compile-time rejected to prevent dangling.
- *
- * LIFETIME — single-scope only:
- *   Internally this class stores pointers into the submitting task's payload
- *   (PTO2TaskPayload::tensors[]), which lives in a ring-buffer slot. After
- *   scope_end the slot becomes eligible for reuse, and a later submit will
- *   overwrite the same Tensor storage in place. Therefore the
- *   TaskOutputTensors instance, the const Tensor& returned by get_ref(), and
- *   any pointer derived from either MUST NOT outlive the PTO2_SCOPE in which
- *   submit was called — do not move/copy them to outer-scope variables, do
- *   not capture references by std::reference_wrapper or raw pointers across
- *   scope boundaries.
- *
- *   This invariant is intentionally not enforced at runtime: a reused slot
- *   simply carries a different but valid owner_task_id, so checking
- *   owner_task_id cannot distinguish "still mine" from "silently aliased to
- *   an unrelated task". Misuse manifests as a wrong-tensor read with no
- *   diagnostic.
- */
-class TaskOutputTensors {
+class TaskOutputTensors
+{
 public:
     TaskOutputTensors() :
         task_id_(PTO2TaskId::invalid()),
-        output_count_(0) {}
+        output_count_(0)
+    {}
 
-    bool empty() const { return output_count_ == 0; }
-    uint32_t size() const { return output_count_; }
+    bool empty() const
+    {
+        return output_count_ == 0;
+    }
+    uint32_t size() const
+    {
+        return output_count_;
+    }
 
     /// Borrow a materialized output tensor by index (lvalue only).
-    const Tensor &get_ref(uint32_t index) const & {
+    const Tensor &get_ref(uint32_t index) const &
+    {
         always_assert(index < output_count_);
         return *tensors_[index];
     }
     const Tensor &get_ref(uint32_t index) const && = delete;
 
     /// Runtime-internal: append one materialized output Tensor.
-    void materialize_output(const Tensor &tensor) {
+    void materialize_output(const Tensor &tensor)
+    {
         always_assert(output_count_ < MAX_TENSOR_ARGS);
         tensors_[output_count_++] = &tensor;
     }
 
-    void set_task_id(PTO2TaskId id) { task_id_ = id; }
+    void set_task_id(PTO2TaskId id)
+    {
+        task_id_ = id;
+    }
 
-    PTO2TaskId task_id() const { return task_id_; }
+    PTO2TaskId task_id() const
+    {
+        return task_id_;
+    }
 
 private:
     PTO2TaskId task_id_;
@@ -139,174 +99,101 @@ class TaskOutputTensors {
 
 using TaskSubmitResult = TaskOutputTensors;
 
-// =============================================================================
-// Argument Types (for pto_submit_task API)
-// =============================================================================
-
 // TensorArgType is defined in tensor_arg.h (included above)
 
-/**
- * Tagged union for a single Arg slot — either a Tensor* or a TensorCreateInfo value.
- * The active member is determined by TensorArgType (OUTPUT → create_info, else → ptr).
- */
-union TensorRef {
+union TensorRef
+{
     const Tensor *ptr;
     const TensorCreateInfo *create_info;
     TensorRef() :
-        ptr(nullptr) {}
+        ptr(nullptr)
+    {}
 };
 
-/**
- * Aggregated argument container for pto_submit_task
- *
- * Inherits storage from TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS, TensorArgType>.
- * Each tensor slot stores a TensorRef union (Tensor* or TensorCreateInfo)
- * discriminated by the corresponding tag().
- * Tensors are dispatched first in kernel args, followed by scalars.
- *
- * Output arguments follow two distinct ownership models:
- * - add_output(const TensorCreateInfo&): OUTPUT — runtime allocates buffer
- *   and materializes a new Tensor, returned via TaskOutputTensors.
- * - add_inout(const Tensor&): INOUT — reuses an existing Tensor as the write target.
- *
- * Example:
- *   Tensor x = make_tensor_external(dev_a, shapes, 2);
- *   TensorCreateInfo ci(shapes, 2);  // must outlive submit
- *   Arg args;
- *   args.add_input(x);
- *   args.add_output(ci);
- *   args.add_scalar(some_value);
- *   TaskOutputTensors outs = rt_submit_aic_task(kernel_id, args);
- *   const Tensor& y = outs.get_ref(0);
- */
-struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS, TensorArgType> {
+struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS, TensorArgType>
+{
     bool has_error{false};
     const char *error_msg{nullptr};
     PTO2LaunchSpec launch_spec;  // SPMD launch parameters (block_num, etc.)
 
-    void clear() {
-        TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS, TensorArgType>::clear();
-#if PTO2_PROFILING
-        dump_arg_mask_ = 0;
-        dump_arg_index_ambiguous_mask_ = 0;
-        clear_scalar_sources();
-        memset(scalar_dtypes_, 0, sizeof(scalar_dtypes_));
-#endif
-        explicit_deps_ = nullptr;
-        explicit_dep_count_ = 0;
-    }
-
-    void reset() {
+    void reset()
+    {
         clear();
         has_error = false;
         error_msg = nullptr;
+        tensor_dump_arg_mask_ = 0;
+        explicit_deps_ = nullptr;
+        explicit_dep_count_ = 0;
     }
 
-    void set_error(const char *msg) {
-        if (!has_error) {
+    void set_error(const char *msg)
+    {
+        if (!has_error)
+        {
             has_error = true;
             error_msg = msg;
         }
     }
 
     template <typename... Args>
-    void dump(Args &&...args) {
-#if PTO2_PROFILING
-        static_assert(
-            (std::is_lvalue_reference_v<Args> && ...),
-            "dump: temporaries are not allowed — pass tensors/scalars already added to this Arg"
-        );
-        static_assert(
-            (is_supported_dump_arg_v<Args> && ...),
-            "dump: all arguments must be Tensor, TensorCreateInfo, or scalar lvalues"
-        );
-        if constexpr (sizeof...(Args) == 0) {
-            mark_all_dump_args();
-        } else {
-            (mark_dump_arg(args), ...);
-        }
-#else
-        ((void)args, ...);
-#endif
+    void dump(Args &&...args)
+    {
+        static_assert((std::is_lvalue_reference_v<Args> && ...), "dump: temporaries are not allowed — pass tensors already added to this Arg");
+        static_assert(((std::is_same_v<std::decay_t<Args>, Tensor> || std::is_same_v<std::decay_t<Args>, TensorCreateInfo>) && ...), "dump: all arguments must be Tensor or TensorCreateInfo");
+        if constexpr (sizeof...(Args) == 0) mark_all_tensor_dump_arg();
+        else (mark_tensor_dump_arg(args), ...);
     }
 
-#if PTO2_PROFILING
-    uint64_t tensor_dump_arg_mask() const { return dump_arg_mask_; }
-    uint64_t tensor_dump_arg_index_ambiguous_mask() const { return dump_arg_index_ambiguous_mask_; }
-#else
-    uint64_t tensor_dump_arg_mask() const { return 0; }
-    uint64_t tensor_dump_arg_index_ambiguous_mask() const { return 0; }
-#endif
+    uint64_t tensor_dump_arg_mask() const
+    {
+        return tensor_dump_arg_mask_;
+    }
 
     template <typename... Args>
-    void add_input(Args &&...args) {
-        if (!check_add_tensor_valid<false>(args...)) {
-            return;
-        }
+    void add_input(Args &&...args)
+    {
+        if (!check_add_tensor_valid<false>(args...)) return;
         ((tensors_[tensor_count_].ptr = &args, tags_[tensor_count_] = TensorArgType::INPUT, tensor_count_++), ...);
     }
 
-    /// Batch add outputs — all Tensor or all TensorCreateInfo:
-    ///   add_output(ci1, ci2)         — runtime allocates buffers (OUTPUT)
-    ///   add_output(t1, t2)           — write-only existing tensors (OUTPUT_EXISTING)
     template <typename... Args>
-    void add_output(Args &&...args) {
+    void add_output(Args &&...args)
+    {
         if (!check_add_tensor_valid<true>(args...)) return;
-        if constexpr ((std::is_same_v<std::decay_t<Args>, TensorCreateInfo> && ...)) {
-            ((tensors_[tensor_count_].create_info = &args, tags_[tensor_count_] = TensorArgType::OUTPUT,
-              tensor_count_++),
-             ...);
-        } else {
-            ((tensors_[tensor_count_].ptr = &args, tags_[tensor_count_] = TensorArgType::OUTPUT_EXISTING,
-              tensor_count_++),
-             ...);
-        }
+        if constexpr ((std::is_same_v<std::decay_t<Args>, TensorCreateInfo> && ...)) ((tensors_[tensor_count_].create_info = &args, tags_[tensor_count_] = TensorArgType::OUTPUT, tensor_count_++), ...);
+        else ((tensors_[tensor_count_].ptr = &args, tags_[tensor_count_] = TensorArgType::OUTPUT_EXISTING, tensor_count_++), ...);
     }
 
     template <typename... Args>
-    void add_inout(Args &&...args) {
-        if (!check_add_tensor_valid<false>(args...)) {
-            return;
-        }
+    void add_inout(Args &&...args)
+    {
+        if (!check_add_tensor_valid<false>(args...)) return;
         ((tensors_[tensor_count_].ptr = &args, tags_[tensor_count_] = TensorArgType::INOUT, tensor_count_++), ...);
     }
 
     /// No-dependency existing tensor: skips OverlapMap lookup, depends on creator only.
     template <typename... Args>
-    void add_no_dep(Args &&...args) {
+    void add_no_dep(Args &&...args)
+    {
         if (!check_add_tensor_valid<false>(args...)) return;
         ((tensors_[tensor_count_].ptr = &args, tags_[tensor_count_] = TensorArgType::NO_DEP, tensor_count_++), ...);
     }
 
-    /**
-     * Attach an explicit dependency array. The Arg stores (ptr, count) without
-     * copying — the caller's array must outlive the submit (same lifetime rule
-     * as add_input/add_output, which also store pointers).
-     *
-     * count == 0 is a valid "set empty" — it clears any previously stored deps
-     * and returns. This lets callers that build the dep set conditionally pass
-     * the result through unguarded, including in the no-dep branch:
-     *   PTO2TaskId deps[3];
-     *   uint32_t n = 0;
-     *   if (have_prev) deps[n++] = prev;
-     *   if (is_last)   deps[n++] = alloc;
-     *   args.set_dependencies(deps, n);    // safe even if n == 0
-     *
-     * For count > 0, the call is single-shot: a second non-empty call after
-     * deps are already set will fail with set_error(). Use count == 0 first
-     * if you need to re-set.
-     */
-    void set_dependencies(const PTO2TaskId *deps, uint32_t count) {
-        if (count == 0) {
+    void set_dependencies(const PTO2TaskId *deps, uint32_t count)
+    {
+        if (count == 0)
+        {
             explicit_deps_ = nullptr;
             explicit_dep_count_ = 0;
             return;
         }
-        if (deps == nullptr) {
+        if (deps == nullptr)
+        {
             set_error("set_dependencies: deps must not be null when count > 0");
             return;
         }
-        if (explicit_deps_ != nullptr) {
+        if (explicit_deps_ != nullptr)
+        {
             set_error("set_dependencies: may be called at most once per Arg");
             return;
         }
@@ -314,238 +201,146 @@ struct Arg : TaskArgsTpl<TensorRef, uint64_t, MAX_TENSOR_ARGS, MAX_SCALAR_ARGS,
         explicit_dep_count_ = count;
     }
 
-    uint32_t explicit_dep_count() const { return explicit_dep_count_; }
+    uint32_t explicit_dep_count() const
+    {
+        return explicit_dep_count_;
+    }
 
-    PTO2TaskId explicit_dep(uint32_t index) const {
+    PTO2TaskId explicit_dep(uint32_t index) const
+    {
         always_assert(index < explicit_dep_count_);
         return explicit_deps_[index];
     }
 
-    const PTO2TaskId *explicit_deps_data() const { return explicit_deps_; }
+    const PTO2TaskId *explicit_deps_data() const
+    {
+        return explicit_deps_;
+    }
 
-    /**
-     * Add scalar values. Types are deduced per argument; each value is
-     * bit-cast to uint64_t for storage. Mixed types are allowed:
-     *
-     *   args.add_scalar(uint64_val);                  // single
-     *   args.add_scalar(3.14f, int32_t(42), 7u);     // mixed batch
-     */
     template <typename... Args>
-    void add_scalar(Args &&...args) {
+    void add_scalar(Args... args)
+    {
         static_assert(sizeof...(Args) >= 1, "add_scalar: at least one argument required");
         static_assert((is_supported_scalar_arg_v<Args> && ...), "add_scalar: all types must be arithmetic or enum");
-        if (scalar_count_ + sizeof...(Args) > MAX_SCALAR_ARGS) {
-            set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=16)");
+        if (scalar_count_ + sizeof...(Args) > MAX_SCALAR_ARGS)
+        {
+            set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=32)");
             return;
         }
-        (add_scalar_one(std::forward<Args>(args)), ...);
+        ((scalars_[scalar_count_++] = to_u64(args)), ...);
     }
 
-    void add_scalars(const uint64_t *values, int count) {
-        if (count < 0 || scalar_count_ + count > MAX_SCALAR_ARGS) {
-            set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=16)");
+    void add_scalars(const uint64_t *values, int count)
+    {
+        if (scalar_count_ + count > MAX_SCALAR_ARGS)
+        {
+            set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=32)");
             return;
         }
         memcpy(&scalars_[scalar_count_], values, count * sizeof(uint64_t));
-#if PTO2_PROFILING
-        memset(&scalar_dtypes_[scalar_count_], 0, count * sizeof(uint8_t));
-        clear_scalar_sources(scalar_count_, count);
-#endif
         scalar_count_ += count;
     }
 
-    /**
-     * Zero-extend int32 bit patterns into uint64 scalar slots.
-     * Negative values are treated as their unsigned 32-bit representation
-     * (e.g., -1 → 0x00000000FFFFFFFF, not 0xFFFFFFFFFFFFFFFF).
-     * Uses NEON to process 4 elements per iteration on aarch64.
-     */
-    void add_scalars_i32(const int32_t *values, int count) {
-        if (count < 0 || scalar_count_ + count > MAX_SCALAR_ARGS) {
-            set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=16)");
+    void add_scalars_i32(const int32_t *values, int count)
+    {
+        if (scalar_count_ + count > MAX_SCALAR_ARGS)
+        {
+            set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=32)");
             return;
         }
         uint64_t *dst = &scalars_[scalar_count_];
 #if defined(__aarch64__)
         int i = 0;
-        for (; i + 4 <= count; i += 4) {
+        for (; i + 4 <= count; i += 4)
+        {
             uint32x4_t v = vld1q_u32(reinterpret_cast<const uint32_t *>(values + i));
             uint64x2_t lo = vmovl_u32(vget_low_u32(v));
             uint64x2_t hi = vmovl_u32(vget_high_u32(v));
             vst1q_u64(dst + i, lo);
             vst1q_u64(dst + i + 2, hi);
         }
-        for (; i < count; i++) {
-            dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
-        }
+        for (; i < count; i++) dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
 #else
-        for (int i = 0; i < count; i++) {
-            dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
-        }
-#endif
-#if PTO2_PROFILING
-        memset(&scalar_dtypes_[scalar_count_], 0, count * sizeof(uint8_t));
-        clear_scalar_sources(scalar_count_, count);
+        for (int i = 0; i < count; i++) dst[i] = static_cast<uint64_t>(static_cast<uint32_t>(values[i]));
 #endif
         scalar_count_ += count;
     }
 
-    /**
-     * Copy scalars from another Arg's scalar array.
-     * Useful when multiple tasks share the same scalar data (e.g., block indices).
-     */
-    void copy_scalars_from(const Arg &src, int src_offset, int count) {
-        if (count < 0 || src_offset + count > src.scalar_count_) {
+    void copy_scalars_from(const Arg &src, int src_offset, int count)
+    {
+        if (src_offset + count > src.scalar_count_)
+        {
             set_error("Source scalar range out of bounds in copy_scalars_from");
             return;
         }
-        if (scalar_count_ + count > MAX_SCALAR_ARGS) {
-            set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=16)");
+        if (scalar_count_ + count > MAX_SCALAR_ARGS)
+        {
+            set_error("Too many scalar args (exceeds MAX_SCALAR_ARGS=32)");
             return;
         }
         memcpy(&scalars_[scalar_count_], &src.scalars_[src_offset], count * sizeof(uint64_t));
-#if PTO2_PROFILING
-        memcpy(&scalar_dtypes_[scalar_count_], &src.scalar_dtypes_[src_offset], count * sizeof(uint8_t));
-        clear_scalar_sources(scalar_count_, count);
-#endif
         scalar_count_ += count;
     }
 
-#if PTO2_PROFILING
-    const uint8_t *scalar_dtypes() const { return scalar_dtypes_; }
-#else
-    const uint8_t *scalar_dtypes() const { return nullptr; }
-#endif
-
 private:
     // Caller-owned dependency array; lifetime must extend through submit.
-#if PTO2_PROFILING
-    static_assert(MAX_TENSOR_ARGS + MAX_SCALAR_ARGS <= 64, "dump arg mask assumes at most 64 arguments");
-    uint64_t dump_arg_mask_{0};
-    uint64_t dump_arg_index_ambiguous_mask_{0};
-    uintptr_t scalar_source_ptrs_[MAX_SCALAR_ARGS]{};
-#endif
+    static_assert(MAX_TENSOR_ARGS <= 64, "tensor dump arg mask assumes at most 64 tensor arguments");
+    uint64_t tensor_dump_arg_mask_{0};
     const PTO2TaskId *explicit_deps_{nullptr};
     uint32_t explicit_dep_count_{0};
-#if PTO2_PROFILING
-    uint8_t scalar_dtypes_[MAX_SCALAR_ARGS] = {};
-
-    template <typename T>
-    static constexpr bool is_supported_dump_arg_v =
-        std::is_same_v<std::decay_t<T>, Tensor> || std::is_same_v<std::decay_t<T>, TensorCreateInfo> ||
-        is_supported_scalar_arg_v<T>;
-
-    void mark_arg_index(int32_t index) { dump_arg_mask_ |= (uint64_t{1} << index); }
-    void mark_arg_index_ambiguous(int32_t index) { dump_arg_index_ambiguous_mask_ |= (uint64_t{1} << index); }
-
-    void clear_scalar_sources() { clear_scalar_sources(0, MAX_SCALAR_ARGS); }
 
-    void clear_scalar_sources(int32_t start, int32_t count) {
-        for (int32_t i = 0; i < count; i++) {
-            scalar_source_ptrs_[start + i] = 0;
-        }
-    }
-
-#endif
-
-    template <typename T>
-    void add_scalar_one(T &&value) {
-        scalars_[scalar_count_] = to_u64(value);
-#if PTO2_PROFILING
-        scalar_dtypes_[scalar_count_] = dtype_of<std::remove_cv_t<std::remove_reference_t<T>>>();
-        if constexpr (std::is_lvalue_reference_v<T>) {
-            scalar_source_ptrs_[scalar_count_] = reinterpret_cast<uintptr_t>(&value);
-        } else {
-            scalar_source_ptrs_[scalar_count_] = 0;
-        }
-#endif
-        scalar_count_++;
-    }
-
-#if PTO2_PROFILING
-    // No-arg dump(): mark every arg already added to this Arg.
-    void mark_all_dump_args() {
-        if (tensor_count_ == 0 && scalar_count_ == 0) {
-            set_error("dump: no arguments added to this Arg");
+    // No-arg dump(): mark every tensor arg already added to this Arg.
+    void mark_all_tensor_dump_arg()
+    {
+        if (tensor_count_ == 0)
+        {
+            set_error("dump: no tensor arguments added to this Arg");
             return;
         }
-        for (int32_t i = 0; i < tensor_count_; i++) {
-            mark_arg_index(i);
-        }
-        for (int32_t i = 0; i < scalar_count_; i++) {
-            mark_arg_index(tensor_count_ + i);
-        }
+        for (int32_t i = 0; i < tensor_count_; i++) tensor_dump_arg_mask_ |= (uint64_t{1} << i);
     }
 
-    void mark_dump_arg(const Tensor &tensor) {
-        for (int32_t i = 0; i < tensor_count_; i++) {
-            if (tags_[i] != TensorArgType::OUTPUT && tensors_[i].ptr == &tensor) {
-                mark_arg_index(i);
+    void mark_tensor_dump_arg(const Tensor &tensor)
+    {
+        for (int32_t i = 0; i < tensor_count_; i++)
+        {
+            if (tags_[i] != TensorArgType::OUTPUT && tensors_[i].ptr == &tensor)
+            {
+                tensor_dump_arg_mask_ |= (uint64_t{1} << i);
                 return;
             }
         }
         set_error("dump: tensor is not part of this Arg");
     }
 
-    void mark_dump_arg(const TensorCreateInfo &create_info) {
-        for (int32_t i = 0; i < tensor_count_; i++) {
-            if (tags_[i] == TensorArgType::OUTPUT && tensors_[i].create_info == &create_info) {
-                mark_arg_index(i);
+    void mark_tensor_dump_arg(const TensorCreateInfo &create_info)
+    {
+        for (int32_t i = 0; i < tensor_count_; i++)
+        {
+            if (tags_[i] == TensorArgType::OUTPUT && tensors_[i].create_info == &create_info)
+            {
+                tensor_dump_arg_mask_ |= (uint64_t{1} << i);
                 return;
             }
         }
         set_error("dump: TensorCreateInfo is not part of this Arg");
     }
 
-    template <typename T>
-    std::enable_if_t<is_supported_scalar_arg_v<T>, void> mark_dump_arg(const T &scalar) {
-        uintptr_t ptr = reinterpret_cast<uintptr_t>(&scalar);
-        int32_t first_match = -1;
-        int32_t match_count = 0;
-        for (int32_t i = 0; i < scalar_count_; i++) {
-            if (scalar_source_ptrs_[i] == ptr) {
-                if (first_match < 0) {
-                    first_match = i;
-                }
-                match_count++;
-            }
-        }
-        if (first_match >= 0) {
-            int32_t arg_index = tensor_count_ + first_match;
-            mark_arg_index(arg_index);
-            if (match_count > 1) {
-                mark_arg_index_ambiguous(arg_index);
-            }
-            return;
-        }
-        set_error("dump: scalar is not part of this Arg");
-    }
-#endif
-
     template <bool is_output, typename... Args>
-    bool check_add_tensor_valid(Args &&...) {
+    bool check_add_tensor_valid(Args &&...)
+    {
         static_assert(sizeof...(Args) >= 1, "at least one argument required");
-        static_assert(
-            (std::is_lvalue_reference_v<Args> && ...),
-            "temporaries are not allowed — stored pointers would dangle after the call"
-        );
-        if constexpr (is_output) {
-            static_assert(
-                (std::is_same_v<std::decay_t<Args>, Tensor> && ...) ||
-                    (std::is_same_v<std::decay_t<Args>, TensorCreateInfo> && ...),
-                "add_output: all arguments must be the same type (all Tensor or all TensorCreateInfo)"
-            );
-        } else {
-            static_assert((std::is_same_v<std::decay_t<Args>, Tensor> && ...), "all arguments must be Tensor");
-        }
-        if (scalar_count_ != 0) {
-            set_error(
-                "add_input/add_output/add_inout called after add_scalar: "
-                "all tensors must be added before any scalars"
-            );
+        static_assert((std::is_lvalue_reference_v<Args> && ...), "temporaries are not allowed — stored pointers would dangle after the call");
+        if constexpr (is_output) static_assert((std::is_same_v<std::decay_t<Args>, Tensor> && ...) || (std::is_same_v<std::decay_t<Args>, TensorCreateInfo> && ...), "add_output: all arguments must be the same type (all Tensor or all TensorCreateInfo)");
+        else static_assert((std::is_same_v<std::decay_t<Args>, Tensor> && ...), "all arguments must be Tensor");
+        if (scalar_count_ != 0)
+        {
+            set_error("add_input/add_output/add_inout called after add_scalar: "
+                      "all tensors must be added before any scalars");
             return false;
         }
-        if (tensor_count_ + sizeof...(Args) > MAX_TENSOR_ARGS) {
+        if (tensor_count_ + sizeof...(Args) > MAX_TENSOR_ARGS)
+        {
             set_error("Too many tensor args (exceeds MAX_TENSOR_ARGS=16)");
             return false;
         }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 635b893f3..6fd795702 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -8,23 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * Runtime Class - Device Execution and Handshake Control
- *
- * This class manages device-side execution through AICPU-AICore handshake
- * protocol. Task graph construction is handled by PTO2Runtime; this class
- * only handles:
- * - Handshake buffers for AICPU-AICore communication
- * - Execution parameters (block_dim, aicpu_thread_num)
- * - Tensor pair management for host-device memory tracking
- * - Device orchestration state (gm_sm_ptr_, orch_args_)
- * - Function address mapping (func_id_to_addr_)
- *
- * Task dispatch uses a per-core PTO2DispatchPayload written by the scheduler.
- * At dispatch time, build_payload() copies tensor pointers and scalars from
- * the task payload into the per-core args[], populates SPMD context, then
- * signals AICore via DATA_MAIN_BASE.
- */
 
 #ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
 #define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
@@ -42,10 +25,6 @@
 #include "pto2_dispatch_payload.h"
 #include "task_args.h"
 
-// =============================================================================
-// Configuration Macros
-// =============================================================================
-
 #define RUNTIME_MAX_ARGS 128
 #define RUNTIME_MAX_WORKER 72  // 24 AIC + 48 AIV cores
 #define RUNTIME_MAX_FUNC_ID 1024
@@ -55,42 +34,8 @@
 // Default ready queue shards: one shard per worker thread (total minus orchestrator)
 constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1;
 
-// =============================================================================
-// Data Structures
-// =============================================================================
-
-/**
- * Handshake Structure - Shared between Host, AICPU, and AICore
- *
- * This structure facilitates communication and synchronization between
- * AICPU and AICore during task execution.
- *
- * Protocol State Machine:
- * 1. Initialization: AICPU sets aicpu_ready=1
- * 2. Acknowledgment: AICore sets aicore_done=core_id+1
- * 3. Task Dispatch: AICPU writes DATA_MAIN_BASE after updating the per-core payload
- * 4. Task Execution: AICore reads the cached PTO2DispatchPayload and executes
- * 5. Task Completion: AICore writes FIN to COND; AICPU observes completion
- * 6. Shutdown: AICPU sets control=1, AICore exits
- *
- * Each AICore instance has its own handshake buffer to enable concurrent
- * task execution across multiple cores.
- */
-
-/**
- * Handshake buffer for AICPU-AICore communication
- *
- * Each AICore has its own handshake buffer for synchronization with AICPU.
- * The structure is cache-line aligned (64 bytes) to prevent false sharing
- * between cores and optimize cache coherency operations.
- *
- * Field Access Patterns:
- * - aicpu_ready: Written by AICPU, read by AICore
- * - aicore_done: Written by AICore, read by AICPU
- * - task: Written by AICPU, read by AICore (0 = not ready, non-zero = PTO2DispatchPayload*)
- * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV)
- */
-struct Handshake {
+struct Handshake
+{
     volatile uint32_t aicpu_ready;        // AICPU ready signal: 0=not ready, 1=ready
     volatile uint32_t aicore_done;        // AICore ready signal: 0=not ready, core_id+1=ready
     volatile uint64_t task;               // Init: PTO2DispatchPayload* (set before aicpu_ready); runtime: unused
@@ -100,104 +45,40 @@ struct Handshake {
     volatile uint32_t aicore_regs_ready;  // AICore ID reported: 0=pending, 1=done
 } __attribute__((aligned(64)));
 
-/**
- * Tensor pair for tracking host-device memory mappings.
- * Used for copy-back during finalize.
- */
-struct TensorPair {
+struct TensorPair
+{
     void *host_ptr;
     void *dev_ptr;
     size_t size;
-    // false for read-only INPUT tensors: they are never written by the kernel,
-    // so the end-of-run D2H copy-back is skipped. OUTPUT/INOUT/unknown
-    // keep the safe default of copying back.
-    bool needs_copy_back = true;
 };
 
-/**
- * Host API function pointers for device memory operations.
- * Allows runtime to use pluggable device memory backends.
- */
-struct HostApi {
+struct HostApi
+{
     void *(*device_malloc)(size_t size);
     void (*device_free)(void *dev_ptr);
     int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size);
     int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size);
-    // Set a device buffer to a byte value (device-side, no PCIe). Used to
-    // zero-init pure OUTPUT buffers in lieu of an H2D copy-in. May be
-    // null on backends that don't wire it; callers must fall back to
-    // copy_to_device.
     int (*device_memset)(void *dev_ptr, int value, size_t size);
-    // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
-    // memory, trb prebuilt runtime arena) as three independent device
-    // allocations. `runtime_arena_size == 0` skips the third region (hbg
-    // path: hbg has no prebuilt runtime arena). Idempotent on identical
-    // sizes; returns 0 on success, -1 on allocation failure.
     int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
-    // Return the per-Worker pooled pointer for the PTO2 GM heap / shared
-    // memory / prebuilt runtime arena. setup_static_arena must have already
-    // committed the relevant region; the returned pointer is owned by the
-    // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it
-    // to device_free or record it in `tensor_pairs_`.
-    //
-    // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is
-    // only committed when setup_static_arena was invoked with
-    // runtime_arena_size > 0. Calling it on the hbg path
-    // (setup_static_arena(...,0)) returns nullptr (not undefined).
     void *(*acquire_pooled_gm_heap)();
     void *(*acquire_pooled_gm_sm)();
     void *(*acquire_pooled_runtime_arena)();
-    // Single-shot upload of the entire ChipCallable buffer. `callable` is a
-    // `const ChipCallable *` (declared void* to avoid pulling task_interface
-    // headers into runtime.h). DeviceRunner walks child_offsets_ to compute
-    // total byte size, allocates device GM once, fixes up each child's
-    // resolved_addr_ in an internal host scratch (onboard: device addr; sim:
-    // dlopen function pointer), H2D's once, and returns the device-side
-    // address of the ChipCallable header. Pool-managed: identical buffer
-    // contents (FNV-1a 64-bit) hit the dedup cache; all chip buffers are
-    // bulk-freed in DeviceRunner::finalize(). Returns 0 on error or when
-    // child_count() == 0. Caller computes child addrs as
-    //     chip_dev + offsetof(ChipCallable, storage_) + child_offset(i)
-    // and stores them via runtime->set_function_bin_addr(fid, child_dev).
     uint64_t (*upload_chip_callable_buffer)(const void *callable);
 };
 
-/**
- * Task structure - Compatibility stub for platform layer
- *
- * RT2 uses PTO2DispatchPayload instead of Task for task dispatch.
- * This stub exists only for API compatibility with device_runner.cpp.
- * Since get_task_count() returns 0, this struct is never actually used.
- */
-struct Task {
+struct Task
+{
     int func_id;
     uint64_t function_bin_addr;
 };
 
-// =============================================================================
-// Runtime Class
-// =============================================================================
-
-/**
- * Runtime class for device execution and handshake control
- *
- * This class manages AICPU-AICore communication through handshake buffers.
- * Task graph construction is handled by PTO2Runtime; this class only handles
- * execution control and device orchestration state.
- */
-class Runtime {
+class Runtime
+{
 public:
     // Handshake buffers for AICPU-AICore communication
     Handshake workers[RUNTIME_MAX_WORKER];  // Worker (AICore) handshake buffers
     int worker_count;                       // Number of active workers
 
-    // Execution parameters for AICPU scheduling.
-    //
-    // aicpu_thread_num is the *total* AICPU thread count launched on this run
-    // (= orch + schedulers). AicpuExecutor splits this into one orchestrator
-    // thread (highest idx, runs aicpu_orchestration_entry) and the remaining
-    // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore.
-    // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set.
     int aicpu_thread_num;
     int ready_queue_shards;  // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)
 
@@ -210,10 +91,6 @@ class Runtime {
     // NOTE: Made public for direct access from aicore code
     uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
 
-    // Orchestrator-to-scheduler transition control
-    // When true, orchestrator threads convert to scheduler threads after orchestration completes.
-    // When false (default), orchestrator threads exit after orchestration without dispatching tasks.
-    // Controlled via PTO2_ORCH_TO_SCHED environment variable.
     bool orch_to_sched;
 
 private:
@@ -226,114 +103,207 @@ class Runtime {
     void *slot_states_ptr_;                  // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling)
     ChipStorageTaskArgs orch_args_storage_;  // Copy of args for device
 
-    // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing
-    // Runtime to device; AICPU reads them in the boot path to skip
-    // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer
-    // (already populated by runtime_init_data_from_layout + wire on host).
     void *prebuilt_arena_base_;
     size_t prebuilt_runtime_offset_;
 
-    // Device orchestration SO (for dlopen on AICPU thread 3).
-    // The SO bytes themselves live in a separately-allocated device buffer
-    // owned by DeviceRunner; only the metadata below travels inside Runtime.
     uint64_t dev_orch_so_addr_;
     uint64_t dev_orch_so_size_;
-    // Per-callable_id dispatch. AICPU dispatches via
-    // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_`
-    // signals whether the host is delivering a freshly-registered
-    // callable_id (write+dlopen) or reusing an already-loaded one.
     int32_t active_callable_id_;
     bool register_new_callable_id_;
     char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
     char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
 
 public:
-    /**
-     * Constructor - zero-initialize all arrays
-     */
-    Runtime();
-
-    // =========================================================================
-    // Performance Profiling
-    // =========================================================================
-
-    // =========================================================================
-    // Device orchestration (for AICPU thread 3)
-    // =========================================================================
-
-    void *get_gm_sm_ptr() const;
-    void *get_gm_heap_ptr() const;
-    const ChipStorageTaskArgs &get_orch_args() const;
-    void set_gm_sm_ptr(void *p);
-    void set_gm_heap(void *p);
-    void set_slot_states_ptr(void *p);
-    void set_orch_args(const ChipStorageTaskArgs &args);
-
-    // Prebuilt-arena fast path (trb only). Set by host's
-    // bind_callable_to_runtime_impl; consumed by AICPU at boot to attach a
-    // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at
-    // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on
-    // first construction (Runtime() ctor zeros them) so a non-prebuilt boot
-    // path can still detect "no prebuilt image set" via nullptr.
-    void set_prebuilt_arena(void *arena_base, size_t runtime_off);
-    void *get_prebuilt_arena_base() const;
-    size_t get_prebuilt_runtime_offset() const;
+    Runtime()
+    {
+        // NOTE: host_api is initialized in InitRuntime() (host-only code)
+        // because the CApi functions don't exist when compiled for device.
+
+        // Initialize handshake buffers
+        memset(workers, 0, sizeof(workers));
+        worker_count = 0;
+        aicpu_thread_num = 1;
+        ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
+        task_window_size = 0;
+        heap_size = 0;
+        dep_pool_size = 0;
+        orch_to_sched = false;
+
+        // Initialize device orchestration state
+        gm_sm_ptr_ = nullptr;
+        gm_heap_ptr_ = nullptr;
+        slot_states_ptr_ = nullptr;
+        orch_args_storage_.clear();
+        prebuilt_arena_base_ = nullptr;
+        prebuilt_runtime_offset_ = 0;
+
+        // Initialize device orchestration SO binary
+        dev_orch_so_addr_ = 0;
+        dev_orch_so_size_ = 0;
+        active_callable_id_ = -1;
+        register_new_callable_id_ = false;
+        device_orch_func_name_[0] = '\0';
+        device_orch_config_name_[0] = '\0';
+
+        // Initialize kernel binary tracking
+        registered_kernel_count_ = 0;
+
+        // Initialize function address mapping
+        for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) func_id_to_addr_[i] = 0;
+    }
+
+    void *get_gm_sm_ptr() const
+    {
+        return gm_sm_ptr_;
+    }
+    void *get_gm_heap_ptr() const
+    {
+        return gm_heap_ptr_;
+    }
+    const ChipStorageTaskArgs &get_orch_args() const
+    {
+        return orch_args_storage_;
+    }
+    void set_gm_sm_ptr(void *p)
+    {
+        gm_sm_ptr_ = p;
+    }
+    void set_gm_heap(void *p)
+    {
+        gm_heap_ptr_ = p;
+    }
+    void set_slot_states_ptr(void *p)
+    {
+        slot_states_ptr_ = p;
+    }
+    void set_orch_args(const ChipStorageTaskArgs &args)
+    {
+        orch_args_storage_ = args;
+    }
+
+    void set_prebuilt_arena(void *arena_base, size_t runtime_off)
+    {
+        prebuilt_arena_base_ = arena_base;
+        prebuilt_runtime_offset_ = runtime_off;
+    }
+    void *get_prebuilt_arena_base() const
+    {
+        return prebuilt_arena_base_;
+    }
+    size_t get_prebuilt_runtime_offset() const
+    {
+        return prebuilt_runtime_offset_;
+    }
 
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
-    void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
-    uint64_t get_dev_orch_so_addr() const;
-    uint64_t get_dev_orch_so_size() const;
-    // Per-callable_id dispatch. callable_id must be in
-    // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU
-    // whether to (re)load the orch SO into orch_so_table_[callable_id] or
-    // reuse the cached entry.
-    void set_active_callable_id(int32_t callable_id, bool is_new);
-    int32_t get_active_callable_id() const;
-    bool register_new_callable_id() const;
-    void set_device_orch_func_name(const char *name);
-    const char *get_device_orch_func_name() const;
-    void set_device_orch_config_name(const char *name);
-    const char *get_device_orch_config_name() const;
-
-    uint64_t get_function_bin_addr(int func_id) const;
-    void set_function_bin_addr(int func_id, uint64_t addr);
-    /**
-     * Replay a previously-uploaded kernel address onto a fresh Runtime
-     * without recording it in registered_kernel_func_ids_. Used by
-     * DeviceRunner::bind_callable_to_runtime so prepared kernel
-     * binaries are not freed by validate_runtime_impl across runs.
-     */
-    void replay_function_bin_addr(int func_id, uint64_t addr);
-
-    int get_registered_kernel_count() const;
-    int get_registered_kernel_func_id(int index) const;
-    void clear_registered_kernels();
-
-    // =========================================================================
-    // Deprecated API (for platform compatibility, always returns 0/nullptr)
-    // Task graph is now managed by PTO2Runtime, not Runtime
-    // =========================================================================
-
-    /** @deprecated Task count is now in PTO2 shared memory */
-    int get_task_count() const { return 0; }
-
-    /** @deprecated RT2 uses PTO2DispatchPayload, not Task. Always returns nullptr. */
-    Task *get_task(int) { return nullptr; }
-
-    // =========================================================================
-    // Host API (host-only, not copied to device)
-    // =========================================================================
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size)
+    {
+        dev_orch_so_addr_ = dev_addr;
+        dev_orch_so_size_ = size;
+    }
+    uint64_t get_dev_orch_so_addr() const
+    {
+        return dev_orch_so_addr_;
+    }
+    uint64_t get_dev_orch_so_size() const
+    {
+        return dev_orch_so_size_;
+    }
+    void set_active_callable_id(int32_t callable_id, bool is_new)
+    {
+        active_callable_id_ = callable_id;
+        register_new_callable_id_ = is_new;
+    }
+    int32_t get_active_callable_id() const
+    {
+        return active_callable_id_;
+    }
+    bool register_new_callable_id() const
+    {
+        return register_new_callable_id_;
+    }
+    void set_device_orch_func_name(const char *name)
+    {
+        if (name == nullptr)
+        {
+            device_orch_func_name_[0] = '\0';
+            return;
+        }
+        std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+        device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+    }
+    const char *get_device_orch_func_name() const
+    {
+        return device_orch_func_name_;
+    }
+    void set_device_orch_config_name(const char *name)
+    {
+        if (name == nullptr)
+        {
+            device_orch_config_name_[0] = '\0';
+            return;
+        }
+        std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+        device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+    }
+    const char *get_device_orch_config_name() const
+    {
+        return device_orch_config_name_;
+    }
+
+    uint64_t get_function_bin_addr(int func_id) const
+    {
+        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
+        return func_id_to_addr_[func_id];
+    }
+    void set_function_bin_addr(int func_id, uint64_t addr)
+    {
+        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return;
+        if (addr != 0 && func_id_to_addr_[func_id] == 0)
+        {
+            if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID)
+            {
+                registered_kernel_func_ids_[registered_kernel_count_++] = func_id;
+            }
+            else
+            {}
+        }
+        func_id_to_addr_[func_id] = addr;
+    }
+    void replay_function_bin_addr(int func_id, uint64_t addr)
+    {
+        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return;
+        func_id_to_addr_[func_id] = addr;
+    }
+
+    int get_registered_kernel_count() const
+    {
+        return registered_kernel_count_;
+    }
+    int get_registered_kernel_func_id(int index) const
+    {
+        if (index < 0 || index >= registered_kernel_count_) return -1;
+        return registered_kernel_func_ids_[index];
+    }
+    void clear_registered_kernels()
+    {
+        registered_kernel_count_ = 0;
+    }
+
+    int get_task_count() const
+    {
+        return 0;
+    }
+
+    Task *get_task([[maybe_unused]] int taskId)
+    {
+        return nullptr;
+    }
 
     // Host API function pointers for device memory operations
     // NOTE: Placed at end of class to avoid affecting device memory layout
     HostApi host_api;
 
-    // Host-side tensor ledger for D2H copy-back at finalize. Populated by
-    // runtime_maker.cpp from orch_args at bind time, then iterated in
-    // validate_runtime_impl. Not read by AICPU/AICore — the device-side
-    // Runtime image carries the std::vector control block as harmless
-    // garbage, identical to host_api above. No fixed cap — grows with the
-    // chip-level entry-tensor count.
     std::vector<TensorPair> tensor_pairs_;
 };
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
deleted file mode 100644
index 4b7484bc9..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - Scheduler Implementation
- *
- * Implements scheduler state management, ready queues, and task lifecycle.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_scheduler.h"
-#include <inttypes.h>
-#include <stdlib.h>
-#include "common/unified_log.h"
-
-#if PTO2_PROFILING
-// Weak fallbacks for host/UT builds that don't link the scope_stats collector.
-extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; }
-extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {}
-#endif
-
-// =============================================================================
-// Scheduler Profiling Counters
-// =============================================================================
-
-#if PTO2_SCHED_PROFILING
-#include "common/platform_config.h"
-
-uint64_t g_sched_lock_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanout_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanin_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_self_consumed_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_lock_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_push_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_pop_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_lock_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanout_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanin_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_self_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_pop_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_complete_count[PLATFORM_MAX_AICPU_THREADS] = {};
-
-PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) {
-    PTO2SchedProfilingData d;
-    d.lock_cycle = std::exchange(g_sched_lock_cycle[thread_idx], 0);
-    d.fanout_cycle = std::exchange(g_sched_fanout_cycle[thread_idx], 0);
-    d.fanin_cycle = std::exchange(g_sched_fanin_cycle[thread_idx], 0);
-    d.self_consumed_cycle = std::exchange(g_sched_self_consumed_cycle[thread_idx], 0);
-    d.lock_wait_cycle = std::exchange(g_sched_lock_wait_cycle[thread_idx], 0);
-    d.push_wait_cycle = std::exchange(g_sched_push_wait_cycle[thread_idx], 0);
-    d.pop_wait_cycle = std::exchange(g_sched_pop_wait_cycle[thread_idx], 0);
-    d.lock_atomic_count = std::exchange(g_sched_lock_atomic_count[thread_idx], 0);
-    d.fanout_atomic_count = std::exchange(g_sched_fanout_atomic_count[thread_idx], 0);
-    d.fanin_atomic_count = std::exchange(g_sched_fanin_atomic_count[thread_idx], 0);
-    d.self_atomic_count = std::exchange(g_sched_self_atomic_count[thread_idx], 0);
-    d.pop_atomic_count = std::exchange(g_sched_pop_atomic_count[thread_idx], 0);
-    d.complete_count = std::exchange(g_sched_complete_count[thread_idx], 0);
-    return d;
-}
-#endif
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-void PTO2SchedulerState::print_stats() {
-    PTO2SchedulerState *sched = this;
-    LOG_INFO_V0("=== Scheduler Statistics ===");
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (sched->ring_sched_states[r].last_task_alive > 0) {
-            LOG_INFO_V0("Ring %d:", r);
-            LOG_INFO_V0("  last_task_alive: %d", sched->ring_sched_states[r].last_task_alive);
-            auto &dp = sched->ring_sched_states[r].dep_pool;
-            if (dp.top > 0) {
-                LOG_INFO_V0(
-                    "  dep_pool: top=%d tail=%d used=%d high_water=%d capacity=%d", dp.top, dp.tail, dp.top - dp.tail,
-                    dp.high_water, dp.capacity
-                );
-            }
-        }
-    }
-#if PTO2_SCHED_PROFILING
-    LOG_INFO_V0("tasks_completed:   %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed));
-    LOG_INFO_V0("tasks_consumed:    %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed));
-#endif
-    LOG_INFO_V0("============================");
-}
-
-void PTO2SchedulerState::print_queues() {
-    PTO2SchedulerState *sched = this;
-    LOG_INFO_V0("=== Ready Queues ===");
-
-    const char *shape_names[] = {"AIC", "AIV", "MIX"};
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        LOG_INFO_V0("  %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size());
-    }
-    LOG_INFO_V0("  DUMMY: count=%" PRIu64, sched->dummy_ready_queue.size());
-
-    LOG_INFO_V0("====================");
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
deleted file mode 100644
index bde75a291..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ /dev/null
@@ -1,1277 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-/**
- * PTO Runtime2 - Scheduler Interface
- *
- * The Scheduler is responsible for:
- * 1. Maintaining per-resource-shape ready queues
- * 2. Tracking task state (PENDING -> COMPLETED -> CONSUMED)
- * 3. Managing fanin/fanout refcounts for dependency resolution
- * 4. Advancing last_task_alive for heap reclamation
- * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete)
- *
- * The Scheduler runs on Device AI_CPU and processes:
- * - Task state transitions based on fanin_refcount
- * - Buffer lifecycle based on fanout_refcount
- * - Ring pointer advancement for flow control
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#pragma once
-
-#include <atomic>
-
-#include "common/core_type.h"
-#include "utils/device_arena.h"
-#include "pto_async_wait.h"
-#include "pto_ring_buffer.h"
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-
-#if PTO2_SCHED_PROFILING
-#include "aicpu/device_time.h"
-#define PTO2_SCHED_CYCLE_START() uint64_t _st0 = get_sys_cnt_aicpu(), _st1
-#define PTO2_SCHED_CYCLE_LAP(acc)   \
-    do {                            \
-        _st1 = get_sys_cnt_aicpu(); \
-        acc += (_st1 - _st0);       \
-        _st0 = _st1;                \
-    } while (0)
-#endif
-
-// =============================================================================
-// Ready Queue (Lock-free bounded MPMC — Vyukov design)
-// =============================================================================
-
-/**
- * Per-slot entry: sequence counter for ABA safety + task payload
- */
-struct PTO2ReadyQueueSlot {
-    std::atomic<int64_t> sequence;
-    PTO2TaskSlotState *slot_state;
-};
-
-/**
- * Thread-local ready buffer for local-first dispatch optimization.
- *
- * Two buffers per scheduling thread, one per CoreType (AIC=0, AIV=1).
- * Initialized once before the scheduling loop; must be empty at
- * the start of each iteration (verified by always_assert).
- *
- * Phase 1 fills per-CoreType buffers via on_task_complete().
- * The dispatch stage drains them local-first via get_ready_tasks_batch,
- * with any remaining tasks pushed to the global ready queue.
- */
-// Number of CoreType values eligible for local dispatch (AIC=0, AIV=1)
-static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2;
-
-struct PTO2LocalReadyBuffer {
-    PTO2TaskSlotState **slot_states = nullptr;
-    int count = 0;
-    int capacity = 0;
-
-    void reset(PTO2TaskSlotState **buf, int cap) {
-        slot_states = buf;
-        count = 0;
-        capacity = cap;
-    }
-
-    bool try_push(PTO2TaskSlotState *s) {
-        if (slot_states && count < capacity) {
-            slot_states[count++] = s;
-            return true;
-        }
-        return false;
-    }
-
-    PTO2TaskSlotState *pop() { return (count > 0) ? slot_states[--count] : nullptr; }
-};
-
-/**
- * Lock-free bounded MPMC queue (Dmitry Vyukov design)
- *
- * Key properties:
- * - enqueue_pos and dequeue_pos on separate cache lines (no false sharing)
- * - Per-slot sequence counter prevents ABA problem
- * - Empty queue pop returns immediately (single atomic load, no lock)
- * - CAS contention is split: producers only touch enqueue_pos,
- *   consumers only touch dequeue_pos
- */
-struct alignas(64) PTO2ReadyQueue {
-    PTO2ReadyQueueSlot *slots;
-    uint64_t capacity;
-    uint64_t mask;        // capacity - 1
-    char _pad0[64 - 24];  // Pad to own cache line
-
-    std::atomic<uint64_t> enqueue_pos;
-    char _pad1[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
-
-    std::atomic<uint64_t> dequeue_pos;
-    char _pad2[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
-
-    uint64_t size() {
-        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
-        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
-        return (e >= d) ? (e - d) : 0;
-    }
-
-    bool push(PTO2TaskSlotState *slot_state) {
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        while (true) {
-            pos = enqueue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - static_cast<int64_t>(pos);
-            if (diff == 0) {
-                if (enqueue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    )) {
-                    break;
-                }
-            } else if (diff < 0) {
-                return false;  // Queue full
-            }
-        }
-
-        slot->slot_state = slot_state;
-        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
-        return true;
-    }
-
-    // Batch push: reserve count slots with a single CAS after confirming
-    // every target slot is available under the usual Vyukov sequence check.
-    void push_batch(PTO2TaskSlotState **items, int count) {
-        if (count == 0) return;
-
-        uint64_t pos;
-        while (true) {
-            pos = enqueue_pos.load(std::memory_order_relaxed);
-            bool ready = true;
-            for (int i = 0; i < count; i++) {
-                PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
-                int64_t seq = slot->sequence.load(std::memory_order_acquire);
-                int64_t diff = seq - static_cast<int64_t>(pos + i);
-                if (diff != 0) {
-                    ready = false;
-                    break;
-                }
-            }
-            if (!ready) {
-                continue;
-            }
-            if (enqueue_pos.compare_exchange_weak(
-                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
-                )) {
-                break;
-            }
-        }
-
-        for (int i = 0; i < count; i++) {
-            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
-            slot->slot_state = items[i];
-            slot->sequence.store(static_cast<int64_t>(pos + i + 1), std::memory_order_release);
-        }
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    bool push(PTO2TaskSlotState *slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) {
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-        while (true) {
-            pos = enqueue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - static_cast<int64_t>(pos);
-            atomic_ops += 2;  // enqueue_pos.load + sequence.load
-            if (diff == 0) {
-                if (enqueue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    )) {
-                    atomic_ops++;  // successful CAS
-                    break;
-                }
-                contended = true;
-                atomic_ops++;  // failed CAS
-            } else if (diff < 0) {
-                return false;  // Queue full
-            } else {
-                contended = true;  // diff > 0: slot not yet released, spin
-            }
-        }
-        atomic_ops++;  // final sequence.store
-        atomic_count += atomic_ops;
-        if (contended) {
-            wait_cycle += (get_sys_cnt_aicpu() - t0);
-        }
-
-        slot->slot_state = slot_state;
-        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
-        return true;
-    }
-#endif
-
-    PTO2TaskSlotState *pop() {
-        // Fast-path: skip slot load when queue is clearly empty
-        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
-        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
-        if (d >= e) {
-            return nullptr;
-        }
-
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        while (true) {
-            pos = dequeue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - static_cast<int64_t>(pos + 1);
-            if (diff == 0) {
-                if (dequeue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    ))
-                    break;
-            } else if (diff < 0) {
-                return nullptr;  // Queue empty
-            }
-        }
-
-        PTO2TaskSlotState *result = slot->slot_state;
-        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
-        return result;
-    }
-
-#if PTO2_SCHED_PROFILING
-    PTO2TaskSlotState *pop(uint64_t &atomic_count, uint64_t &wait_cycle) {
-        // Fast-path: skip slot load when queue is clearly empty
-        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
-        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
-        atomic_count += 2;  // dequeue_pos.load + enqueue_pos.load
-        if (d >= e) {
-            return nullptr;
-        }
-
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-        while (true) {
-            pos = dequeue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - static_cast<int64_t>(pos + 1);
-            atomic_ops += 2;  // dequeue_pos.load + sequence.load
-            if (diff == 0) {
-                if (dequeue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    )) {
-                    atomic_ops++;  // successful CAS
-                    break;
-                }
-                contended = true;
-                atomic_ops++;  // failed CAS
-            } else if (diff < 0) {
-                atomic_count += atomic_ops;
-                return nullptr;  // Queue empty
-            } else {
-                contended = true;
-            }
-        }
-        atomic_ops++;  // final sequence.store
-        atomic_count += atomic_ops;
-        if (contended) {
-            wait_cycle += (get_sys_cnt_aicpu() - t0);
-        }
-
-        PTO2TaskSlotState *result = slot->slot_state;
-        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
-        return result;
-    }
-#endif
-
-    // Batch pop: reserve a contiguous run of ready slots with a single CAS.
-    // Returns actual number of items popped (may be less than max_count).
-    int pop_batch(PTO2TaskSlotState **out, int max_count) {
-        uint64_t pos;
-        int count;
-        while (true) {
-            pos = dequeue_pos.load(std::memory_order_relaxed);
-            count = 0;
-            while (count < max_count) {
-                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
-                int64_t seq = slot->sequence.load(std::memory_order_acquire);
-                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
-                if (diff == 0) {
-                    count++;
-                    continue;
-                }
-                if (diff < 0) {
-                    break;
-                }
-                count = -1;
-                break;
-            }
-            if (count == 0) return 0;
-            if (count < 0) continue;
-            if (dequeue_pos.compare_exchange_weak(
-                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
-                )) {
-                break;
-            }
-        }
-
-        for (int i = 0; i < count; i++) {
-            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
-            out[i] = slot->slot_state;
-            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
-        }
-        return count;
-    }
-
-#if PTO2_SCHED_PROFILING
-    int pop_batch(PTO2TaskSlotState **out, int max_count, uint64_t &atomic_count, uint64_t &wait_cycle) {
-        uint64_t pos;
-        int count;
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-        while (true) {
-            pos = dequeue_pos.load(std::memory_order_relaxed);
-            atomic_ops++;  // dequeue_pos.load
-            count = 0;
-            while (count < max_count) {
-                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
-                int64_t seq = slot->sequence.load(std::memory_order_acquire);
-                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
-                atomic_ops++;  // sequence.load
-                if (diff == 0) {
-                    count++;
-                    continue;
-                }
-                if (diff < 0) {
-                    break;
-                }
-                contended = true;
-                count = -1;
-                break;
-            }
-            if (count == 0) {
-                atomic_count += atomic_ops;
-                return 0;
-            }
-            if (count < 0) {
-                continue;
-            }
-            if (dequeue_pos.compare_exchange_weak(
-                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
-                )) {
-                atomic_ops++;  // successful CAS
-                break;
-            }
-            contended = true;
-            atomic_ops++;  // failed CAS
-        }
-
-        for (int i = 0; i < count; i++) {
-            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
-            out[i] = slot->slot_state;
-            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
-            atomic_ops++;  // sequence.store
-        }
-        atomic_count += atomic_ops;
-        if (contended) {
-            wait_cycle += (get_sys_cnt_aicpu() - t0);
-        }
-        return count;
-    }
-#endif
-};
-
-// Cold-path ready queue operations (defined in pto_scheduler.cpp). Declared
-// as non-member so PTO2ReadyQueue stays a POD-like struct with cache-line
-// alignment. Storage is owned by the caller-supplied arena.
-//   reserve_layout: declare the slots[] region on the arena (must precede commit)
-//   init_from_layout: bind slots pointer from arena.region_ptr(off) and
-//                     initialize sequence counters
-//   destroy: forget the slots pointer (arena owns the buffer)
-size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity);
-// Writes everything *except* the arena-internal `slots` pointer field
-// (sequences/positions on the slot array, capacity, mask). Uses
-// arena.region_ptr(slots_off) only to address the slot array for writes;
-// does NOT store the pointer in `queue->slots`. Call
-// `ready_queue_wire_arena_pointers` afterwards to set the field itself.
-bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
-// Stores queue->slots = arena.region_ptr(slots_off). Idempotent.
-void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off);
-void ready_queue_destroy(PTO2ReadyQueue *queue);
-
-// =============================================================================
-// SPSC Queue (Single-Producer Single-Consumer, wait-free)
-// =============================================================================
-//
-// Bounded ring buffer optimized for the wiring queue use case:
-//   - Producer: orchestrator thread (push)
-//   - Consumer: scheduler thread 0 (pop_batch)
-//
-// Design based on Rigtorp's cached-index technique: each side caches
-// the other's index locally, avoiding cross-core cache line bouncing
-// on the hot path. Only when the local cache says "full" or "empty"
-// does the thread issue an acquire load on the remote index.
-//
-// Memory layout: 5 cache-line-aligned fields ensure zero false sharing.
-
-struct alignas(64) PTO2SpscQueue {
-    // --- Producer cache lines (orchestrator thread) ---
-    alignas(64) std::atomic<uint64_t> head_{0};
-    alignas(64) uint64_t tail_cached_{0};
-
-    // --- Consumer cache lines (scheduler thread 0) ---
-    alignas(64) std::atomic<uint64_t> tail_{0};
-    alignas(64) uint64_t head_cached_{0};
-
-    // --- Shared Cacheline (read only) with mask and data ptr (immutable after init) ---
-    alignas(64) PTO2TaskSlotState **buffer_{nullptr};
-    uint64_t mask_{0};
-
-    // Padding to exactly 5 cache lines
-    char padding[64 - sizeof(PTO2TaskSlotState **) - sizeof(uint64_t)];
-
-    // Reserve the backing buffer region on the supplied arena. Returns the
-    // region offset, to be passed to init_from_layout() after the arena is
-    // committed. Cache-line aligned: the buffer is shared between the
-    // orchestrator (push) and scheduler thread 0 (pop_batch), so its base
-    // must not false-share with neighboring regions.
-    static size_t reserve_layout(DeviceArena &arena, uint64_t capacity) {
-        return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
-    }
-
-    // Writes everything except the arena-internal `buffer_` pointer field
-    // (zeros the slot pointer array, mask/head/tail). The host pre-builds the
-    // image without storing a host address in buffer_; the AICPU wires
-    // buffer_ at boot via wire_arena_pointers().
-    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
-        if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false;
-        auto *buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
-        // calloc'd-equivalent: zero the slot pointers so spurious early pops
-        // observe nullptr.
-        for (uint64_t i = 0; i < capacity; i++)
-            buf[i] = nullptr;
-        mask_ = capacity - 1;
-        head_.store(0, std::memory_order_relaxed);
-        tail_.store(0, std::memory_order_relaxed);
-        tail_cached_ = 0;
-        head_cached_ = 0;
-        return true;
-    }
-
-    // Wire the arena-internal pointer. Called by both host (with host arena)
-    // and AICPU (with device arena attached to the prebuilt image).
-    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) {
-        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
-    }
-
-    // Arena owns the buffer; here we only forget our pointer.
-    void destroy() { buffer_ = nullptr; }
-
-    // Push one item (producer only). Returns false if queue is full.
-    // Full condition: next_h - tail > mask_ (i.e. > capacity-1), so the
-    // effective usable capacity is capacity-1 (one slot is wasted as a
-    // sentinel to distinguish full from empty). uint64_t wrapping is safe
-    // since head and tail are monotonically increasing and subtraction
-    // wraps correctly.
-    bool push(PTO2TaskSlotState *item) {
-        uint64_t h = head_.load(std::memory_order_relaxed);
-        uint64_t next_h = h + 1;
-        if (next_h - tail_cached_ > mask_) {
-            tail_cached_ = tail_.load(std::memory_order_acquire);
-            if (next_h - tail_cached_ > mask_) {
-                return false;
-            }
-        }
-        buffer_[h & mask_] = item;
-        head_.store(next_h, std::memory_order_release);
-        return true;
-    }
-
-    // Pop up to max_count items (consumer only). Returns actual count.
-    int pop_batch(PTO2TaskSlotState **out, int max_count) {
-        uint64_t t = tail_.load(std::memory_order_relaxed);
-        uint64_t avail = head_cached_ - t;
-        if (avail < static_cast<uint64_t>(max_count)) {
-            head_cached_ = head_.load(std::memory_order_acquire);
-            avail = head_cached_ - t;
-            if (avail == 0) return 0;
-        }
-        int count = (avail < static_cast<uint64_t>(max_count)) ? static_cast<int>(avail) : max_count;
-        for (int i = 0; i < count; i++) {
-            out[i] = buffer_[(t + i) & mask_];
-        }
-        tail_.store(t + count, std::memory_order_release);
-        return count;
-    }
-
-    // Approximate size (used for backoff decisions, not exact).
-    uint64_t size() const {
-        uint64_t h = head_.load(std::memory_order_acquire);
-        uint64_t t = tail_.load(std::memory_order_acquire);
-        return h - t;
-    }
-};
-
-static_assert(sizeof(PTO2SpscQueue) == 5 * 64, "PTO2SpscQueue must be exactly 5 cache lines (320B)");
-// =============================================================================
-
-/**
- * Statistics returned by mixed-task completion processing
- */
-struct CompletionStats {
-    int32_t fanout_edges;       // Number of fanout edges traversed (notify consumers)
-    int32_t tasks_enqueued;     // Number of consumers that became READY
-    int32_t fanin_edges;        // Number of fanin edges traversed (release producers)
-    bool mixed_task_completed;  // True only when this callback completed a mixed task
-};
-
-/**
- * Layout descriptor produced by PTO2SchedulerState::reserve_layout(). Holds
- * the arena offsets of every sub-region the scheduler needs plus the
- * capacities used at layout time (init_from_layout reuses them).
- */
-struct PTO2SchedulerLayout {
-    size_t off_ready_queue_slots[PTO2_NUM_RESOURCE_SHAPES];
-    size_t off_dummy_ready_queue_slots;
-    size_t off_dep_pool_entries[PTO2_MAX_RING_DEPTH];
-    size_t off_wiring_spsc_buffer;
-    uint64_t ready_queue_capacity;
-    uint64_t spsc_capacity;
-    int32_t dep_pool_capacity;
-};
-
-/**
- * Scheduler state structure
- *
- * Contains dynamic state updated during task execution.
- * Separated from shared memory for cache efficiency.
- * Hot-path methods are defined inline (implicitly inline as member functions).
- */
-struct PTO2SchedulerState {
-    // Shared memory access
-    PTO2SharedMemoryHeader *sm_header;
-
-    // Per-ring state
-    struct alignas(64) RingSchedState {
-        // --- Cache Line 0: ring pointer (read-only) + hot path (read-write) ---
-        PTO2SharedMemoryRingHeader *ring;
-        int32_t last_task_alive;
-        std::atomic<int32_t> advance_lock;  // multi-thread CAS
-
-        // --- Cache Line 1+: Thread 0 only (wiring dep_pool) ---
-        alignas(64) PTO2DepListPool dep_pool;
-#if PTO2_PROFILING
-        // Published only for scope_stats; orchestrator must not read dep_pool's non-atomic counters directly.
-        alignas(64) std::atomic<int32_t> dep_pool_snapshot_tail;
-        std::atomic<int32_t> dep_pool_snapshot_top;
-#endif
-
-        // Initialize arena-internal data + arena-external pointers; does NOT
-        // store dep_pool.base (that lives in the runtime arena and is wired
-        // by SchedulerState::wire_arena_pointers). The `ring` field stores
-        // the device address of the SM ring header — computed via offset
-        // arithmetic, no SM dereference.
-        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id);
-        void destroy();
-
-        void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); }
-
-#if PTO2_PROFILING
-        void publish_dep_pool_snapshot() {
-            dep_pool_snapshot_tail.store(dep_pool.tail, std::memory_order_release);
-            dep_pool_snapshot_top.store(dep_pool.top, std::memory_order_release);
-        }
-
-        void read_dep_pool_snapshot(int32_t &tail, int32_t &top) const {
-            top = dep_pool_snapshot_top.load(std::memory_order_acquire);
-            tail = dep_pool_snapshot_tail.load(std::memory_order_acquire);
-            if (tail > top) tail = top;
-        }
-#endif
-
-        void advance_ring_pointers() {
-            int32_t current_task_index = ring->fc.current_task_index.load(std::memory_order_acquire);
-            int32_t old_last_task_alive = last_task_alive;
-
-            while (last_task_alive < current_task_index) {
-                PTO2TaskSlotState &slot_state = ring->get_slot_state_by_task_id(last_task_alive);
-                if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) {
-                    break;
-                }
-                last_task_alive++;
-            }
-
-            // Eager reset: prepare reclaimed slots for reuse while still hot in cache.
-            // Safe because last_task_alive has advanced past these slots but
-            // sync_to_sm has not yet published — the orchestrator cannot reuse
-            // them until the release store below.
-            // Skips payload, task, ring_id — immutable after RingSchedState::init().
-            for (int32_t id = old_last_task_alive; id < last_task_alive; id++) {
-                ring->get_slot_state_by_task_id(id).reset_for_reuse();
-            }
-
-            sync_to_sm();
-        }
-    } ring_sched_states[PTO2_MAX_RING_DEPTH];
-
-    // Ready queues remain global (scheduling is ring-agnostic)
-    PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES];
-
-    // Dependency-only tasks (active_mask is empty, shape == DUMMY). Drained by
-    // the dispatch loop and completed inline -- never goes to AICore.
-    PTO2ReadyQueue dummy_ready_queue;
-
-    // Wiring subsystem — groups all wiring-related state for cache-line isolation.
-    //
-    // Three cache-line regions by writer:
-    //   1. batch_*  / backoff — thread 0 exclusive (local batch buffer)
-    //   2. queue    — SPSC: orchestrator push, thread 0 pop
-    //   3. orch_needs_drain — orchestrator write, thread 0 read
-    struct alignas(64) WiringState {
-        static constexpr uint64_t BATCH_SIZE = 30;
-        static constexpr int BACKOFF_LIMIT = 32;
-
-        // --- Thread 0 exclusive: local batch buffer + backoff ---
-        int batch_count = 0;
-        int batch_index = 0;
-        int backoff_counter = 0;
-        PTO2TaskSlotState *batch[BATCH_SIZE];
-
-        // --- SPSC queue: orchestrator (push) ↔ thread 0 (pop) ---
-        PTO2SpscQueue queue;
-
-        // --- Orchestrator write, thread 0 read ---
-        alignas(64) std::atomic<bool> orch_needs_drain{false};
-    } wiring;
-
-    static_assert(
-        offsetof(WiringState, queue) == 256, "WiringState: batch region must be exactly 4 cache lines before queue"
-    );
-    static_assert(sizeof(WiringState) == 640, "WiringState must be exactly 10 cache lines (640B)");
-
-    alignas(64) AsyncWaitList async_wait_list;
-
-    // Statistics (cold path, isolated from hot-path fields)
-#if PTO2_SCHED_PROFILING
-    alignas(64) std::atomic<int64_t> tasks_completed;
-    std::atomic<int64_t> tasks_consumed;
-#endif
-    // =========================================================================
-    // Inline hot-path methods
-    // =========================================================================
-
-    /**
-     * Drain wiring queue: pop submitted tasks and wire their fanout edges.
-     * Called by scheduler thread 0 each loop iteration. Sets fanin_count,
-     * acquires fanout_lock per producer, allocates dep_pool entries, and
-     * pushes ready tasks to the appropriate ready queue.
-     *
-     * @return Number of tasks wired this call.
-     */
-
-    int drain_wiring_queue(bool force_drain = false) {
-        int wired = 0;
-
-        // Refill local batch buffer when exhausted.
-        if (wiring.batch_index >= wiring.batch_count) {
-            // Backoff: defer pop when queue holds fewer than a full batch,
-            // unless force_drain, orch_needs_drain, or backoff limit reached.
-            if (!force_drain && wiring.queue.size() < WiringState::BATCH_SIZE) {
-                if (!wiring.orch_needs_drain.load(std::memory_order_acquire) &&
-                    wiring.backoff_counter < WiringState::BACKOFF_LIMIT) {
-                    wiring.backoff_counter++;
-                    return 0;
-                }
-            }
-            wiring.backoff_counter = 0;
-            wiring.batch_count = wiring.queue.pop_batch(wiring.batch, WiringState::BATCH_SIZE);
-            wiring.batch_index = 0;
-            if (wiring.batch_count == 0) return 0;
-        }
-
-        // Process tasks from local buffer in strict FIFO order.
-        while (wiring.batch_index < wiring.batch_count) {
-            PTO2TaskSlotState *ws = wiring.batch[wiring.batch_index];
-            int ring_id = ws->ring_id;
-            auto &rss = ring_sched_states[ring_id];
-            int32_t wfanin = ws->payload->fanin_actual_count;
-
-            if (wfanin > 0 && rss.dep_pool.available() < wfanin) {
-                rss.dep_pool.reclaim(*rss.ring, rss.last_task_alive);
-                if (rss.dep_pool.available() < wfanin) {
-#if PTO2_PROFILING
-                    if (is_scope_stats_enabled()) {
-                        rss.publish_dep_pool_snapshot();
-                    }
-#endif
-                    break;  // not enough dep_pool space — keep remainder for next call
-                }
-            }
-
-            wiring.batch_index++;
-            wire_task(rss, ws, wfanin);
-            wired++;
-        }
-
-        return wired;
-    }
-
-    // Route a ready slot to the right global queue. Dummy tasks (empty
-    // active_mask) live in dummy_ready_queue; everything else goes to the
-    // per-shape ready_queues[]. Used by paths that do not have a thread-local
-    // ready buffer (e.g. wiring). See push_ready_routed_local for the
-    // dispatch-time fast path.
-    void push_ready_routed(PTO2TaskSlotState *slot_state) {
-        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
-        if (shape == PTO2ResourceShape::DUMMY) {
-            dummy_ready_queue.push(slot_state);
-        } else {
-            ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-        }
-    }
-
-    /**
-     * Wire fanout edges for a single task. Sets fanin_count, acquires each
-     * producer's fanout_lock, allocates dep_pool entries for live producers,
-     * pushes the task to the ready queue once its fanin refcount is satisfied.
-     */
-    void wire_task(RingSchedState &rss, PTO2TaskSlotState *ws, int32_t wfanin) {
-        PTO2TaskPayload *wp = ws->payload;
-        ws->fanin_count = wfanin + 1;
-
-        if (wfanin != 0) {
-            int32_t early_finished = 0;
-            for_each_fanin_slot_state(*wp, [&](PTO2TaskSlotState *producer) {
-                producer->lock_fanout();
-                int32_t pstate = producer->task_state.load(std::memory_order_acquire);
-                if (pstate >= PTO2_TASK_COMPLETED) {
-                    early_finished++;
-                } else {
-                    producer->fanout_head = rss.dep_pool.prepend(producer->fanout_head, ws);
-                }
-                producer->unlock_fanout();
-            });
-
-            int32_t init_rc = early_finished + 1;
-            int32_t new_rc = ws->fanin_refcount.fetch_add(init_rc, std::memory_order_acq_rel) + init_rc;
-            if (new_rc >= ws->fanin_count) {
-                push_ready_routed(ws);
-            }
-        } else {
-            ws->fanin_refcount.fetch_add(1, std::memory_order_acq_rel);
-            push_ready_routed(ws);
-        }
-
-        ws->dep_pool_mark = rss.dep_pool.top;
-#if PTO2_PROFILING
-        if (is_scope_stats_enabled()) {
-            rss.publish_dep_pool_snapshot();
-        }
-#endif
-    }
-
-    void check_and_handle_consumed(PTO2TaskSlotState &slot_state) {
-        if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return;
-
-        PTO2TaskState expected = PTO2_TASK_COMPLETED;
-        if (!slot_state.task_state.compare_exchange_strong(
-                expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
-            )) {
-            return;
-        }
-
-#if PTO2_SCHED_PROFILING
-        tasks_consumed.fetch_add(1, std::memory_order_relaxed);
-#endif
-
-        int32_t ring_id = slot_state.ring_id;
-        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
-        int32_t expected_lock = 0;
-        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(
-                expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed
-            )) {
-            ring_sched_states[ring_id].advance_ring_pointers();
-            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
-        }
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void check_and_handle_consumed(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
-        int32_t fc = slot_state.fanout_count;
-        int32_t rc = slot_state.fanout_refcount.load(std::memory_order_acquire);
-
-        atomic_count += 2;  // fanout_count.load + fanout_refcount.load
-
-        if (rc != fc) return;
-
-        PTO2TaskState expected = PTO2_TASK_COMPLETED;
-        if (!slot_state.task_state.compare_exchange_strong(
-                expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
-            )) {
-            atomic_count += 1;  // failed CAS
-            return;
-        }
-
-        atomic_count += 1;  // successful CAS
-
-#if PTO2_SCHED_PROFILING
-        tasks_consumed.fetch_add(1, std::memory_order_relaxed);
-#endif
-
-        int32_t ring_id = slot_state.ring_id;
-        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
-        int32_t expected_lock = 0;
-        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(
-                expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed
-            )) {
-            ring_sched_states[ring_id].advance_ring_pointers();
-            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
-            atomic_count += 2;  // try-lock CAS + unlock store
-        } else {
-            atomic_count += 1;  // failed try-lock CAS
-        }
-    }
-#endif
-
-    void release_producer(PTO2TaskSlotState &slot_state) {
-        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
-        check_and_handle_consumed(slot_state);
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void release_producer(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
-        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
-        atomic_count += 1;  // fanout_refcount.fetch_add
-        check_and_handle_consumed(slot_state, atomic_count);
-    }
-#endif
-
-    bool release_fanin_and_check_ready(PTO2TaskSlotState &slot_state, PTO2LocalReadyBuffer *local_bufs = nullptr) {
-        // Atomically increment fanin_refcount and check if all producers are done
-        // ACQ_REL on fanin_refcount already synchronizes with the orchestrator's
-        // init release, making fanin_count visible — plain load suffices.
-        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
-
-        if (new_refcount == slot_state.fanin_count) {
-            // Local-first: try per-CoreType thread-local buffer before global queue
-            // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1]
-            // DUMMY shape is out of range for local_bufs (sized PTO2_NUM_RESOURCE_SHAPES);
-            // dummy slots bypass the local fast path and go straight to dummy_ready_queue.
-            PTO2ResourceShape shape = slot_state.active_mask.to_shape();
-            if (shape == PTO2ResourceShape::DUMMY) {
-                dummy_ready_queue.push(&slot_state);
-            } else if (!local_bufs || !local_bufs[static_cast<int32_t>(shape)].try_push(&slot_state)) {
-                ready_queues[static_cast<int32_t>(shape)].push(&slot_state);
-            }
-            return true;
-        }
-        return false;
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    bool release_fanin_and_check_ready(
-        PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &push_wait,
-        PTO2LocalReadyBuffer *local_bufs = nullptr
-    ) {
-        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
-        atomic_count += 1;  // fanin_refcount.fetch_add
-
-        if (new_refcount == slot_state.fanin_count) {
-            // Local-first: try per-CoreType thread-local buffer before global queue.
-            // Dummy slots bypass local_bufs (out-of-range for PTO2_NUM_RESOURCE_SHAPES)
-            // and go straight to dummy_ready_queue; use the profiling-aware push so
-            // atomic_count / push_wait stay consistent with the non-dummy path.
-            PTO2ResourceShape shape = slot_state.active_mask.to_shape();
-            if (shape == PTO2ResourceShape::DUMMY) {
-                dummy_ready_queue.push(&slot_state, atomic_count, push_wait);
-            } else if (!local_bufs || !local_bufs[static_cast<int32_t>(shape)].try_push(&slot_state)) {
-                ready_queues[static_cast<int32_t>(shape)].push(&slot_state, atomic_count, push_wait);
-            }
-            return true;
-        }
-        return false;
-    }
-#endif
-
-    int get_ready_tasks_batch(
-        PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
-    ) {
-        int count = 0;
-        while (count < max_count && local_buf.count > 0) {
-            out[count++] = local_buf.slot_states[--local_buf.count];
-        }
-        int remaining = max_count - count;
-        if (remaining > 0) {
-            count += ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining);
-        }
-        return count;
-    }
-
-#if PTO2_SCHED_PROFILING
-    int get_ready_tasks_batch(
-        PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count,
-        uint64_t &atomic_count, uint64_t &wait_cycle
-    ) {
-        int count = 0;
-        while (count < max_count && local_buf.count > 0) {
-            out[count++] = local_buf.slot_states[--local_buf.count];
-        }
-        int remaining = max_count - count;
-        if (remaining > 0) {
-            count +=
-                ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining, atomic_count, wait_cycle);
-        }
-        return count;
-    }
-#endif
-
-    void on_scope_end(PTO2TaskSlotState **task_slot_states, int32_t count) {
-#if PTO2_ORCH_PROFILING
-        extern uint64_t g_orch_scope_end_atomic_count;
-        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
-        for (int32_t i = 0; i < count; i++) {
-            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
-            release_producer(*task_slot_states[i], g_orch_scope_end_atomic_count);
-        }
-#else
-        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
-        for (int32_t i = 0; i < count; i++) {
-            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
-            release_producer(*task_slot_states[i]);
-        }
-#endif
-    }
-
-    /**
-     * Subtask completion: atomic counter model.
-     * Called when a single subtask (AIC, AIV0, or AIV1) finishes on any block.
-     * Atomically increments completed_subtasks and checks whether all subtasks
-     * across all blocks are done.
-     *
-     * @return true if this was the last subtask, completing the entire task.
-     */
-    bool on_subtask_complete(PTO2TaskSlotState &slot_state) {
-        int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
-        return (prev + 1) == slot_state.total_required_subtasks;
-    }
-
-    /**
-     * Two-stage completion: second stage.
-     * Called exactly once when all subtasks of a mixed task are done
-     * (i.e., on_subtask_complete returned true).
-     * Handles fanout notification, fanin release, and self-consumption check.
-     */
-#if PTO2_SCHED_PROFILING
-    CompletionStats
-#else
-    void
-#endif
-    on_mixed_task_complete(
-        PTO2TaskSlotState &slot_state,
-#if PTO2_SCHED_PROFILING
-        int thread_idx,
-#endif
-
-        PTO2LocalReadyBuffer *local_bufs = nullptr
-    ) {
-#if PTO2_SCHED_PROFILING
-        CompletionStats stats = {0, 0, 0, true};
-#endif
-#if PTO2_SCHED_PROFILING
-        extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[];
-        extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[];
-        extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[];
-        uint64_t lock_atomics = 0, lock_wait = 0;
-        PTO2_SCHED_CYCLE_START();
-#endif
-
-#if PTO2_SCHED_PROFILING
-        slot_state.lock_fanout(lock_atomics, lock_wait);
-#else
-        slot_state.lock_fanout();
-#endif
-        slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
-        PTO2DepListEntry *current = slot_state.fanout_head;  // Protected by fanout_lock
-        slot_state.unlock_fanout();
-
-#if PTO2_SCHED_PROFILING
-        lock_atomics += 2;  // state.store + unlock.store
-        g_sched_lock_atomic_count[thread_idx] += lock_atomics;
-        g_sched_lock_wait_cycle[thread_idx] += lock_wait;
-        PTO2_SCHED_CYCLE_LAP(g_sched_lock_cycle[thread_idx]);
-#endif
-
-        // Fanout: notify consumers
-#if PTO2_SCHED_PROFILING
-        uint64_t fanout_atomics = 0, push_wait = 0;
-#endif
-        while (current != nullptr) {
-            PTO2TaskSlotState &consumer_slot = *current->slot_state;
-#if PTO2_SCHED_PROFILING
-            stats.fanout_edges++;
-            if (release_fanin_and_check_ready(consumer_slot, fanout_atomics, push_wait, local_bufs)) {
-                stats.tasks_enqueued++;
-            }
-#else
-            release_fanin_and_check_ready(consumer_slot, local_bufs);
-#endif
-            current = current->next;
-        }
-
-#if PTO2_SCHED_PROFILING
-        g_sched_fanout_atomic_count[thread_idx] += fanout_atomics;
-        g_sched_push_wait_cycle[thread_idx] += push_wait;
-        PTO2_SCHED_CYCLE_LAP(g_sched_fanout_cycle[thread_idx]);
-        return stats;
-#endif
-    }
-
-    /**
-     * Cold path: release producers (fanin traversal) + check self for CONSUMED.
-     * Returns fanin edge count for profiling.
-     */
-
-#if PTO2_SCHED_PROFILING
-    int32_t on_task_release(PTO2TaskSlotState &slot_state, int32_t thread_idx) {
-        PTO2_SCHED_CYCLE_START();
-        extern uint64_t g_sched_fanin_cycle[], g_sched_fanin_atomic_count[];
-        extern uint64_t g_sched_self_atomic_count[];
-        extern uint64_t g_sched_self_consumed_cycle[];
-        extern uint64_t g_sched_complete_count[];
-        uint64_t fanin_atomics = 0;
-#else
-    int32_t on_task_release(PTO2TaskSlotState &slot_state) {
-#endif
-        PTO2TaskPayload *payload = slot_state.payload;
-        for_each_fanin_slot_state(*payload, [&](PTO2TaskSlotState *producer_slot_state) {
-#if PTO2_SCHED_PROFILING
-            release_producer(*producer_slot_state, fanin_atomics);
-#else
-            release_producer(*producer_slot_state);
-#endif
-        });
-#if PTO2_SCHED_PROFILING
-        g_sched_fanin_atomic_count[thread_idx] += fanin_atomics;
-        PTO2_SCHED_CYCLE_LAP(g_sched_fanin_cycle[thread_idx]);
-#endif
-
-        // Self consumed check
-#if PTO2_SCHED_PROFILING
-        uint64_t self_atomics = 0;
-        check_and_handle_consumed(slot_state, self_atomics);
-        g_sched_self_atomic_count[thread_idx] += self_atomics;
-        PTO2_SCHED_CYCLE_LAP(g_sched_self_consumed_cycle[thread_idx]);
-        g_sched_complete_count[thread_idx]++;
-#else
-        check_and_handle_consumed(slot_state);
-#endif
-        return payload->fanin_actual_count;
-    }
-
-    // === Cold-path API (defined in pto_scheduler.cpp) ===
-
-    // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots,
-    // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena.
-    // Capacities are baked into the returned layout; init_data_from_layout uses
-    // the same values.
-    static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
-
-    // Phase 3a: write everything *except* arena-internal pointer fields.
-    // `sm_dev_base` is the device address of the SM (only stored, never
-    // dereferenced here). Safe to call on a host arena that holds the
-    // prebuilt image buffer. (The orchestrator counterpart takes
-    // task_window_size for ring task_descriptors address arithmetic; the
-    // scheduler only needs the SM header / ring header base addresses,
-    // both window-size-independent.)
-    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base);
-
-    // Phase 3b: write the arena-internal pointer fields
-    // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each
-    // ring, wiring.queue.buffer_). Called on both host and device sides.
-    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena);
-
-    // Forget per-region pointers; arena owns the backing memory.
-    void destroy();
-    void print_stats();
-    void print_queues();
-};
-
-// Scheduler cold-path API is declared as PTO2SchedulerState member functions.
-// See init()/destroy()/print_stats()/print_queues() below the struct definition.
-
-// try_inline_complete_locked: short-circuit NotDeferred completions seen during
-// drain so they don't grow entries[]. Defined here (not in pto_async_wait.h)
-// because PTO2SchedulerState's on_mixed_task_complete signature is only known
-// after its full definition above.
-//
-// When the deferred_release_slot_states[] buffer is full, drain it via
-// on_task_release before appending — mirrors the same overflow-drain idiom
-// that scheduler_completion.cpp's inline NotDeferred path uses, so high task
-// rates don't surface as ASYNC_WAIT_OVERFLOW errors.
-inline bool
-AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state) {
-#if PTO2_SCHED_PROFILING
-    sink.sched->on_mixed_task_complete(slot_state, sink.thread_idx, sink.local_bufs);
-#else
-    sink.sched->on_mixed_task_complete(slot_state, sink.local_bufs);
-#endif
-    if (*sink.deferred_release_count >= sink.deferred_release_capacity) {
-        while (*sink.deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-            (void)sink.sched->on_task_release(
-                *sink.deferred_release_slot_states[--(*sink.deferred_release_count)], sink.thread_idx
-            );
-#else
-            sink.sched->on_task_release(*sink.deferred_release_slot_states[--(*sink.deferred_release_count)]);
-#endif
-        }
-    }
-    sink.deferred_release_slot_states[(*sink.deferred_release_count)++] = &slot_state;
-    sink.inline_completed++;
-    return true;
-}
-
-template <bool Profiling>
-inline AsyncPollResult AsyncWaitList::poll_and_complete(
-    AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs,
-    PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, int32_t deferred_release_capacity
-#if PTO2_SCHED_PROFILING
-    ,
-    int thread_idx
-#endif
-) {
-    AsyncPollResult result;
-    if (!try_lock()) return result;
-
-    AsyncWaitList::DrainCompletionSink sink{};
-    sink.sched = sched;
-    sink.local_bufs = local_bufs;
-    sink.deferred_release_slot_states = deferred_release_slot_states;
-    sink.deferred_release_count = &deferred_release_count;
-    sink.deferred_release_capacity = deferred_release_capacity;
-#if PTO2_SCHED_PROFILING
-    sink.thread_idx = thread_idx;
-#endif
-
-    int32_t drain_err = PTO2_ERROR_NONE;
-    drain_aicore_completion_mailbox_locked(aicore_mailbox, sink, drain_err);
-    if (drain_err != PTO2_ERROR_NONE) {
-        result.error_code = drain_err;
-        unlock();
-        return result;
-    }
-    result.completed += sink.inline_completed;
-
-    for (int32_t i = count - 1; i >= 0; --i) {
-        AsyncWaitEntry &entry = entries[i];
-        uintptr_t last_invalidated_counter_line = static_cast<uintptr_t>(-1);
-        for (int32_t c = 0; c < entry.condition_count; c++) {
-            CompletionCondition &cond = entry.conditions[c];
-            if (cond.satisfied) continue;
-            if (cond.completion_type == COMPLETION_TYPE_COUNTER && cond.counter_addr != nullptr) {
-                uintptr_t counter_line = mailbox_cache_line(cond.counter_addr);
-                if (counter_line != last_invalidated_counter_line) {
-                    cache_invalidate_range(reinterpret_cast<const void *>(counter_line), sizeof(uint32_t));
-                    last_invalidated_counter_line = counter_line;
-                }
-            }
-            CompletionPollResult poll = cond.test();
-            if (poll.state == CompletionPollState::FAILED) {
-                result.error_code = poll.error_code;
-                result.failed_slot_state = entry.slot_state;
-                unlock();
-                return result;
-            }
-            if (poll.state == CompletionPollState::READY) {
-                cond.satisfied = true;
-                cond.retire();
-                entry.waiting_completion_count--;
-            }
-        }
-
-        if (entry.normal_done && entry.waiting_completion_count <= 0) {
-#if PTO2_SCHED_PROFILING
-            sched->on_mixed_task_complete(*entry.slot_state, thread_idx, local_bufs);
-#else
-            sched->on_mixed_task_complete(*entry.slot_state, local_bufs);
-#endif
-            // Drain deferred_release in place when the buffer fills — same
-            // overflow-drain idiom used by complete_slot_task's inline path
-            // and by try_inline_complete_locked. Without this, large bursts
-            // of completable wait_list entries in a single poll surfaced as
-            // ASYNC_WAIT_OVERFLOW under the MPSC model.
-            if (deferred_release_count >= deferred_release_capacity) {
-                while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                    (void)sched->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-                    sched->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-                }
-            }
-            deferred_release_slot_states[deferred_release_count++] = entry.slot_state;
-            result.completed++;
-
-            int32_t last = count - 1;
-            if (i != last) entries[i] = entries[last];
-            count = last;
-        }
-    }
-
-    unlock();
-    return result;
-}
-
-// =============================================================================
-// Scheduler Profiling Data
-// =============================================================================
-
-#if PTO2_SCHED_PROFILING
-struct PTO2SchedProfilingData {
-    // Sub-phase cycle breakdown within on_mixed_task_complete
-    uint64_t lock_cycle;           // lock_fanout + state store + unlock
-    uint64_t fanout_cycle;         // fanout traversal
-    uint64_t fanin_cycle;          // fanin traversal
-    uint64_t self_consumed_cycle;  // self check_and_handle_consumed
-
-    // Wait times
-    uint64_t lock_wait_cycle;  // spin-wait in fanout_lock
-    uint64_t push_wait_cycle;  // CAS contention in push()
-    uint64_t pop_wait_cycle;   // CAS contention in pop()
-
-    // Atomic counts per sub-phase
-    uint64_t lock_atomic_count;
-    uint64_t fanout_atomic_count;
-    uint64_t fanin_atomic_count;
-    uint64_t self_atomic_count;
-    uint64_t pop_atomic_count;
-
-    int64_t complete_count;
-};
-
-/**
- * Get and reset scheduler profiling data for a specific thread.
- * Returns accumulated profiling data and resets counters.
- */
-PTO2SchedProfilingData scheduler_get_profiling(int thread_idx);
-#endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
deleted file mode 100644
index 7f9011d47..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ /dev/null
@@ -1,1085 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include "scheduler_context.h"
-
-#include <cinttypes>
-#include <cstdio>
-
-#include "common/unified_log.h"
-#include "aicpu/device_time.h"
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-#include "aicpu/platform_regs.h"
-#include "aicpu/pmu_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-#include "common/memory_barrier.h"
-#include "common/l2_swimlane_profiling.h"
-#include "common/platform_config.h"
-#include "pto_runtime2.h"
-#include "pto_shared_memory.h"
-#include "runtime.h"
-#include "spin_hint.h"
-
-// =============================================================================
-// Cold-path helpers for the main dispatch loop (noinline to reduce hot-loop icache)
-// =============================================================================
-
-static void latch_scheduler_error(PTO2SharedMemoryHeader *header, int32_t thread_idx, int32_t error_code) {
-    if (header == nullptr || error_code == PTO2_ERROR_NONE) {
-        return;
-    }
-    // The first error code/thread pair wins; the bitmap cumulatively records all reporting threads.
-    int32_t expected = PTO2_ERROR_NONE;
-    if (header->sched_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) {
-        header->sched_error_thread.store(thread_idx, std::memory_order_release);
-    }
-    if (thread_idx >= 0 && thread_idx < 32) {
-        header->sched_error_bitmap.fetch_or(1U << static_cast<uint32_t>(thread_idx), std::memory_order_acq_rel);
-    }
-}
-
-LoopAction SchedulerContext::handle_orchestrator_exit(
-    int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count
-) {
-    if (completed_.load(std::memory_order_acquire)) {
-        return LoopAction::BREAK_LOOP;
-    }
-    int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
-    if (orch_err != PTO2_ERROR_NONE) {
-        LOG_ERROR(
-            "Thread %d: Fatal error (code=%d), sending EXIT_SIGNAL to all cores. "
-            "completed_tasks=%d, total_tasks=%d",
-            thread_idx, orch_err, completed_tasks_.load(std::memory_order_relaxed), total_tasks_
-        );
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-        return LoopAction::BREAK_LOOP;
-    }
-    int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
-    if (sched_err != PTO2_ERROR_NONE) {
-        LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err);
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-        return LoopAction::BREAK_LOOP;
-    }
-
-    bool orch_done = orchestrator_done_;
-    if (!orch_done) return LoopAction::NONE;
-
-    task_count = total_tasks_;
-    if (task_count > 0 && completed_tasks_.load(std::memory_order_relaxed) >= task_count) {
-        completed_.store(true, std::memory_order_release);
-        LOG_INFO_V0(
-            "Thread %d: PTO2 completed tasks %d/%d", thread_idx, completed_tasks_.load(std::memory_order_relaxed),
-            task_count
-        );
-        return LoopAction::BREAK_LOOP;
-    }
-    return LoopAction::NONE;
-}
-
-LoopAction SchedulerContext::handle_core_transition(bool &cores_released) {
-    if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE;
-    if (!reassigned_.load(std::memory_order_acquire)) {
-        wait_reassign_.fetch_add(1, std::memory_order_release);
-        while (!reassigned_.load(std::memory_order_acquire)) {
-            if (completed_.load(std::memory_order_acquire)) {
-                return LoopAction::BREAK_LOOP;
-            }
-            SPIN_WAIT_HINT();
-        }
-    }
-    cores_released = true;
-    return LoopAction::NONE;
-}
-
-LoopAction
-SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) {
-    if (completed_.load(std::memory_order_acquire)) {
-        return LoopAction::BREAK_LOOP;
-    }
-    int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
-    if (orch_err != PTO2_ERROR_NONE) {
-        LOG_ERROR("Thread %d: Fatal error detected (code=%d), sending EXIT_SIGNAL to all cores", thread_idx, orch_err);
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-        return LoopAction::BREAK_LOOP;
-    }
-    int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
-    if (sched_err != PTO2_ERROR_NONE) {
-        LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err);
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-        return LoopAction::BREAK_LOOP;
-    }
-    return LoopAction::NONE;
-}
-
-// =============================================================================
-// Stall diagnostic log format.
-//
-// Every line is self-contained — when scheduler threads emit concurrently and
-// device_log interleaves their output, each line still carries enough context
-// to identify which thread / iteration / object it belongs to.
-//
-// Prefix on every line:
-//   [STALL thread=N idle_iterations=K] CATEGORY ...
-//
-// All scheduler threads spinning at the same idle rate hit STALL_LOG_INTERVAL
-// together, so lines with the same idle_iterations belong to one diagnostic
-// round; grep "idle_iterations=N" groups one round's output.
-//
-// Categories (and which thread emits them):
-//   SUMMARY  — completed / total counts and scan totals               (thread 0 only)
-//   TASK     — one per non-completed task scanned from shared rings   (thread 0 only)
-//              - state=RUNNING: includes running_on=[...] cross-ref
-//              - state=READY:   fanin satisfied but no idle core yet
-//              - state=WAIT:    includes missing_deps=N
-//   CLUSTER  — one per cluster owned by this thread                   (every thread)
-//              - busy slot shows kernel + task_id + cond_reg_state;
-//                ANOMALY suffix when COND register is fin while software
-//                still has the slot marked busy.
-//
-// Reader workflow:
-//   1. grep SUMMARY                          -> overall completion status
-//   2. grep "idle_iterations=N TASK"         -> stuck RUNNING task and which
-//                                               core/thread it is on
-//   3. grep "idle_iterations=N CLUSTER.*task=<id>" -> cross-check via the
-//                                                     cluster line (or just
-//                                                     read running_on in step 2)
-// =============================================================================
-
-namespace {
-
-// Format a core's idle/busy state into a fixed buffer. Used inside CLUSTER lines.
-// Layout (idle):    coreN(idle)
-// Layout (busy):    coreN(busy kernel=K task=T cond_reg_state=ack)
-// Layout (anomaly): coreN(busy kernel=K task=T cond_reg_state=fin ANOMALY)
-//
-// Healthy busy: COND register reports ack (AICore still executing). fin means
-// AICore wrote completion but AICPU hasn't recycled the running slot yet —
-// either a completion-poll bug or the diagnostic raced the recycle.
-void format_core_status(
-    char *buf, size_t buf_size, int32_t core_id, bool idle, const CoreExecState *core_state, uint64_t reg_addr_for_cond
-) {
-    if (idle) {
-        snprintf(buf, buf_size, "core%d(idle)", core_id);
-        return;
-    }
-    int32_t kernel = -1;
-    int64_t task_id_raw = -1;
-    if (core_state && core_state->running_slot_state) {
-        int32_t subslot = static_cast<int32_t>(core_state->running_subslot);
-        kernel = core_state->running_slot_state->task->kernel_id[subslot];
-        task_id_raw = static_cast<int64_t>(core_state->running_slot_state->task->task_id.raw);
-    }
-    uint64_t cond_reg = read_reg(reg_addr_for_cond, RegId::COND);
-    int32_t hw_state = EXTRACT_TASK_STATE(cond_reg);
-    const char *cond_reg_state_str = (hw_state == TASK_ACK_STATE) ? "ack" : "fin";
-    if (hw_state == TASK_ACK_STATE) {
-        snprintf(
-            buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s)", core_id, kernel, task_id_raw,
-            cond_reg_state_str
-        );
-    } else {
-        snprintf(
-            buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s ANOMALY)", core_id, kernel,
-            task_id_raw, cond_reg_state_str
-        );
-    }
-}
-
-}  // namespace
-
-int32_t SchedulerContext::find_core_owner_thread(int32_t core_id) const {
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        const int32_t *ids = core_trackers_[t].core_ids();
-        int32_t n = core_trackers_[t].core_num();
-        for (int32_t i = 0; i < n; i++) {
-            if (ids[i] == core_id) return t;
-        }
-    }
-    return -1;
-}
-
-bool SchedulerContext::self_owns_running_task(int32_t thread_idx) const {
-    const int32_t *cores = core_trackers_[thread_idx].core_ids();
-    int32_t core_num = core_trackers_[thread_idx].core_num();
-    for (int32_t i = 0; i < core_num; i++) {
-        if (core_exec_states_[cores[i]].running_slot_state != nullptr) {
-            return true;
-        }
-    }
-    return false;
-}
-
-bool SchedulerContext::no_thread_owns_running_task() const {
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        if (self_owns_running_task(t)) return false;
-    }
-    return true;
-}
-
-void SchedulerContext::log_stall_diagnostics(
-    int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count
-) {
-    CoreTracker &tracker = core_trackers_[thread_idx];
-
-    // T0 owns the shared-ring scan; printing it from other threads would
-    // produce identical TASK lines once per scheduler thread.
-    if (thread_idx == 0) {
-        int32_t cnt_ready = 0, cnt_waiting = 0, cnt_running = 0, submitted_in_ring = 0;
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            PTO2SharedMemoryRingHeader &ring = *sched_->ring_sched_states[r].ring;
-            int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed);
-            submitted_in_ring += ring_task_count;
-            for (int32_t si = 0; si < ring_task_count; si++) {
-                PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si);
-                PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed);
-                int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed);
-                int32_t fi = slot_state.fanin_count;
-                int32_t kid_aic = slot_state.task->kernel_id[0];
-                int32_t kid_aiv0 = slot_state.task->kernel_id[1];
-                int32_t kid_aiv1 = slot_state.task->kernel_id[2];
-                int64_t task_id = static_cast<int64_t>(slot_state.task->task_id.raw);
-                if (st >= PTO2_TASK_COMPLETED) continue;
-                // task_state has no intermediate ready/running value — it
-                // stays PENDING until the worker stores COMPLETED. Classify
-                // by the ground truth instead: a slot is RUNNING iff some
-                // core has it as running_slot_state. A task occupies at most
-                // 3 cores (one cluster), all under the same owner thread by
-                // construction of assign_cores_to_threads.
-                char running_on[192] = {0};
-                int32_t owner = -1;
-                int32_t pos = 0;
-                bool is_running = false;
-                for (int32_t cid = 0; cid < cores_total_num_ && pos + 32 < (int32_t)sizeof(running_on); cid++) {
-                    if (core_exec_states_[cid].running_slot_state != &slot_state) continue;
-                    is_running = true;
-                    if (owner < 0) owner = find_core_owner_thread(cid);
-                    const char *sname = subslot_name(core_exec_states_[cid].running_subslot);
-                    int32_t written = snprintf(
-                        running_on + pos, sizeof(running_on) - pos, "%score=%d(%s)", pos == 0 ? "" : " ", cid, sname
-                    );
-                    if (written > 0) pos += written;
-                }
-
-                if (is_running) {
-                    cnt_running++;
-                    if (cnt_running > STALL_DUMP_READY_MAX) continue;
-                    LOG_INFO_V9(
-                        "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
-                        " state=RUNNING fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] "
-                        "running_on=[owner_thread=%d cores=[%s]]",
-                        thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, owner, running_on
-                    );
-                    continue;
-                }
-                if (rc >= fi) {
-                    cnt_ready++;
-                    if (cnt_ready > STALL_DUMP_READY_MAX) continue;
-                    LOG_INFO_V9(
-                        "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
-                        " state=READY   fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d]",
-                        thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1
-                    );
-                    continue;
-                }
-                cnt_waiting++;
-                if (cnt_waiting > STALL_DUMP_WAIT_MAX) continue;
-                LOG_INFO_V9(
-                    "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
-                    " state=WAIT    fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] missing_deps=%d",
-                    thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, fi - rc
-                );
-            }
-        }
-        int32_t effective_total = task_count > 0 ? task_count : submitted_in_ring;
-        int32_t c = completed_tasks_.load(std::memory_order_relaxed);
-        LOG_INFO_V9(
-            "[STALL thread=%d idle_iterations=%d] SUMMARY completed=%d/%d last_progress_iteration=%d "
-            "scan_ready=%d scan_waiting=%d scan_running=%d",
-            thread_idx, idle_iterations, c, effective_total, last_progress_count, cnt_ready, cnt_waiting, cnt_running
-        );
-    }
-
-    // CLUSTER lines: one per cluster this thread owns.
-    // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the
-    // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads.
-    int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
-    for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) {
-        int32_t offset = cli * 3;
-        int32_t aic_id = tracker.get_aic_core_id(offset);
-        int32_t aiv0_id = tracker.get_aiv0_core_id(offset);
-        int32_t aiv1_id = tracker.get_aiv1_core_id(offset);
-        bool aic_idle = tracker.is_aic_core_idle(offset);
-        bool aiv0_idle = tracker.is_aiv0_core_idle(offset);
-        bool aiv1_idle = tracker.is_aiv1_core_idle(offset);
-        int32_t cluster_id = cli * ast + thread_idx;
-        char aic_buf[128], aiv0_buf[128], aiv1_buf[128];
-        format_core_status(
-            aic_buf, sizeof(aic_buf), aic_id, aic_idle, &core_exec_states_[aic_id], core_exec_states_[aic_id].reg_addr
-        );
-        format_core_status(
-            aiv0_buf, sizeof(aiv0_buf), aiv0_id, aiv0_idle, &core_exec_states_[aiv0_id],
-            core_exec_states_[aiv0_id].reg_addr
-        );
-        format_core_status(
-            aiv1_buf, sizeof(aiv1_buf), aiv1_id, aiv1_idle, &core_exec_states_[aiv1_id],
-            core_exec_states_[aiv1_id].reg_addr
-        );
-        LOG_INFO_V9(
-            "[STALL thread=%d idle_iterations=%d] CLUSTER cluster_id=%d aic=%s aiv0=%s aiv1=%s", thread_idx,
-            idle_iterations, cluster_id, aic_buf, aiv0_buf, aiv1_buf
-        );
-    }
-}
-
-void SchedulerContext::log_shutdown_stall_snapshot(
-    int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count
-) {
-    LOG_WARN(
-        "[SHUTDOWN_SNAPSHOT trigger_thread=%d reason=scheduler_timeout idle_iterations=%d] "
-        "dumping all scheduler threads before emergency shutdown",
-        trigger_thread_idx, trigger_idle_iterations
-    );
-    int32_t thread_count = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
-    if (thread_count < 0 || thread_count > MAX_AICPU_THREADS) {
-        LOG_ERROR(
-            "[SHUTDOWN_SNAPSHOT trigger_thread=%d] invalid thread_count=%d, clamping to [0,%d]", trigger_thread_idx,
-            thread_count, MAX_AICPU_THREADS
-        );
-        thread_count = thread_count < 0 ? 0 : MAX_AICPU_THREADS;
-    }
-    for (int32_t t = 0; t < thread_count; t++) {
-        log_stall_diagnostics(t, total_tasks_, trigger_idle_iterations, trigger_last_progress_count);
-    }
-}
-
-int32_t SchedulerContext::handle_timeout_exit(
-    int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
-    int32_t last_progress_count
-#if PTO2_PROFILING
-    ,
-    uint64_t sched_start_ts
-#endif
-) {
-    LOG_ERROR(
-        "[STALL thread=%d idle_iterations=%d] TIMEOUT_EXIT after_idle_iterations=%d", thread_idx, idle_iterations,
-        idle_iterations
-    );
-    latch_scheduler_error(header, thread_idx, PTO2_ERROR_SCHEDULER_TIMEOUT);
-    if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-        log_shutdown_stall_snapshot(thread_idx, idle_iterations, last_progress_count);
-#if PTO2_PROFILING
-        // Capture the in-flight kernels' partial output before signalling the
-        // cores to exit, so the dump reflects the live stuck state.
-        if (is_dump_tensor_enabled()) {
-            dump_running_task_outputs<PTO2_SUBTASK_SLOT_COUNT>(
-                thread_idx, cores_total_num_,
-                [this](int32_t cid) {
-                    return core_exec_states_[cid].running_slot_state;
-                },
-                [](ActiveMask active_mask, int raw_subtask_id) {
-                    return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
-                },
-                [this](int32_t func_id) {
-                    return get_function_bin_addr(func_id);
-                }
-            );
-        }
-#endif
-        emergency_shutdown(runtime);
-    }
-#if PTO2_PROFILING
-    uint64_t sched_timeout_ts = get_sys_cnt_aicpu();
-    LOG_INFO_V9(
-        "Thread %d: sched_start=%" PRIu64 " sched_end(timeout)=%" PRIu64 " sched_cost=%.3fus", thread_idx,
-        static_cast<uint64_t>(sched_start_ts), static_cast<uint64_t>(sched_timeout_ts),
-        cycles_to_us(sched_timeout_ts - sched_start_ts)
-    );
-#endif
-    return -PTO2_ERROR_SCHEDULER_TIMEOUT;
-}
-
-#if PTO2_PROFILING
-void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) {
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-    uint64_t sched_end_ts = get_sys_cnt_aicpu();
-    LOG_INFO_V9(
-        "Thread %d: sched_start=%" PRIu64 " sched_end=%" PRIu64 " sched_cost=%.3fus", thread_idx,
-        static_cast<uint64_t>(l2_swimlane.sched_start_ts), static_cast<uint64_t>(sched_end_ts),
-        cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts)
-    );
-
-    uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle +
-                           l2_swimlane.sched_dispatch_cycle + l2_swimlane.sched_idle_cycle;
-    if (sched_total == 0) sched_total = 1;
-
-#if PTO2_SCHED_PROFILING
-    {
-        PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx);
-        uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle;
-        uint64_t complete_poll =
-            (l2_swimlane.sched_complete_cycle > otc_total + l2_swimlane.sched_complete_perf_cycle) ?
-                (l2_swimlane.sched_complete_cycle - otc_total - l2_swimlane.sched_complete_perf_cycle) :
-                0;
-        uint64_t dispatch_poll = (l2_swimlane.sched_dispatch_cycle >
-                                  l2_swimlane.sched_dispatch_pop_cycle + l2_swimlane.sched_dispatch_setup_cycle) ?
-                                     (l2_swimlane.sched_dispatch_cycle - l2_swimlane.sched_dispatch_pop_cycle -
-                                      l2_swimlane.sched_dispatch_setup_cycle) :
-                                     0;
-
-        LOG_INFO_V9(
-            "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx,
-            cycles_to_us(sched_total), cur_thread_completed
-        );
-
-        // fanout / fanin per-thread aggregates live in
-        // sched_overhead_analysis.compute_dag_stats_from_deps (deps.json edges
-        // × core_to_thread).
-        LOG_INFO_V9(
-            "Thread %d:   complete       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_complete_cycle),
-            l2_swimlane.sched_complete_cycle * 100.0 / sched_total
-        );
-
-        uint64_t c_parent = l2_swimlane.sched_complete_cycle > 0 ? l2_swimlane.sched_complete_cycle : 1;
-        uint64_t complete_miss_count = (l2_swimlane.complete_probe_count > l2_swimlane.complete_hit_count) ?
-                                           (l2_swimlane.complete_probe_count - l2_swimlane.complete_hit_count) :
-                                           0;
-        double complete_hit_rate = l2_swimlane.complete_probe_count > 0 ?
-                                       l2_swimlane.complete_hit_count * 100.0 / l2_swimlane.complete_probe_count :
-                                       0.0;
-        LOG_INFO_V9(
-            "Thread %d:     poll         : %.3fus (%.1f%%)  hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%",
-            thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent,
-            static_cast<uint64_t>(l2_swimlane.complete_hit_count), static_cast<uint64_t>(complete_miss_count),
-            complete_hit_rate
-        );
-        LOG_INFO_V9(
-            "Thread %d:     otc_lock     : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.lock_cycle), sp.lock_cycle * 100.0 / c_parent,
-            cycles_to_us(sp.lock_cycle - sp.lock_wait_cycle), cycles_to_us(sp.lock_wait_cycle),
-            static_cast<uint64_t>(sp.lock_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     otc_fanout   : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.fanout_cycle), sp.fanout_cycle * 100.0 / c_parent,
-            cycles_to_us(sp.fanout_cycle - sp.push_wait_cycle), cycles_to_us(sp.push_wait_cycle),
-            static_cast<uint64_t>(sp.fanout_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     otc_fanin    : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.fanin_cycle), sp.fanin_cycle * 100.0 / c_parent,
-            static_cast<uint64_t>(sp.fanin_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     otc_self     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.self_consumed_cycle), sp.self_consumed_cycle * 100.0 / c_parent,
-            static_cast<uint64_t>(sp.self_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     perf         : %.3fus (%.1f%%)", thread_idx,
-            cycles_to_us(l2_swimlane.sched_complete_perf_cycle),
-            l2_swimlane.sched_complete_perf_cycle * 100.0 / c_parent
-        );
-
-        LOG_INFO_V9(
-            "Thread %d:   dispatch       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_dispatch_cycle),
-            l2_swimlane.sched_dispatch_cycle * 100.0 / sched_total
-        );
-
-        uint64_t d_parent = l2_swimlane.sched_dispatch_cycle > 0 ? l2_swimlane.sched_dispatch_cycle : 1;
-        LOG_INFO_V9(
-            "Thread %d:     poll         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll),
-            dispatch_poll * 100.0 / d_parent
-        );
-        LOG_INFO_V9(
-            "Thread %d:     pop          : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle), l2_swimlane.sched_dispatch_pop_cycle * 100.0 / d_parent,
-            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle),
-            static_cast<uint64_t>(sp.pop_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     setup        : %.3fus (%.1f%%)", thread_idx,
-            cycles_to_us(l2_swimlane.sched_dispatch_setup_cycle),
-            l2_swimlane.sched_dispatch_setup_cycle * 100.0 / d_parent
-        );
-
-#if PTO2_SCHED_PROFILING
-        LOG_INFO_V9(
-            "Thread %d:   wiring         : %.3fus (%.1f%%)  tasks=%d", thread_idx,
-            cycles_to_us(l2_swimlane.sched_wiring_cycle), l2_swimlane.sched_wiring_cycle * 100.0 / sched_total,
-            l2_swimlane.phase_wiring_count
-        );
-#else
-        LOG_INFO_V9(
-            "Thread %d:   wiring         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_wiring_cycle),
-            l2_swimlane.sched_wiring_cycle * 100.0 / sched_total
-        );
-#endif
-
-        LOG_INFO_V9(
-            "Thread %d:   idle           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_idle_cycle),
-            l2_swimlane.sched_idle_cycle * 100.0 / sched_total
-        );
-
-        if (cur_thread_completed > 0) {
-            LOG_INFO_V9(
-                "Thread %d:   avg/complete   : %.3fus", thread_idx,
-                cycles_to_us(l2_swimlane.sched_complete_cycle) / cur_thread_completed
-            );
-        }
-    }
-#endif
-    LOG_INFO_V9(
-        "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx,
-        cycles_to_us(sched_total), static_cast<uint64_t>(l2_swimlane.sched_loop_count), cur_thread_completed
-    );
-}
-#endif
-
-// =============================================================================
-// Shutdown: deinit AICore regs for this thread's cores (and PMU finalize if enabled).
-// Orchestrator threads have core_trackers_[thread_idx].core_num() == 0 -> no-op.
-// platform_deinit_aicore_regs is idempotent; safe to call after early completion.
-// =============================================================================
-int32_t SchedulerContext::shutdown(int32_t thread_idx) {
-    const int32_t *cores = core_trackers_[thread_idx].core_ids();
-    int32_t core_num = core_trackers_[thread_idx].core_num();
-    if (core_num == 0) return 0;
-
-#if PTO2_PROFILING
-    if (is_pmu_enabled()) {
-        pmu_aicpu_finalize(cores, core_num);
-    }
-#endif
-
-    LOG_INFO_V0("Thread %d: Shutting down %d cores", thread_idx, core_num);
-    int32_t rc = 0;
-    for (int32_t i = 0; i < core_num; i++) {
-        int32_t core_id = cores[i];
-        uint64_t reg_addr = core_exec_states_[core_id].reg_addr;
-        if (reg_addr != 0) {
-            // Timeout means AICore is unresponsive. Log and continue deiniting remaining cores.
-            if (platform_deinit_aicore_regs(reg_addr) != 0) {
-                LOG_ERROR("Thread %d: Core %d deinit timed out", thread_idx, core_id);
-                rc = -1;
-            }
-        } else {
-            LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id);
-        }
-    }
-    LOG_INFO_V0("Thread %d: Shutdown complete", thread_idx);
-    return rc;
-}
-
-// =============================================================================
-// Handshake with all AICore workers; discover core type and reg address.
-// =============================================================================
-int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) {
-    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
-    cores_total_num_ = runtime->worker_count;
-
-    // Validate cores_total_num_ before using as array index
-    if (cores_total_num_ == 0 || cores_total_num_ > RUNTIME_MAX_WORKER) {
-        LOG_ERROR("Invalid cores_total_num %d (expected 1-%d)", cores_total_num_, RUNTIME_MAX_WORKER);
-        return -1;
-    }
-
-    aic_count_ = 0;
-    aiv_count_ = 0;
-
-    LOG_INFO_V0("Handshaking with %d cores", cores_total_num_);
-
-    // Step 1: Write per-core payload addresses and send handshake signal.
-    // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before
-    // aicpu_ready=1, so AICore reads the correct payload pointer after waking up.
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        all_handshakes[i].task = reinterpret_cast<uint64_t>(&payload_per_core_[i][0]);
-        OUT_OF_ORDER_STORE_BARRIER();
-        all_handshakes[i].aicpu_ready = 1;
-    }
-    OUT_OF_ORDER_STORE_BARRIER();
-
-    // Get platform physical cores count for validation
-    uint32_t max_physical_cores_count = platform_get_physical_cores_count();
-
-    // Step 2: Wait for all cores to respond, collect core type and register addresses
-    bool handshake_failed = false;
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        Handshake *hank = &all_handshakes[i];
-
-        while (hank->aicore_regs_ready == 0) {
-            SPIN_WAIT_HINT();
-        }
-
-        uint32_t physical_core_id = hank->physical_core_id;
-
-        if (physical_core_id >= max_physical_cores_count) {
-            LOG_ERROR(
-                "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id,
-                max_physical_cores_count
-            );
-            handshake_failed = true;
-            continue;
-        }
-
-        uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
-        uint64_t reg_addr = regs[physical_core_id];
-
-        // Initialize AICore registers after discovery (first round)
-        platform_init_aicore_regs(reg_addr);
-        OUT_OF_ORDER_STORE_BARRIER();
-        hank->aicpu_regs_ready = 1;
-
-        OUT_OF_ORDER_STORE_BARRIER();
-
-        while (hank->aicore_done == 0) {
-            SPIN_WAIT_HINT();
-        }
-
-        CoreType type = hank->core_type;
-
-        core_exec_states_[i].reg_addr = reg_addr;
-        core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
-
-#if PTO2_PROFILING
-        // Record physical_core_id for PMU init later (CoreExecState has no room
-        // for this field under PTO2_PROFILING).
-        physical_core_ids_[i] = physical_core_id;
-#endif
-#if !PTO2_PROFILING
-        core_exec_states_[i].worker_id = i;
-        core_exec_states_[i].physical_core_id = physical_core_id;
-        core_exec_states_[i].core_type = type;
-#endif
-
-        if (type == CoreType::AIC) {
-            aic_worker_ids_[aic_count_++] = i;
-            LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
-        } else {
-            aiv_worker_ids_[aiv_count_++] = i;
-            LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
-        }
-    }
-
-    if (handshake_failed) {
-        emergency_shutdown(runtime);
-        return -1;
-    }
-
-    LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_);
-    return 0;
-}
-
-// =============================================================================
-// Assign discovered cores to scheduler threads (cluster-aligned round-robin).
-// =============================================================================
-bool SchedulerContext::assign_cores_to_threads() {
-    // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % active_sched_threads_.
-    // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together.
-    active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
-    int32_t cluster_count = aic_count_;
-
-    // Max clusters any single sched thread can hold: ceil(cluster_count / active_sched_threads_).
-    int32_t max_clusters_per_thread = (cluster_count + active_sched_threads_ - 1) / active_sched_threads_;
-    int32_t thread_cores_num = max_clusters_per_thread * 3;
-
-    if (thread_cores_num > CoreTracker::MAX_CORE_PER_THREAD) {
-        LOG_ERROR("Can't assign more then 64 cores in per scheduler");
-        return false;
-    }
-
-    LOG_INFO_V0(
-        "Assigning cores (round-robin): %d clusters across %d sched threads (%d AIC, %d AIV)", cluster_count,
-        active_sched_threads_, aic_count_, aiv_count_
-    );
-
-    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
-        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
-        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
-    }
-
-    // Count clusters per thread first (round-robin may distribute unevenly)
-    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        clusters_per_thread[ci % active_sched_threads_]++;
-    }
-    for (int32_t i = 0; i < active_sched_threads_; i++) {
-        core_trackers_[i].init(clusters_per_thread[i]);
-    }
-
-    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
-
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        int32_t t = ci % active_sched_threads_;
-
-        int32_t aic_wid = aic_worker_ids_[ci];
-        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
-        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
-
-        core_trackers_[t].set_cluster(cluster_idx_per_thread[t]++, aic_wid, aiv0_wid, aiv1_wid);
-
-        LOG_INFO_V0("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", t, ci, aic_wid, aiv0_wid, aiv1_wid);
-    }
-
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        LOG_INFO_V0(
-            "Thread %d: total %d cores (%d clusters)", t, core_trackers_[t].core_num(),
-            core_trackers_[t].get_cluster_count()
-        );
-    }
-
-    LOG_INFO_V0(
-        "Config: threads=%d, cores=%d, cores_per_thread=%d", aicpu_thread_num_, cores_total_num_, thread_cores_num
-    );
-    return true;
-}
-
-// =============================================================================
-// Reassign all cores across all threads (sched + orchestrator) after orchestration.
-// =============================================================================
-void SchedulerContext::reassign_cores_for_all_threads() {
-    LOG_INFO_V0(
-        "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_
-    );
-
-    // Collect running worker_ids from all current trackers
-    bool running_cores[RUNTIME_MAX_WORKER] = {};
-    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
-        auto all_running = core_trackers_[i].get_all_running_cores();
-        int32_t bp;
-        while ((bp = all_running.pop_first()) >= 0) {
-            running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true;
-        }
-    }
-
-    // Count clusters per thread (round-robin across all threads)
-    int32_t cluster_count = aic_count_;
-    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        clusters_per_thread[ci % aicpu_thread_num_]++;
-    }
-
-    // Re-init all trackers and reset core counts
-    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
-        core_trackers_[i].init(clusters_per_thread[i]);
-    }
-
-    // Assign clusters round-robin and restore running state
-    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        int32_t t = ci % aicpu_thread_num_;
-
-        int32_t aic_wid = aic_worker_ids_[ci];
-        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
-        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
-
-        int32_t cl_idx = cluster_idx_per_thread[t]++;
-        core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid);
-
-        // init() marks all idle; toggle cores that were running and restore pending_occupied
-        if (running_cores[aic_wid]) {
-            core_trackers_[t].change_core_state(cl_idx * 3);
-            core_trackers_[t].set_pending_occupied(cl_idx * 3);
-        }
-        if (running_cores[aiv0_wid]) {
-            core_trackers_[t].change_core_state(cl_idx * 3 + 1);
-            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1);
-        }
-        if (running_cores[aiv1_wid]) {
-            core_trackers_[t].change_core_state(cl_idx * 3 + 2);
-            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2);
-        }
-    }
-
-    // Log final distribution
-    LOG_INFO_V0("Core reassignment complete:");
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        int32_t aic_running = core_trackers_[t].get_running_count<CoreType::AIC>();
-        int32_t aiv_running = core_trackers_[t].get_running_count<CoreType::AIV>();
-        LOG_INFO_V0(
-            "  Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(),
-            core_trackers_[t].get_cluster_count(), aic_running, aiv_running
-        );
-    }
-    active_sched_threads_ = aicpu_thread_num_;
-}
-
-// =============================================================================
-// Emergency shutdown: broadcast exit signal to every handshake'd core and
-// deinit their AICore register blocks. Idempotent.
-// =============================================================================
-void SchedulerContext::emergency_shutdown(Runtime *runtime) {
-    LOG_WARN("Emergency shutdown: sending exit signal to all initialized cores");
-    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
-    int32_t timeout_count = 0;
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        Handshake *hank = &all_handshakes[i];
-        OUT_OF_ORDER_STORE_BARRIER();
-        hank->aicpu_regs_ready = 1;
-        if (core_exec_states_[i].reg_addr != 0) {
-            if (platform_deinit_aicore_regs(core_exec_states_[i].reg_addr) != 0) {
-                timeout_count++;
-            }
-        }
-    }
-    if (timeout_count > 0) {
-        LOG_ERROR("Emergency shutdown: %d cores did not acknowledge exit", timeout_count);
-    }
-    LOG_WARN("Emergency shutdown complete");
-}
-
-// =============================================================================
-// Lifecycle: init / deinit
-// =============================================================================
-int32_t SchedulerContext::init(
-    Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base
-) {
-    always_assert(runtime != nullptr);
-
-    // Zero all per-core execution state before handshake
-    memset(core_exec_states_, 0, sizeof(core_exec_states_));
-
-    // Wire thread/transition configuration that handshake/assign need to read.
-    aicpu_thread_num_ = aicpu_thread_num;
-    sched_thread_num_ = sched_thread_num;
-    orch_to_sched_ = orch_to_sched;
-    regs_ = regs_base;
-
-#if PTO2_PROFILING
-    // l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory
-    // header — must be called BEFORE caching the level, otherwise the cached
-    // value would still be 0 (only the binary enable bit has been seeded by
-    // kernel.cpp at this point). Reset the cached level on disabled runs so a
-    // prior enabled launch's level can't leak into the phase-record gates in
-    // scheduler_dispatch.
-    if (is_l2_swimlane_enabled()) {
-        l2_swimlane_aicpu_init(runtime->worker_count);
-        l2_swimlane_level_ = get_l2_swimlane_level();
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-            // Sched-phase pool count: matches the dump_tensor_init branch in
-            // scheduler_dispatch.cpp. sched_thread_num_ <= 0 means "use all
-            // AICPU threads as scheduler threads" (see assign_cores_to_threads'
-            // active_sched_threads_ normalization at line 689). Without this
-            // normalization here, init_phase would prime zero sched pools and
-            // all sched_phase emits would silently drop.
-            const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
-            const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched;
-            // Orchestration is always single-threaded, so orch-phase is one pool
-            // (ordinal 0) in both modes — see record_orch_phase.
-            const int orch_phase_threads = 1;
-            l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_phase_threads, orch_phase_threads);
-        }
-    } else {
-        l2_swimlane_level_ = L2SwimlaneLevel::DISABLED;
-    }
-#endif
-
-    // Discover cores and assign to scheduler threads.
-    int32_t rc = handshake_all_cores(runtime);
-    if (rc != 0) {
-        LOG_ERROR("handshake_all_cores failed");
-        return rc;
-    }
-    if (!assign_cores_to_threads()) {
-        return -1;
-    }
-
-    // Initialize task counters. Task count comes from PTO2 shared memory.
-    if (runtime->get_gm_sm_ptr()) {
-        auto *header = static_cast<PTO2SharedMemoryHeader *>(runtime->get_gm_sm_ptr());
-        // Read at one-time boot init, before the SM is reset for the run, so a
-        // ring not yet written holds uninitialized memory (0xbe... under ASAN's
-        // malloc-fill). Sum in int64 and only count rings whose value is a
-        // plausible task count — (0, PTO2_SCOPE_TASKS_CAP]; a ring cannot hold
-        // more than the scope cap. This rejects any garbage pattern (negative
-        // or positive), so uninitialized rings contribute 0 (the correct boot
-        // count) while valid counts still add up, with no signed overflow.
-        int64_t pto2_count = 0;
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            int32_t ring_tasks = header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
-            if (ring_tasks > 0 && ring_tasks <= PTO2_SCOPE_TASKS_CAP) pto2_count += ring_tasks;
-        }
-        total_tasks_ = static_cast<int32_t>(pto2_count);
-    } else {
-        total_tasks_ = 0;
-    }
-    completed_tasks_.store(0, std::memory_order_release);
-
-    // Device orchestration: the orchestrator thread flips this when the graph is built.
-    orchestrator_done_ = false;
-
-    // Clear per-core dispatch payloads
-    memset(payload_per_core_, 0, sizeof(payload_per_core_));
-    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
-
-    // Initialize per-core GlobalContext (sub_block_id) based on cluster position.
-    // This is done once at startup and never modified afterwards.
-    for (int32_t t = 0; t < sched_thread_num_; t++) {
-        CoreTracker &tracker = core_trackers_[t];
-        for (int32_t c = 0; c < tracker.get_cluster_count(); c++) {
-            int32_t cluster_offset = c * 3;  // Each cluster = 1 AIC + 2 AIV
-            auto aiv0_id = tracker.get_core_id_by_offset(tracker.get_aiv0_core_offset(cluster_offset));
-            auto aiv1_id = tracker.get_core_id_by_offset(tracker.get_aiv1_core_offset(cluster_offset));
-            payload_per_core_[aiv0_id][0].global_context.sub_block_id = 0;
-            payload_per_core_[aiv0_id][1].global_context.sub_block_id = 0;
-            payload_per_core_[aiv1_id][0].global_context.sub_block_id = 1;
-            payload_per_core_[aiv1_id][1].global_context.sub_block_id = 1;
-        }
-    }
-
-    func_id_to_addr_ = runtime->func_id_to_addr_;
-
-    return 0;
-}
-
-void SchedulerContext::deinit() {
-    // Reset all per-core execution state
-    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
-        core_exec_states_[i] = {};
-        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
-        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
-    }
-
-    // Clear per-core dispatch payloads
-    memset(payload_per_core_, 0, sizeof(payload_per_core_));
-    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
-
-    // Reset sync-start drain coordination — a previous run that aborted mid-drain
-    // would otherwise leave dirty pending/elected/ack state for the next reuse.
-    drain_state_.sync_start_pending.store(0, std::memory_order_release);
-    drain_state_.drain_worker_elected.store(0, std::memory_order_release);
-    drain_state_.drain_ack_mask.store(0, std::memory_order_release);
-    drain_state_.pending_task.store(nullptr, std::memory_order_release);
-
-    // Reset task counters and orchestrator state
-    completed_tasks_.store(0, std::memory_order_release);
-    total_tasks_ = 0;
-    orchestrator_done_ = false;
-    pto2_init_done_.store(false, std::memory_order_release);
-    pto2_init_complete_.store(false, std::memory_order_release);
-
-    // Reset core transition state
-    transition_requested_.store(false, std::memory_order_release);
-    wait_reassign_.store(0, std::memory_order_release);
-    reassigned_.store(false, std::memory_order_release);
-    completed_.store(false, std::memory_order_release);
-
-    // Reset core discovery and assignment state
-    aic_count_ = 0;
-    aiv_count_ = 0;
-    cores_total_num_ = 0;
-    aicpu_thread_num_ = 0;
-    sched_thread_num_ = 0;
-    orch_to_sched_ = false;
-    active_sched_threads_ = 0;
-    for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) {
-        core_trackers_[t] = CoreTracker{};
-    }
-
-    regs_ = 0;
-    sched_ = nullptr;
-    rt_ = nullptr;
-    func_id_to_addr_ = nullptr;
-}
-
-void SchedulerContext::wait_pto2_init_complete() const {
-    while (!pto2_init_complete_.load(std::memory_order_acquire)) {
-        SPIN_WAIT_HINT();
-    }
-}
-
-void SchedulerContext::bind_runtime(PTO2Runtime *rt) {
-    rt_ = rt;
-    sched_ = &rt->scheduler;
-}
-
-// =============================================================================
-// Post-orchestration bookkeeping. Runs on the orchestrator thread once the
-// build phase finishes; folds inline-completed tasks, flips orchestrator_done_,
-// and drives the orchestrator → scheduler core transition (or fatal shutdown).
-// =============================================================================
-void SchedulerContext::on_orchestration_done(
-    Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks
-) {
-#if PTO2_PROFILING
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) {
-        // Flush the orchestrator's orch-phase buffer (single instance, pool 0).
-        // The orchestrator has no scheduler-phase pool of its own — those belong
-        // to the scheduler threads and are flushed in scheduler_dispatch.
-        l2_swimlane_aicpu_flush_orch_phase_buffer(thread_idx);
-    }
-#endif
-
-    total_tasks_ = total_tasks;
-
-    // Fold tasks completed inline during orchestration
-    int32_t inline_completed = static_cast<int32_t>(rt->orchestrator.inline_completed_tasks);
-    if (inline_completed > 0) {
-        completed_tasks_.fetch_add(inline_completed, std::memory_order_relaxed);
-#if PTO2_SCHED_PROFILING
-        rt->scheduler.tasks_completed.fetch_add(inline_completed, std::memory_order_relaxed);
-#endif
-    }
-    orchestrator_done_ = true;
-
-    // Check for fatal error from orchestration; if so, shut down immediately.
-    int32_t orch_err = 0;
-    if (sched_->sm_header) {
-        orch_err = sched_->sm_header->orch_error_code.load(std::memory_order_relaxed);
-    }
-    if (orch_err != PTO2_ERROR_NONE) {
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-    }
-
-    // Skip core transition on fatal error — cores already shut down above.
-    if (completed_.load(std::memory_order_acquire)) {
-        // Signal transition to unblock scheduler threads waiting at core transition
-        transition_requested_.store(true, std::memory_order_release);
-        reassigned_.store(true, std::memory_order_release);
-    } else if (orch_to_sched_) {
-        LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
-        transition_requested_.store(true, std::memory_order_release);
-
-        // Wait for scheduler threads to acknowledge transition request
-        while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) {
-            if (completed_.load(std::memory_order_acquire)) {
-                break;
-            }
-            SPIN_WAIT_HINT();
-        }
-        if (!completed_.load(std::memory_order_acquire)) {
-            reassign_cores_for_all_threads();
-            reassigned_.store(true, std::memory_order_release);
-        }
-    }
-
-#if PTO2_PROFILING
-    // Write core-to-thread mapping AFTER reassignment so the profiling data
-    // reflects the final distribution (all active_sched_threads_, including
-    // former orchestrator threads when orch_to_sched_ is enabled).
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        l2_swimlane_aicpu_init_core_assignments(cores_total_num_);
-        for (int32_t t = 0; t < active_sched_threads_; t++) {
-            l2_swimlane_aicpu_write_core_assignments_for_thread(
-                t, core_trackers_[t].core_ids(), core_trackers_[t].core_num()
-            );
-        }
-    }
-#endif
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
deleted file mode 100644
index eda052769..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
+++ /dev/null
@@ -1,534 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include "scheduler_context.h"
-
-#include <algorithm>
-
-#include "common/unified_log.h"
-#include "aicpu/device_time.h"
-#include "aicpu/platform_regs.h"
-#include "common/l2_swimlane_profiling.h"
-#include "common/memory_barrier.h"
-#include "common/platform_config.h"
-#include "pto_runtime2.h"
-#include "runtime.h"
-#include "spin_hint.h"
-
-// Performance profiling headers
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-#include "aicpu/pmu_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-
-// =============================================================================
-// Dual-slot state machine helpers
-// =============================================================================
-
-namespace {
-inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
-}
-
-// Pure function: read register result -> SlotTransition (no side effects).
-SlotTransition SchedulerContext::decide_slot_transition(
-    int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id
-) {
-    SlotTransition t;
-    if (pending_id != AICPU_TASK_INVALID && reg_task_id == pending_id) {
-        t.matched = true;
-        t.running_done = true;  // Serial execution: pending event implies running done
-        t.running_freed = true;
-        t.pending_freed = true;
-        if (reg_state == TASK_FIN_STATE) {
-            t.pending_done = true;  // Case 1: pending FIN
-        }
-        // else: Case 2: pending ACK (pending_done stays false)
-    } else if (reg_task_id == running_id) {
-        if (reg_state == TASK_FIN_STATE) {
-            if (pending_id == AICPU_TASK_INVALID) {
-                // Case 3.2: running FIN, no pending -> core goes idle
-                t.matched = true;
-                t.running_done = true;
-                t.running_freed = true;
-            }
-            // Case 3.1: running FIN, pending exists -> skip (transient state).
-            // Case 1/2 (pending ACK/FIN) will complete running implicitly via running_done=true.
-        } else {
-            // Case 4: running ACK -- only pending_freed (slot now hardware-latched)
-            t.matched = true;
-            t.pending_freed = true;
-        }
-    }
-    return t;
-}
-
-// Complete one slot's task: subtask counting, mixed completion, deferred release, profiling.
-void SchedulerContext::complete_slot_task(
-    PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, [[maybe_unused]] PTO2SubtaskSlot subslot,
-    int32_t thread_idx, int32_t core_id, Handshake *hank, int32_t &completed_this_turn,
-    PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs
-#if PTO2_PROFILING
-    ,
-    uint64_t dispatch_ts, uint64_t finish_ts
-#endif
-) {
-#if PTO2_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-#else
-    (void)hank;
-#endif
-    // MPSC fast-path is opt-in per task: only tasks with at least one subtask
-    // that registered a deferred condition route through the mailbox. Pure
-    // non-deferred tasks complete inline on this thread (matching pre-MPSC
-    // behavior — keeps the common case parallelized across scheduler threads
-    // instead of serializing through the single consumer). The
-    // any_subtask_deferred flag on slot_state is the discriminator; it's set
-    // (release) before on_subtask_complete and read (acquire) after, so the
-    // last subtask sees flag writes from any earlier subtask of the same task.
-    AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr;
-    bool defer_completion_to_consumer = false;
-
-    if (slot_state.payload != nullptr) {
-        volatile DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][expected_reg_task_id & 1];
-        int32_t slab_err = deferred_slab->error_code;
-        if (slab_err != PTO2_ERROR_NONE) {
-            int32_t expected = PTO2_ERROR_NONE;
-            sched_->sm_header->sched_error_code.compare_exchange_strong(
-                expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire
-            );
-            completed_.store(true, std::memory_order_release);
-            return;
-        }
-
-        uint32_t cond_count = deferred_slab->count;
-        if (cond_count > MAX_COMPLETIONS_PER_TASK) {
-            int32_t expected = PTO2_ERROR_NONE;
-            sched_->sm_header->sched_error_code.compare_exchange_strong(
-                expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire
-            );
-            completed_.store(true, std::memory_order_release);
-            return;
-        }
-
-        if (cond_count > 0) {
-            // Publish "this task is deferred" before on_subtask_complete so the
-            // acq_rel fetch_add inside on_subtask_complete makes the flag
-            // visible to whichever subtask sees mixed_complete=true (which may
-            // be this thread or a later one).
-            slot_state.any_subtask_deferred.store(true, std::memory_order_release);
-
-            const PTO2TaskId token = slot_state.task->task_id;
-            for (uint32_t i = 0; i < cond_count; ++i) {
-                volatile DeferredCompletionEntry *e = &deferred_slab->entries[i];
-                while (!mailbox->try_push_condition(token, e->addr, e->expected_value, e->engine, e->completion_type)) {
-                    sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
-                    SPIN_WAIT_HINT();
-                }
-            }
-        }
-    }
-
-    bool mixed_complete = sched_->on_subtask_complete(slot_state);
-
-    if (mixed_complete && slot_state.payload != nullptr &&
-        slot_state.any_subtask_deferred.load(std::memory_order_acquire)) {
-        // Some subtask of this task registered conditions; finish the
-        // registration by handing the slot_state off to the consumer.
-        while (!mailbox->try_push_normal_done(slot_state.task->task_id, reinterpret_cast<uint64_t>(&slot_state))) {
-            sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
-            SPIN_WAIT_HINT();
-        }
-        defer_completion_to_consumer = true;
-    }
-
-    if (mixed_complete && !defer_completion_to_consumer) {
-#if PTO2_PROFILING
-        if (is_dump_tensor_enabled()) {
-            dump_tensors_for_task<PTO2_SUBTASK_SLOT_COUNT>(
-                thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION,
-                [](ActiveMask active_mask, int raw_subtask_id) {
-                    return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
-                },
-                [this](int32_t func_id) {
-                    return get_function_bin_addr(func_id);
-                }
-            );
-        }
-#endif
-#if PTO2_SCHED_PROFILING
-        // SCHED_PROFILING variant takes thread_idx for its per-thread atomic
-        // counter side-effects (g_sched_*_atomic_count[thread_idx], consumed
-        // by the otc_* log lines). Its return value is unused.
-        (void)sched_->on_mixed_task_complete(slot_state, thread_idx, local_bufs);
-#else
-        sched_->on_mixed_task_complete(slot_state, local_bufs);
-#endif
-#if PTO2_PROFILING
-        l2_swimlane.phase_complete_count++;
-#endif
-        if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP) {
-            deferred_release_slot_states[deferred_release_count++] = &slot_state;
-        } else {
-            LOG_INFO_V9("Thread %d: release", thread_idx);
-            while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                // SCHED_PROFILING variant takes thread_idx for the per-thread
-                // atomic counter side-effects. The return value is unused.
-                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-            }
-            deferred_release_slot_states[deferred_release_count++] = &slot_state;
-        }
-        completed_this_turn++;
-    }
-
-#if PTO2_PROFILING
-    // Level gate: at AICORE_TIMING (level=1) the AICore record alone carries
-    // {start, end, task_token_raw}, host resolves func_id/core_type from
-    // dep_gen / per-core mapping, and AICPU has nothing to write. Only at
-    // AICPU_TIMING (level=2) and above does AICPU contribute dispatch/finish
-    // timestamps via complete_task. Bypassing here saves the per-completion
-    // hot-path cost (counter inc + ring lookup + record store + wmb + buffer
-    // rotation bookkeeping) for runs that only want AICore timing.
-    if (l2_swimlane.l2_swimlane_enabled && l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
-#if PTO2_SCHED_PROFILING
-        uint64_t t_perf_start = get_sys_cnt_aicpu();
-#endif
-
-        if (l2_swimlane_aicpu_complete_task(
-                core_id, thread_idx, static_cast<uint32_t>(expected_reg_task_id), dispatch_ts, finish_ts
-            ) != 0) {
-            LOG_ERROR(
-                "Core %d: l2_swimlane_aicpu_complete_task failed for task 0x%" PRIx64, core_id,
-                static_cast<uint64_t>(slot_state.task->task_id.raw)
-            );
-        }
-#if PTO2_SCHED_PROFILING
-        l2_swimlane.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start);
-#endif
-    }
-
-    if (is_pmu_enabled()) {
-        pmu_aicpu_record_task(
-            core_id, thread_idx, slot_state.task->task_id.raw,
-            slot_state.task->kernel_id[static_cast<int32_t>(subslot)], hank[core_id].core_type
-        );
-    }
-#endif
-}
-
-// Promote pending slot data to running slot. Clears pending fields.
-void SchedulerContext::promote_pending_to_running(CoreExecState &core) {
-    core.running_slot_state = core.pending_slot_state;
-    core.running_reg_task_id = core.pending_reg_task_id;
-    core.running_subslot = core.pending_subslot;
-#if PTO2_PROFILING
-    core.running_dispatch_timestamp = core.pending_dispatch_timestamp;
-#endif
-    core.pending_slot_state = nullptr;
-    core.pending_reg_task_id = AICPU_TASK_INVALID;
-}
-
-// Clear running slot (core becomes idle).
-void SchedulerContext::clear_running_slot(CoreExecState &core) {
-    core.running_slot_state = nullptr;
-    core.running_reg_task_id = AICPU_TASK_INVALID;
-}
-
-void SchedulerContext::check_running_cores_for_completion(
-    int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed,
-    bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
-    PTO2LocalReadyBuffer *local_bufs
-) {
-#if PTO2_SCHED_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-#endif
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    auto running_core_states = tracker.get_all_running_cores();
-    while (running_core_states.has_value()) {
-        int32_t bit_pos = running_core_states.pop_first();
-        int32_t core_id = tracker.get_core_id_by_offset(bit_pos);
-        CoreExecState &core = core_exec_states_[core_id];
-
-        // --- Judgment phase: read register, derive transition ---
-        // Use the precomputed cond_ptr (resolved once in handshake) to skip
-        // the reg_offset switch and reg_addr addition on every poll.
-        uint64_t reg_val = static_cast<uint64_t>(*core.cond_ptr);
-        // ARM64 allows Device-nGnRnE -> Normal-cacheable load reorder; the
-        // rmb() pins any AICore-published cacheable reads downstream of the
-        // FIN observation. Replaces the post-`__sync_synchronize` that the
-        // old read_reg() helper carried implicitly.
-        rmb();
-        int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
-        int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
-
-#if PTO2_SCHED_PROFILING
-        if (l2_swimlane.l2_swimlane_enabled) {
-            l2_swimlane.complete_probe_count++;
-        }
-#endif
-
-        SlotTransition t =
-            decide_slot_transition(reg_task_id, reg_state, core.running_reg_task_id, core.pending_reg_task_id);
-        if (!t.matched) continue;
-
-#if PTO2_SCHED_PROFILING
-        if (l2_swimlane.l2_swimlane_enabled && (t.running_done || t.pending_done)) {
-            l2_swimlane.complete_hit_count++;
-        }
-#endif
-
-#if PTO2_PROFILING
-        // Capture finish_ts at the FIN observation point — right after rmb()
-        // above pinned the cacheable AICore reads downstream of the register
-        // load, and BEFORE any fanin / deferred-release work. Anything later
-        // (slot transition apply, complete_slot_task fanin processing) would
-        // charge AICPU completion-processing cost to the (end → finish)
-        // span, masking the actual FIN-delivery latency.
-        uint64_t finish_ts = 0;
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING && (t.pending_done || t.running_done)) {
-            finish_ts = get_sys_cnt_aicpu();
-        }
-#endif
-
-        // --- Apply phase: execute actions based on transition ---
-
-        // 1. Complete finished tasks (capture pointers before modifying core state)
-        if (t.pending_done) {
-            complete_slot_task(
-                *core.pending_slot_state, core.pending_reg_task_id, core.pending_subslot, thread_idx, core_id, hank,
-                completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs
-#if PTO2_PROFILING
-                ,
-                core.pending_dispatch_timestamp, finish_ts
-#endif
-            );
-            cur_thread_completed++;
-        }
-        if (t.running_done) {
-            complete_slot_task(
-                *core.running_slot_state, core.running_reg_task_id, core.running_subslot, thread_idx, core_id, hank,
-                completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs
-#if PTO2_PROFILING
-                ,
-                core.running_dispatch_timestamp, finish_ts
-#endif
-            );
-            cur_thread_completed++;
-        }
-
-        // 2. Update slot data
-        if (t.running_freed) {
-            if (core.pending_slot_state != nullptr && !t.pending_done) {
-                promote_pending_to_running(core);  // Case 2 or Case 3 (with pending)
-            } else {
-                clear_running_slot(core);  // Case 1 or Case 3 (no pending)
-                if (t.pending_done) {
-                    // Case 1: pending FIN observed directly -- clear stale pending fields.
-                    // Without this, pending_reg_task_id retains a stale value that blocks
-                    // clear_pending_occupied and permanently degrades pipelining.
-                    core.pending_slot_state = nullptr;
-                    core.pending_reg_task_id = AICPU_TASK_INVALID;
-                }
-            }
-        }
-
-        // 3. Update tracker bitmap
-        bool is_idle = (core.running_reg_task_id == AICPU_TASK_INVALID);
-        if (is_idle) {
-            tracker.change_core_state(bit_pos);       // Mark idle
-            tracker.clear_pending_occupied(bit_pos);  // Idle safeguard: no payload to protect
-        } else if (t.pending_freed && core.pending_reg_task_id == AICPU_TASK_INVALID) {
-            // Case 4 (running ACK) or Case 2 (pending ACK): clear pending_occupied only
-            // when no pending task is currently held. Otherwise pending slot is occupied
-            // by a pre-loaded task and must stay protected.
-            tracker.clear_pending_occupied(bit_pos);
-        }
-
-        // 4. Progress signal (only when running task completes)
-        if (t.running_done) {
-            made_progress = true;
-        }
-    }
-}
-
-// =============================================================================
-// sync_start drain protocol
-// =============================================================================
-
-// Take ownership of slot_state and signal all threads to enter drain mode.
-// Returns true if this thread won the CAS and owns the drain slot.
-// Returns false if another thread already holds drain; caller must re-push slot_state.
-//
-// Two-phase protocol: CAS 0 -> -1 (sentinel) to claim ownership, store task and
-// reset election flag, then release-store block_num.  Other threads acquire-load
-// sync_start_pending; seeing block_num > 0 ensures all relaxed stores are visible.
-bool SchedulerContext::enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num) {
-    int32_t expected = 0;
-    if (!drain_state_.sync_start_pending.compare_exchange_strong(
-            expected, -1, std::memory_order_relaxed, std::memory_order_relaxed
-        )) {
-        return false;  // Another thread already holds the drain slot.
-    }
-    // We own the drain slot.  Store the task and reset election flag before making it visible.
-    drain_state_.pending_task.store(slot_state, std::memory_order_release);
-    drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
-    drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
-    // Release store: all stores above are now visible to any thread that
-    // acquire-loads sync_start_pending and sees block_num > 0.
-    drain_state_.sync_start_pending.store(block_num, std::memory_order_release);
-    return true;
-}
-
-// Count total available resources across all scheduler threads for a given shape.
-int32_t SchedulerContext::count_global_available(PTO2ResourceShape shape) {
-    int32_t total = 0;
-    for (int32_t t = 0; t < active_sched_threads_; t++) {
-        total += core_trackers_[t].get_idle_core_offset_states(shape).count();
-    }
-    return total;
-}
-
-// Drain worker: dispatch all blocks in one pass across all threads' trackers.
-// Called only when global resources >= block_num, so one pass always suffices.
-// All other threads are spinning -- the drain worker has exclusive tracker access.
-void SchedulerContext::drain_worker_dispatch(int32_t block_num) {
-    PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
-    if (!slot_state) {
-        drain_state_.sync_start_pending.store(0, std::memory_order_release);
-        return;
-    }
-    PTO2ResourceShape shape = slot_state->active_mask.to_shape();
-
-    for (int32_t t = 0; t < active_sched_threads_ && slot_state->next_block_idx < block_num; t++) {
-        auto valid = core_trackers_[t].get_idle_core_offset_states(shape);
-        int32_t remaining = slot_state->logical_block_num - slot_state->next_block_idx;
-        int32_t claim = std::min(valid.count(), remaining);
-        int32_t start = slot_state->next_block_idx;
-        slot_state->next_block_idx += claim;
-        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
-        int handle_count = 0;
-        for (int32_t b = 0; b < claim; b++) {
-            auto core_offset = valid.pop_first();
-            handle_count += prepare_block_for_dispatch(
-                t, core_offset, *slot_state, shape, false, start + b, &handles[handle_count]
-            );
-        }
-        wmb();
-        uint64_t dispatch_ts = 0;
-#if PTO2_PROFILING
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
-            dispatch_ts = get_sys_cnt_aicpu();
-        }
-#endif
-        for (int i = 0; i < handle_count; i++) {
-            publish_subtask_to_core(handles[i], dispatch_ts);
-        }
-    }
-
-    // All blocks dispatched -- clear drain state.
-    // Release fence ensures tracker mutations are visible to threads that
-    // acquire-load sync_start_pending == 0 and resume normal operation.
-    std::atomic_thread_fence(std::memory_order_release);
-    drain_state_.pending_task.store(nullptr, std::memory_order_release);
-    drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
-    drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
-    drain_state_.sync_start_pending.store(0, std::memory_order_release);
-}
-
-// Called by each scheduler thread when drain_state_.sync_start_pending != 0.
-//
-// Protocol (single-stage ack barrier):
-//   1. Ack barrier: all threads signal they've stopped dispatch, then spin
-//      until all ack bits are set.
-//      If this thread's bit gets cleared while waiting, a reset occurred -- return.
-//   2. Election: one thread wins the CAS and becomes the drain worker.
-//      If resources are insufficient, reset ack/election fields and return --
-//      all threads resume completion polling to free running cores, then retry.
-//   3. Dispatch: elected thread dispatches all blocks (one pass, resources guaranteed).
-//      Non-elected threads spin-wait until sync_start_pending == 0.
-//      During dispatch the elected thread has exclusive tracker access.
-void SchedulerContext::handle_drain_mode(int32_t thread_idx) {
-    // Every spin in this function honors is_completed(): once the run latches
-    // completed_ (all tasks done, or a fatal error raised elsewhere), peers leave
-    // the dispatch loop and stop participating in the drain. A thread parked in a
-    // drain spin would then wait forever for acks / a gate-open that can no longer
-    // arrive -- the AICPU watchdog never fires here because these spins live
-    // outside the dispatch loop's wall-clock budget, so the hang escalates straight
-    // to the 3 s STARS op-exec timeout (507018) and poisons the device. Bailing on
-    // completed_ is always safe: any pending sync_start task is either already
-    // dispatched (a stale re-popped slot) or moot under teardown, and deinit()
-    // resets drain_state_ before the next run, so leaving it dirty is harmless.
-    // Spin until drain is fully initialized (sentinel -1 -> block_num > 0).
-    int32_t block_num;
-    do {
-        if (is_completed()) return;
-        block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire);
-    } while (block_num < 0);
-    if (block_num == 0) return;
-
-    uint32_t all_acked = (1u << active_sched_threads_) - 1;
-
-    // Ack barrier -- signal this thread has stopped dispatch.
-    drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release);
-
-    // Spin until all threads have acked.
-    // If our bit is cleared while waiting, elected reset due to insufficient resources.
-    while (true) {
-        if (is_completed()) return;
-        uint32_t ack = drain_state_.drain_ack_mask.load(std::memory_order_acquire);
-        if ((ack & all_acked) == all_acked) break;
-        if ((ack & (1u << thread_idx)) == 0) return;
-        SPIN_WAIT_HINT();
-    }
-
-    // Election -- exactly one thread wins the CAS.
-    int32_t expected = 0;
-    drain_state_.drain_worker_elected.compare_exchange_strong(
-        expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed
-    );
-
-    if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1) {
-        // Non-elected: spin-wait for drain completion or resource-insufficient reset.
-        while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
-            if (is_completed()) return;
-            if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return;
-            SPIN_WAIT_HINT();
-        }
-        return;
-    }
-
-    // Elected: check if global resources are sufficient.
-    PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
-    if (slot_state == nullptr) {
-        // pending_task is observed null only when a concurrent drain completion
-        // already cleared it (drain_worker_dispatch nulls it before reopening the
-        // gate). That drain is done and this is a stale-elected thread, so just
-        // release the election lock and return. Do NOT clear drain_ack_mask or
-        // sync_start_pending: a *new* drain run may already be active and
-        // accumulating acks, and zeroing them would corrupt it into a hang.
-        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
-        return;
-    }
-    PTO2ResourceShape shape = slot_state->active_mask.to_shape();
-    int32_t available = count_global_available(shape);
-
-    if (available < block_num) {
-        // Insufficient resources -- reset drain fields so threads can resume
-        // completion polling to free running cores, then retry.
-        drain_state_.drain_ack_mask.store(0, std::memory_order_release);
-        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
-        return;
-    }
-
-    // Dispatch -- all other threads are spinning, elected thread has exclusive tracker access.
-    drain_worker_dispatch(block_num);
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
deleted file mode 100644
index e76f152a3..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#ifndef SCHEDULER_CONTEXT_H
-#define SCHEDULER_CONTEXT_H
-
-#include "aicpu/platform_regs.h"
-#include "common/l2_swimlane_profiling.h"
-#include "common/unified_log.h"
-#include "scheduler_types.h"
-
-#include "scheduler/pto_scheduler.h"
-
-#include "aicore_completion_mailbox.h"
-#include "pto2_dispatch_payload.h"
-
-// These macros are defined in runtime.h, but we cannot include it here
-// (it pulls in Handshake which we only forward-declare).  Mirror the
-// authoritative values so the class layout compiles standalone.
-#ifndef RUNTIME_MAX_WORKER
-#define RUNTIME_MAX_WORKER 72
-#endif
-#ifndef RUNTIME_MAX_FUNC_ID
-#define RUNTIME_MAX_FUNC_ID 1024
-#endif
-
-// Forward declarations — avoid pulling in full headers for pointer/reference params.
-class Runtime;
-struct Handshake;
-struct PTO2Runtime;
-
-/**
- * SchedulerContext: owns all scheduler-side state and methods.
- *
- * Held as a member of AicpuExecutor (sched_ctx_).  The single public entry
- * point is resolve_and_dispatch(), called once per scheduler thread.
- *
- * All dispatch/completion/drain/cold-path logic is implemented as private
- * member methods, split across three .cpp files by responsibility:
- *   - scheduler_completion.cpp  (completion polling, drain protocol)
- *   - scheduler_cold_path.cpp   (exit checks, stall diagnostics, profiling)
- *   - scheduler_dispatch.cpp    (task dispatch loop and helpers)
- */
-class SchedulerContext {
-public:
-    // =========================================================================
-    // Lifecycle
-    // =========================================================================
-
-    // Initialize scheduler state from the given runtime and thread layout.
-    // - Discovers cores via handshake_all_cores()
-    // - Assigns cores to scheduler threads
-    // - Resets task counters, payloads, per-core GlobalContext
-    // - Binds func_id_to_addr_ / initial sched_ (if rt is already known)
-    // - Captures AICore-register base (consumed by handshake_all_cores())
-    // Returns 0 on success, negative on failure (handshake / assignment error).
-    int32_t
-    init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base);
-
-    // Reset all SchedulerContext-owned state to its post-construction defaults.
-    // Called by AicpuExecutor::deinit() during per-run teardown.
-    void deinit();
-
-    // =========================================================================
-    // Per-thread execution entry points (called by AicpuExecutor::run)
-    // =========================================================================
-
-    // Main scheduler thread entry: poll completion + dispatch ready tasks.
-    int32_t resolve_and_dispatch(Runtime *runtime, int32_t thread_idx);
-
-    // Shutdown AICore registers for this thread's assigned cores.
-    // Also runs PMU finalize (PTO2_PROFILING) before deinit when enabled.
-    // Orchestrator threads (core_trackers_[thread_idx].core_num() == 0) are a no-op.
-    int32_t shutdown(int32_t thread_idx);
-
-    // Run all post-orchestration scheduler bookkeeping:
-    //  - publishes core assignments to the perf collector (PTO2_PROFILING)
-    //  - latches submitted task count from PTO2 shared memory
-    //  - folds inline_completed_tasks into completed_tasks_
-    //  - flips orchestrator_done_ and triggers core transition
-    //    (skipped on fatal error — emergency_shutdown runs instead)
-    // Callers must invoke rt_orchestration_done(rt) before this — that
-    // step belongs to the orchestrator lifecycle, not the scheduler.
-    void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks);
-
-    // Bind the PTO2Runtime scheduler pointer. Required in device-orchestration
-    // mode where rt is created by the orchestrator thread after init().
-    void bind_runtime(PTO2Runtime *rt);
-
-    // =========================================================================
-    // State queries / external synchronization points
-    // =========================================================================
-
-    int32_t aic_count() const { return aic_count_; }
-    int32_t aiv_count() const { return aiv_count_; }
-    bool is_completed() const { return completed_.load(std::memory_order_acquire); }
-    int32_t completed_tasks_count() const { return completed_tasks_.load(std::memory_order_acquire); }
-
-    // Block until the first scheduler thread has finished one-time PTO2 init.
-    // Called by the orchestrator thread in device-orch mode.
-    void wait_pto2_init_complete() const;
-
-private:
-    // =========================================================================
-    // State
-    // =========================================================================
-
-    // --- Scheduler binding & per-core runtime state ---
-    alignas(64) PTO2SchedulerState *sched_{nullptr};
-    PTO2Runtime *rt_{nullptr};
-
-    // Per-core execution state, indexed by core_id (= worker_id)
-    CoreExecState core_exec_states_[RUNTIME_MAX_WORKER];
-
-    // Cluster-ordered core trackers, one per scheduler thread
-    CoreTracker core_trackers_[MAX_AICPU_THREADS];
-
-    // Per-core dispatch payload storage: dual-buffer for pipelining.
-    // buf_idx = reg_task_id & 1; adjacent dispatches alternate automatically.
-    PTO2DispatchPayload payload_per_core_[RUNTIME_MAX_WORKER][2];
-
-    // Per-core deferred-completion software registration storage.  This has
-    // the same runtime lifetime as payload_per_core_, but is kept out of the
-    // dispatch payload so normal task dispatch layout and cache footprint stay
-    // unchanged.
-    DeferredCompletionSlab deferred_slab_per_core_[RUNTIME_MAX_WORKER][2];
-
-    // sync_start drain coordination
-    SyncStartDrainState drain_state_;
-
-#if PTO2_PROFILING
-    SchedL2SwimlaneCounters sched_l2_swimlane_[MAX_AICPU_THREADS];
-    // Cached once at init() from get_l2_swimlane_level(), AFTER
-    // l2_swimlane_aicpu_init has promoted the level from the shared-memory header.
-    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};
-#endif
-
-    // --- Task-execution tracking ---
-    std::atomic<int32_t> completed_tasks_{0};
-    int32_t total_tasks_{0};
-    // Device orchestration: set by last orchestrator when graph is built; schedulers poll it.
-    // volatile prevents the compiler from hoisting the load out of spin loops.
-    volatile bool orchestrator_done_{false};
-    std::atomic<bool> completed_{false};
-    uint64_t *func_id_to_addr_{nullptr};
-
-    // --- Core-transition coordination ---
-    std::atomic<bool> transition_requested_{false};
-    std::atomic<int32_t> wait_reassign_{0};
-    std::atomic<bool> reassigned_{false};
-
-    // --- Thread/core configuration ---
-    int32_t active_sched_threads_{0};
-    int32_t sched_thread_num_{0};
-    bool orch_to_sched_{false};
-    int32_t aicpu_thread_num_{0};
-    int32_t cores_total_num_{0};
-
-    // Cluster-ordered worker_id lists, populated by handshake_all_cores().
-    int32_t aic_worker_ids_[RUNTIME_MAX_WORKER]{};
-    int32_t aiv_worker_ids_[RUNTIME_MAX_WORKER]{};
-    int32_t aic_count_{0};
-    int32_t aiv_count_{0};
-
-    // Platform AICore-register base array (set by AicpuExecutor before init()).
-    uint64_t regs_{0};
-
-#if PTO2_PROFILING
-    // PMU profiling: physical core IDs for PMU MMIO base resolution.
-    // Separate storage because CoreExecState's 64-byte budget has no room for
-    // physical_core_id when PTO2_PROFILING=1.
-    uint32_t physical_core_ids_[RUNTIME_MAX_WORKER]{};
-#endif
-
-    // --- One-time init coordination ---
-    std::atomic<bool> pto2_init_done_{false};
-    std::atomic<bool> pto2_init_complete_{false};
-
-    // =========================================================================
-    // Core management (scheduler_cold_path.cpp)
-    // =========================================================================
-
-    // Handshake with all AICore workers; populates core_exec_states_, worker id lists.
-    int32_t handshake_all_cores(Runtime *runtime);
-
-    // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads.
-    bool assign_cores_to_threads();
-
-    // Re-distribute all cores across all threads after orchestration completes.
-    void reassign_cores_for_all_threads();
-
-    // Emergency shutdown: broadcast exit signal to every handshake'd core and
-    // deinit their AICore register blocks. Idempotent.
-    void emergency_shutdown(Runtime *runtime);
-
-    // =========================================================================
-    // Dispatch (scheduler_dispatch.cpp)
-    // =========================================================================
-
-    static const char *shape_name(PTO2ResourceShape shape);
-
-    // Lower-case rendering of PTO2SubtaskSlot, used by dispatch and stall logs.
-    // Kept lower-case to match the `kernels=[aic:N aiv0:N aiv1:N]` field
-    // convention already established in the stall log family.
-    static inline const char *subslot_name(PTO2SubtaskSlot s) {
-        switch (s) {
-        case PTO2SubtaskSlot::AIC:
-            return "aic";
-        case PTO2SubtaskSlot::AIV0:
-            return "aiv0";
-        case PTO2SubtaskSlot::AIV1:
-            return "aiv1";
-        }
-        return "?";
-    }
-
-    int pop_ready_tasks_batch(
-        PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out,
-        int max_count
-    );
-
-    void build_payload(
-        PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
-        const AsyncCtx &async_ctx, int32_t block_idx
-    );
-
-    // Batched-dispatch primitives. prepare_* builds the payload and per-core
-    // state; publish_* issues the MMIO register write. Callers must wmb()
-    // between the prepare batch and the publish batch, then sample
-    // get_sys_cnt_aicpu() once and pass it to publish_* for every handle.
-    //
-    // dispatch_timestamp_slot points to the CoreExecState slot
-    // (pending_dispatch_timestamp / running_dispatch_timestamp) selected at
-    // prepare time, or nullptr when L2 swimlane is below AICPU_TIMING and no
-    // dispatch timestamp is being recorded.
-    struct PublishHandle {
-        uint64_t reg_addr;
-        uint32_t reg_task_id;
-        int32_t core_offset;
-        uint64_t *dispatch_timestamp_slot;
-    };
-
-    PublishHandle prepare_subtask_to_core(
-        int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
-        bool to_pending, int32_t block_idx
-    );
-
-    inline void publish_subtask_to_core(const PublishHandle &h, uint64_t dispatch_ts) {
-        if (h.dispatch_timestamp_slot != nullptr) {
-            *h.dispatch_timestamp_slot = dispatch_ts;
-        }
-        write_reg(h.reg_addr, RegId::DATA_MAIN_BASE, static_cast<uint64_t>(h.reg_task_id));
-    }
-
-    // Fan out one block's subtasks (1 for AIC/AIV, 1-3 for MIX) into the
-    // caller-supplied handles buffer. Returns the number of handles written.
-    int prepare_block_for_dispatch(
-        int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape,
-        bool to_pending, int32_t block_idx, PublishHandle *out_handles
-    );
-
-    void dispatch_shape(
-        int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf,
-        CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
-    );
-
-    // One pass of "Phase 4" in the resolve_and_dispatch loop: IDLE-stage dispatch
-    // for MIX then (if no mix residual) AIC/AIV; mid-flush of local buffers; then
-    // PENDING-stage dispatch with cross-thread idle gating. MIX is strictly
-    // prioritized — when mix residual is detected after MIX-IDLE, AIC/AIV are
-    // skipped for the whole pass but MIX-PENDING still runs.
-    //
-    // Forward-progress argument for AIC/AIV: skip_aic_aiv is sticky for the
-    // current pass only. The next loop iteration re-evaluates after Phase 1
-    // completion polling and the global MIX queue draining (here or on any
-    // peer thread). AIC/AIV starvation is therefore bounded by MIX throughput,
-    // not unbounded — once mix completes on at least one cluster, the next
-    // pass either drains the residual or admits AIC/AIV.
-    void dispatch_ready_tasks(
-        int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES],
-        bool pmu_active, bool &made_progress, bool &try_pushed
-    );
-
-    // Returns true if any *other* scheduler thread currently has an idle core
-    // matching `shape`. Used as a scheduling hint on the PENDING dispatch path
-    // — see the implementation in scheduler_dispatch.cpp for the hint-semantics
-    // rationale and the safety argument against the drain worker.
-    bool has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const;
-
-    // True if mix tasks remain anywhere this thread could see them: the caller's
-    // MIX local LIFO stack or the global MIX ready queue. Approximate —
-    // PTO2ReadyQueue::size() (see pto_scheduler.h) snapshots its enqueue/dequeue
-    // positions with std::memory_order_relaxed and may interleave with concurrent
-    // push/pop. Don't confuse with PTO2SpscQueue::size(), which uses acquire
-    // loads — that one isn't on this path. A stale read here causes at most one
-    // extra/missed AIC/AIV skip and self-corrects on the next loop iteration.
-    bool has_residual_mix(const PTO2LocalReadyBuffer &mix_local_buf) const {
-        return mix_local_buf.count > 0 || sched_->ready_queues[static_cast<int32_t>(PTO2ResourceShape::MIX)].size() > 0;
-    }
-
-    // =========================================================================
-    // Completion & drain (scheduler_completion.cpp)
-    // =========================================================================
-
-    static SlotTransition
-    decide_slot_transition(int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id);
-
-    void complete_slot_task(
-        PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, PTO2SubtaskSlot subslot, int32_t thread_idx,
-        int32_t core_id, Handshake *hank, int32_t &completed_this_turn,
-        PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
-        PTO2LocalReadyBuffer *local_bufs
-#if PTO2_PROFILING
-        ,
-        uint64_t dispatch_ts, uint64_t finish_ts
-#endif
-    );
-
-    static void promote_pending_to_running(CoreExecState &core);
-    static void clear_running_slot(CoreExecState &core);
-
-    void check_running_cores_for_completion(
-        int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed,
-        bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
-        PTO2LocalReadyBuffer *local_bufs
-    );
-
-    bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num);
-    int32_t count_global_available(PTO2ResourceShape shape);
-    void drain_worker_dispatch(int32_t block_num);
-    void handle_drain_mode(int32_t thread_idx);
-
-    // =========================================================================
-    // Cold path: exit checks, stall diagnostics, profiling (scheduler_cold_path.cpp)
-    // =========================================================================
-
-    __attribute__((noinline, cold)) LoopAction
-    handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count);
-
-    __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released);
-
-    __attribute__((noinline, cold)) LoopAction
-    check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime);
-
-    __attribute__((noinline, cold)) void
-    log_stall_diagnostics(int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count);
-
-    __attribute__((noinline, cold)) void log_shutdown_stall_snapshot(
-        int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count
-    );
-
-    // Reverse lookup: given a global core_id, find which scheduler thread's
-    // tracker owns it. Returns -1 if not found. Linear scan — only used on
-    // the cold diagnostic path.
-    int32_t find_core_owner_thread(int32_t core_id) const;
-
-    // Does this thread own any core with a RUNNING task (running_slot_state set)?
-    // Gates the scheduler timeout fatal latch: a thread without an owned
-    // RUNNING task has no first-hand evidence of a stuck dispatch and must
-    // not declare global fatal on its own idle observation. The thread that
-    // does own the stuck task will reach the budget on its own polls and
-    // latch with valid evidence (or recover when the COND register flips).
-    bool self_owns_running_task(int32_t thread_idx) const;
-
-    // Does *any* scheduler thread own a RUNNING task? Used as the second
-    // fatal-latch condition: if the wall-clock budget elapsed AND no thread
-    // owns RUNNING work AND tasks remain incomplete, the system is in a
-    // pre-dispatch / WAIT-only deadlock (e.g. dependency cycle) and the
-    // ownerless idle threads are the only observers — let one of them latch.
-    bool no_thread_owns_running_task() const;
-
-    __attribute__((noinline, cold)) int32_t handle_timeout_exit(
-        int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
-        int32_t last_progress_count
-#if PTO2_PROFILING
-        ,
-        uint64_t sched_start_ts
-#endif
-    );
-
-#if PTO2_PROFILING
-    __attribute__((noinline, cold)) void log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed);
-#endif
-
-    // =========================================================================
-    // Small inline helpers
-    // =========================================================================
-
-    uint64_t get_function_bin_addr(int func_id) const {
-        if (!func_id_to_addr_ || func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
-            LOG_ERROR("func_id=%d is out of range [0, %d) or map is null", func_id, RUNTIME_MAX_FUNC_ID);
-            return 0;
-        }
-        return func_id_to_addr_[func_id];
-    }
-};
-
-#endif  // SCHEDULER_CONTEXT_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
deleted file mode 100644
index 4082becad..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ /dev/null
@@ -1,1080 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include "scheduler_context.h"
-
-#include <algorithm>
-#include <cinttypes>
-#include <limits>
-
-#include "common.h"  // debug_assert
-
-#include "common/unified_log.h"
-#include "aicpu/device_time.h"
-#include "aicpu/platform_regs.h"
-#include "callable.h"
-#include "common/l2_swimlane_profiling.h"
-#include "common/memory_barrier.h"
-#include "common/platform_config.h"
-#include "pto_runtime2.h"
-#include "runtime.h"
-#include "spin_hint.h"
-
-// Performance profiling headers
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-#include "aicpu/pmu_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-
-#ifndef unlikely
-#define unlikely(x) __builtin_expect(!!(x), 0)
-#endif
-
-// =============================================================================
-// Dispatch helpers
-// =============================================================================
-
-namespace {
-inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
-}
-
-const char *SchedulerContext::shape_name(PTO2ResourceShape shape) {
-    switch (shape) {
-    case PTO2ResourceShape::AIC:
-        return "AIC";
-    case PTO2ResourceShape::AIV:
-        return "AIV";
-    case PTO2ResourceShape::MIX:
-        return "MIX";
-    case PTO2ResourceShape::DUMMY:
-        return "DUMMY";
-    }
-    return "UNKNOWN";
-}
-
-bool SchedulerContext::has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const {
-    // Cross-thread read of peer trackers without explicit synchronization. The
-    // backing `core_states_` is a naturally aligned uint64_t; aarch64 guarantees
-    // single-copy atomicity for an 8-byte aligned load, so no torn read. The
-    // value is consumed only as a scheduling *hint* — a stale read at worst
-    // causes one missed/extra pending dispatch, corrected on the next iteration.
-    // Drain-mode cross-thread writes are serialized by handle_drain_mode's ack
-    // barrier (all peers spin out of the dispatch path before any tracker
-    // mutation), so this routine is never racing the drain worker.
-    for (int32_t t = 0; t < active_sched_threads_; t++) {
-        if (t == self_thread_idx) continue;
-        if (core_trackers_[t].get_idle_core_offset_states(shape).has_value()) {
-            return true;
-        }
-    }
-    return false;
-}
-
-int SchedulerContext::pop_ready_tasks_batch(
-    PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
-) {
-#if PTO2_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-#if PTO2_SCHED_PROFILING
-    extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
-    uint64_t t_pop_start = get_sys_cnt_aicpu();
-    int count = sched_->get_ready_tasks_batch(
-        shape, local_buf, out, max_count, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]
-    );
-    l2_swimlane.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
-#else
-    int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
-#endif
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        if (count > 0) {
-            l2_swimlane.pop_hit += count;
-        } else {
-            l2_swimlane.pop_miss++;
-        }
-    }
-#else
-    (void)thread_idx;
-    int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
-#endif
-    return count;
-}
-
-void SchedulerContext::build_payload(
-    PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
-    const AsyncCtx &async_ctx, int32_t block_idx
-) {
-    int32_t slot_idx = static_cast<int32_t>(subslot);
-    uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]);
-    const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
-    dispatch_payload.function_bin_addr = callable->resolved_addr();
-    auto &payload = *slot_state.payload;
-    int n = 0;
-    for (int32_t i = 0; i < payload.tensor_count; i++) {
-        dispatch_payload.args[n++] = reinterpret_cast<uint64_t>(&payload.tensors[i]);
-    }
-    for (int32_t i = 0; i < payload.scalar_count; i++) {
-        dispatch_payload.args[n++] = payload.scalars[i];
-    }
-    dispatch_payload.local_context.block_idx = block_idx;
-    dispatch_payload.local_context.block_num = slot_state.logical_block_num;
-    dispatch_payload.local_context.async_ctx = async_ctx;
-    dispatch_payload.args[PAYLOAD_LOCAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.local_context);
-    dispatch_payload.args[PAYLOAD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.global_context);
-}
-
-SchedulerContext::PublishHandle SchedulerContext::prepare_subtask_to_core(
-    int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, bool to_pending,
-    int32_t block_idx
-) {
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    auto core_id = tracker.get_core_id_by_offset(core_offset);
-    CoreExecState &core_exec_state = core_exec_states_[core_id];
-
-    core_exec_state.dispatch_seq++;
-    uint32_t reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
-    static_assert(
-        (TASK_ID_MASK - AICORE_EXIT_SIGNAL + 1) % 2 == 0, "Sentinel skip must be even to preserve dual-buffer parity"
-    );
-    if (reg_task_id >= AICORE_EXIT_SIGNAL) {
-        core_exec_state.dispatch_seq += (TASK_ID_MASK - reg_task_id + 1);
-        reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
-    }
-
-    uint32_t buf_idx = reg_task_id & 1u;
-    PTO2DispatchPayload &payload = payload_per_core_[core_id][buf_idx];
-    DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][buf_idx];
-    deferred_slab->count = 0;
-    deferred_slab->error_code = PTO2_ERROR_NONE;
-    AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab);
-    build_payload(payload, slot_state, subslot, async_ctx, block_idx);
-
-    if (to_pending) {
-        core_exec_state.pending_subslot = subslot;
-        core_exec_state.pending_slot_state = &slot_state;
-        core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
-    } else {
-        core_exec_state.running_subslot = subslot;
-        core_exec_state.running_slot_state = &slot_state;
-        core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
-        tracker.change_core_state(core_offset);
-    }
-    tracker.set_pending_occupied(core_offset);
-
-    LOG_DEBUG(
-        "Thread %d: Dispatched %s %s task %" PRId64 " kernel_id=[%d,%d,%d] block_idx=%d/total_blocks=%d to"
-        " core_offset=%d core_id=%d reg_task_id=%u",
-        thread_idx, to_pending ? "pending" : "idle", subslot_name(subslot),
-        static_cast<int64_t>(slot_state.task->task_id.raw), slot_state.task->kernel_id[0],
-        slot_state.task->kernel_id[1], slot_state.task->kernel_id[2], block_idx, slot_state.logical_block_num,
-        core_offset, core_id, reg_task_id
-    );
-
-    // AICore buffer rotation lives on the dispatch path: count this dispatch
-    // and rotate before write_reg when we're about to cross a BUFFER_SIZE
-    // boundary. The completion-before-dispatch invariant makes this race-free
-    // (all prior tasks on this core have FIN'd, so AICore has dcci'd their
-    // records out of the old buffer). Gated on the same enable bit as flush
-    // so level=1 (AICORE_TIMING-only) participates without needing complete_task.
-#if PTO2_PROFILING
-    if (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED) {
-        l2_swimlane_aicpu_on_aicore_dispatch(core_id, thread_idx);
-    }
-#endif
-
-    uint64_t *dispatch_timestamp_slot = nullptr;
-#if PTO2_PROFILING
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
-        dispatch_timestamp_slot =
-            to_pending ? &core_exec_state.pending_dispatch_timestamp : &core_exec_state.running_dispatch_timestamp;
-    }
-#endif
-
-    return PublishHandle{core_exec_state.reg_addr, reg_task_id, core_offset, dispatch_timestamp_slot};
-}
-
-int SchedulerContext::prepare_block_for_dispatch(
-    int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape, bool to_pending,
-    int32_t block_idx, PublishHandle *out_handles
-) {
-#if PTO2_PROFILING
-    if (is_dump_tensor_enabled()) {
-        dump_tensors_for_task<PTO2_SUBTASK_SLOT_COUNT>(
-            thread_idx, slot_state, TensorDumpStage::BEFORE_DISPATCH,
-            [](ActiveMask active_mask, int raw_subtask_id) {
-                return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
-            },
-            [this](int32_t func_id) {
-                return get_function_bin_addr(func_id);
-            }
-        );
-    }
-#endif
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    if (shape == PTO2ResourceShape::MIX) {
-        uint8_t cmask = slot_state.active_mask.core_mask();
-        int n = 0;
-        if (cmask & PTO2_SUBTASK_MASK_AIC) {
-            bool p = to_pending && !tracker.is_aic_core_idle(core_offset);
-            out_handles[n++] = prepare_subtask_to_core(
-                thread_idx, tracker.get_aic_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIC, p, block_idx
-            );
-        }
-        if (cmask & PTO2_SUBTASK_MASK_AIV0) {
-            bool p = to_pending && !tracker.is_aiv0_core_idle(core_offset);
-            out_handles[n++] = prepare_subtask_to_core(
-                thread_idx, tracker.get_aiv0_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV0, p, block_idx
-            );
-        }
-        if (cmask & PTO2_SUBTASK_MASK_AIV1) {
-            bool p = to_pending && !tracker.is_aiv1_core_idle(core_offset);
-            out_handles[n++] = prepare_subtask_to_core(
-                thread_idx, tracker.get_aiv1_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV1, p, block_idx
-            );
-        }
-#if PTO2_PROFILING
-        sched_l2_swimlane_[thread_idx].phase_dispatch_count += __builtin_popcount(cmask);
-#endif
-        return n;
-    } else if (shape == PTO2ResourceShape::AIC) {
-        out_handles[0] =
-            prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIC, to_pending, block_idx);
-#if PTO2_PROFILING
-        sched_l2_swimlane_[thread_idx].phase_dispatch_count += 1;
-#endif
-        return 1;
-    } else {
-        out_handles[0] =
-            prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx);
-#if PTO2_PROFILING
-        sched_l2_swimlane_[thread_idx].phase_dispatch_count += 1;
-#endif
-        return 1;
-    }
-}
-
-void SchedulerContext::dispatch_shape(
-    int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf,
-    CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
-) {
-#if PTO2_SCHED_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-#endif
-    if (entered_drain) return;
-
-    bool is_pending = (phase == CoreTracker::DispatchPhase::PENDING);
-    auto cores = tracker.get_dispatchable_cores(shape, phase);
-    if (!cores.has_value()) return;
-
-    while (cores.has_value() && !entered_drain) {
-        int want = cores.count();
-        PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS * 3];
-        int got = pop_ready_tasks_batch(shape, thread_idx, local_buf, batch, want);
-        if (got == 0) break;
-
-        // sync_start exclusion gate.
-        //
-        // When the popped batch contains a sync_start task we MUST publish each
-        // prior task with its own wmb so AICore receives them with time
-        // separation. The drain coordinator's `count_global_available()` check
-        // reads the per-thread CoreTracker, and although `prepare_block_for_dispatch`
-        // marks cores occupied synchronously, the head-start between successive
-        // tasks is what lets the surrounding completion loop catch up on FINs in
-        // the retry window when the sync_start task hits insufficient resources.
-        // Bursting all prior tasks at the end of the pop (cross-task batching)
-        // collapses that head-start and causes spmd_sync_start_stress to time
-        // out via 507018 on ~40% of runs — see
-        // docs/investigations/2026-06-cross-task-batched-publish.md.
-        //
-        // When the batch carries no sync_start task, no drain entry can happen
-        // in this pop, so we hoist `handles[]`, `wmb()`, and the publish loop
-        // out of the per-task body. One wmb amortizes across all tasks and one
-        // dispatch_ts is shared, which restores ~60 ns first-to-last AICore
-        // start span for single-block decode kernels (out_proj, q_proj, ...).
-        // Detection is a single mask check per task — cheap relative to even
-        // one register write.
-        bool any_sync_start = false;
-        for (int bi = 0; bi < got; bi++) {
-            if (batch[bi]->active_mask.requires_sync_start()) {
-                any_sync_start = true;
-                break;
-            }
-        }
-
-        // handles[] is sized for the MIX worst case: total claims across the
-        // pop bounded by `cores.count() ≤ MAX_CLUSTERS`, and each block
-        // contributes ≤ 3 subtasks for MIX.
-        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
-        int handle_count = 0;
-        bool dispatched_any = false;
-#if PTO2_SCHED_PROFILING
-        uint64_t t_setup_start = get_sys_cnt_aicpu();
-#endif
-
-        // Flush prepared-but-unpublished handles. Required before
-        // `enter_drain_mode` so the drain coordinator sees cores as occupied,
-        // and at the per-task boundary when `any_sync_start` is true.
-        auto flush_publish = [&]() {
-            if (handle_count == 0) return;
-            wmb();
-            uint64_t dispatch_ts = 0;
-#if PTO2_PROFILING
-            if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
-                dispatch_ts = get_sys_cnt_aicpu();
-            }
-#endif
-            for (int i = 0; i < handle_count; i++) {
-                publish_subtask_to_core(handles[i], dispatch_ts);
-            }
-            handle_count = 0;
-            made_progress = true;
-        };
-
-        for (int bi = 0; bi < got; bi++) {
-            PTO2TaskSlotState *slot_state = batch[bi];
-
-            if (slot_state->active_mask.requires_sync_start()) {
-                if (is_pending) {
-                    sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-                    continue;
-                }
-                int32_t available = cores.count();
-                if (available < slot_state->logical_block_num) {
-                    flush_publish();
-                    if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) {
-                        sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-                    }
-                    for (int rem = bi + 1; rem < got; rem++) {
-                        sched_->ready_queues[static_cast<int32_t>(shape)].push(batch[rem]);
-                    }
-                    entered_drain = true;
-                    break;
-                }
-            }
-
-            if (!cores.has_value()) {
-                flush_publish();
-                sched_->ready_queues[static_cast<int32_t>(shape)].push_batch(&batch[bi], got - bi);
-                break;
-            }
-
-            dispatched_any = true;
-            try_pushed = true;
-            // Claim a contiguous range of blocks, hand the slot back to the
-            // ready queue immediately, then perform the expensive dispatches.
-            // This lets other schedulers concurrently claim and dispatch the
-            // remaining blocks of the same SPMD task instead of spinning while
-            // this thread fills all its own cores. Only local `start + b` is
-            // read after the push — `next_block_idx` may already be advanced
-            // by another scheduler that popped the slot.
-            int32_t remaining = slot_state->logical_block_num - slot_state->next_block_idx;
-            int32_t claim = std::min(cores.count(), remaining);
-            int32_t start = slot_state->next_block_idx;
-            slot_state->next_block_idx += claim;
-
-            if (slot_state->next_block_idx < slot_state->logical_block_num) {
-                sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-            }
-
-            for (int32_t b = 0; b < claim; b++) {
-                auto core_offset = cores.pop_first();
-                handle_count += prepare_block_for_dispatch(
-                    thread_idx, core_offset, *slot_state, shape, is_pending, start + b, &handles[handle_count]
-                );
-            }
-
-            // Sync_start exclusion: flush per task so prior tasks have head-
-            // start time before any sync_start drain check. Normal batches
-            // fall through and accumulate for one cross-task flush at the
-            // end of the pop.
-            if (any_sync_start) {
-                flush_publish();
-            }
-        }
-
-        flush_publish();
-#if PTO2_SCHED_PROFILING
-        l2_swimlane.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
-#endif
-
-        if (!dispatched_any) break;
-
-        if (!cores.has_value()) {
-            cores = tracker.get_dispatchable_cores(shape, phase);
-        }
-    }
-}
-
-void SchedulerContext::dispatch_ready_tasks(
-    int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES],
-    bool pmu_active, bool &made_progress, bool &try_pushed
-) {
-    using Phase = CoreTracker::DispatchPhase;
-    constexpr int32_t MIX_I = static_cast<int32_t>(PTO2ResourceShape::MIX);
-
-    // MIX is handled explicitly at the top of each stage; only AIC/AIV cycle
-    // through this 2-elem array, with order toggled by thread parity for
-    // shape-level load balancing across threads.
-    static constexpr PTO2ResourceShape kAicAivOrder[2][2] = {
-        {PTO2ResourceShape::AIC, PTO2ResourceShape::AIV},
-        {PTO2ResourceShape::AIV, PTO2ResourceShape::AIC},
-    };
-    const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1];
-
-    // Spill overflow from local_bufs to the shared ready queue BEFORE we start
-    // dispatching. release_fanin's fast path packs all newly-ready consumers
-    // into the producing thread's local_bufs (zero atomic, peer-invisible). For
-    // batch releases (e.g. attn_fence → 50 out_proj consumers) that
-    // overshoots this thread's slot budget so peers are starving while we
-    // hoard. The cross-thread invisibility window between "complete pushes 50
-    // to local" and "IDLE-AIC's mid-phase flush exposes overflow to shared"
-    // is what shows up in the swimlane as the multi-microsecond inter-thread
-    // stagger on out_proj's first wave.
-    //
-    // Gate conditions:
-    //   (a) local count exceeds this thread's per-shape block budget — we
-    //       can't dispatch them all even with both RUNNING+PENDING slots;
-    //   (b) at least one peer has idle cores in this shape — they want work.
-    // Both must hold to avoid wasting a CAS push when we could profitably
-    // self-dispatch the overflow. Condition (b) reads peer CoreTracker
-    // (plain 8-byte load on a rarely-contended cache line, ~5 ns) — we
-    // deliberately avoid ready_queues[s].size() here, which is two atomic
-    // loads on lines pushers + poppers actively bounce.
-    //
-    // Capacity derives from how cores are partitioned across sched threads:
-    //   per-shape budget = (PLATFORM_MAX_BLOCKDIM / active_sched_threads_)
-    //                       × cores_per_blockdim_for_that_shape
-    //   MIX is 1 cluster per block dim, so its budget equals the block-dim
-    //   share without multiplying.
-    //
-    // Push the trailing `excess` slot pointers — O(1) count decrement, no
-    // memmove. push_batch is one CAS for the whole excess; peers see the
-    // batch immediately and can race for them.
-    const int32_t bd_per_thread = PLATFORM_MAX_BLOCKDIM / active_sched_threads_;
-    const int32_t thread_capacity[PTO2_NUM_RESOURCE_SHAPES] = {
-        /*AIC=*/bd_per_thread * PLATFORM_AIC_CORES_PER_BLOCKDIM,
-        /*AIV=*/bd_per_thread * PLATFORM_AIV_CORES_PER_BLOCKDIM,
-        /*MIX=*/bd_per_thread,
-    };
-    for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
-        auto &lb = local_bufs[s];
-        int32_t excess = lb.count - thread_capacity[s];
-        if (excess <= 0) continue;
-        if (!has_idle_in_other_threads(thread_idx, static_cast<PTO2ResourceShape>(s))) continue;
-        sched_->ready_queues[s].push_batch(&lb.slot_states[lb.count - excess], excess);
-        lb.count -= excess;
-    }
-
-    auto flush_local_bufs = [&]() {
-        for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
-            auto &lb = local_bufs[s];
-            if (lb.count > 0) {
-                sched_->ready_queues[s].push_batch(lb.slot_states, lb.count);
-                lb.count = 0;
-            }
-        }
-    };
-    // Every return path below must flush; wrap in RAII so we cannot forget.
-    // The mid-function flush between IDLE and PENDING is still called
-    // explicitly — guard only covers exit.
-    struct FlushGuard {
-        decltype(flush_local_bufs) &flush_fn;
-        ~FlushGuard() { flush_fn(); }
-    } flush_guard{flush_local_bufs};
-
-    bool entered_drain = false;
-
-    // ===== IDLE stage =====
-    dispatch_shape(
-        thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain, made_progress,
-        try_pushed
-    );
-    if (entered_drain) return;
-
-    // MIX-IDLE residual: AIC/AIV (both IDLE and PENDING) yield for this pass.
-    // MIX-PENDING below still runs — that is the core of "mix strict priority":
-    // pending slots are spent on mix before AIC/AIV get any chance.
-    bool skip_aic_aiv = has_residual_mix(local_bufs[MIX_I]);
-
-    if (!skip_aic_aiv) {
-        for (int i = 0; i < 2; i++) {
-            PTO2ResourceShape s = aic_aiv[i];
-            dispatch_shape(
-                thread_idx, s, Phase::IDLE, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress,
-                try_pushed
-            );
-            if (entered_drain) return;
-        }
-    }
-
-    // Flush between IDLE and PENDING so PENDING-stage queue-size checks and any
-    // peer-thread reads see the IDLE-stage release_fanin output.
-    flush_local_bufs();
-
-    if (pmu_active) return;
-
-    // ===== PENDING stage =====
-    // MIX-PENDING gate: skip when a peer has an idle MIX-capable cluster — that
-    // peer's next IDLE-MIX iteration will pull the mix task from the global
-    // queue (already flushed above) at lower latency than us pre-loading a
-    // pending slot here. Forward progress for MIX is preserved: at least one
-    // thread will run MIX-IDLE next pass and consume the residual.
-    //
-    // The gate is NOT subject to skip_aic_aiv — residual mix continues to drain
-    // via pending slots on this thread when no peer is idle.
-    if (!has_idle_in_other_threads(thread_idx, PTO2ResourceShape::MIX)) {
-        dispatch_shape(
-            thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain,
-            made_progress, try_pushed
-        );
-        if (entered_drain) return;
-    }
-
-    // Re-check after MIX-PENDING. If MIX-IDLE already set skip_aic_aiv, leave
-    // it set; otherwise, escalate iff PENDING-MIX left residual.
-    if (!skip_aic_aiv && has_residual_mix(local_bufs[MIX_I])) {
-        skip_aic_aiv = true;
-    }
-
-    // PENDING-MIX may have re-populated AIC/AIV local_bufs via release_fanin
-    // during in-flight completions; flush_guard ensures these don't carry
-    // across to the next iteration's IDLE stage.
-    if (skip_aic_aiv) return;
-
-    // AIC/AIV-PENDING gate: a peer-idle skip is a delay, not a loss — the peer
-    // will pull from the global queue on its next IDLE pass.
-    for (int i = 0; i < 2; i++) {
-        PTO2ResourceShape s = aic_aiv[i];
-        if (has_idle_in_other_threads(thread_idx, s)) continue;
-        dispatch_shape(
-            thread_idx, s, Phase::PENDING, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress,
-            try_pushed
-        );
-        if (entered_drain) return;
-    }
-}
-
-// =============================================================================
-// Main scheduler dispatch loop
-// =============================================================================
-
-int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_idx) {
-    always_assert(sched_ != nullptr);
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    LOG_INFO_V0("Thread %d: resolve_and_dispatch entry", thread_idx);
-
-    PTO2SharedMemoryHeader *header = sched_->sm_header;
-    if (!header) {
-        LOG_ERROR("PTO2 dispatch: header is null");
-        return -1;
-    }
-    LOG_INFO_V0(
-        "Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu", thread_idx, static_cast<void *>(header),
-        static_cast<uint64_t>(header->rings[0].task_descriptors_offset),
-        static_cast<uint64_t>(header->rings[0].task_window_size)
-    );
-
-    Handshake *hank = static_cast<Handshake *>(runtime->workers);
-    LOG_INFO_V0(
-        "Thread %d: hank=%p, window_size=%lu", thread_idx, static_cast<void *>(hank),
-        static_cast<uint64_t>(header->rings[0].task_window_size)
-    );
-
-    // One-time init: assign perf buffers (one thread does it; others wait)
-    if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel)) {
-        LOG_INFO_V0("Thread %d: doing one-time init", thread_idx);
-
-#if PTO2_PROFILING
-        if (is_dump_tensor_enabled()) {
-            dump_tensor_init(orch_to_sched_ ? aicpu_thread_num_ : sched_thread_num_);
-        }
-#endif
-
-#if PTO2_PROFILING
-        // Initialize PMU: program events, start counters, and pop initial buffers
-        if (is_pmu_enabled()) {
-            pmu_aicpu_init(physical_core_ids_, cores_total_num_);
-            LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_);
-        }
-#endif
-
-        LOG_INFO_V0("Thread %d: one-time init done", thread_idx);
-        pto2_init_complete_.store(true, std::memory_order_release);
-    } else {
-        while (!pto2_init_complete_.load(std::memory_order_acquire)) {
-            SPIN_WAIT_HINT();
-        }
-    }
-
-    LOG_INFO_V0("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, core_trackers_[thread_idx].core_num());
-    int32_t cur_thread_completed = 0;
-    // Non-zero once a scheduler-hang timeout latches; returned in place of the
-    // completed count so the caller still sees the negative error rc while the
-    // shared end-of-loop flush below runs.
-    int32_t timeout_rc = 0;
-    int32_t idle_iterations = 0;
-    int32_t last_progress_count = 0;
-#if PTO2_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-    l2_swimlane.reset();
-    l2_swimlane.l2_swimlane_enabled = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED);
-#endif
-
-    constexpr int LOCAL_READY_CAP_PER_TYPE = 64;
-    PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE];
-    PTO2LocalReadyBuffer local_bufs[PTO2_NUM_RESOURCE_SHAPES];
-    for (int32_t i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        local_bufs[i].reset(local_ptrs[i], LOCAL_READY_CAP_PER_TYPE);
-    }
-    PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP];
-    int32_t deferred_release_count = 0;
-
-    bool cores_released = false;
-
-    // PMU runs require single-issue dispatch — overlapping in-flight tasks
-    // pollute per-task PMU counters, so skip the PENDING pre-load phase.
-    // Cached at function scope: is_pmu_enabled() is extern "C" and the
-    // compiler cannot hoist it across the dispatch loop on its own.
-    const bool pmu_active = is_pmu_enabled();
-
-#if PTO2_PROFILING
-    l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
-#endif
-
-#if PTO2_PROFILING
-    // Queue-depth snapshot carried across the iteration boundary: each phase
-    // emit consumes (phase_start_*) and refreshes them with its own end snapshot
-    // so the next phase's "at_start" equals the previous phase's "at_end".
-    //
-    // L2SWIMLANE_NUM_QUEUE_SHAPES (3) matches PTO2_NUM_RESOURCE_SHAPES: AIC/AIV/MIX.
-    //
-    // **Hot-path cost discipline.** Local depth (this thread's PTO2LocalReadyBuffer)
-    // is a single int read on a register-cached stack — free. Shared depth
-    // (PTO2ReadyQueue::size) is two atomic relaxed loads against cache lines
-    // that all peer sched threads also write to (enqueue_pos and dequeue_pos
-    // bounce on every flush_local_bufs + every pop). With both phases emitting
-    // per iter that's 12 cross-core loads × thousands of iters per run, a
-    // measurable AICPU slowdown. Mitigation: lazy + per-iter cached shared
-    // snapshot, refreshed at most once per iteration. The complete-emit and
-    // dispatch-emit in the same iter both reuse the same shared sample; the
-    // big transitions (local→shared flush) still show up across iter boundaries.
-    static_assert(
-        L2SWIMLANE_NUM_QUEUE_SHAPES == PTO2_NUM_RESOURCE_SHAPES,
-        "queue snapshot width must match runtime resource shape count"
-    );
-    int16_t phase_start_local[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
-    int16_t phase_start_shared[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
-    int16_t iter_shared_snapshot[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
-    bool iter_shared_sampled = false;
-    auto capture_local_snapshot = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) {
-        for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-            local_out[s] = static_cast<int16_t>(local_bufs[s].count);
-        }
-    };
-    auto get_or_sample_shared = [&]() -> const int16_t * {
-        if (!iter_shared_sampled) {
-            // Clamp to int16_t max before narrowing. PTO2_PROF_READYQUEUE_SIZE
-            // is in the low thousands today but could grow with platform
-            // scaling — without clamp, sizes above 32767 wrap to negatives
-            // and silently corrupt the snapshot.
-            constexpr size_t kMax = static_cast<size_t>(std::numeric_limits<int16_t>::max());
-            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-                const size_t qsize = sched_->ready_queues[s].size();
-                iter_shared_snapshot[s] = static_cast<int16_t>(std::min(qsize, kMax));
-            }
-            iter_shared_sampled = true;
-        }
-        return iter_shared_snapshot;
-    };
-    auto capture_phase_end = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES],
-                                 int16_t shared_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) {
-        capture_local_snapshot(local_out);
-        const int16_t *shared_cached = get_or_sample_shared();
-        for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++)
-            shared_out[s] = shared_cached[s];
-    };
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        capture_phase_end(phase_start_local, phase_start_shared);
-    }
-#endif
-
-    // Wall-clock timestamp of the last completed task on this thread.
-    // Updated on made_progress; consulted to decide whether the wall-clock
-    // budget for declaring a scheduler hang has elapsed. Initialized to
-    // "now" so the first budget cycle starts when this thread does, not at
-    // an undefined value.
-    uint64_t last_progress_ts = get_sys_cnt_aicpu();
-
-    while (true) {
-        if (completed_.load(std::memory_order_acquire)) {
-            break;
-        }
-        bool made_progress = false;
-#if PTO2_PROFILING
-        CYCLE_COUNT_START();
-        l2_swimlane.sched_loop_count++;
-        uint64_t _t0_phase = _t0;
-        // Per-iter lazy shared-queue snapshot: first phase emit in this iter
-        // pays the atomic-load cost, subsequent emits in the same iter reuse
-        // the cached value. Reset here so we re-sample exactly once per iter
-        // (or skip entirely on iters with no phase emit).
-        iter_shared_sampled = false;
-#endif
-        int32_t task_count = 0;
-        if (!tracker.has_any_running_cores()) {
-            LoopAction action = handle_orchestrator_exit(thread_idx, header, runtime, task_count);
-            if (action == LoopAction::BREAK_LOOP) break;
-        }
-
-        if (!cores_released && orch_to_sched_) {
-            LoopAction action = handle_core_transition(cores_released);
-            if (action == LoopAction::BREAK_LOOP) break;
-        }
-
-#if PTO2_PROFILING
-        CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
-#endif
-
-        // Phase 1: Check running cores for completion
-        int32_t completed_this_turn = 0;
-
-        bool try_completed = tracker.has_any_running_cores();
-        if (try_completed) {
-            check_running_cores_for_completion(
-                thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress,
-                deferred_release_slot_states, deferred_release_count, local_bufs
-            );
-        }
-        if (completed_this_turn > 0) {
-#if PTO2_SCHED_PROFILING
-            sched_->tasks_completed.fetch_add(completed_this_turn, std::memory_order_relaxed);
-#endif
-            int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
-            int32_t new_total = prev + completed_this_turn;
-            last_progress_count = new_total;
-            if (thread_idx == 0 && task_count > 0) {
-                if (new_total <= PROGRESS_VERBOSE_THRESHOLD ||
-                    new_total / PROGRESS_LOG_INTERVAL != prev / PROGRESS_LOG_INTERVAL || new_total >= task_count) {
-                    LOG_INFO_V9(
-                        "PTO2 progress: completed=%d total=%d (%.1f%%)", new_total, task_count,
-                        100.0 * new_total / task_count
-                    );
-                }
-            }
-        }
-
-        if (rt_ != nullptr && rt_->aicore_mailbox != nullptr &&
-            (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending())) {
-            AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(
-                rt_->aicore_mailbox, sched_, local_bufs, deferred_release_slot_states, deferred_release_count,
-                PTO2_DEFERRED_RELEASE_CAP
-#if PTO2_SCHED_PROFILING
-                ,
-                thread_idx
-#endif
-            );
-            if (poll_result.error_code != PTO2_ERROR_NONE) {
-                int32_t expected = PTO2_ERROR_NONE;
-                header->sched_error_code.compare_exchange_strong(
-                    expected, poll_result.error_code, std::memory_order_acq_rel, std::memory_order_acquire
-                );
-                completed_.store(true, std::memory_order_release);
-                break;
-            }
-            if (poll_result.completed > 0) {
-#if PTO2_SCHED_PROFILING
-                sched_->tasks_completed.fetch_add(poll_result.completed, std::memory_order_relaxed);
-#endif
-                int32_t prev = completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed);
-                int32_t new_total = prev + poll_result.completed;
-                last_progress_count = new_total;
-                made_progress = true;
-            }
-        }
-
-#if PTO2_PROFILING
-        if (!try_completed) {
-            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
-        } else {
-            CYCLE_COUNT_LAP(l2_swimlane.sched_complete_cycle);
-            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_complete_count > 0) {
-                // Local depth is cheap (this thread's own buffer counter).
-                // Shared depth is NOT sampled here: complete's release_fanin
-                // pushes to local_bufs in the fast path (try_push succeeds
-                // until cap=64). Shared only changes on dispatch's flush
-                // path. Carrying phase_start_shared forward as end_shared
-                // is the right answer 99% of the time AND skips three
-                // contended atomic loads per emit.
-                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
-                capture_local_snapshot(phase_end_local);
-                l2_swimlane_aicpu_record_sched_phase(
-                    thread_idx, L2SwimlaneSchedPhaseKind::Complete, _t0_phase, _t1, l2_swimlane.sched_loop_count,
-                    l2_swimlane.phase_complete_count, /*pop_hit=*/0, /*pop_miss=*/0, phase_start_local,
-                    phase_start_shared, phase_end_local, phase_start_shared
-                );
-                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-                    phase_start_local[s] = phase_end_local[s];
-                    // phase_start_shared unchanged — carried forward
-                }
-                _t0_phase = _t1;
-                l2_swimlane.phase_complete_count = 0;
-            }
-        }
-#endif
-
-        bool try_pushed = false;
-
-        // Phase 2 drain check
-        if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
-            handle_drain_mode(thread_idx);
-            continue;
-        }
-
-        // Phase 3: Drain wiring queue (thread 0 only)
-        if (thread_idx == 0) {
-            int wired = sched_->drain_wiring_queue(orchestrator_done_);
-            if (wired > 0) {
-                made_progress = true;
-#if PTO2_SCHED_PROFILING
-                l2_swimlane.phase_wiring_count += wired;
-#endif
-            }
-        }
-#if PTO2_PROFILING
-        CYCLE_COUNT_LAP(l2_swimlane.sched_wiring_cycle);
-#endif
-
-        // Phase 3b: Drain dummy ready queue (thread 0 only).
-        //
-        // Dependency-only tasks bypass AICore dispatch: they go through the
-        // scheduler so fanin/fanout edges stay consistent, but completion is
-        // signalled inline here. Pinned to thread 0 to avoid cross-thread
-        // races and to keep cache hot near the wiring drain above.
-        if (thread_idx == 0) {
-            constexpr int DUMMY_DRAIN_BATCH = 16;
-            PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH];
-            int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH);
-            for (int di = 0; di < dummy_got; di++) {
-                PTO2TaskSlotState &dummy_slot = *dummy_batch[di];
-#if PTO2_SCHED_PROFILING
-                sched_->on_mixed_task_complete(dummy_slot, thread_idx, local_bufs);
-#else
-                sched_->on_mixed_task_complete(dummy_slot, local_bufs);
-#endif
-                // Dummy tasks have no subtasks to retire and no fanout pre-conditions
-                // beyond their own producers; release self-reference so the slot can
-                // reach CONSUMED once all consumers drain.
-                deferred_release_slot_states[deferred_release_count++] = &dummy_slot;
-                if (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP) {
-                    while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                        (void)sched_->on_task_release(
-                            *deferred_release_slot_states[--deferred_release_count], thread_idx
-                        );
-#else
-                        sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-                    }
-                }
-                int32_t prev = completed_tasks_.fetch_add(1, std::memory_order_relaxed);
-                last_progress_count = prev + 1;
-                cur_thread_completed++;
-            }
-            if (dummy_got > 0) {
-                made_progress = true;
-            }
-        }
-
-        // Phase 4: MIX-strict-priority dispatch with phase-split and
-        // cross-thread idle gating. See dispatch_ready_tasks for the policy.
-        dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed);
-
-#if PTO2_PROFILING
-        if (!try_pushed) {
-            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
-        } else {
-            CYCLE_COUNT_LAP(l2_swimlane.sched_dispatch_cycle);
-            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_dispatch_count > 0) {
-                // Final-drain at loop end emits the trailing-idle tail so
-                // sum-of-deltas == run-cumulative.
-                uint64_t pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
-                uint64_t pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
-                // L2SwimlaneAicpuSchedPhaseRecord's pop_hit / pop_miss are uint32 — a delta that overflows means
-                // an emit was missed for ~4 billion pops, which is well outside any
-                // realistic dispatch cadence and silently truncates without this guard.
-                debug_assert(pop_hit_delta < (1ULL << 32));
-                debug_assert(pop_miss_delta < (1ULL << 32));
-                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
-                int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES];
-                capture_phase_end(phase_end_local, phase_end_shared);
-                l2_swimlane_aicpu_record_sched_phase(
-                    thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, _t0_phase, _t1, l2_swimlane.sched_loop_count,
-                    l2_swimlane.phase_dispatch_count, static_cast<uint32_t>(pop_hit_delta),
-                    static_cast<uint32_t>(pop_miss_delta), phase_start_local, phase_start_shared, phase_end_local,
-                    phase_end_shared
-                );
-                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-                    phase_start_local[s] = phase_end_local[s];
-                    phase_start_shared[s] = phase_end_shared[s];
-                }
-                _t0_phase = _t1;
-                l2_swimlane.phase_dispatch_count = 0;
-                l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
-                l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
-            }
-        }
-#endif
-
-#if !PTO2_PROFILING
-        (void)try_completed;
-        (void)try_pushed;
-#endif
-
-        if (made_progress) {
-            idle_iterations = 0;
-            last_progress_ts = get_sys_cnt_aicpu();
-        } else {
-            while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-            }
-            idle_iterations++;
-
-            if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0) {
-                LoopAction action = check_idle_fatal_error(thread_idx, header, runtime);
-                if (action == LoopAction::BREAK_LOOP) break;
-            }
-
-            if (idle_iterations % STALL_LOG_INTERVAL == 0) {
-                log_stall_diagnostics(thread_idx, total_tasks_, idle_iterations, last_progress_count);
-            }
-            // Wall-clock budget gate, with two fatal-latch branches:
-            //
-            // 1. Self owns a RUNNING task — first-hand evidence the
-            //    dispatch is stuck. Latch.
-            // 2. No thread anywhere owns a RUNNING task AND tasks remain
-            //    unfinished — the system is in a pre-dispatch / WAIT-only
-            //    deadlock (e.g. dependency cycle). Ownerless idle threads
-            //    are the only observers; let this one latch on the global
-            //    evidence (`completed_tasks_ < total_tasks_` and
-            //    `no_thread_owns_running_task()`).
-            //
-            // Otherwise: a sibling thread owns a RUNNING task but hasn't
-            // hit its own budget yet (typical distributed startup-skew
-            // case) — refresh last_progress_ts and keep spinning. The
-            // STALL diagnostic above still fires periodically so
-            // observability is preserved.
-            if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) {
-                bool self_owns = self_owns_running_task(thread_idx);
-                bool global_stuck = !self_owns && total_tasks_ > 0 &&
-                                    completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ &&
-                                    no_thread_owns_running_task();
-                if (self_owns || global_stuck) {
-                    // Latch the error + emergency_shutdown, then break to the
-                    // shared end-of-loop cleanup so the diagnostic buffers get
-                    // flushed to the host. An early return here would strand the
-                    // stuck task's already-dumped inputs and every completed
-                    // task's in/out records in the unflushed per-thread dump
-                    // buffer — exactly the state we need to triage the hang.
-                    timeout_rc = handle_timeout_exit(
-                        thread_idx, header, runtime, idle_iterations, last_progress_count
-#if PTO2_PROFILING
-                        ,
-                        l2_swimlane.sched_start_ts
-#endif
-                    );
-                    break;
-                }
-                last_progress_ts = get_sys_cnt_aicpu();
-            }
-            SPIN_WAIT_HINT();
-#if PTO2_PROFILING
-            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
-            // Idle iterations no longer emit a phase record. Host tooling
-            // recovers idle spans from the gap between consecutive sched
-            // phase records on the same thread. _t0_phase still advances
-            // so the next emitted COMPLETE/DISPATCH gets the correct
-            // start_time (the iter it actually ran in), not the start of
-            // the preceding idle stretch.
-            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-                _t0_phase = _t1;
-            }
-#endif
-        }
-    }
-
-    // Drain any entries left in the deferred-release batch. The in-loop flush
-    // only fires on idle iterations and on buffer-full; a loop exit while the
-    // last iteration made progress can leave entries un-released. Drop them
-    // here so every consumed producer slot completes its on_task_release
-    // regardless of which loop-exit path fired.
-    while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-        (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-        sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-    }
-
-#if PTO2_PROFILING
-    // Final-drain: emit any pop_hit / pop_miss accrued since the last
-    // dispatch emit (typically the trailing idle loops while waiting for
-    // orchestrator_done_) as a zero-duration synthetic dispatch record so
-    // sum(record.pop_*) reconciles with the run-cumulative counter.
-    // Gate on SCHED_PHASES — at lower levels the phase buffer is never
-    // flushed (see below), so writing this record would be wasted work.
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        uint64_t final_pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
-        uint64_t final_pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
-        debug_assert(final_pop_hit_delta < (1ULL << 32));
-        debug_assert(final_pop_miss_delta < (1ULL << 32));
-        if (final_pop_hit_delta != 0 || final_pop_miss_delta != 0) {
-            uint64_t t_now = get_sys_cnt_aicpu();
-            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
-            int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES];
-            capture_phase_end(phase_end_local, phase_end_shared);
-            l2_swimlane_aicpu_record_sched_phase(
-                thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, t_now, t_now, l2_swimlane.sched_loop_count, 0,
-                static_cast<uint32_t>(final_pop_hit_delta), static_cast<uint32_t>(final_pop_miss_delta),
-                phase_end_local, phase_end_shared, phase_end_local, phase_end_shared
-            );
-            l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
-            l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
-        }
-    }
-    log_l2_swimlane_summary(thread_idx, cur_thread_completed);
-#endif
-
-#if PTO2_PROFILING
-    if (l2_swimlane.l2_swimlane_enabled) {
-        l2_swimlane_aicpu_flush(
-            thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
-        );
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-            l2_swimlane_aicpu_flush_sched_phase_buffer(thread_idx);
-        }
-    }
-#endif
-#if PTO2_PROFILING
-    if (is_dump_tensor_enabled()) {
-        dump_tensor_flush(thread_idx);
-    }
-#endif
-#if PTO2_PROFILING
-    if (is_pmu_enabled()) {
-        pmu_aicpu_flush_buffers(
-            thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
-        );
-    }
-#endif
-
-    return timeout_rc != 0 ? timeout_rc : cur_thread_completed;
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
deleted file mode 100644
index 9d52cf1ea..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#ifndef SCHEDULER_TYPES_H
-#define SCHEDULER_TYPES_H
-
-#include <atomic>
-#include <cstdint>
-
-#include "common/core_type.h"
-#include "common/platform_config.h"
-#include "pto_runtime2_types.h"
-#include "spin_hint.h"
-
-// =============================================================================
-// Profiling macros (compile-time gated)
-// =============================================================================
-
-#if PTO2_PROFILING
-#include "aicpu/device_time.h"
-// Accumulated nanoseconds per sub-step
-#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
-#define CYCLE_COUNT_LAP(acc)       \
-    do {                           \
-        _t1 = get_sys_cnt_aicpu(); \
-        acc += (_t1 - _t0);        \
-        _t0 = _t1;                 \
-    } while (0)
-#else
-#define CYCLE_COUNT_START()
-#define CYCLE_COUNT_LAP(acc)
-#endif
-
-// =============================================================================
-// Scheduler constants
-// =============================================================================
-
-constexpr int32_t MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS;
-
-// Periodic cadence (in idle iterations) for emitting the per-thread STALL
-// diagnostic while no progress is being made. Purely an observability knob,
-// independent of the wall-clock timeout below: small enough to fire a few times
-// before the budget expires, large enough not to flood device_log.
-constexpr int32_t STALL_LOG_INTERVAL = 480000;
-constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024;  // Check orchestrator error every N idle iters
-
-// Wall-clock budget for declaring "no progress = scheduler timeout". Replaces
-// the per-thread iteration-count cap that once lived here as MAX_IDLE_ITERATIONS
-// for the fatal-latch decision; STALL_LOG_INTERVAL above keeps the per-thread
-// diagnostic cadence.
-//
-// Using wall-clock here is load-bearing for distributed runs: with per-thread
-// iteration counts, a pure-idle thread spinning ~115 ns/iter hits the cap in
-// ~92 ms while a sibling thread polling a RUNNING task takes ~200 ms for the
-// same iteration count. The fast spinner racing ahead and latching fatal
-// kills the slower-but-correct poller mid-poll — see the distributed
-// startup-skew scenario in issue #897.
-//
-// The budget is platform-defined (PLATFORM_SCHEDULER_TIMEOUT_MS in spin_hint.h)
-// because the safe value differs per variant: onboard trims it to 2 s so the
-// AICPU detects a hang and flushes its diagnostics (tensor dump, in-flight
-// partial output) before STARS reaps the op and poisons the context (chain:
-// this < op-exec < host stream-sync, platform_config.h); sim has no STARS to
-// race and keeps the full 5 s #897 headroom. See spin_hint.h for the per-variant
-// rationale.
-constexpr int32_t SCHEDULER_TIMEOUT_MS = PLATFORM_SCHEDULER_TIMEOUT_MS;
-constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES =
-    static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
-constexpr int32_t STALL_DUMP_READY_MAX = 8;
-constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
-constexpr int32_t STALL_DUMP_CORE_MAX = 8;
-constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10;  // log every completion for the first N tasks
-constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions after threshold
-
-// =============================================================================
-// Control flow signal from cold-path helpers back to the main dispatch loop.
-// =============================================================================
-
-enum class LoopAction : int8_t {
-    NONE,        // cold path did not trigger; proceed normally
-    BREAK_LOOP,  // equivalent to 'break' from the while(true) loop
-};
-
-// =============================================================================
-// Per-core state: one cache line per core to eliminate false sharing
-// and co-locate all hot-path fields for minimal cache misses.
-// Dual-slot layout: running (currently executing) + pending (pre-loaded, awaiting hardware pickup).
-// =============================================================================
-
-struct alignas(64) CoreExecState {
-    // --- Hot fields (completion + dispatch, every iteration) ---
-    uint64_t reg_addr;                      // offset  0: register base address (set once in handshake)
-    PTO2TaskSlotState *running_slot_state;  // offset  8: slot state for running task (nullptr = empty)
-    PTO2TaskSlotState *pending_slot_state;  // offset 16: slot state for pending task (nullptr = empty)
-    int32_t running_reg_task_id;            // offset 24: register task ID (AICPU_TASK_INVALID = idle)
-    int32_t pending_reg_task_id;            // offset 28: pending register task ID (AICPU_TASK_INVALID = none)
-    uint32_t dispatch_seq;                  // offset 32: monotonic dispatch counter
-    PTO2SubtaskSlot running_subslot;        // offset 36: which subtask slot is running
-    PTO2SubtaskSlot pending_subslot;        // offset 37: which subtask slot is pending
-    uint8_t pad0_[2];                       // offset 38: alignment padding
-    // Precomputed COND register pointer; resolved once in handshake so the
-    // hot completion poll does a single volatile load instead of recomputing
-    // reg_base + reg_offset(COND) on every iteration.
-    volatile uint32_t *cond_ptr;  // offset 40: precomputed pointer to COND register
-#if PTO2_PROFILING
-    // --- Profiling fields (dispatch path, compile-time gated) ---
-    uint64_t running_dispatch_timestamp;  // offset 48: AICPU dispatch timestamp for running task
-    uint64_t pending_dispatch_timestamp;  // offset 56: AICPU dispatch timestamp for pending task
-#else
-    // --- Cold fields (init/diagnostics only, never in hot path) ---
-    int32_t worker_id;          // offset 48: index in runtime.workers[]
-    uint32_t physical_core_id;  // offset 52: hardware physical core ID
-    CoreType core_type;         // offset 56: AIC or AIV (enum class : int32_t)
-    uint8_t pad2_[4];           // offset 60: pad to 64 bytes
-#endif
-};
-static_assert(sizeof(CoreExecState) == 64, "CoreExecState must occupy exactly one cache line");
-
-// =============================================================================
-// CoreTracker: cluster-based bitmask tracker for idle/running core state.
-//
-// core_states_ encodes per-cluster core idle/running in 3 bits per cluster:
-//   bit i*3   = AIC of cluster i   (1 = idle, 0 = running)
-//   bit i*3+1 = AIV0 of cluster i
-//   bit i*3+2 = AIV1 of cluster i
-// Max 21 clusters per tracker (63 bits in uint64_t).
-// =============================================================================
-
-class alignas(64) CoreTracker {
-public:
-    static inline int32_t MAX_CORE_PER_THREAD = 63;
-    static constexpr int32_t MAX_CLUSTERS = 63 / 3;
-
-public:
-    CoreTracker() = default;
-
-    class BitStates {
-    public:
-        BitStates() = default;
-
-        explicit BitStates(uint64_t states) :
-            states_(states) {}
-        void init() { states_ = 0; }
-
-        BitStates operator~() const { return BitStates(~states_); }
-        BitStates operator&(const BitStates &other) const { return BitStates(states_ & other.states_); }
-        BitStates operator|(const BitStates &other) const { return BitStates(states_ | other.states_); }
-        BitStates operator^(const BitStates &other) const { return BitStates(states_ ^ other.states_); }
-        BitStates operator>>(int32_t offset) const { return BitStates(states_ >> offset); }
-        BitStates operator<<(int32_t offset) const { return BitStates(states_ << offset); }
-        void operator&=(const BitStates &other) { states_ &= other.states_; }
-        void operator|=(const BitStates &other) { states_ |= other.states_; }
-        void operator^=(const BitStates &other) { states_ ^= other.states_; }
-
-        bool has_value() const { return states_ > 0; }
-        int32_t count() const { return __builtin_popcountll(states_); }
-
-        // Extract the lowest set bit from mask, clear it, and return its position.
-        // Returns -1 if mask is empty.
-        int32_t pop_first() {
-            if (states_ == 0) return -1;
-            int32_t pos = __builtin_ctzll(states_);
-            states_ &= states_ - 1;
-            return pos;
-        }
-
-    private:
-        uint64_t states_{0};
-    };
-
-public:
-    void init(int32_t cluster_count) {
-        cluster_count_ = cluster_count;
-        aic_mask_.init();
-        aiv_mask_.init();
-        pending_occupied_.init();
-        for (int32_t i = 0; i < cluster_count; i++) {
-            aic_mask_ |= BitStates(1ULL << (i * 3));
-            aiv_mask_ |= BitStates(6ULL << (i * 3));
-        }
-        core_states_ = aic_mask_ | aiv_mask_;
-    }
-
-    void set_cluster(int32_t cluster_idx, int32_t aic_wid, int32_t aiv0_wid, int32_t aiv1_wid) {
-        core_id_map_[cluster_idx * 3] = aic_wid;
-        core_id_map_[cluster_idx * 3 + 1] = aiv0_wid;
-        core_id_map_[cluster_idx * 3 + 2] = aiv1_wid;
-    }
-
-    int32_t get_cluster_count() const { return cluster_count_; }
-
-    // --- Running core queries ---
-
-    template <CoreType CT>
-    bool has_running_cores() const {
-        if constexpr (CT == CoreType::AIC) {
-            return ((~core_states_) & aic_mask_).has_value();
-        } else {
-            return ((~core_states_) & aiv_mask_).has_value();
-        }
-    }
-
-    bool has_any_running_cores() const { return ((~core_states_) & (aic_mask_ | aiv_mask_)).has_value(); }
-
-    template <CoreType CT>
-    int32_t get_running_count() const {
-        if constexpr (CT == CoreType::AIC) {
-            return ((~core_states_) & aic_mask_).count();
-        } else {
-            return ((~core_states_) & aiv_mask_).count();
-        }
-    }
-
-    // Return an opaque bitmask for iterating running cores of a given type.
-    // Use pop_first() to extract core bit offsets one at a time.
-    template <CoreType CT>
-    BitStates get_running_cores() const {
-        if constexpr (CT == CoreType::AIC) {
-            return (~core_states_) & aic_mask_;
-        } else {
-            return (~core_states_) & aiv_mask_;
-        }
-    }
-
-    BitStates get_all_running_cores() const { return (~core_states_) & (aic_mask_ | aiv_mask_); }
-
-    // --- Cluster matching ---
-
-    BitStates get_valid_cluster_offset_states(PTO2ResourceShape shape) const {
-        switch (shape) {
-        case PTO2ResourceShape::AIC:
-            return core_states_ & aic_mask_;
-        case PTO2ResourceShape::AIV:
-            return ((core_states_ >> 1) | (core_states_ >> 2)) & aic_mask_;
-        case PTO2ResourceShape::MIX:
-            return (core_states_ >> 1) & (core_states_ >> 2) & core_states_ & aic_mask_;
-        case PTO2ResourceShape::DUMMY:
-            // DUMMY tasks never reach the core-tracker dispatch path; they are
-            // completed inline by resolve_and_dispatch via dummy_ready_queue.
-            return BitStates(0ULL);
-        }
-        return BitStates(0ULL);
-    }
-
-    int32_t get_aic_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset]; }
-    int32_t get_aiv0_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 1]; }
-    int32_t get_aiv1_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 2]; }
-
-    int32_t get_aic_core_offset(int32_t cluster_offset) const { return cluster_offset; }
-    int32_t get_aiv0_core_offset(int32_t cluster_offset) const { return cluster_offset + 1; }
-    int32_t get_aiv1_core_offset(int32_t cluster_offset) const { return cluster_offset + 2; }
-
-    bool is_aic_core_idle(int32_t cluster_offset) const {
-        return ((core_states_ >> cluster_offset) & BitStates(1ULL)).has_value();
-    }
-    bool is_aiv0_core_idle(int32_t cluster_offset) const {
-        return ((core_states_ >> (cluster_offset + 1)) & BitStates(1ULL)).has_value();
-    }
-    bool is_aiv1_core_idle(int32_t cluster_offset) const {
-        return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value();
-    }
-
-    // --- State mutation ---
-
-    // Toggle bit at the given bit offset (running <-> idle)
-    void change_core_state(int32_t bit_offset) { core_states_ ^= BitStates(1ULL << bit_offset); }
-
-    // --- Pending-occupied tracking ---
-    // Tracks whether a core's pending payload slot is occupied (awaiting hardware ACK).
-    // SET on dispatch (both running-first and pending), CLEAR on idle or pending_freed.
-
-    void set_pending_occupied(int32_t bit_offset) { pending_occupied_ |= BitStates(1ULL << bit_offset); }
-    void clear_pending_occupied(int32_t bit_offset) {
-        pending_occupied_ ^= (pending_occupied_ & BitStates(1ULL << bit_offset));
-    }
-
-    // --- Two-phase dispatch queries ---
-
-    // Idle dispatch: returns bit offsets of idle cores for the given shape.
-    // For AIC: 1 bit per cluster (core offset == cluster offset).
-    // For AIV: 1 bit per AIV core (2 bits per cluster at aiv_mask_ positions).
-    // Only AIC needs pending_occupied filtering: by invariant, idle cores (core_states_ bit=1)
-    // always have pending_occupied=0, so AIV/MIX need no extra filtering.
-    // Skipping the AIC-centric filter also fixes a latent bug where a running+pending AIC core
-    // would incorrectly block AIV idle dispatch on the same cluster.
-    BitStates get_idle_core_offset_states(PTO2ResourceShape shape) const {
-        if (shape == PTO2ResourceShape::AIC) {
-            return get_valid_cluster_offset_states(shape) & ~(pending_occupied_ & aic_mask_);
-        }
-        if (shape == PTO2ResourceShape::AIV) {
-            return core_states_ & aiv_mask_;
-        }
-        return get_valid_cluster_offset_states(shape);  // MIX: cluster-level
-    }
-
-    // Pending dispatch: returns bit offsets of cores eligible for pending-slot dispatch.
-    // AIC: 1 bit per cluster (aic_mask_ positions). AIV: 1 bit per AIV core (aiv_mask_ positions).
-    // MIX: 1 bit per cluster where ALL 3 cores have free pending slots AND at least one is running.
-    //       Idle cores participate via to_pending=false in the MIX prepare path.
-    BitStates get_pending_core_offset_states(PTO2ResourceShape shape) const {
-        if (shape == PTO2ResourceShape::MIX) {
-            // Any core without a pending payload can accept a dispatch (idle or running).
-            BitStates available = ~pending_occupied_;
-            BitStates mix_available =
-                (available & aic_mask_) & ((available >> 1) & aic_mask_) & ((available >> 2) & aic_mask_);
-            // Exclude fully-idle clusters (handled by IDLE phase) to prevent double-dispatch.
-            BitStates running = ~core_states_;
-            BitStates cluster_has_running =
-                (running & aic_mask_) | ((running >> 1) & aic_mask_) | ((running >> 2) & aic_mask_);
-            return mix_available & cluster_has_running;
-        }
-        if (shape == PTO2ResourceShape::AIC) {
-            return (~core_states_) & aic_mask_ & ~(pending_occupied_ & aic_mask_);
-        }
-        // AIV
-        return (~core_states_) & aiv_mask_ & ~pending_occupied_;
-    }
-
-    // --- Two-phase dispatch unified query ---
-
-    enum class DispatchPhase : uint8_t { IDLE, PENDING };
-
-    BitStates get_dispatchable_cores(PTO2ResourceShape shape, DispatchPhase phase) const {
-        return (phase == DispatchPhase::IDLE) ? get_idle_core_offset_states(shape) :
-                                                get_pending_core_offset_states(shape);
-    }
-
-    // --- Bit offset <-> worker_id mapping ---
-
-    int32_t get_core_id_by_offset(int32_t offset) const { return core_id_map_[offset]; }
-
-    const int32_t *core_ids() const { return core_id_map_; }
-    int32_t core_num() const { return cluster_count_ * 3; }
-
-private:
-    int32_t cluster_count_;
-    BitStates aic_mask_;
-    BitStates aiv_mask_;
-    BitStates core_states_;
-    BitStates pending_occupied_;
-    int32_t core_id_map_[63];  // bit_position -> worker_id, max 21 clusters * 3
-};
-
-// =============================================================================
-// SlotTransition: pure event signals from a single register poll.
-// true = event occurred, false = no-op (maintain current state).
-// =============================================================================
-
-struct SlotTransition {
-    bool running_done = false;   // running task completed
-    bool pending_done = false;   // pending task completed
-    bool running_freed = false;  // running slot data should be released
-    bool pending_freed = false;  // pending_occupied can be cleared
-    bool matched = false;        // some case was hit (otherwise skip apply)
-};
-
-// =============================================================================
-// Profiling counters (compile-time gated)
-// =============================================================================
-
-#if PTO2_PROFILING
-struct alignas(64) SchedL2SwimlaneCounters {
-    bool l2_swimlane_enabled{false};
-    uint64_t sched_start_ts{0};
-    uint64_t sched_complete_cycle{0};
-    uint64_t sched_dispatch_cycle{0};
-    uint64_t sched_wiring_cycle{0};
-    uint64_t sched_idle_cycle{0};
-    uint64_t sched_loop_count{0};
-    uint32_t phase_complete_count{0};
-    uint32_t phase_dispatch_count{0};
-    // Per-emit delta is (current - *_at_last_emit). Accumulated only when
-    // l2_swimlane_level_ >= SCHED_PHASES.
-    uint64_t pop_hit{0};
-    uint64_t pop_miss{0};
-    uint64_t pop_hit_at_last_emit{0};
-    uint64_t pop_miss_at_last_emit{0};
-#if PTO2_SCHED_PROFILING
-    uint32_t phase_wiring_count{0};
-    uint64_t complete_probe_count{0};
-    uint64_t complete_hit_count{0};
-    uint64_t sched_complete_perf_cycle{0};
-    uint64_t sched_dispatch_pop_cycle{0};
-    uint64_t sched_dispatch_setup_cycle{0};
-#endif
-    void reset() { *this = SchedL2SwimlaneCounters{}; }
-};
-#endif
-
-// =============================================================================
-// sync_start drain coordination
-// =============================================================================
-
-// When sync_start_pending != 0, all scheduler threads skip dispatch
-// (only process completions) until the drain worker finishes launching all blocks.
-struct alignas(64) SyncStartDrainState {
-    std::atomic<int32_t> sync_start_pending{0};    // 0=normal; -1=initializing; >0=active (value=block_num)
-    std::atomic<int32_t> drain_worker_elected{0};  // 0=none; >0: elected thread's (thread_idx+1)
-    std::atomic<uint32_t> drain_ack_mask{0};       // bit per thread; all-set = all threads reached ack barrier
-    std::atomic<PTO2TaskSlotState *> pending_task{nullptr};  // held task (not re-queued)
-    int32_t _pad[10];
-};
-static_assert(sizeof(SyncStartDrainState) == 64);
-
-#endif  // SCHEDULER_TYPES_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
new file mode 100644
index 000000000..b2c178a92
--- /dev/null
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
@@ -0,0 +1,1546 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#ifndef SCHEDULER_CONTEXT_H
+#define SCHEDULER_CONTEXT_H
+
+#include "aicpu/platform_regs.h"
+#include "common/l2_swimlane_profiling.h"
+#include "scheduler_types.h"
+
+#include "pto_scheduler.h"
+
+#include "aicore_completion_mailbox.h"
+#include "pto2_dispatch_payload.h"
+
+#include <cinttypes>
+#include <cstdio>
+#include "runtime.h"
+#include "pto_runtime2.h"
+#include "pto_shared_memory.h"
+#include "aicpu/device_time.h"
+#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+#include "common/memory_barrier.h"
+#include "common/platform_config.h"
+#include "spin_hint.h"
+
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+
+inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
+
+inline void latch_scheduler_error(PTO2SharedMemoryHeader *header, int32_t thread_idx, int32_t error_code)
+{
+    if (header == nullptr || error_code == PTO2_ERROR_NONE) return;
+    int32_t expected = PTO2_ERROR_NONE;
+    if (header->sched_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) header->sched_error_thread.store(thread_idx, std::memory_order_release);
+    if (thread_idx >= 0 && thread_idx < 32) header->sched_error_bitmap.fetch_or(1U << static_cast<uint32_t>(thread_idx), std::memory_order_acq_rel);
+}
+
+inline void format_core_status(char *buf, size_t buf_size, int32_t core_id, bool idle, const CoreExecState *core_state, uint64_t reg_addr_for_cond)
+{
+    if (idle)
+    {
+        snprintf(buf, buf_size, "core%d(idle)", core_id);
+        return;
+    }
+    int32_t kernel = -1;
+    int64_t task_id_raw = -1;
+    if (core_state && core_state->running_slot_state)
+    {
+        int32_t subslot = static_cast<int32_t>(core_state->running_subslot);
+        kernel = core_state->running_slot_state->task->kernel_id[subslot];
+        task_id_raw = static_cast<int64_t>(core_state->running_slot_state->task->task_id.raw);
+    }
+    uint64_t cond_reg = read_reg(reg_addr_for_cond, RegId::COND);
+    int32_t hw_state = EXTRACT_TASK_STATE(cond_reg);
+    const char *cond_reg_state_str = (hw_state == TASK_ACK_STATE) ? "ack" : "fin";
+    if (hw_state == TASK_ACK_STATE) snprintf(buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s)", core_id, kernel, task_id_raw, cond_reg_state_str);
+    else snprintf(buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s ANOMALY)", core_id, kernel, task_id_raw, cond_reg_state_str);
+}
+
+#ifndef RUNTIME_MAX_WORKER
+#define RUNTIME_MAX_WORKER 72
+#endif
+#ifndef RUNTIME_MAX_FUNC_ID
+#define RUNTIME_MAX_FUNC_ID 1024
+#endif
+
+// Forward declarations — avoid pulling in full headers for pointer/reference params.
+class Runtime;
+struct Handshake;
+struct PTO2Runtime;
+
+class SchedulerContext
+{
+public:
+    int32_t init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base)
+    {
+        always_assert(runtime != nullptr);
+
+        // Zero all per-core execution state before handshake
+        memset(core_exec_states_, 0, sizeof(core_exec_states_));
+
+        // Wire thread/transition configuration that handshake/assign need to read.
+        aicpu_thread_num_ = aicpu_thread_num;
+        sched_thread_num_ = sched_thread_num;
+        orch_to_sched_ = orch_to_sched;
+        regs_ = regs_base;
+
+        // Discover cores and assign to scheduler threads.
+        int32_t rc = handshake_all_cores(runtime);
+        if (rc != 0) return rc;
+        if (!assign_cores_to_threads()) return -1;
+
+        // Initialize task counters. Task count comes from PTO2 shared memory.
+        if (runtime->get_gm_sm_ptr())
+        {
+            auto *header = static_cast<PTO2SharedMemoryHeader *>(runtime->get_gm_sm_ptr());
+            int64_t pto2_count = 0;
+            for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            {
+                int32_t ring_tasks = header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
+                if (ring_tasks > 0 && ring_tasks <= PTO2_SCOPE_TASKS_CAP) pto2_count += ring_tasks;
+            }
+            total_tasks_ = static_cast<int32_t>(pto2_count);
+        }
+        else
+        {
+            total_tasks_ = 0;
+        }
+        completed_tasks_.store(0, std::memory_order_release);
+
+        // Device orchestration: the orchestrator thread flips this when the graph is built.
+        orchestrator_done_ = false;
+
+        // Clear per-core dispatch payloads
+        memset(payload_per_core_, 0, sizeof(payload_per_core_));
+        memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+        // Initialize per-core GlobalContext (sub_block_id) based on cluster position.
+        // This is done once at startup and never modified afterwards.
+        for (int32_t t = 0; t < sched_thread_num_; t++)
+        {
+            CoreTracker &tracker = core_trackers_[t];
+            for (int32_t c = 0; c < tracker.get_cluster_count(); c++)
+            {
+                int32_t cluster_offset = c * 3;  // Each cluster = 1 AIC + 2 AIV
+                auto aiv0_id = tracker.get_core_id_by_offset(tracker.get_aiv0_core_offset(cluster_offset));
+                auto aiv1_id = tracker.get_core_id_by_offset(tracker.get_aiv1_core_offset(cluster_offset));
+                payload_per_core_[aiv0_id][0].global_context.sub_block_id = 0;
+                payload_per_core_[aiv0_id][1].global_context.sub_block_id = 0;
+                payload_per_core_[aiv1_id][0].global_context.sub_block_id = 1;
+                payload_per_core_[aiv1_id][1].global_context.sub_block_id = 1;
+            }
+        }
+
+        func_id_to_addr_ = runtime->func_id_to_addr_;
+
+        return 0;
+    }
+
+    // Reset all SchedulerContext-owned state to its post-construction defaults.
+    // Called by AicpuExecutor::deinit() during per-run teardown.
+    void deinit()
+    {
+        // Reset all per-core execution state
+        for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++)
+        {
+            core_exec_states_[i] = {};
+            core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+            core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+        }
+
+        // Clear per-core dispatch payloads
+        memset(payload_per_core_, 0, sizeof(payload_per_core_));
+        memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+        // Reset sync-start drain coordination — a previous run that aborted mid-drain
+        // would otherwise leave dirty pending/elected/ack state for the next reuse.
+        drain_state_.sync_start_pending.store(0, std::memory_order_release);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+        drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+        drain_state_.pending_task.store(nullptr, std::memory_order_release);
+
+        // Reset task counters and orchestrator state
+        completed_tasks_.store(0, std::memory_order_release);
+        total_tasks_ = 0;
+        orchestrator_done_ = false;
+        pto2_init_done_.store(false, std::memory_order_release);
+        pto2_init_complete_.store(false, std::memory_order_release);
+
+        // Reset core transition state
+        transition_requested_.store(false, std::memory_order_release);
+        wait_reassign_.store(0, std::memory_order_release);
+        reassigned_.store(false, std::memory_order_release);
+        completed_.store(false, std::memory_order_release);
+
+        // Reset core discovery and assignment state
+        aic_count_ = 0;
+        aiv_count_ = 0;
+        cores_total_num_ = 0;
+        aicpu_thread_num_ = 0;
+        sched_thread_num_ = 0;
+        orch_to_sched_ = false;
+        active_sched_threads_ = 0;
+        for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) core_trackers_[t] = CoreTracker{};
+
+        regs_ = 0;
+        sched_ = nullptr;
+        rt_ = nullptr;
+        func_id_to_addr_ = nullptr;
+    }
+
+    // Main scheduler thread entry: poll completion + dispatch ready tasks.
+    int32_t resolve_and_dispatch(Runtime *runtime, int32_t thread_idx)
+    {
+        always_assert(sched_ != nullptr);
+        CoreTracker &tracker = core_trackers_[thread_idx];
+
+        PTO2SharedMemoryHeader *header = sched_->sm_header;
+        if (!header) return -1;
+
+        Handshake *hank = static_cast<Handshake *>(runtime->workers);
+
+        // One-time init: assign perf buffers (one thread does it; others wait)
+        if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel)) pto2_init_complete_.store(true, std::memory_order_release);
+        else
+            while (!pto2_init_complete_.load(std::memory_order_acquire)) SPIN_WAIT_HINT();
+
+        int32_t cur_thread_completed = 0;
+        int32_t idle_iterations = 0;
+        int32_t last_progress_count = 0;
+
+        constexpr int LOCAL_READY_CAP_PER_TYPE = 64;
+        PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE];
+        PTO2LocalReadyBuffer local_bufs[PTO2_NUM_RESOURCE_SHAPES];
+        for (int32_t i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) local_bufs[i].reset(local_ptrs[i], LOCAL_READY_CAP_PER_TYPE);
+        PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP];
+        int32_t deferred_release_count = 0;
+
+        bool cores_released = false;
+
+        const bool pmu_active = is_pmu_enabled();
+
+        uint64_t last_progress_ts = get_sys_cnt_aicpu();
+
+        while (true)
+        {
+            if (completed_.load(std::memory_order_acquire)) break;
+            bool made_progress = false;
+            int32_t task_count = 0;
+            if (!tracker.has_any_running_cores())
+            {
+                LoopAction action = handle_orchestrator_exit(header, runtime, task_count);
+                if (action == LoopAction::BREAK_LOOP) break;
+            }
+
+            if (!cores_released && orch_to_sched_)
+            {
+                LoopAction action = handle_core_transition(cores_released);
+                if (action == LoopAction::BREAK_LOOP) break;
+            }
+
+            // Phase 1: Check running cores for completion
+            int32_t completed_this_turn = 0;
+
+            bool try_completed = tracker.has_any_running_cores();
+            if (try_completed) check_running_cores_for_completion(thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress, deferred_release_slot_states, deferred_release_count, local_bufs);
+            if (completed_this_turn > 0)
+            {
+                int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
+                int32_t new_total = prev + completed_this_turn;
+                last_progress_count = new_total;
+                if (thread_idx == 0 && task_count > 0)
+                {
+                    if (new_total <= PROGRESS_VERBOSE_THRESHOLD || new_total / PROGRESS_LOG_INTERVAL != prev / PROGRESS_LOG_INTERVAL || new_total >= task_count)
+                    {}
+                }
+            }
+
+            if (rt_ != nullptr && rt_->aicore_mailbox != nullptr && (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending()))
+            {
+                AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(rt_->aicore_mailbox, sched_, local_bufs, deferred_release_slot_states, deferred_release_count, PTO2_DEFERRED_RELEASE_CAP);
+                if (poll_result.error_code != PTO2_ERROR_NONE)
+                {
+                    int32_t expected = PTO2_ERROR_NONE;
+                    header->sched_error_code.compare_exchange_strong(expected, poll_result.error_code, std::memory_order_acq_rel, std::memory_order_acquire);
+                    completed_.store(true, std::memory_order_release);
+                    break;
+                }
+                if (poll_result.completed > 0)
+                {
+                    int32_t prev = completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed);
+                    int32_t new_total = prev + poll_result.completed;
+                    last_progress_count = new_total;
+                    made_progress = true;
+                }
+            }
+
+            bool try_pushed = false;
+
+            // Phase 2 drain check
+            if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0)
+            {
+                handle_drain_mode(thread_idx);
+                continue;
+            }
+
+            // Phase 3: Drain wiring queue (thread 0 only)
+            if (thread_idx == 0)
+            {
+                int wired = sched_->drain_wiring_queue(orchestrator_done_);
+                if (wired > 0) made_progress = true;
+            }
+
+            if (thread_idx == 0)
+            {
+                constexpr int DUMMY_DRAIN_BATCH = 16;
+                PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH];
+                int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH);
+                for (int di = 0; di < dummy_got; di++)
+                {
+                    PTO2TaskSlotState &dummy_slot = *dummy_batch[di];
+                    sched_->on_mixed_task_complete(dummy_slot, local_bufs);
+                    deferred_release_slot_states[deferred_release_count++] = &dummy_slot;
+                    if (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP)
+                        while (deferred_release_count > 0) sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+                    int32_t prev = completed_tasks_.fetch_add(1, std::memory_order_relaxed);
+                    last_progress_count = prev + 1;
+                    cur_thread_completed++;
+                }
+                if (dummy_got > 0) made_progress = true;
+            }
+
+            // Phase 4: MIX-strict-priority dispatch with phase-split and
+            // cross-thread idle gating. See dispatch_ready_tasks for the policy.
+            dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed);
+
+            (void)try_completed;
+            (void)try_pushed;
+
+            if (made_progress)
+            {
+                idle_iterations = 0;
+                last_progress_ts = get_sys_cnt_aicpu();
+            }
+            else
+            {
+                while (deferred_release_count > 0) sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+                idle_iterations++;
+
+                if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0)
+                {
+                    LoopAction action = check_idle_fatal_error(header, runtime);
+                    if (action == LoopAction::BREAK_LOOP) break;
+                }
+
+                if (idle_iterations % STALL_LOG_INTERVAL == 0) log_stall_diagnostics(thread_idx, total_tasks_);
+                if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES)
+                {
+                    bool self_owns = self_owns_running_task(thread_idx);
+                    bool global_stuck = !self_owns && total_tasks_ > 0 && completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ && no_thread_owns_running_task();
+                    if (self_owns || global_stuck) return handle_timeout_exit(thread_idx, header, runtime, idle_iterations, last_progress_count);
+                    last_progress_ts = get_sys_cnt_aicpu();
+                }
+                SPIN_WAIT_HINT();
+            }
+        }
+
+        while (deferred_release_count > 0) sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+
+        return cur_thread_completed;
+    }
+
+    int32_t shutdown(int32_t thread_idx)
+    {
+        const int32_t *cores = core_trackers_[thread_idx].core_ids();
+        int32_t core_num = core_trackers_[thread_idx].core_num();
+        if (core_num == 0) return 0;
+
+        int32_t rc = 0;
+        for (int32_t i = 0; i < core_num; i++)
+        {
+            int32_t core_id = cores[i];
+            uint64_t reg_addr = core_exec_states_[core_id].reg_addr;
+            if (reg_addr != 0)
+            {
+                // Timeout means AICore is unresponsive. Log and continue deiniting remaining cores.
+                if (platform_deinit_aicore_regs(reg_addr) != 0) rc = -1;
+            }
+            else
+            {}
+        }
+        return rc;
+    }
+
+    void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t total_tasks)
+    {
+        total_tasks_ = total_tasks;
+
+        // Fold tasks completed inline during orchestration
+        int32_t inline_completed = static_cast<int32_t>(rt->orchestrator.inline_completed_tasks);
+        if (inline_completed > 0) completed_tasks_.fetch_add(inline_completed, std::memory_order_relaxed);
+        orchestrator_done_ = true;
+
+        // Check for fatal error from orchestration; if so, shut down immediately.
+        int32_t orch_err = 0;
+        if (sched_->sm_header) orch_err = sched_->sm_header->orch_error_code.load(std::memory_order_relaxed);
+        if (orch_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+        }
+
+        // Skip core transition on fatal error — cores already shut down above.
+        if (completed_.load(std::memory_order_acquire))
+        {
+            // Signal transition to unblock scheduler threads waiting at core transition
+            transition_requested_.store(true, std::memory_order_release);
+            reassigned_.store(true, std::memory_order_release);
+        }
+        else if (orch_to_sched_)
+        {
+            transition_requested_.store(true, std::memory_order_release);
+
+            // Wait for scheduler threads to acknowledge transition request
+            while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_)
+            {
+                if (completed_.load(std::memory_order_acquire)) break;
+                SPIN_WAIT_HINT();
+            }
+            if (!completed_.load(std::memory_order_acquire))
+            {
+                reassign_cores_for_all_threads();
+                reassigned_.store(true, std::memory_order_release);
+            }
+        }
+    }
+
+    // Bind the PTO2Runtime scheduler pointer. Required in device-orchestration
+    // mode where rt is created by the orchestrator thread after init().
+    void bind_runtime(PTO2Runtime *rt)
+    {
+        rt_ = rt;
+        sched_ = &rt->scheduler;
+    }
+
+    int32_t aic_count() const
+    {
+        return aic_count_;
+    }
+    int32_t aiv_count() const
+    {
+        return aiv_count_;
+    }
+    bool is_completed() const
+    {
+        return completed_.load(std::memory_order_acquire);
+    }
+    int32_t completed_tasks_count() const
+    {
+        return completed_tasks_.load(std::memory_order_acquire);
+    }
+
+    // Block until the first scheduler thread has finished one-time PTO2 init.
+    // Called by the orchestrator thread in device-orch mode.
+    void wait_pto2_init_complete() const
+    {
+        while (!pto2_init_complete_.load(std::memory_order_acquire)) SPIN_WAIT_HINT();
+    }
+
+private:
+    // --- Scheduler binding & per-core runtime state ---
+    alignas(64) PTO2SchedulerState *sched_{nullptr};
+    PTO2Runtime *rt_{nullptr};
+
+    // Per-core execution state, indexed by core_id (= worker_id)
+    CoreExecState core_exec_states_[RUNTIME_MAX_WORKER];
+
+    // Cluster-ordered core trackers, one per scheduler thread
+    CoreTracker core_trackers_[MAX_AICPU_THREADS];
+
+    // Per-core dispatch payload storage: dual-buffer for pipelining.
+    // buf_idx = reg_task_id & 1; adjacent dispatches alternate automatically.
+    PTO2DispatchPayload payload_per_core_[RUNTIME_MAX_WORKER][2];
+
+    DeferredCompletionSlab deferred_slab_per_core_[RUNTIME_MAX_WORKER][2];
+
+    // sync_start drain coordination
+    SyncStartDrainState drain_state_;
+
+    // --- Task-execution tracking ---
+    std::atomic<int32_t> completed_tasks_{0};
+    int32_t total_tasks_{0};
+    // Device orchestration: set by last orchestrator when graph is built; schedulers poll it.
+    // volatile prevents the compiler from hoisting the load out of spin loops.
+    volatile bool orchestrator_done_{false};
+    std::atomic<bool> completed_{false};
+    uint64_t *func_id_to_addr_{nullptr};
+
+    // --- Core-transition coordination ---
+    std::atomic<bool> transition_requested_{false};
+    std::atomic<int32_t> wait_reassign_{0};
+    std::atomic<bool> reassigned_{false};
+
+    // --- Thread/core configuration ---
+    int32_t active_sched_threads_{0};
+    int32_t sched_thread_num_{0};
+    bool orch_to_sched_{false};
+    int32_t aicpu_thread_num_{0};
+    int32_t cores_total_num_{0};
+
+    // Cluster-ordered worker_id lists, populated by handshake_all_cores().
+    int32_t aic_worker_ids_[RUNTIME_MAX_WORKER]{};
+    int32_t aiv_worker_ids_[RUNTIME_MAX_WORKER]{};
+    int32_t aic_count_{0};
+    int32_t aiv_count_{0};
+
+    // Platform AICore-register base array (set by AicpuExecutor before init()).
+    uint64_t regs_{0};
+
+    // --- One-time init coordination ---
+    std::atomic<bool> pto2_init_done_{false};
+    std::atomic<bool> pto2_init_complete_{false};
+
+    // Handshake with all AICore workers; populates core_exec_states_, worker id lists.
+    int32_t handshake_all_cores(Runtime *runtime)
+    {
+        Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
+        cores_total_num_ = runtime->worker_count;
+
+        // Validate cores_total_num_ before using as array index
+        if (cores_total_num_ == 0 || cores_total_num_ > RUNTIME_MAX_WORKER) return -1;
+
+        aic_count_ = 0;
+        aiv_count_ = 0;
+
+        for (int32_t i = 0; i < cores_total_num_; i++)
+        {
+            all_handshakes[i].task = reinterpret_cast<uint64_t>(&payload_per_core_[i][0]);
+            OUT_OF_ORDER_STORE_BARRIER();
+            all_handshakes[i].aicpu_ready = 1;
+        }
+        OUT_OF_ORDER_STORE_BARRIER();
+
+        // Get platform physical cores count for validation
+        uint32_t max_physical_cores_count = platform_get_physical_cores_count();
+
+        // Step 2: Wait for all cores to respond, collect core type and register addresses
+        bool handshake_failed = false;
+        for (int32_t i = 0; i < cores_total_num_; i++)
+        {
+            Handshake *hank = &all_handshakes[i];
+
+            while (hank->aicore_regs_ready == 0) SPIN_WAIT_HINT();
+
+            uint32_t physical_core_id = hank->physical_core_id;
+
+            if (physical_core_id >= max_physical_cores_count)
+            {
+                handshake_failed = true;
+                continue;
+            }
+
+            uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
+            uint64_t reg_addr = regs[physical_core_id];
+
+            // Initialize AICore registers after discovery (first round)
+            platform_init_aicore_regs(reg_addr);
+            OUT_OF_ORDER_STORE_BARRIER();
+            hank->aicpu_regs_ready = 1;
+
+            OUT_OF_ORDER_STORE_BARRIER();
+
+            while (hank->aicore_done == 0) SPIN_WAIT_HINT();
+
+            CoreType type = hank->core_type;
+
+            core_exec_states_[i].reg_addr = reg_addr;
+            core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
+
+            core_exec_states_[i].worker_id = i;
+            core_exec_states_[i].physical_core_id = physical_core_id;
+            core_exec_states_[i].core_type = type;
+
+            if (type == CoreType::AIC) aic_worker_ids_[aic_count_++] = i;
+            else aiv_worker_ids_[aiv_count_++] = i;
+        }
+
+        if (handshake_failed)
+        {
+            emergency_shutdown(runtime);
+            return -1;
+        }
+
+        return 0;
+    }
+
+    // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads.
+    bool assign_cores_to_threads()
+    {
+        // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % active_sched_threads_.
+        // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together.
+        active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
+        int32_t cluster_count = aic_count_;
+
+        // Max clusters any single sched thread can hold: ceil(cluster_count / active_sched_threads_).
+        int32_t max_clusters_per_thread = (cluster_count + active_sched_threads_ - 1) / active_sched_threads_;
+        int32_t thread_cores_num = max_clusters_per_thread * 3;
+
+        if (thread_cores_num > CoreTracker::MAX_CORE_PER_THREAD) return false;
+
+        for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++)
+        {
+            core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+            core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+        }
+
+        // Count clusters per thread first (round-robin may distribute unevenly)
+        int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
+        for (int32_t ci = 0; ci < cluster_count; ci++) clusters_per_thread[ci % active_sched_threads_]++;
+        for (int32_t i = 0; i < active_sched_threads_; i++) core_trackers_[i].init(clusters_per_thread[i]);
+
+        int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
+
+        for (int32_t ci = 0; ci < cluster_count; ci++)
+        {
+            int32_t t = ci % active_sched_threads_;
+
+            int32_t aic_wid = aic_worker_ids_[ci];
+            int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
+            int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
+
+            core_trackers_[t].set_cluster(cluster_idx_per_thread[t]++, aic_wid, aiv0_wid, aiv1_wid);
+        }
+
+        for (int32_t t = 0; t < aicpu_thread_num_; t++)
+        {}
+
+        return true;
+    }
+
+    // Re-distribute all cores across all threads after orchestration completes.
+    void reassign_cores_for_all_threads()
+    {
+        // Collect running worker_ids from all current trackers
+        bool running_cores[RUNTIME_MAX_WORKER] = {};
+        for (int32_t i = 0; i < aicpu_thread_num_; i++)
+        {
+            auto all_running = core_trackers_[i].get_all_running_cores();
+            int32_t bp;
+            while ((bp = all_running.pop_first()) >= 0) running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true;
+        }
+
+        // Count clusters per thread (round-robin across all threads)
+        int32_t cluster_count = aic_count_;
+        int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
+        for (int32_t ci = 0; ci < cluster_count; ci++) clusters_per_thread[ci % aicpu_thread_num_]++;
+
+        // Re-init all trackers and reset core counts
+        for (int32_t i = 0; i < aicpu_thread_num_; i++) core_trackers_[i].init(clusters_per_thread[i]);
+
+        // Assign clusters round-robin and restore running state
+        int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
+        for (int32_t ci = 0; ci < cluster_count; ci++)
+        {
+            int32_t t = ci % aicpu_thread_num_;
+
+            int32_t aic_wid = aic_worker_ids_[ci];
+            int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
+            int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
+
+            int32_t cl_idx = cluster_idx_per_thread[t]++;
+            core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid);
+
+            // init() marks all idle; toggle cores that were running and restore pending_occupied
+            if (running_cores[aic_wid])
+            {
+                core_trackers_[t].change_core_state(cl_idx * 3);
+                core_trackers_[t].set_pending_occupied(cl_idx * 3);
+            }
+            if (running_cores[aiv0_wid])
+            {
+                core_trackers_[t].change_core_state(cl_idx * 3 + 1);
+                core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1);
+            }
+            if (running_cores[aiv1_wid])
+            {
+                core_trackers_[t].change_core_state(cl_idx * 3 + 2);
+                core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2);
+            }
+        }
+
+        active_sched_threads_ = aicpu_thread_num_;
+    }
+
+    // Emergency shutdown: broadcast exit signal to every handshake'd core and
+    // deinit their AICore register blocks. Idempotent.
+    void emergency_shutdown(Runtime *runtime)
+    {
+        Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
+        int32_t timeout_count = 0;
+        for (int32_t i = 0; i < cores_total_num_; i++)
+        {
+            Handshake *hank = &all_handshakes[i];
+            OUT_OF_ORDER_STORE_BARRIER();
+            hank->aicpu_regs_ready = 1;
+            if (core_exec_states_[i].reg_addr != 0)
+            {
+                if (platform_deinit_aicore_regs(core_exec_states_[i].reg_addr) != 0) timeout_count++;
+            }
+        }
+        if (timeout_count > 0)
+        {}
+    }
+
+    static const char *shape_name(PTO2ResourceShape shape)
+    {
+        switch (shape)
+        {
+        case PTO2ResourceShape::AIC:
+            return "AIC";
+        case PTO2ResourceShape::AIV:
+            return "AIV";
+        case PTO2ResourceShape::MIX:
+            return "MIX";
+        case PTO2ResourceShape::DUMMY:
+            return "DUMMY";
+        }
+        return "UNKNOWN";
+    }
+
+    static inline const char *subslot_name(PTO2SubtaskSlot s)
+    {
+        switch (s)
+        {
+        case PTO2SubtaskSlot::AIC:
+            return "aic";
+        case PTO2SubtaskSlot::AIV0:
+            return "aiv0";
+        case PTO2SubtaskSlot::AIV1:
+            return "aiv1";
+        }
+        return "?";
+    }
+
+    int pop_ready_tasks_batch(PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count)
+    {
+        (void)thread_idx;
+        int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
+        return count;
+    }
+
+    void build_payload(PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, const AsyncCtx &async_ctx, int32_t block_idx)
+    {
+        int32_t slot_idx = static_cast<int32_t>(subslot);
+        uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]);
+        const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
+        dispatch_payload.function_bin_addr = callable->resolved_addr();
+        auto &payload = *slot_state.payload;
+        int n = 0;
+        for (int32_t i = 0; i < payload.tensor_count; i++) dispatch_payload.args[n++] = reinterpret_cast<uint64_t>(&payload.tensors[i]);
+        for (int32_t i = 0; i < payload.scalar_count; i++) dispatch_payload.args[n++] = payload.scalars[i];
+        dispatch_payload.local_context.block_idx = block_idx;
+        dispatch_payload.local_context.block_num = slot_state.logical_block_num;
+        dispatch_payload.local_context.async_ctx = async_ctx;
+        dispatch_payload.args[PAYLOAD_LOCAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.local_context);
+        dispatch_payload.args[PAYLOAD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.global_context);
+    }
+
+    struct PublishHandle
+    {
+        uint64_t reg_addr;
+        uint32_t reg_task_id;
+        int32_t core_offset;
+        uint64_t *dispatch_timestamp_slot;
+    };
+
+    SchedulerContext::PublishHandle prepare_subtask_to_core(int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, bool to_pending, int32_t block_idx)
+    {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        auto core_id = tracker.get_core_id_by_offset(core_offset);
+        CoreExecState &core_exec_state = core_exec_states_[core_id];
+
+        core_exec_state.dispatch_seq++;
+        uint32_t reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
+        static_assert((TASK_ID_MASK - AICORE_EXIT_SIGNAL + 1) % 2 == 0, "Sentinel skip must be even to preserve dual-buffer parity");
+        if (reg_task_id >= AICORE_EXIT_SIGNAL)
+        {
+            core_exec_state.dispatch_seq += (TASK_ID_MASK - reg_task_id + 1);
+            reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
+        }
+
+        uint32_t buf_idx = reg_task_id & 1u;
+        PTO2DispatchPayload &payload = payload_per_core_[core_id][buf_idx];
+        DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][buf_idx];
+        deferred_slab->count = 0;
+        deferred_slab->error_code = PTO2_ERROR_NONE;
+        AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab);
+        build_payload(payload, slot_state, subslot, async_ctx, block_idx);
+
+        if (to_pending)
+        {
+            core_exec_state.pending_subslot = subslot;
+            core_exec_state.pending_slot_state = &slot_state;
+            core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
+        }
+        else
+        {
+            core_exec_state.running_subslot = subslot;
+            core_exec_state.running_slot_state = &slot_state;
+            core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
+            tracker.change_core_state(core_offset);
+        }
+        tracker.set_pending_occupied(core_offset);
+
+        uint64_t *dispatch_timestamp_slot = nullptr;
+
+        return PublishHandle{core_exec_state.reg_addr, reg_task_id, core_offset, dispatch_timestamp_slot};
+    }
+
+    inline void publish_subtask_to_core(const PublishHandle &h, uint64_t dispatch_ts)
+    {
+        if (h.dispatch_timestamp_slot != nullptr) *h.dispatch_timestamp_slot = dispatch_ts;
+        write_reg(h.reg_addr, RegId::DATA_MAIN_BASE, static_cast<uint64_t>(h.reg_task_id));
+    }
+
+    // Fan out one block's subtasks (1 for AIC/AIV, 1-3 for MIX) into the
+    // caller-supplied handles buffer. Returns the number of handles written.
+    int prepare_block_for_dispatch(int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape, bool to_pending, int32_t block_idx, PublishHandle *out_handles)
+    {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        if (shape == PTO2ResourceShape::MIX)
+        {
+            uint8_t cmask = slot_state.active_mask.core_mask();
+            int n = 0;
+            if (cmask & PTO2_SUBTASK_MASK_AIC)
+            {
+                bool p = to_pending && !tracker.is_aic_core_idle(core_offset);
+                out_handles[n++] = prepare_subtask_to_core(thread_idx, tracker.get_aic_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIC, p, block_idx);
+            }
+            if (cmask & PTO2_SUBTASK_MASK_AIV0)
+            {
+                bool p = to_pending && !tracker.is_aiv0_core_idle(core_offset);
+                out_handles[n++] = prepare_subtask_to_core(thread_idx, tracker.get_aiv0_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV0, p, block_idx);
+            }
+            if (cmask & PTO2_SUBTASK_MASK_AIV1)
+            {
+                bool p = to_pending && !tracker.is_aiv1_core_idle(core_offset);
+                out_handles[n++] = prepare_subtask_to_core(thread_idx, tracker.get_aiv1_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV1, p, block_idx);
+            }
+            return n;
+        }
+        else if (shape == PTO2ResourceShape::AIC)
+        {
+            out_handles[0] = prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIC, to_pending, block_idx);
+            return 1;
+        }
+        else
+        {
+            out_handles[0] = prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx);
+            return 1;
+        }
+    }
+
+    void dispatch_shape(int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf, CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed)
+    {
+        if (entered_drain) return;
+
+        bool is_pending = (phase == CoreTracker::DispatchPhase::PENDING);
+        auto cores = tracker.get_dispatchable_cores(shape, phase);
+        if (!cores.has_value()) return;
+
+        while (cores.has_value() && !entered_drain)
+        {
+            int want = cores.count();
+            PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS * 3];
+            int got = pop_ready_tasks_batch(shape, thread_idx, local_buf, batch, want);
+            if (got == 0) break;
+
+            bool any_sync_start = false;
+            for (int bi = 0; bi < got; bi++)
+            {
+                if (batch[bi]->active_mask.requires_sync_start())
+                {
+                    any_sync_start = true;
+                    break;
+                }
+            }
+
+            PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
+            int handle_count = 0;
+            bool dispatched_any = false;
+
+            auto flush_publish = [&]() {
+                if (handle_count == 0) return;
+                wmb();
+                uint64_t dispatch_ts = 0;
+                for (int i = 0; i < handle_count; i++) publish_subtask_to_core(handles[i], dispatch_ts);
+                handle_count = 0;
+                made_progress = true;
+            };
+
+            for (int bi = 0; bi < got; bi++)
+            {
+                PTO2TaskSlotState *slot_state = batch[bi];
+
+                if (slot_state->active_mask.requires_sync_start())
+                {
+                    if (is_pending)
+                    {
+                        sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                        continue;
+                    }
+                    int32_t available = cores.count();
+                    if (available < slot_state->logical_block_num)
+                    {
+                        flush_publish();
+                        if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                        for (int rem = bi + 1; rem < got; rem++) sched_->ready_queues[static_cast<int32_t>(shape)].push(batch[rem]);
+                        entered_drain = true;
+                        break;
+                    }
+                }
+
+                if (!cores.has_value())
+                {
+                    flush_publish();
+                    sched_->ready_queues[static_cast<int32_t>(shape)].push_batch(&batch[bi], got - bi);
+                    break;
+                }
+
+                dispatched_any = true;
+                try_pushed = true;
+                int32_t remaining = slot_state->logical_block_num - slot_state->next_block_idx;
+                int32_t claim = std::min(cores.count(), remaining);
+                int32_t start = slot_state->next_block_idx;
+                slot_state->next_block_idx += claim;
+
+                if (slot_state->next_block_idx < slot_state->logical_block_num) sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+
+                for (int32_t b = 0; b < claim; b++)
+                {
+                    auto core_offset = cores.pop_first();
+                    handle_count += prepare_block_for_dispatch(thread_idx, core_offset, *slot_state, shape, is_pending, start + b, &handles[handle_count]);
+                }
+
+                if (any_sync_start) flush_publish();
+            }
+
+            flush_publish();
+
+            if (!dispatched_any) break;
+
+            if (!cores.has_value()) cores = tracker.get_dispatchable_cores(shape, phase);
+        }
+    }
+
+    void dispatch_ready_tasks(int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES], bool pmu_active, bool &made_progress, bool &try_pushed)
+    {
+        using Phase = CoreTracker::DispatchPhase;
+        constexpr int32_t MIX_I = static_cast<int32_t>(PTO2ResourceShape::MIX);
+
+        static constexpr PTO2ResourceShape kAicAivOrder[2][2] = {
+            {PTO2ResourceShape::AIC, PTO2ResourceShape::AIV},
+            {PTO2ResourceShape::AIV, PTO2ResourceShape::AIC},
+        };
+        const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1];
+
+        const int32_t bd_per_thread = PLATFORM_MAX_BLOCKDIM / active_sched_threads_;
+        const int32_t thread_capacity[PTO2_NUM_RESOURCE_SHAPES] = {
+            bd_per_thread * PLATFORM_AIC_CORES_PER_BLOCKDIM,
+            bd_per_thread * PLATFORM_AIV_CORES_PER_BLOCKDIM,
+            bd_per_thread,
+        };
+        for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++)
+        {
+            auto &lb = local_bufs[s];
+            int32_t excess = lb.count - thread_capacity[s];
+            if (excess <= 0) continue;
+            if (!has_idle_in_other_threads(thread_idx, static_cast<PTO2ResourceShape>(s))) continue;
+            sched_->ready_queues[s].push_batch(&lb.slot_states[lb.count - excess], excess);
+            lb.count -= excess;
+        }
+
+        auto flush_local_bufs = [&]() {
+            for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++)
+            {
+                auto &lb = local_bufs[s];
+                if (lb.count > 0)
+                {
+                    sched_->ready_queues[s].push_batch(lb.slot_states, lb.count);
+                    lb.count = 0;
+                }
+            }
+        };
+        struct FlushGuard
+        {
+            decltype(flush_local_bufs) &flush_fn;
+            ~FlushGuard()
+            {
+                flush_fn();
+            }
+        } flush_guard{flush_local_bufs};
+
+        bool entered_drain = false;
+
+        // ===== IDLE stage =====
+        dispatch_shape(thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain, made_progress, try_pushed);
+        if (entered_drain) return;
+
+        bool skip_aic_aiv = has_residual_mix(local_bufs[MIX_I]);
+
+        if (!skip_aic_aiv)
+        {
+            for (int i = 0; i < 2; i++)
+            {
+                PTO2ResourceShape s = aic_aiv[i];
+                dispatch_shape(thread_idx, s, Phase::IDLE, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress, try_pushed);
+                if (entered_drain) return;
+            }
+        }
+
+        // Flush between IDLE and PENDING so PENDING-stage queue-size checks and any
+        // peer-thread reads see the IDLE-stage release_fanin output.
+        flush_local_bufs();
+
+        if (pmu_active) return;
+
+        if (!has_idle_in_other_threads(thread_idx, PTO2ResourceShape::MIX))
+        {
+            dispatch_shape(thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain, made_progress, try_pushed);
+            if (entered_drain) return;
+        }
+
+        // Re-check after MIX-PENDING. If MIX-IDLE already set skip_aic_aiv, leave
+        // it set; otherwise, escalate iff PENDING-MIX left residual.
+        if (!skip_aic_aiv && has_residual_mix(local_bufs[MIX_I])) skip_aic_aiv = true;
+
+        if (skip_aic_aiv) return;
+
+        // AIC/AIV-PENDING gate: a peer-idle skip is a delay, not a loss — the peer
+        // will pull from the global queue on its next IDLE pass.
+        for (int i = 0; i < 2; i++)
+        {
+            PTO2ResourceShape s = aic_aiv[i];
+            if (has_idle_in_other_threads(thread_idx, s)) continue;
+            dispatch_shape(thread_idx, s, Phase::PENDING, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress, try_pushed);
+            if (entered_drain) return;
+        }
+    }
+
+    bool has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const
+    {
+        for (int32_t t = 0; t < active_sched_threads_; t++)
+        {
+            if (t == self_thread_idx) continue;
+            if (core_trackers_[t].get_idle_core_offset_states(shape).has_value()) return true;
+        }
+        return false;
+    }
+
+    bool has_residual_mix(const PTO2LocalReadyBuffer &mix_local_buf) const
+    {
+        return mix_local_buf.count > 0 || sched_->ready_queues[static_cast<int32_t>(PTO2ResourceShape::MIX)].size() > 0;
+    }
+
+    static SlotTransition decide_slot_transition(int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id)
+    {
+        SlotTransition t;
+        if (pending_id != AICPU_TASK_INVALID && reg_task_id == pending_id)
+        {
+            t.matched = true;
+            t.running_done = true;  // Serial execution: pending event implies running done
+            t.running_freed = true;
+            t.pending_freed = true;
+            if (reg_state == TASK_FIN_STATE) t.pending_done = true;  // Case 1: pending FIN
+            // else: Case 2: pending ACK (pending_done stays false)
+        }
+        else if (reg_task_id == running_id)
+        {
+            if (reg_state == TASK_FIN_STATE)
+            {
+                if (pending_id == AICPU_TASK_INVALID)
+                {
+                    // Case 3.2: running FIN, no pending -> core goes idle
+                    t.matched = true;
+                    t.running_done = true;
+                    t.running_freed = true;
+                }
+                // Case 3.1: running FIN, pending exists -> skip (transient state).
+                // Case 1/2 (pending ACK/FIN) will complete running implicitly via running_done=true.
+            }
+            else
+            {
+                // Case 4: running ACK -- only pending_freed (slot now hardware-latched)
+                t.matched = true;
+                t.pending_freed = true;
+            }
+        }
+        return t;
+    }
+
+    void complete_slot_task(PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, int32_t core_id, Handshake *hank, int32_t &completed_this_turn, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs)
+    {
+        (void)hank;
+        AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr;
+        bool defer_completion_to_consumer = false;
+
+        if (slot_state.payload != nullptr)
+        {
+            volatile DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][expected_reg_task_id & 1];
+            int32_t slab_err = deferred_slab->error_code;
+            if (slab_err != PTO2_ERROR_NONE)
+            {
+                int32_t expected = PTO2_ERROR_NONE;
+                sched_->sm_header->sched_error_code.compare_exchange_strong(expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire);
+                completed_.store(true, std::memory_order_release);
+                return;
+            }
+
+            uint32_t cond_count = deferred_slab->count;
+            if (cond_count > MAX_COMPLETIONS_PER_TASK)
+            {
+                int32_t expected = PTO2_ERROR_NONE;
+                sched_->sm_header->sched_error_code.compare_exchange_strong(expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire);
+                completed_.store(true, std::memory_order_release);
+                return;
+            }
+
+            if (cond_count > 0)
+            {
+                slot_state.any_subtask_deferred.store(true, std::memory_order_release);
+
+                const PTO2TaskId token = slot_state.task->task_id;
+                for (uint32_t i = 0; i < cond_count; ++i)
+                {
+                    volatile DeferredCompletionEntry *e = &deferred_slab->entries[i];
+                    while (!mailbox->try_push_condition(token, e->addr, e->expected_value, e->engine, e->completion_type))
+                    {
+                        sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
+                        SPIN_WAIT_HINT();
+                    }
+                }
+            }
+        }
+
+        bool mixed_complete = sched_->on_subtask_complete(slot_state);
+
+        if (mixed_complete && slot_state.payload != nullptr && slot_state.any_subtask_deferred.load(std::memory_order_acquire))
+        {
+            // Some subtask of this task registered conditions; finish the
+            // registration by handing the slot_state off to the consumer.
+            while (!mailbox->try_push_normal_done(slot_state.task->task_id, reinterpret_cast<uint64_t>(&slot_state)))
+            {
+                sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
+                SPIN_WAIT_HINT();
+            }
+            defer_completion_to_consumer = true;
+        }
+
+        if (mixed_complete && !defer_completion_to_consumer)
+        {
+            sched_->on_mixed_task_complete(slot_state, local_bufs);
+            if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP)
+            {
+                deferred_release_slot_states[deferred_release_count++] = &slot_state;
+            }
+            else
+            {
+                while (deferred_release_count > 0) sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
+                deferred_release_slot_states[deferred_release_count++] = &slot_state;
+            }
+            completed_this_turn++;
+        }
+    }
+
+    static void promote_pending_to_running(CoreExecState &core)
+    {
+        core.running_slot_state = core.pending_slot_state;
+        core.running_reg_task_id = core.pending_reg_task_id;
+        core.running_subslot = core.pending_subslot;
+        core.pending_slot_state = nullptr;
+        core.pending_reg_task_id = AICPU_TASK_INVALID;
+    }
+    static void clear_running_slot(CoreExecState &core)
+    {
+        core.running_slot_state = nullptr;
+        core.running_reg_task_id = AICPU_TASK_INVALID;
+    }
+
+    void check_running_cores_for_completion(int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed, bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs)
+    {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        auto running_core_states = tracker.get_all_running_cores();
+        while (running_core_states.has_value())
+        {
+            int32_t bit_pos = running_core_states.pop_first();
+            int32_t core_id = tracker.get_core_id_by_offset(bit_pos);
+            CoreExecState &core = core_exec_states_[core_id];
+
+            uint64_t reg_val = static_cast<uint64_t>(*core.cond_ptr);
+            rmb();
+            int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
+            int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
+
+            SlotTransition t = decide_slot_transition(reg_task_id, reg_state, core.running_reg_task_id, core.pending_reg_task_id);
+            if (!t.matched) continue;
+
+            // --- Apply phase: execute actions based on transition ---
+
+            // 1. Complete finished tasks (capture pointers before modifying core state)
+            if (t.pending_done)
+            {
+                complete_slot_task(*core.pending_slot_state, core.pending_reg_task_id, core_id, hank, completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs);
+                cur_thread_completed++;
+            }
+            if (t.running_done)
+            {
+                complete_slot_task(*core.running_slot_state, core.running_reg_task_id, core_id, hank, completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs);
+                cur_thread_completed++;
+            }
+
+            // 2. Update slot data
+            if (t.running_freed)
+            {
+                if (core.pending_slot_state != nullptr && !t.pending_done)
+                {
+                    promote_pending_to_running(core);  // Case 2 or Case 3 (with pending)
+                }
+                else
+                {
+                    clear_running_slot(core);  // Case 1 or Case 3 (no pending)
+                    if (t.pending_done)
+                    {
+                        core.pending_slot_state = nullptr;
+                        core.pending_reg_task_id = AICPU_TASK_INVALID;
+                    }
+                }
+            }
+
+            // 3. Update tracker bitmap
+            bool is_idle = (core.running_reg_task_id == AICPU_TASK_INVALID);
+            if (is_idle)
+            {
+                tracker.change_core_state(bit_pos);       // Mark idle
+                tracker.clear_pending_occupied(bit_pos);  // Idle safeguard: no payload to protect
+            }
+            else if (t.pending_freed && core.pending_reg_task_id == AICPU_TASK_INVALID)
+            {
+                tracker.clear_pending_occupied(bit_pos);
+            }
+
+            // 4. Progress signal (only when running task completes)
+            if (t.running_done) made_progress = true;
+        }
+    }
+
+    bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num)
+    {
+        int32_t expected = 0;
+        if (!drain_state_.sync_start_pending.compare_exchange_strong(expected, -1, std::memory_order_relaxed, std::memory_order_relaxed)) return false;  // Another thread already holds the drain slot.
+        // We own the drain slot.  Store the task and reset election flag before making it visible.
+        drain_state_.pending_task.store(slot_state, std::memory_order_release);
+        drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+        // Release store: all stores above are now visible to any thread that
+        // acquire-loads sync_start_pending and sees block_num > 0.
+        drain_state_.sync_start_pending.store(block_num, std::memory_order_release);
+        return true;
+    }
+    int32_t count_global_available(PTO2ResourceShape shape)
+    {
+        int32_t total = 0;
+        for (int32_t t = 0; t < active_sched_threads_; t++) total += core_trackers_[t].get_idle_core_offset_states(shape).count();
+        return total;
+    }
+    void drain_worker_dispatch(int32_t block_num)
+    {
+        PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
+        if (!slot_state)
+        {
+            drain_state_.sync_start_pending.store(0, std::memory_order_release);
+            return;
+        }
+        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+
+        for (int32_t t = 0; t < active_sched_threads_ && slot_state->next_block_idx < block_num; t++)
+        {
+            auto valid = core_trackers_[t].get_idle_core_offset_states(shape);
+            int32_t remaining = slot_state->logical_block_num - slot_state->next_block_idx;
+            int32_t claim = std::min(valid.count(), remaining);
+            int32_t start = slot_state->next_block_idx;
+            slot_state->next_block_idx += claim;
+            PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
+            int handle_count = 0;
+            for (int32_t b = 0; b < claim; b++)
+            {
+                auto core_offset = valid.pop_first();
+                handle_count += prepare_block_for_dispatch(t, core_offset, *slot_state, shape, false, start + b, &handles[handle_count]);
+            }
+            wmb();
+            uint64_t dispatch_ts = 0;
+            for (int i = 0; i < handle_count; i++) publish_subtask_to_core(handles[i], dispatch_ts);
+        }
+
+        std::atomic_thread_fence(std::memory_order_release);
+        drain_state_.pending_task.store(nullptr, std::memory_order_release);
+        drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+        drain_state_.sync_start_pending.store(0, std::memory_order_release);
+    }
+    void handle_drain_mode(int32_t thread_idx)
+    {
+        // Spin until drain is fully initialized (sentinel -1 -> block_num > 0).
+        int32_t block_num;
+        do {
+            block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire);
+        } while (block_num < 0);
+        if (block_num == 0) return;
+
+        uint32_t all_acked = (1u << active_sched_threads_) - 1;
+
+        // Ack barrier -- signal this thread has stopped dispatch.
+        drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release);
+
+        // Spin until all threads have acked.
+        // If our bit is cleared while waiting, elected reset due to insufficient resources.
+        while (true)
+        {
+            uint32_t ack = drain_state_.drain_ack_mask.load(std::memory_order_acquire);
+            if ((ack & all_acked) == all_acked) break;
+            if ((ack & (1u << thread_idx)) == 0) return;
+            SPIN_WAIT_HINT();
+        }
+
+        // Election -- exactly one thread wins the CAS.
+        int32_t expected = 0;
+        drain_state_.drain_worker_elected.compare_exchange_strong(expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed);
+
+        if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1)
+        {
+            // Non-elected: spin-wait for drain completion or resource-insufficient reset.
+            while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0)
+            {
+                if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return;
+                SPIN_WAIT_HINT();
+            }
+            return;
+        }
+
+        // Elected: check if global resources are sufficient.
+        PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
+        if (slot_state == nullptr)
+        {
+            drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+            return;
+        }
+        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+        int32_t available = count_global_available(shape);
+
+        if (available < block_num)
+        {
+            // Insufficient resources -- reset drain fields so threads can resume
+            // completion polling to free running cores, then retry.
+            drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+            drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+            return;
+        }
+
+        // Dispatch -- all other threads are spinning, elected thread has exclusive tracker access.
+        drain_worker_dispatch(block_num);
+    }
+
+    LoopAction handle_orchestrator_exit(PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count)
+    {
+        if (completed_.load(std::memory_order_acquire)) return LoopAction::BREAK_LOOP;
+        int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+        if (orch_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+            return LoopAction::BREAK_LOOP;
+        }
+        int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
+        if (sched_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+            return LoopAction::BREAK_LOOP;
+        }
+
+        bool orch_done = orchestrator_done_;
+        if (!orch_done) return LoopAction::NONE;
+
+        task_count = total_tasks_;
+        if (task_count > 0 && completed_tasks_.load(std::memory_order_relaxed) >= task_count)
+        {
+            completed_.store(true, std::memory_order_release);
+            return LoopAction::BREAK_LOOP;
+        }
+        return LoopAction::NONE;
+    }
+
+    LoopAction handle_core_transition(bool &cores_released)
+    {
+        if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE;
+        if (!reassigned_.load(std::memory_order_acquire))
+        {
+            wait_reassign_.fetch_add(1, std::memory_order_release);
+            while (!reassigned_.load(std::memory_order_acquire))
+            {
+                if (completed_.load(std::memory_order_acquire)) return LoopAction::BREAK_LOOP;
+                SPIN_WAIT_HINT();
+            }
+        }
+        cores_released = true;
+        return LoopAction::NONE;
+    }
+
+    LoopAction check_idle_fatal_error(PTO2SharedMemoryHeader *header, Runtime *runtime)
+    {
+        if (completed_.load(std::memory_order_acquire)) return LoopAction::BREAK_LOOP;
+        int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+        if (orch_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+            return LoopAction::BREAK_LOOP;
+        }
+        int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
+        if (sched_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+            return LoopAction::BREAK_LOOP;
+        }
+        return LoopAction::NONE;
+    }
+
+    void log_stall_diagnostics(int32_t thread_idx, [[maybe_unused]] int32_t task_count)
+    {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+
+        // T0 owns the shared-ring scan; printing it from other threads would
+        // produce identical TASK lines once per scheduler thread.
+        if (thread_idx == 0)
+        {
+            int32_t cnt_ready = 0, cnt_waiting = 0, cnt_running = 0, submitted_in_ring = 0;
+            for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            {
+                PTO2SharedMemoryRingHeader &ring = *sched_->ring_sched_states[r].ring;
+                int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed);
+                submitted_in_ring += ring_task_count;
+                for (int32_t si = 0; si < ring_task_count; si++)
+                {
+                    PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si);
+                    PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed);
+                    bool fanin_ready = sched_->fanin_satisfied(&slot_state);
+                    if (st >= PTO2_TASK_COMPLETED) continue;
+                    char running_on[192] = {0};
+                    int32_t owner = -1;
+                    int32_t pos = 0;
+                    bool is_running = false;
+                    for (int32_t cid = 0; cid < cores_total_num_ && pos + 32 < (int32_t)sizeof(running_on); cid++)
+                    {
+                        if (core_exec_states_[cid].running_slot_state != &slot_state) continue;
+                        is_running = true;
+                        if (owner < 0) owner = find_core_owner_thread(cid);
+                        const char *sname = subslot_name(core_exec_states_[cid].running_subslot);
+                        int32_t written = snprintf(running_on + pos, sizeof(running_on) - pos, "%score=%d(%s)", pos == 0 ? "" : " ", cid, sname);
+                        if (written > 0) pos += written;
+                    }
+
+                    if (is_running)
+                    {
+                        cnt_running++;
+                        if (cnt_running > STALL_DUMP_READY_MAX) continue;
+                        continue;
+                    }
+                    if (fanin_ready)
+                    {
+                        cnt_ready++;
+                        if (cnt_ready > STALL_DUMP_READY_MAX) continue;
+                        continue;
+                    }
+                    cnt_waiting++;
+                    if (cnt_waiting > STALL_DUMP_WAIT_MAX) continue;
+                }
+            }
+        }
+
+        for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++)
+        {
+            int32_t offset = cli * 3;
+            int32_t aic_id = tracker.get_aic_core_id(offset);
+            int32_t aiv0_id = tracker.get_aiv0_core_id(offset);
+            int32_t aiv1_id = tracker.get_aiv1_core_id(offset);
+            bool aic_idle = tracker.is_aic_core_idle(offset);
+            bool aiv0_idle = tracker.is_aiv0_core_idle(offset);
+            bool aiv1_idle = tracker.is_aiv1_core_idle(offset);
+            char aic_buf[128], aiv0_buf[128], aiv1_buf[128];
+            format_core_status(aic_buf, sizeof(aic_buf), aic_id, aic_idle, &core_exec_states_[aic_id], core_exec_states_[aic_id].reg_addr);
+            format_core_status(aiv0_buf, sizeof(aiv0_buf), aiv0_id, aiv0_idle, &core_exec_states_[aiv0_id], core_exec_states_[aiv0_id].reg_addr);
+            format_core_status(aiv1_buf, sizeof(aiv1_buf), aiv1_id, aiv1_idle, &core_exec_states_[aiv1_id], core_exec_states_[aiv1_id].reg_addr);
+        }
+    }
+
+    void log_shutdown_stall_snapshot([[maybe_unused]] int32_t trigger_idle_iterations, [[maybe_unused]] int32_t trigger_last_progress_count)
+    {
+        int32_t thread_count = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
+        if (thread_count < 0 || thread_count > MAX_AICPU_THREADS) thread_count = thread_count < 0 ? 0 : MAX_AICPU_THREADS;
+        for (int32_t t = 0; t < thread_count; t++) log_stall_diagnostics(t, total_tasks_);
+    }
+
+    int32_t find_core_owner_thread(int32_t core_id) const
+    {
+        for (int32_t t = 0; t < aicpu_thread_num_; t++)
+        {
+            const int32_t *ids = core_trackers_[t].core_ids();
+            int32_t n = core_trackers_[t].core_num();
+            for (int32_t i = 0; i < n; i++)
+                if (ids[i] == core_id) return t;
+        }
+        return -1;
+    }
+
+    bool self_owns_running_task(int32_t thread_idx) const
+    {
+        const int32_t *cores = core_trackers_[thread_idx].core_ids();
+        int32_t core_num = core_trackers_[thread_idx].core_num();
+        for (int32_t i = 0; i < core_num; i++)
+            if (core_exec_states_[cores[i]].running_slot_state != nullptr) return true;
+        return false;
+    }
+
+    bool no_thread_owns_running_task() const
+    {
+        for (int32_t t = 0; t < aicpu_thread_num_; t++)
+            if (self_owns_running_task(t)) return false;
+        return true;
+    }
+
+    int32_t handle_timeout_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations, int32_t last_progress_count)
+    {
+        latch_scheduler_error(header, thread_idx, PTO2_ERROR_SCHEDULER_TIMEOUT);
+        if (!completed_.exchange(true, std::memory_order_acq_rel))
+        {
+            log_shutdown_stall_snapshot(idle_iterations, last_progress_count);
+            emergency_shutdown(runtime);
+        }
+        return -PTO2_ERROR_SCHEDULER_TIMEOUT;
+    }
+
+    uint64_t get_function_bin_addr(int func_id) const
+    {
+        if (!func_id_to_addr_ || func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
+        return func_id_to_addr_[func_id];
+    }
+};
+
+#endif  // SCHEDULER_CONTEXT_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h
new file mode 100644
index 000000000..f2dc71ed5
--- /dev/null
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#ifndef SCHEDULER_TYPES_H
+#define SCHEDULER_TYPES_H
+
+#include <atomic>
+#include <cstdint>
+
+#include "common/core_type.h"
+#include "common/platform_config.h"
+#include "pto_runtime2_types.h"
+#include "spin_hint.h"
+
+constexpr int32_t MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS;
+
+// PLATFORM_MAX_IDLE_ITERATIONS was removed upstream; fixed cadence matches a5's
+// equivalent (used only for per-thread diagnostic logging, not for the fatal-
+// timeout path which uses wall-clock).
+constexpr int32_t STALL_LOG_INTERVAL = 480000;
+constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024;  // Check orchestrator error every N idle iters
+
+constexpr int32_t SCHEDULER_TIMEOUT_MS = PLATFORM_SCHEDULER_TIMEOUT_MS;
+constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES = static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
+constexpr int32_t STALL_DUMP_READY_MAX = 8;
+constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
+constexpr int32_t STALL_DUMP_CORE_MAX = 8;
+constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10;  // log every completion for the first N tasks
+constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions after threshold
+
+enum class LoopAction : int8_t
+{
+    NONE,        // cold path did not trigger; proceed normally
+    BREAK_LOOP,  // equivalent to 'break' from the while(true) loop
+};
+
+struct alignas(64) CoreExecState
+{
+    // --- Hot fields (completion + dispatch, every iteration) ---
+    uint64_t reg_addr;                      // offset  0: register base address (set once in handshake)
+    PTO2TaskSlotState *running_slot_state;  // offset  8: slot state for running task (nullptr = empty)
+    PTO2TaskSlotState *pending_slot_state;  // offset 16: slot state for pending task (nullptr = empty)
+    int32_t running_reg_task_id;            // offset 24: register task ID (AICPU_TASK_INVALID = idle)
+    int32_t pending_reg_task_id;            // offset 28: pending register task ID (AICPU_TASK_INVALID = none)
+    uint32_t dispatch_seq;                  // offset 32: monotonic dispatch counter
+    PTO2SubtaskSlot running_subslot;        // offset 36: which subtask slot is running
+    PTO2SubtaskSlot pending_subslot;        // offset 37: which subtask slot is pending
+    uint8_t pad0_[2];                       // offset 38: alignment padding
+    volatile uint32_t *cond_ptr;            // offset 40: precomputed pointer to COND register
+    // --- Cold fields (init/diagnostics only, never in hot path) ---
+    int32_t worker_id;          // offset 48: index in runtime.workers[]
+    uint32_t physical_core_id;  // offset 52: hardware physical core ID
+    CoreType core_type;         // offset 56: AIC or AIV (enum class : int32_t)
+    uint8_t pad2_[4];           // offset 60: pad to 64 bytes
+};
+static_assert(sizeof(CoreExecState) == 64, "CoreExecState must occupy exactly one cache line");
+
+class alignas(64) CoreTracker
+{
+public:
+    static inline int32_t MAX_CORE_PER_THREAD = 63;
+    static constexpr int32_t MAX_CLUSTERS = 63 / 3;
+
+public:
+    CoreTracker() = default;
+
+    class BitStates
+    {
+    public:
+        BitStates() = default;
+
+        explicit BitStates(uint64_t states) :
+            states_(states)
+        {}
+        void init()
+        {
+            states_ = 0;
+        }
+
+        BitStates operator~() const
+        {
+            return BitStates(~states_);
+        }
+        BitStates operator&(const BitStates &other) const
+        {
+            return BitStates(states_ & other.states_);
+        }
+        BitStates operator|(const BitStates &other) const
+        {
+            return BitStates(states_ | other.states_);
+        }
+        BitStates operator^(const BitStates &other) const
+        {
+            return BitStates(states_ ^ other.states_);
+        }
+        BitStates operator>>(int32_t offset) const
+        {
+            return BitStates(states_ >> offset);
+        }
+        BitStates operator<<(int32_t offset) const
+        {
+            return BitStates(states_ << offset);
+        }
+        void operator&=(const BitStates &other)
+        {
+            states_ &= other.states_;
+        }
+        void operator|=(const BitStates &other)
+        {
+            states_ |= other.states_;
+        }
+        void operator^=(const BitStates &other)
+        {
+            states_ ^= other.states_;
+        }
+
+        bool has_value() const
+        {
+            return states_ > 0;
+        }
+        int32_t count() const
+        {
+            return __builtin_popcountll(states_);
+        }
+
+        // Extract the lowest set bit from mask, clear it, and return its position.
+        // Returns -1 if mask is empty.
+        int32_t pop_first()
+        {
+            if (states_ == 0) return -1;
+            int32_t pos = __builtin_ctzll(states_);
+            states_ &= states_ - 1;
+            return pos;
+        }
+
+    private:
+        uint64_t states_{0};
+    };
+
+public:
+    void init(int32_t cluster_count)
+    {
+        cluster_count_ = cluster_count;
+        aic_mask_.init();
+        aiv_mask_.init();
+        pending_occupied_.init();
+        for (int32_t i = 0; i < cluster_count; i++)
+        {
+            aic_mask_ |= BitStates(1ULL << (i * 3));
+            aiv_mask_ |= BitStates(6ULL << (i * 3));
+        }
+        core_states_ = aic_mask_ | aiv_mask_;
+    }
+
+    void set_cluster(int32_t cluster_idx, int32_t aic_wid, int32_t aiv0_wid, int32_t aiv1_wid)
+    {
+        core_id_map_[cluster_idx * 3] = aic_wid;
+        core_id_map_[cluster_idx * 3 + 1] = aiv0_wid;
+        core_id_map_[cluster_idx * 3 + 2] = aiv1_wid;
+    }
+
+    int32_t get_cluster_count() const
+    {
+        return cluster_count_;
+    }
+
+    // --- Running core queries ---
+
+    template <CoreType CT>
+    bool has_running_cores() const
+    {
+        if constexpr (CT == CoreType::AIC) return ((~core_states_) & aic_mask_).has_value();
+        else return ((~core_states_) & aiv_mask_).has_value();
+    }
+
+    bool has_any_running_cores() const
+    {
+        return ((~core_states_) & (aic_mask_ | aiv_mask_)).has_value();
+    }
+
+    template <CoreType CT>
+    int32_t get_running_count() const
+    {
+        if constexpr (CT == CoreType::AIC) return ((~core_states_) & aic_mask_).count();
+        else return ((~core_states_) & aiv_mask_).count();
+    }
+
+    // Return an opaque bitmask for iterating running cores of a given type.
+    // Use pop_first() to extract core bit offsets one at a time.
+    template <CoreType CT>
+    BitStates get_running_cores() const
+    {
+        if constexpr (CT == CoreType::AIC) return (~core_states_) & aic_mask_;
+        else return (~core_states_) & aiv_mask_;
+    }
+
+    BitStates get_all_running_cores() const
+    {
+        return (~core_states_) & (aic_mask_ | aiv_mask_);
+    }
+
+    // --- Cluster matching ---
+
+    BitStates get_valid_cluster_offset_states(PTO2ResourceShape shape) const
+    {
+        switch (shape)
+        {
+        case PTO2ResourceShape::AIC:
+            return core_states_ & aic_mask_;
+        case PTO2ResourceShape::AIV:
+            return ((core_states_ >> 1) | (core_states_ >> 2)) & aic_mask_;
+        case PTO2ResourceShape::MIX:
+            return (core_states_ >> 1) & (core_states_ >> 2) & core_states_ & aic_mask_;
+        case PTO2ResourceShape::DUMMY:
+            // DUMMY tasks never reach the core-tracker dispatch path; they are
+            // completed inline by resolve_and_dispatch via dummy_ready_queue.
+            return BitStates(0ULL);
+        }
+        return BitStates(0ULL);
+    }
+
+    int32_t get_aic_core_id(int32_t cluster_offset) const
+    {
+        return core_id_map_[cluster_offset];
+    }
+    int32_t get_aiv0_core_id(int32_t cluster_offset) const
+    {
+        return core_id_map_[cluster_offset + 1];
+    }
+    int32_t get_aiv1_core_id(int32_t cluster_offset) const
+    {
+        return core_id_map_[cluster_offset + 2];
+    }
+
+    int32_t get_aic_core_offset(int32_t cluster_offset) const
+    {
+        return cluster_offset;
+    }
+    int32_t get_aiv0_core_offset(int32_t cluster_offset) const
+    {
+        return cluster_offset + 1;
+    }
+    int32_t get_aiv1_core_offset(int32_t cluster_offset) const
+    {
+        return cluster_offset + 2;
+    }
+
+    bool is_aic_core_idle(int32_t cluster_offset) const
+    {
+        return ((core_states_ >> cluster_offset) & BitStates(1ULL)).has_value();
+    }
+    bool is_aiv0_core_idle(int32_t cluster_offset) const
+    {
+        return ((core_states_ >> (cluster_offset + 1)) & BitStates(1ULL)).has_value();
+    }
+    bool is_aiv1_core_idle(int32_t cluster_offset) const
+    {
+        return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value();
+    }
+
+    // --- State mutation ---
+
+    // Toggle bit at the given bit offset (running <-> idle)
+    void change_core_state(int32_t bit_offset)
+    {
+        core_states_ ^= BitStates(1ULL << bit_offset);
+    }
+
+    void set_pending_occupied(int32_t bit_offset)
+    {
+        pending_occupied_ |= BitStates(1ULL << bit_offset);
+    }
+    void clear_pending_occupied(int32_t bit_offset)
+    {
+        pending_occupied_ ^= (pending_occupied_ & BitStates(1ULL << bit_offset));
+    }
+
+    // --- Two-phase dispatch queries ---
+
+    BitStates get_idle_core_offset_states(PTO2ResourceShape shape) const
+    {
+        if (shape == PTO2ResourceShape::AIC) return get_valid_cluster_offset_states(shape) & ~(pending_occupied_ & aic_mask_);
+        if (shape == PTO2ResourceShape::AIV) return core_states_ & aiv_mask_;
+        return get_valid_cluster_offset_states(shape);  // MIX: cluster-level
+    }
+
+    BitStates get_pending_core_offset_states(PTO2ResourceShape shape) const
+    {
+        if (shape == PTO2ResourceShape::MIX)
+        {
+            // Any core without a pending payload can accept a dispatch (idle or running).
+            BitStates available = ~pending_occupied_;
+            BitStates mix_available = (available & aic_mask_) & ((available >> 1) & aic_mask_) & ((available >> 2) & aic_mask_);
+            // Exclude fully-idle clusters (handled by IDLE phase) to prevent double-dispatch.
+            BitStates running = ~core_states_;
+            BitStates cluster_has_running = (running & aic_mask_) | ((running >> 1) & aic_mask_) | ((running >> 2) & aic_mask_);
+            return mix_available & cluster_has_running;
+        }
+        if (shape == PTO2ResourceShape::AIC) return (~core_states_) & aic_mask_ & ~(pending_occupied_ & aic_mask_);
+        // AIV
+        return (~core_states_) & aiv_mask_ & ~pending_occupied_;
+    }
+
+    // --- Two-phase dispatch unified query ---
+
+    enum class DispatchPhase : uint8_t
+    {
+        IDLE,
+        PENDING
+    };
+
+    BitStates get_dispatchable_cores(PTO2ResourceShape shape, DispatchPhase phase) const
+    {
+        return (phase == DispatchPhase::IDLE) ? get_idle_core_offset_states(shape) : get_pending_core_offset_states(shape);
+    }
+
+    // --- Bit offset <-> worker_id mapping ---
+
+    int32_t get_core_id_by_offset(int32_t offset) const
+    {
+        return core_id_map_[offset];
+    }
+
+    const int32_t *core_ids() const
+    {
+        return core_id_map_;
+    }
+    int32_t core_num() const
+    {
+        return cluster_count_ * 3;
+    }
+
+private:
+    int32_t cluster_count_;
+    BitStates aic_mask_;
+    BitStates aiv_mask_;
+    BitStates core_states_;
+    BitStates pending_occupied_;
+    int32_t core_id_map_[63];  // bit_position -> worker_id, max 21 clusters * 3
+};
+
+struct SlotTransition
+{
+    bool running_done = false;   // running task completed
+    bool pending_done = false;   // pending task completed
+    bool running_freed = false;  // running slot data should be released
+    bool pending_freed = false;  // pending_occupied can be cleared
+    bool matched = false;        // some case was hit (otherwise skip apply)
+};
+
+// When sync_start_pending != 0, all scheduler threads skip dispatch
+// (only process completions) until the drain worker finishes launching all blocks.
+struct alignas(64) SyncStartDrainState
+{
+    std::atomic<int32_t> sync_start_pending{0};              // 0=normal; -1=initializing; >0=active (value=block_num)
+    std::atomic<int32_t> drain_worker_elected{0};            // 0=none; >0: elected thread's (thread_idx+1)
+    std::atomic<uint32_t> drain_ack_mask{0};                 // bit per thread; all-set = all threads reached ack barrier
+    std::atomic<PTO2TaskSlotState *> pending_task{nullptr};  // held task (not re-queued)
+    int32_t _pad[10];
+};
+static_assert(sizeof(SyncStartDrainState) == 64);
+
+#endif  // SCHEDULER_TYPES_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
deleted file mode 100644
index 24585db85..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Host/AICPU shared runtime-arena layout, init_data and wire implementations.
- *
- * Lives under runtime/shared/ so it is included in both the host_runtime.so
- * build (host pre-populates the prebuilt arena image) and the aicpu_runtime
- * build (AICPU runs wire_arena_pointers + destroy after attach). The
- * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp
- * (ops table, scope/submit/dispatch business logic, profiling) stay in their
- * original files and the aicpu build only.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "pto_orchestrator.h"
-#include "pto_runtime2.h"
-#include "pto_ring_buffer.h"
-#include "pto_shared_memory.h"
-#include "pto_tensormap.h"
-#include "scheduler/pto_scheduler.h"
-
-// =============================================================================
-// Ready queue
-// =============================================================================
-
-size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
-    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
-    // first slot cannot false-share with whatever region sits in front of us
-    // (e.g. orchestrator tensormap heads written by the orch thread).
-    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
-}
-
-bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
-    // Address the slots region for data writes without storing the pointer in
-    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
-    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
-    queue->capacity = capacity;
-    queue->mask = capacity - 1;
-    queue->enqueue_pos.store(0, std::memory_order_relaxed);
-    queue->dequeue_pos.store(0, std::memory_order_relaxed);
-
-    for (uint64_t i = 0; i < capacity; i++) {
-        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
-        slots_arena[i].slot_state = nullptr;
-    }
-
-    return true;
-}
-
-void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) {
-    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
-}
-
-void ready_queue_destroy(PTO2ReadyQueue *queue) {
-    // Arena owns the slots[] buffer; just forget the pointer.
-    queue->slots = nullptr;
-}
-
-// =============================================================================
-// Scheduler
-// =============================================================================
-
-bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) {
-    // ring stores the device address of the SM ring header — pure offset
-    // arithmetic, no SM load.
-    ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
-    last_task_alive = 0;
-    advance_lock.store(0, std::memory_order_relaxed);
-#if PTO2_PROFILING
-    dep_pool_snapshot_tail.store(1, std::memory_order_relaxed);
-    dep_pool_snapshot_top.store(1, std::memory_order_relaxed);
-#endif
-
-    // Per-slot SM-side initialization (bind_ring + reset_for_reuse +
-    // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle::
-    // init_header_per_ring so the AICPU performs it during SM reset; host
-    // prebuilt-arena init skips SM access here.
-
-    return true;
-}
-
-void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
-
-PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
-    PTO2SchedulerLayout layout{};
-    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
-    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
-    layout.dep_pool_capacity = dep_pool_capacity;
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    }
-    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        // Force a cache-line base so writes from scheduler thread 0 (sole
-        // writer of this ring's dep_pool) do not invalidate adjacent
-        // multi-threaded regions like ready_queue.slots.
-        layout.off_dep_pool_entries[r] =
-            arena.reserve(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
-    }
-    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
-    return layout;
-}
-
-bool PTO2SchedulerState::init_data_from_layout(
-    const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base
-) {
-    PTO2SchedulerState *sched = this;
-    sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
-#if PTO2_SCHED_PROFILING
-    sched->tasks_completed.store(0, std::memory_order_relaxed);
-    sched->tasks_consumed.store(0, std::memory_order_relaxed);
-#endif
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) {
-            return false;
-        }
-    }
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        if (!ready_queue_init_data_from_layout(
-                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
-            )) {
-            return false;
-        }
-    }
-    if (!ready_queue_init_data_from_layout(
-            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
-        )) {
-        return false;
-    }
-
-    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
-        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry));
-        sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacity, orch_err);
-    }
-
-    if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
-        return false;
-    }
-    sched->wiring.batch_count = 0;
-    sched->wiring.batch_index = 0;
-    sched->wiring.backoff_counter = 0;
-
-    return true;
-}
-
-void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) {
-    PTO2SchedulerState *sched = this;
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
-    }
-    ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        sched->ring_sched_states[r].dep_pool.base =
-            static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
-    }
-    sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer);
-}
-
-void PTO2SchedulerState::destroy() {
-    PTO2SchedulerState *sched = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        sched->ring_sched_states[r].destroy();
-        sched->ring_sched_states[r].dep_pool.base = nullptr;
-    }
-    sched->wiring.queue.destroy();
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        ready_queue_destroy(&sched->ready_queues[i]);
-    }
-    ready_queue_destroy(&sched->dummy_ready_queue);
-}
-
-// =============================================================================
-// Orchestrator
-// =============================================================================
-
-PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
-    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
-) {
-    PTO2OrchestratorLayout layout{};
-    layout.dep_pool_capacity = dep_pool_capacity;
-    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
-    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
-    }
-    layout.off_scope_tasks = arena.reserve(
-        static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *)
-    );
-    layout.off_scope_begins =
-        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
-    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
-    return layout;
-}
-
-bool PTO2OrchestratorState::init_data_from_layout(
-    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
-    uint64_t task_window_size
-) {
-    auto *orch = this;
-    *orch = PTO2OrchestratorState{};
-
-    orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
-    orch->gm_heap_base = gm_heap;
-    orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
-    orch->fatal = false;
-
-    // Mirror the SM API's per-ring window-size shape so a future per-ring
-    // SM layout cannot silently disagree with the addresses we compute here.
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
-        task_window_sizes[r] = task_window_size;
-
-    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
-        auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
-        auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
-        auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
-
-        orch->rings[r].task_allocator.init(
-            task_descs_dev, static_cast<int32_t>(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base,
-            heap_size, orch_err
-        );
-
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
-        memset(fanin_entries, 0, fanin_pool_bytes);
-        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, orch_err);
-    }
-
-    if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) {
-        return false;
-    }
-
-    orch->scope_tasks_size = 0;
-    orch->scope_tasks_capacity = layout.scope_tasks_cap;
-    orch->scope_stack_top = -1;
-    orch->scope_stack_capacity = layout.scope_stack_capacity;
-    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-
-    return true;
-}
-
-void PTO2OrchestratorState::wire_arena_pointers(
-    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg
-) {
-    auto *orch = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        orch->rings[r].fanin_pool.base = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
-    }
-    orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
-    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
-    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
-    orch->scheduler = scheduler_arg;
-}
-
-void PTO2OrchestratorState::destroy() {
-    auto *orch = this;
-    orch->tensor_map.destroy();
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        orch->rings[r].fanin_pool.base = nullptr;
-    }
-    orch->scope_tasks = nullptr;
-    orch->scope_begins = nullptr;
-}
-
-void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
-
-// =============================================================================
-// Top-level runtime arena
-// =============================================================================
-
-PTO2RuntimeArenaLayout
-runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) {
-    PTO2RuntimeArenaLayout layout{};
-    layout.task_window_size = task_window_size;
-    layout.dep_pool_capacity = dep_pool_capacity;
-
-    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = static_cast<int32_t>(task_window_size);
-    }
-
-    layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
-    layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity);
-    layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity);
-    layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
-    layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
-
-    layout.arena_size = arena.total_size();
-    return layout;
-}
-
-PTO2Runtime *runtime_init_data_from_layout(
-    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
-    uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size
-) {
-    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.off_runtime));
-    memset(rt, 0, sizeof(*rt));
-
-    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
-    memset(sm_wrap, 0, sizeof(*sm_wrap));
-
-    // rt->ops is filled by the AICPU at boot.
-    rt->mode = mode;
-    rt->gm_heap = gm_heap_dev_base;
-    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
-    rt->gm_heap_owned = false;
-    rt->total_cycles = 0;
-
-    if (!rt->orchestrator.init_data_from_layout(
-            layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.task_window_size
-        )) {
-        return nullptr;
-    }
-    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) {
-        return nullptr;
-    }
-
-    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
-    memset(mailbox, 0, sizeof(*mailbox));
-
-    return rt;
-}
-
-void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) {
-    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
-    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
-    rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler);
-    rt->scheduler.wire_arena_pointers(layout.sched, arena);
-}
-
-void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) {
-    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
-    if (!rt) return;
-    rt->scheduler.destroy();
-    rt->orchestrator.destroy();
-    rt->aicore_mailbox = nullptr;
-    rt->sm_handle = nullptr;
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
deleted file mode 100644
index 1e1edff92..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - Shared Memory Implementation
- *
- * Implements shared memory allocation, initialization, and management
- * for Orchestrator-Scheduler communication.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_shared_memory.h"
-#include <inttypes.h>
-#include <stdlib.h>
-#include <string.h>
-#include "common/unified_log.h"
-
-// =============================================================================
-// Size Calculation
-// =============================================================================
-
-uint64_t PTO2SharedMemoryHandle::calculate_size(uint64_t task_window_size) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-    }
-    return calculate_size_per_ring(task_window_sizes);
-}
-
-uint64_t PTO2SharedMemoryHandle::calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
-    uint64_t size = 0;
-
-    // Header (aligned to cache line)
-    size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-
-    // Per-ring task descriptors and payloads
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
-    }
-
-    return size;
-}
-
-// =============================================================================
-// Creation and Destruction
-// =============================================================================
-
-void PTO2SharedMemoryHandle::setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
-    char *ptr = (char *)sm_base;
-
-    // Header
-    header = (PTO2SharedMemoryHeader *)ptr;
-    ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-
-    // Per-ring task descriptors, payloads, and slot states
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto &ring = header->rings[r];
-        ring.task_descriptors = (PTO2TaskDescriptor *)ptr;
-        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-
-        ring.task_payloads = (PTO2TaskPayload *)ptr;
-        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-
-        ring.slot_states = (PTO2TaskSlotState *)ptr;
-        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
-    }
-}
-
-void PTO2SharedMemoryHandle::setup_pointers(uint64_t task_window_size) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-    }
-    setup_pointers_per_ring(task_window_sizes);
-}
-
-bool PTO2SharedMemoryHandle::init(
-    void *sm_base_arg, uint64_t sm_size_arg, uint64_t task_window_size, uint64_t heap_size
-) {
-    if (!sm_base_arg || sm_size_arg == 0) return false;
-    if (sm_size_arg < calculate_size(task_window_size)) return false;
-
-    sm_base = sm_base_arg;
-    sm_size = sm_size_arg;
-    is_owner = false;
-    setup_pointers(task_window_size);
-    init_header(task_window_size, heap_size);
-    return true;
-}
-
-PTO2SharedMemoryHandle *PTO2SharedMemoryHandle::create_and_init_default(DeviceArena &arena) {
-    const uint64_t buffer_size = calculate_size(PTO2_TASK_WINDOW_SIZE);
-    const size_t off_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
-    const size_t off_buffer = arena.reserve(static_cast<size_t>(buffer_size), PTO2_ALIGN_SIZE);
-    if (arena.commit() == nullptr) return nullptr;
-
-    auto *handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_handle));
-    memset(handle, 0, sizeof(*handle));
-    void *buffer = arena.region_ptr(off_buffer);
-    memset(buffer, 0, static_cast<size_t>(buffer_size));
-    if (!handle->init(buffer, buffer_size, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE)) return nullptr;
-    return handle;
-}
-
-void PTO2SharedMemoryHandle::destroy() {
-    // Arena-owned wrappers (is_owner == false) are reclaimed by arena.release();
-    // calling destroy on them is a no-op so existing callers stay safe.
-    if (is_owner && sm_base) {
-        free(sm_base);
-        free(this);
-    }
-}
-
-// =============================================================================
-// Initialization
-// =============================================================================
-//
-// no need init data in pool, init pool data when used
-void PTO2SharedMemoryHandle::init_header(uint64_t task_window_size, uint64_t heap_size) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-        heap_sizes[r] = heap_size;
-    }
-    init_header_per_ring(task_window_sizes, heap_sizes);
-}
-
-void PTO2SharedMemoryHandle::init_header_per_ring(
-    const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
-) {
-    // Per-ring flow control (start at 0)
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        header->rings[r].fc.init();
-    }
-
-    header->orchestrator_done.store(0, std::memory_order_relaxed);
-
-    // Per-ring layout info
-    uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        header->rings[r].task_window_size = task_window_sizes[r];
-        header->rings[r].task_window_mask = static_cast<int32_t>(task_window_sizes[r] - 1);
-        header->rings[r].heap_size = heap_sizes[r];
-        header->rings[r].task_descriptors_offset = offset;
-        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
-    }
-
-    header->total_size = sm_size;
-    header->graph_output_ptr.store(0, std::memory_order_relaxed);
-    header->graph_output_size.store(0, std::memory_order_relaxed);
-
-    // Error reporting
-    header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
-    header->sched_error_bitmap.store(0, std::memory_order_relaxed);
-    header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
-    header->sched_error_thread.store(-1, std::memory_order_relaxed);
-
-    // Per-ring slot_states reset. Previously lived in
-    // PTO2SchedulerState::RingSchedState::init(), but it writes into
-    // ring->slot_states[] which is SM-side storage — keeping it here lets
-    // host-side prebuilt-arena init skip all SM dereferences.
-    // bind_ring() pins the ring_id (slot-invariant after this point);
-    // reset_for_reuse() prepares dynamic fanout/refcount fields so the first
-    // submit doesn't need an explicit reset.
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto &ring = header->rings[r];
-        for (uint64_t i = 0; i < task_window_sizes[r]; i++) {
-            ring.slot_states[i].bind_ring(static_cast<uint8_t>(r));
-            ring.slot_states[i].reset_for_reuse();
-            ring.slot_states[i].fanin_count = 0;
-            ring.slot_states[i].active_mask = ActiveMask{};
-        }
-    }
-}
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-void PTO2SharedMemoryHandle::print_layout() {
-    if (!header) return;
-
-    PTO2SharedMemoryHeader *h = header;
-
-    LOG_INFO_V0("=== PTO2 Shared Memory Layout ===");
-    LOG_INFO_V0("Base address:       %p", sm_base);
-    LOG_INFO_V0("Total size:         %" PRIu64 " bytes", h->total_size);
-    LOG_INFO_V0("Ring depth:         %d", PTO2_MAX_RING_DEPTH);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        LOG_INFO_V0("Ring %d:", r);
-        LOG_INFO_V0("  task_window_size: %" PRIu64, h->rings[r].task_window_size);
-        LOG_INFO_V0("  heap_size:        %" PRIu64 " bytes", h->rings[r].heap_size);
-        LOG_INFO_V0(
-            "  descriptors_off:  %" PRIu64 " (0x%" PRIx64 ")", h->rings[r].task_descriptors_offset,
-            h->rings[r].task_descriptors_offset
-        );
-        LOG_INFO_V0("  current_task_idx: %d", h->rings[r].fc.current_task_index.load(std::memory_order_acquire));
-        LOG_INFO_V0("  last_task_alive:  %d", h->rings[r].fc.last_task_alive.load(std::memory_order_acquire));
-    }
-    LOG_INFO_V0("orchestrator_done:  %d", h->orchestrator_done.load(std::memory_order_acquire));
-    LOG_INFO_V0("Error state:");
-    LOG_INFO_V0("  orch_error_code:    %d", h->orch_error_code.load(std::memory_order_relaxed));
-    LOG_INFO_V0("  sched_error_bitmap: 0x%x", h->sched_error_bitmap.load(std::memory_order_relaxed));
-    LOG_INFO_V0("  sched_error_code:   %d", h->sched_error_code.load(std::memory_order_relaxed));
-    LOG_INFO_V0("  sched_error_thread: %d", h->sched_error_thread.load(std::memory_order_relaxed));
-    LOG_INFO_V0("================================");
-}
-
-bool PTO2SharedMemoryHandle::validate() {
-    if (!sm_base) return false;
-    if (!header) return false;
-
-    PTO2SharedMemoryHeader *h = header;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!h->rings[r].fc.validate(this, r)) return false;
-    }
-
-    return true;
-}
-
-bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const {
-    if (!handle) return false;
-    if (!handle->header) return false;
-    if (ring_id < 0 || ring_id >= PTO2_MAX_RING_DEPTH) return false;
-
-    const PTO2SharedMemoryHeader *h = handle->header;
-
-    // Check that offsets are within bounds
-    if (h->rings[ring_id].task_descriptors_offset >= h->total_size) return false;
-
-    // Check pointer alignment
-    if ((uintptr_t)h->rings[ring_id].task_descriptors % PTO2_ALIGN_SIZE != 0) return false;
-
-    // Check flow control pointer sanity
-    int32_t current = current_task_index.load(std::memory_order_acquire);
-    int32_t last_alive = last_task_alive.load(std::memory_order_acquire);
-    if (current < 0) return false;
-    if (last_alive < 0) return false;
-
-    return true;
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
deleted file mode 100644
index b99c67233..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - TensorMap Implementation
- *
- * Implements TensorMap with ring buffer pool, lazy invalidation,
- * and chain truncation optimization.
- *
- * Key features:
- * 1. O(1) insert at bucket head
- * 2. O(valid_entries) lookup with chain truncation
- * 3. Automatic stale entry cleanup during lookup
- * 4. Periodic explicit cleanup for long chains
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_tensormap.h"
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "common.h"
-#include "common/unified_log.h"
-
-// =============================================================================
-// TensorMap Lookup Chain Length Statistics (compile-time toggle)
-// =============================================================================
-#if PTO2_TENSORMAP_PROFILING
-uint64_t g_lookup_chain_total = 0;
-uint64_t g_lookup_count = 0;
-int32_t g_lookup_chain_max = 0;
-uint64_t g_lookup_overlap_checks = 0;
-uint64_t g_lookup_overlap_hits = 0;
-uint64_t g_insert_count = 0;
-#endif
-
-// =============================================================================
-// Initialization and Destruction
-// =============================================================================
-
-PTO2TensorMapLayout PTO2TensorMap::reserve_layout(
-    DeviceArena &arena, int32_t new_num_buckets, int32_t new_pool_size,
-    const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]
-) {
-    // num_buckets must be a power of two for the hash truncation to work.
-    always_assert((new_num_buckets & (new_num_buckets - 1)) == 0);
-
-    PTO2TensorMapLayout layout{};
-    layout.num_buckets = new_num_buckets;
-    layout.pool_size = new_pool_size;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        layout.task_window_sizes[r] = new_task_window_sizes[r];
-    }
-
-    layout.off_buckets = arena.reserve(
-        static_cast<size_t>(new_num_buckets) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)
-    );
-    layout.off_entry_pool =
-        arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry), alignof(PTO2TensorMapEntry));
-    layout.off_free_entry_list =
-        arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        layout.off_task_entry_heads[r] = arena.reserve(
-            static_cast<size_t>(new_task_window_sizes[r]) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)
-        );
-    }
-    return layout;
-}
-
-PTO2TensorMapLayout
-PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]) {
-    return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
-}
-
-bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
-    num_buckets = layout.num_buckets;
-    pool_size = layout.pool_size;
-
-    // Address arena regions for data writes; do not store these in struct
-    // fields (wire_arena_pointers does that).
-    auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
-    auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
-    auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
-
-    // buckets[]: empty == nullptr.
-    for (int32_t i = 0; i < num_buckets; i++) {
-        buckets_arena[i] = nullptr;
-    }
-
-    // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...).
-    // The pool's persistent invariant after init is "bucket_index == -1 means
-    // not linked", set explicitly below.
-    memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
-    for (int32_t i = 0; i < pool_size; i++) {
-        entry_pool_arena[i].bucket_index = -1;
-        entry_pool_arena[i].next_in_bucket = nullptr;
-        entry_pool_arena[i].prev_in_bucket = nullptr;
-        entry_pool_arena[i].next_in_task = nullptr;
-        entry_pool_arena[i].prev_in_task = nullptr;
-        entry_pool_arena[i].producer_task_id = PTO2TaskId{};
-    }
-
-    // free_entry_list: zeroed (was calloc'd before); contents become meaningful
-    // only after entries are freed back, so the body of the array stays as 0.
-    memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
-
-    next_entry_idx = 0;
-    free_num = 0;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
-        for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) {
-            heads_arena[i] = nullptr;
-        }
-        task_window_sizes[r] = layout.task_window_sizes[r];
-        last_task_alives[r] = 0;
-        last_cleanup[r] = 0;
-    }
-
-    return true;
-}
-
-void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
-    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
-    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
-    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
-    }
-}
-
-void PTO2TensorMap::destroy() {
-    // Arena owns the backing memory; here we only forget our pointers so any
-    // stray post-destroy access trips a nullptr dereference instead of reading
-    // a recycled allocation.
-    buckets = nullptr;
-    entry_pool = nullptr;
-    free_entry_list = nullptr;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_entry_heads[r] = nullptr;
-    }
-}
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-void PTO2TensorMap::print_stats() {
-    int32_t valid = 0;
-    int32_t stale = 0;
-    int32_t empty_buckets = 0;
-    int32_t max_chain = 0;
-    int64_t total_chain = 0;
-    int32_t non_empty_buckets = 0;
-
-    // Count entries
-    for (int32_t i = 0; i < pool_size; i++) {
-        if (entry_pool[i].bucket_index != -1) {
-            if (entry_valid(entry_pool[i])) {
-                valid++;
-            } else {
-                stale++;
-            }
-        }
-    }
-
-    // Count bucket stats
-    for (int32_t b = 0; b < num_buckets; b++) {
-        int32_t chain_len = 0;
-        auto cur_entry = buckets[b];
-
-        while (cur_entry != nullptr) {
-            chain_len++;
-            cur_entry = cur_entry->next_in_bucket;
-        }
-
-        if (chain_len == 0) {
-            empty_buckets++;
-        } else {
-            non_empty_buckets++;
-            total_chain += chain_len;
-            if (chain_len > max_chain) {
-                max_chain = chain_len;
-            }
-        }
-    }
-
-    LOG_INFO_V0("=== TensorMap Statistics ===");
-    LOG_INFO_V0("Pool size:           %d", pool_size);
-    LOG_INFO_V0("Pool next entry idx: %d", next_entry_idx);
-    LOG_INFO_V0("Pool free_num:       %d", free_num);
-    LOG_INFO_V0("Num buckets:         %d", num_buckets);
-    LOG_INFO_V0("Valid entries:       %d", valid);
-    LOG_INFO_V0("Stale entries:       %d", stale);
-    LOG_INFO_V0("Empty buckets:       %d", empty_buckets);
-    LOG_INFO_V0("Max chain len:       %d", max_chain);
-    LOG_INFO_V0("Avg chain len:       %.2f", non_empty_buckets > 0 ? (float)total_chain / non_empty_buckets : 0);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        LOG_INFO_V0("Last task alive[%d]: %d", r, last_task_alives[r]);
-    }
-    LOG_INFO_V0("============================");
-}
-
-int32_t PTO2TensorMap::valid_count() {
-    int32_t count = 0;
-
-    for (int32_t i = 0; i < pool_size; i++) {
-        if (entry_pool[i].bucket_index != -1 && entry_valid(entry_pool[i])) {
-            count++;
-        }
-    }
-
-    return count;
-}
-
-void PTO2TensorMap::sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive) {
-    auto ring_id = task_id.ring();
-    auto local_id = task_id.local();
-    sync_validity(ring_id, sm_last_task_alive);
-
-    // Only attempt cleanup when last_task_alive has actually advanced;
-    // otherwise cleanup_retired would empty-loop and we'd spin forever.
-    auto overlap = get_task_local_id_slot(ring_id, local_id) == get_task_local_id_slot(ring_id, last_cleanup[ring_id]);
-    if (sm_last_task_alive - last_cleanup[ring_id] >= PTO2_TENSORMAP_CLEANUP_INTERVAL || overlap) {
-        cleanup_retired(ring_id, last_cleanup[ring_id], sm_last_task_alive);
-        last_cleanup[ring_id] = sm_last_task_alive;
-    }
-}
-
-// =============================================================================
-// TensorMap Lookup Profiling
-// =============================================================================
-#if PTO2_TENSORMAP_PROFILING
-PTO2TensorMapProfilingData pto2_tensormap_get_profiling() {
-    PTO2TensorMapProfilingData d;
-    d.lookup_chain_total = g_lookup_chain_total;
-    d.lookup_count = g_lookup_count;
-    d.lookup_chain_max = g_lookup_chain_max;
-    d.overlap_checks = g_lookup_overlap_checks;
-    d.overlap_hits = g_lookup_overlap_hits;
-    d.insert_count = g_insert_count;
-
-    // Reset
-    g_lookup_chain_total = 0;
-    g_lookup_count = 0;
-    g_lookup_chain_max = 0;
-    g_lookup_overlap_checks = 0;
-    g_lookup_overlap_hits = 0;
-    g_insert_count = 0;
-    return d;
-}
-#endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
deleted file mode 100644
index b3347b53c..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Runtime Class - Implementation
- *
- * Device execution and handshake control.
- * Task graph construction is handled by PTO2Runtime.
- */
-
-#include "runtime.h"
-
-#include "common/unified_log.h"
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-
-// =============================================================================
-// Constructor
-// =============================================================================
-
-Runtime::Runtime() {
-    // NOTE: host_api is initialized in InitRuntime() (host-only code)
-    // because the CApi functions don't exist when compiled for device.
-
-    // Initialize handshake buffers
-    memset(workers, 0, sizeof(workers));
-    worker_count = 0;
-    aicpu_thread_num = 1;
-    ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
-    task_window_size = 0;
-    heap_size = 0;
-    dep_pool_size = 0;
-    orch_to_sched = false;
-
-    // Initialize device orchestration state
-    gm_sm_ptr_ = nullptr;
-    gm_heap_ptr_ = nullptr;
-    slot_states_ptr_ = nullptr;
-    orch_args_storage_.clear();
-    prebuilt_arena_base_ = nullptr;
-    prebuilt_runtime_offset_ = 0;
-
-    // Initialize device orchestration SO binary
-    dev_orch_so_addr_ = 0;
-    dev_orch_so_size_ = 0;
-    active_callable_id_ = -1;
-    register_new_callable_id_ = false;
-    device_orch_func_name_[0] = '\0';
-    device_orch_config_name_[0] = '\0';
-
-    // Initialize kernel binary tracking
-    registered_kernel_count_ = 0;
-
-    // Initialize function address mapping
-    for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) {
-        func_id_to_addr_[i] = 0;
-    }
-}
-
-// =============================================================================
-// Device orchestration
-// =============================================================================
-
-void *Runtime::get_gm_sm_ptr() const { return gm_sm_ptr_; }
-void *Runtime::get_gm_heap_ptr() const { return gm_heap_ptr_; }
-const ChipStorageTaskArgs &Runtime::get_orch_args() const { return orch_args_storage_; }
-void Runtime::set_gm_sm_ptr(void *p) { gm_sm_ptr_ = p; }
-void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; }
-void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
-void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
-
-void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) {
-    prebuilt_arena_base_ = arena_base;
-    prebuilt_runtime_offset_ = runtime_off;
-}
-void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; }
-size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; }
-
-// Device orchestration SO metadata (bytes live in a separate device buffer
-// owned by DeviceRunner; only the address/size travels in Runtime).
-void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
-    dev_orch_so_addr_ = dev_addr;
-    dev_orch_so_size_ = size;
-}
-
-uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; }
-
-uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; }
-
-void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) {
-    active_callable_id_ = callable_id;
-    register_new_callable_id_ = is_new;
-}
-
-int32_t Runtime::get_active_callable_id() const { return active_callable_id_; }
-
-bool Runtime::register_new_callable_id() const { return register_new_callable_id_; }
-
-void Runtime::set_device_orch_func_name(const char *name) {
-    if (name == nullptr) {
-        device_orch_func_name_[0] = '\0';
-        return;
-    }
-    std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
-    device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
-}
-
-const char *Runtime::get_device_orch_func_name() const { return device_orch_func_name_; }
-
-void Runtime::set_device_orch_config_name(const char *name) {
-    if (name == nullptr) {
-        device_orch_config_name_[0] = '\0';
-        return;
-    }
-    std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
-    device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
-}
-
-const char *Runtime::get_device_orch_config_name() const { return device_orch_config_name_; }
-
-uint64_t Runtime::get_function_bin_addr(int func_id) const {
-    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
-    return func_id_to_addr_[func_id];
-}
-
-void Runtime::set_function_bin_addr(int func_id, uint64_t addr) {
-    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
-        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
-        return;
-    }
-    if (addr != 0 && func_id_to_addr_[func_id] == 0) {
-        if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID) {
-            registered_kernel_func_ids_[registered_kernel_count_++] = func_id;
-        } else {
-            LOG_ERROR(
-                "[Runtime] Registration limit reached (%d). Cannot track func_id=%d for cleanup.", RUNTIME_MAX_FUNC_ID,
-                func_id
-            );
-        }
-    }
-    func_id_to_addr_[func_id] = addr;
-}
-
-void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) {
-    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
-        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
-        return;
-    }
-    func_id_to_addr_[func_id] = addr;
-}
-
-int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; }
-
-int Runtime::get_registered_kernel_func_id(int index) const {
-    if (index < 0 || index >= registered_kernel_count_) return -1;
-    return registered_kernel_func_ids_[index];
-}
-
-void Runtime::clear_registered_kernels() { registered_kernel_count_ = 0; }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/tensor.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/tensor.h
index 385fbf897..c0e6ac5c6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/tensor.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/tensor.h
@@ -24,50 +24,38 @@
 
 constexpr int RUNTIME_MAX_TENSOR_DIMS = 5;
 
-/**
- * Buffer Handle
- *
- * Represents a device memory buffer with address and total size in bytes.
- * This is the underlying memory allocation that a Tensor describes access patterns for.
- */
-struct PTOBufferHandle {
+struct PTOBufferHandle
+{
     uint64_t addr;  // Device memory address (bytes)
     uint64_t size;  // Total buffer size in bytes
 };
 
-enum class OverlapStatus {
+enum class OverlapStatus
+{
     NO_OVERLAP,
     COVERED,
     OTHER,
 };
 
-struct Segment {
+struct Segment
+{
     uint64_t begin;
     uint64_t end;
 
-    bool line_segment_intersection(const Segment &other) const { return end > other.begin && other.end > begin; }
-    bool contains(const Segment &other) const { return begin <= other.begin && other.end <= end; }
+    bool line_segment_intersection(const Segment &other) const
+    {
+        return end > other.begin && other.end > begin;
+    }
+    bool contains(const Segment &other) const
+    {
+        return begin <= other.begin && other.end <= end;
+    }
 };
 
-/**
- * TensorCreateInfo — submit-time create-info for runtime-allocated outputs.
- *
- * Carries the metadata required to materialize a fresh contiguous output:
- * dtype, ndims, shapes, manual_dep, and an optional initial value fill.
- *
- * Layout (64B) is aligned with Tensor cache line 1 so that
- * init_from_create_info() can copy the entire cache line with a single memcpy,
- * then overwrite buffer/owner metadata and compute the contiguous stride in
- * cache line 2.
- *
- * Arg::add_output() stores a pointer to this object, so the original
- * must remain valid (not a temporary) until after the submit call.
- */
-class alignas(64) TensorCreateInfo {
+class alignas(64) TensorCreateInfo
+{
 public:
-    TensorCreateInfo(
-        const uint32_t shapes_in[], uint32_t ndims_in, DataType dtype_in = DataType::FLOAT32, bool manual_dep_in = false
-    ) :
+    TensorCreateInfo(const uint32_t shapes_in[], uint32_t ndims_in, DataType dtype_in = DataType::FLOAT32, bool manual_dep_in = false) :
         initial_value(0),
         has_initial_value(false),
         __pad2__(0),
@@ -77,33 +65,31 @@ class alignas(64) TensorCreateInfo {
         dtype(dtype_in),
         manual_dep(manual_dep_in),
         is_contiguous(true),  // mirrors Tensor::is_contiguous; pre-set for create-info outputs
-        __pad_flags__(0) {
-        for (uint32_t i = 0; i < ndims_in; i++) {
-            shapes[i] = shapes_in[i];
-        }
+        __pad_flags__(0)
+    {
+        for (uint32_t i = 0; i < ndims_in; i++) shapes[i] = shapes_in[i];
     }
 
-    void copy(const TensorCreateInfo &other) { memcpy(this, &other, sizeof(other)); }
+    void copy(const TensorCreateInfo &other)
+    {
+        memcpy(this, &other, sizeof(other));
+    }
 
     template <typename T = uint64_t>
-    void set_initial_value(T value) {
+    void set_initial_value(T value)
+    {
         has_initial_value = true;
         initial_value = to_u64(value);
     }
 
-    uint64_t buffer_size_bytes() const {
+    uint64_t buffer_size_bytes() const
+    {
         uint64_t total = 1;
-        for (uint32_t i = 0; i < ndims; i++) {
-            total *= shapes[i];
-        }
+        for (uint32_t i = 0; i < ndims; i++) total *= shapes[i];
         return total * get_element_size(dtype);
     }
 
 public:
-    // --- Bytes [0, 32): TensorCreateInfo-only fields ---
-    // These occupy the same positions as Tensor::buffer, Tensor::owner_task_id,
-    // and Tensor::start_offset. The runtime overwrites owner metadata after the
-    // memcpy and recomputes start_offset / stride during payload materialization.
     uint64_t initial_value;
     bool has_initial_value;
     uint8_t __pad1__[7];
@@ -126,106 +112,47 @@ class alignas(64) TensorCreateInfo {
 
 static_assert(sizeof(TensorCreateInfo) == 64);
 
-/**
- * Tensor descriptor for Task input/output (128B = 2 cache lines)
- *
- * Describes a strided memory access pattern on Global Memory (GM) using:
- *   - `buffer`: underlying memory allocation (addr/size in bytes)
- *   - `start_offset`: 1D element offset of the view origin from `buffer.addr`
- *   - `shapes[i]`, `strides[i]`: per-dim view shape and **element** stride
- *
- * Stride semantics:
- *   - Element-granularity (matches start_offset). Byte offset of element
- *     `coords[]` is `(start_offset + Σ coords[i] · strides[i]) · dtype_bytes`.
- *   - strides[i] > 0 STRICTLY. Broadcast (stride=0) and negative slice step
- *     (stride<0) are NOT supported.
- *
- * Fast-path flags on cache line 1:
- *   - manual_dep: when true, dependency tracking is creator-only (skip OverlapMap)
- *   - is_contiguous: cached PyTorch-style contiguous flag — i.e.
- *     `strides[i] == prod(shapes[i+1..ndims-1])`. When true AND start_offset==0,
- *     all hot paths can compute extent_elem from `shapes` alone and never read
- *     cache line 2. NOTE: this is strictly tighter than the pre-#808
- *     `shapes[i] == raw_shapes[i]` test, but equivalent on every view the old
- *     (raw_shapes-based) encoding could express; the two only diverge on
- *     post-#808-only views (transpose / permute / slice-with-step results).
- *
- * Layout: cache line 1 holds hot-path fields (buffer, owner_task_id,
- * start_offset, version, ndims, dtype, flags, shapes); cache line 2 holds
- * stride + cached extent_elem.
- *
- * Construction:
- * Users cannot default-construct or directly construct a Tensor.
- * Valid Tensors are obtained only through controlled entry points:
- *   - make_tensor_external(...)
- *   - from_tensor_arg(...)
- *   - TaskOutputTensors returned by submit(...)
- *   - Tensor::view() / reshape() / transpose() / permute() / slice() on an existing valid Tensor
- */
-struct alignas(64) Tensor {
+struct alignas(64) Tensor
+{
     // === Cache line 1 (64B) — hot path ===
-    PTOBufferHandle buffer;    // Underlying memory buffer (addr in bytes, size in bytes)
-    PTO2TaskId owner_task_id;  // Creator task; PTO2TaskId::invalid() for external tensors
-    uint64_t start_offset;     // 1D ELEMENT offset of the view origin into `buffer`
-    int32_t version;           // Tensor version for overlap detection
-    uint32_t ndims;            // Number of dimensions used
-    DataType dtype;            // Data type of tensor elements
-    bool manual_dep;           // True when dependency tracking is creator-only (skip OverlapMap lookup/insert)
-    bool is_contiguous;        // Cached: strides[] == row_major_stride(shapes)
-    uint8_t _pad_cl1;          // Pad to align shapes[5] at byte 44
+    PTOBufferHandle buffer;                    // Underlying memory buffer (addr in bytes, size in bytes)
+    PTO2TaskId owner_task_id;                  // Creator task; PTO2TaskId::invalid() for external tensors
+    uint64_t start_offset;                     // 1D ELEMENT offset of the view origin into `buffer`
+    int32_t version;                           // Tensor version for overlap detection
+    uint32_t ndims;                            // Number of dimensions used
+    DataType dtype;                            // Data type of tensor elements
+    bool manual_dep;                           // True when dependency tracking is creator-only (skip OverlapMap lookup/insert)
+    bool is_contiguous;                        // Cached: strides[] == row_major_stride(shapes)
+    uint8_t _pad_cl1;                          // Pad to align shapes[5] at byte 44
     uint32_t shapes[RUNTIME_MAX_TENSOR_DIMS];  // Current view shape per dimension (elements)
 
-    // === Cache line 2 (64B) — warm path (view metadata) ===
-    // Field order: place the 8B-aligned cache before the 4B-aligned strides[]
-    // to avoid 4B padding between them (sizeof(Tensor) must stay 128).
     uint64_t extent_elem_cache;                 // Cached extent_elem (see extent_elem()); maintained by ops
     uint32_t strides[RUNTIME_MAX_TENSOR_DIMS];  // Element stride per dimension; ALWAYS > 0 (type-enforced)
     uint8_t _pad_cl2[36];                       // Reserved for future extension
 
-    // --- Copy / move / destroy ---
-    // Kept trivially copyable (default copy = byte-for-byte) so other modules
-    // (PTO2TensorMapEntry::copy_from_tensor, TensorCreateInfo memcpy path)
-    // can rely on memcpy semantics. The contiguous fast-path optimization
-    // lives in `init(const Tensor&)`; call sites that care should use
-    // `result.init(*this)` instead of the default copy ctor.
     Tensor(const Tensor &) = default;
     Tensor &operator=(const Tensor &) = default;
     Tensor(Tensor &&) = default;
     Tensor &operator=(Tensor &&) = default;
     ~Tensor() = default;
 
-    // ========================================================================
-    // Accessors / helpers
-    // ========================================================================
-
-    /// Number of logical elements covered by the view (NOT the extent).
-    /// ndims > 0 is a construction-time invariant (see init_external /
-    /// init_from_create_info), so the loop always runs at least once.
-    uint64_t numel() const {
+    uint64_t numel() const
+    {
         uint64_t total = 1;
-        for (uint32_t i = 0; i < ndims; i++)
-            total *= shapes[i];
+        for (uint32_t i = 0; i < ndims; i++) total *= shapes[i];
         return total;
     }
 
     /// Element extent — the smallest M such that every reachable element lies in [start_offset, start_offset+M).
     /// For strides[i]>0: extent_elem = 1 + Σ (shapes[i]-1) · strides[i].
-    uint64_t extent_elem() const {
+    uint64_t extent_elem() const
+    {
         if (is_contiguous) return numel();  // fast path: line 2 not needed when contiguous
         return extent_elem_cache;
     }
 
-    // ========================================================================
-    // Initialization (operates on already-constructed Tensor)
-    // ========================================================================
-
-    /// Initialize as a contiguous tensor that covers `shapes[]` starting at `addr`.
-    /// stride is set to row_major(shapes); start_offset = 0; is_contiguous = true.
-    /// Enforces the ndims > 0 invariant relied upon by every downstream op.
-    void init_external(
-        void *addr, uint64_t buffer_size_bytes, const uint32_t in_shapes[], uint32_t in_ndims, DataType in_dtype,
-        int32_t in_version, bool in_manual_dep = false
-    ) {
+    void init_external(void *addr, uint64_t buffer_size_bytes, const uint32_t in_shapes[], uint32_t in_ndims, DataType in_dtype, int32_t in_version, bool in_manual_dep = false)
+    {
         always_assert(in_ndims > 0 && in_ndims <= RUNTIME_MAX_TENSOR_DIMS);
         buffer = {reinterpret_cast<uint64_t>(addr), buffer_size_bytes};
         ndims = in_ndims;
@@ -236,11 +163,9 @@ struct alignas(64) Tensor {
         _pad_cl1 = 0;
         start_offset = 0;
         owner_task_id = PTO2TaskId::invalid();
-        // Single reverse pass: write shapes, accumulate row-major stride, and
-        // track numel — `s` ends as prod(shapes) which is also extent_elem
-        // for a contiguous view.
         uint32_t s = 1;
-        for (int32_t i = static_cast<int32_t>(in_ndims) - 1; i >= 0; --i) {
+        for (int32_t i = static_cast<int32_t>(in_ndims) - 1; i >= 0; --i)
+        {
             shapes[i] = in_shapes[i];
             strides[i] = s;
             s *= in_shapes[i];
@@ -248,111 +173,89 @@ struct alignas(64) Tensor {
         extent_elem_cache = s;
     }
 
-    /// Deep copy with contiguous fast-path optimization.
-    ///
-    /// Always copies cache line 1 (always needed: buffer, shapes, dtype, ...).
-    /// When `other` is in canonical contiguous form (is_contiguous &&
-    /// start_offset == 0), cache line 2 (stride / extent_elem_cache) is fully
-    /// derivable from line 1, so we **skip reading other's cache line 2** and
-    /// write dst's line 2 from the local shapes instead. Non-contiguous source
-    /// pays one line 2 read; contiguous source does not.
-    void init_from(const Tensor &other) {
+    void init_from(const Tensor &other)
+    {
         init_from_line1(other);
-        if (other.is_contiguous && other.start_offset == 0) {
+        if (other.is_contiguous && other.start_offset == 0)
+        {
             // Derive line 2 from line 1: stride = row-major of shapes; extent = numel.
             uint32_t s = 1;
-            for (int32_t i = static_cast<int32_t>(ndims) - 1; i >= 0; --i) {
+            for (int32_t i = static_cast<int32_t>(ndims) - 1; i >= 0; --i)
+            {
                 strides[i] = s;
                 s *= shapes[i];
             }
             extent_elem_cache = s;
-        } else {
+        }
+        else
+        {
             extent_elem_cache = other.extent_elem_cache;
-            for (uint32_t i = 0; i < other.ndims; i++) {
-                strides[i] = other.strides[i];
-            }
+            for (uint32_t i = 0; i < other.ndims; i++) strides[i] = other.strides[i];
             // _pad_cl2 left stale on purpose — reserved bytes are not
             // semantically read by any consumer.
         }
     }
 
-    /// View ops use this: copy cache line 1 only, leaving cache line 2 (stride,
-    /// extent_elem_cache) untouched. The op then mutates shapes / start_offset
-    /// in place and calls `refresh_derived()` to recompute line 2 once. This
-    /// avoids the wasted line 2 writes that `init_from()` would do just before
-    /// the op overwrites them.
-    void init_from_line1(const Tensor &other) { memcpy(this, &other, 64); }
+    void init_from_line1(const Tensor &other)
+    {
+        memcpy(this, &other, 64);
+    }
 
     /// Backward-compat alias used by orchestrator hot paths that need a full
     /// deep copy. Equivalent to `init_from(other)`.
-    void copy(const Tensor &other) { init_from(other); }
+    void copy(const Tensor &other)
+    {
+        init_from(other);
+    }
 
-    /// Materialize a TensorCreateInfo into this Tensor (fresh contiguous output).
-    /// Single 64B memcpy covers cache line 1; ci pre-initialises start_offset (=0)
-    /// and is_contiguous (=true) in its line-1 slots so they need no reset here.
-    /// Cache line 2 (stride/extent) is computed from `ci.shapes` in a single reverse pass.
-    void init_from_create_info(const TensorCreateInfo &ci, void *addr, uint64_t buffer_size) {
+    void init_from_create_info(const TensorCreateInfo &ci, void *addr, uint64_t buffer_size)
+    {
         always_assert(ci.ndims > 0 && ci.ndims <= RUNTIME_MAX_TENSOR_DIMS);
         memcpy(this, &ci, 64);
         buffer = {reinterpret_cast<uint64_t>(addr), buffer_size};
         owner_task_id = PTO2TaskId::invalid();  // caller (orchestrator) overwrites with actual task_id
         uint32_t s = 1;
-        for (int32_t i = static_cast<int32_t>(ndims) - 1; i >= 0; --i) {
+        for (int32_t i = static_cast<int32_t>(ndims) - 1; i >= 0; --i)
+        {
             strides[i] = s;
             s *= shapes[i];
         }
         extent_elem_cache = s;
-        if (ci.has_initial_value) {
-            fill_initial_value(ci.initial_value);
-        }
+        if (ci.has_initial_value) fill_initial_value(ci.initial_value);
     }
 
-    void fill_initial_value(uint64_t initial_value) {
+    void fill_initial_value(uint64_t initial_value)
+    {
         always_assert(reinterpret_cast<char *>(buffer.addr) != nullptr);
         uint64_t elem_size = get_element_size(dtype);
         char *dst = reinterpret_cast<char *>(buffer.addr);
         constexpr uint64_t blk_size = 64;
         uint64_t blk = (buffer.size < blk_size) ? buffer.size : blk_size;
-        for (uint64_t b = 0; b < blk; b += elem_size) {
-            memcpy(dst + b, &initial_value, elem_size);
-        }
+        for (uint64_t b = 0; b < blk; b += elem_size) memcpy(dst + b, &initial_value, elem_size);
         uint64_t filled = blk;
-        while (filled < buffer.size) {
+        while (filled < buffer.size)
+        {
             uint64_t copy_size = ((buffer.size - filled) < filled) ? (buffer.size - filled) : filled;
             memcpy(dst + filled, dst, copy_size);
             filled += copy_size;
         }
     }
 
-    // ========================================================================
-    // Address / offset computation
-    // ========================================================================
-
-    /// Compute 1D flat ELEMENT offset of `indices[]` from `buffer.addr`.
-    /// Callers multiply by `get_element_size(dtype)` to obtain a byte offset.
-    /// Works for any view (transpose / permute / slice / reshape).
-    uint64_t compute_flat_offset(const uint32_t indices[], uint32_t in_ndims) const {
+    uint64_t compute_flat_offset(const uint32_t indices[], uint32_t in_ndims) const
+    {
         uint64_t elem_off = start_offset;
-        for (uint32_t d = 0; d < in_ndims; d++) {
-            elem_off += static_cast<uint64_t>(indices[d]) * static_cast<uint64_t>(strides[d]);
-        }
+        for (uint32_t d = 0; d < in_ndims; d++) elem_off += static_cast<uint64_t>(indices[d]) * static_cast<uint64_t>(strides[d]);
         return elem_off;
     }
 
-    // ========================================================================
-    // View operations (zero-copy metadata rewrites)
-    // ========================================================================
-
-    /// Sub-tensor at per-dim offsets, with new per-dim shape.
-    /// Updates start_offset += Σ off[i]·strides[i]; shapes := new_shape; stride unchanged.
-    /// Each (offset[i], new_shape[i]) must stay within the current shapes[i] —
-    /// i.e. a view cannot expand any dimension beyond what the parent view sees.
-    Tensor view(const uint32_t view_shapes[], const uint32_t view_offsets[], bool in_manual_dep = false) const {
+    Tensor view(const uint32_t view_shapes[], const uint32_t view_offsets[], bool in_manual_dep = false) const
+    {
         Tensor result;
         // Copy line 1 only; stride from *this is still in result's line 2 garbage
         // — we need to bring it forward explicitly since view keeps stride.
         result.init_from_line1(*this);
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             debug_assert(view_offsets[i] + view_shapes[i] <= shapes[i]);
             result.start_offset += static_cast<uint64_t>(view_offsets[i]) * static_cast<uint64_t>(strides[i]);
             result.shapes[i] = view_shapes[i];
@@ -364,16 +267,19 @@ struct alignas(64) Tensor {
         return result;
     }
 
-    bool valid_transpose(uint32_t x, uint32_t y) const { return x < ndims && y < ndims; }
+    bool valid_transpose(uint32_t x, uint32_t y) const
+    {
+        return x < ndims && y < ndims;
+    }
 
     /// Swap two dimensions: shapes/stride swapped together. start_offset unchanged.
-    Tensor transpose(uint32_t x, uint32_t y, bool in_manual_dep = false) const {
+    Tensor transpose(uint32_t x, uint32_t y, bool in_manual_dep = false) const
+    {
         debug_assert(valid_transpose(x, y));
         Tensor result;
         result.init_from_line1(*this);
         // Carry forward source's stride before swapping (line 2 was not memcpy'd).
-        for (uint32_t i = 0; i < ndims; i++)
-            result.strides[i] = strides[i];
+        for (uint32_t i = 0; i < ndims; i++) result.strides[i] = strides[i];
         std::swap(result.shapes[x], result.shapes[y]);
         std::swap(result.strides[x], result.strides[y]);
         result.manual_dep = in_manual_dep;
@@ -383,10 +289,12 @@ struct alignas(64) Tensor {
 
     /// Permute dimensions according to `order[]` (length = ndims).
     /// Both shapes and stride are reordered in-place; start_offset unchanged.
-    Tensor permute(const uint32_t order[], bool in_manual_dep = false) const {
+    Tensor permute(const uint32_t order[], bool in_manual_dep = false) const
+    {
         Tensor result;
         result.init_from_line1(*this);
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             debug_assert(order[i] < ndims);
             result.shapes[i] = shapes[order[i]];
             result.strides[i] = strides[order[i]];
@@ -398,7 +306,8 @@ struct alignas(64) Tensor {
 
     /// Slice dimension `dim` with `[start, end)` and positive `step`.
     /// strides[dim] *= step; shapes[dim] = ⌈(end-start)/step⌉; start_offset += start·strides[dim_old].
-    Tensor slice(uint32_t dim, uint32_t start, uint32_t end, uint32_t step = 1, bool in_manual_dep = false) const {
+    Tensor slice(uint32_t dim, uint32_t start, uint32_t end, uint32_t step = 1, bool in_manual_dep = false) const
+    {
         debug_assert(dim < ndims);
         debug_assert(step >= 1);
         debug_assert(end > start);
@@ -406,8 +315,7 @@ struct alignas(64) Tensor {
         Tensor result;
         result.init_from_line1(*this);
         // Carry forward source's stride before patching the sliced dim.
-        for (uint32_t i = 0; i < ndims; i++)
-            result.strides[i] = strides[i];
+        for (uint32_t i = 0; i < ndims; i++) result.strides[i] = strides[i];
         const uint32_t old_stride_d = strides[dim];
         result.start_offset += static_cast<uint64_t>(start) * static_cast<uint64_t>(old_stride_d);
         const uint32_t new_len = (end - start + step - 1) / step;
@@ -419,19 +327,16 @@ struct alignas(64) Tensor {
         return result;
     }
 
-    bool valid_reshape(const uint32_t new_shapes[], uint32_t new_ndims) const {
+    bool valid_reshape(const uint32_t new_shapes[], uint32_t new_ndims) const
+    {
         uint64_t x = numel();
         uint64_t y = 1;
-        for (uint32_t i = 0; i < new_ndims; i++)
-            y *= new_shapes[i];
+        for (uint32_t i = 0; i < new_ndims; i++) y *= new_shapes[i];
         return x == y;
     }
 
-    /// Reshape — zero-copy only if source is_contiguous; otherwise asserts.
-    /// Materialize fallback (allocating a contiguous copy) is NOT in this op;
-    /// callers must reach contiguous via a copy before calling reshape on a
-    /// non-contiguous view.
-    Tensor reshape(const uint32_t new_shapes[], uint32_t new_ndims, bool in_manual_dep = false) const {
+    Tensor reshape(const uint32_t new_shapes[], uint32_t new_ndims, bool in_manual_dep = false) const
+    {
         debug_assert(valid_reshape(new_shapes, new_ndims));
         always_assert(is_contiguous);
         Tensor result;
@@ -440,7 +345,8 @@ struct alignas(64) Tensor {
         result.manual_dep = in_manual_dep;
         // Single reverse pass: write new shapes, accumulate row-major stride, track numel.
         uint32_t s = 1;
-        for (int32_t i = static_cast<int32_t>(new_ndims) - 1; i >= 0; --i) {
+        for (int32_t i = static_cast<int32_t>(new_ndims) - 1; i >= 0; --i)
+        {
             result.shapes[i] = new_shapes[i];
             result.strides[i] = s;
             s *= new_shapes[i];
@@ -450,11 +356,8 @@ struct alignas(64) Tensor {
         return result;
     }
 
-    // ========================================================================
-    // Dump for diagnostics
-    // ========================================================================
-
-    std::string dump() const {
+    std::string dump() const
+    {
         std::stringstream ss;
         std::string indent = "    ";
         ss << "{" << '\n';
@@ -466,13 +369,15 @@ struct alignas(64) Tensor {
         ss << indent << "start_offset: " << start_offset << " (elements)" << '\n';
         ss << indent << "is_contiguous: " << (is_contiguous ? "true" : "false") << '\n';
         ss << indent << "shapes: [";
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             if (i > 0) ss << ", ";
             ss << shapes[i];
         }
         ss << "]" << '\n';
         ss << indent << "strides: [";
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             if (i > 0) ss << ", ";
             ss << strides[i];
         }
@@ -486,30 +391,20 @@ struct alignas(64) Tensor {
     // Valid Tensors come only from controlled entry points.
     Tensor() = default;
 
-    Tensor(
-        void *addr, uint64_t buffer_size_bytes, const uint32_t in_shapes[], uint32_t in_ndims, DataType in_dtype,
-        int32_t in_version, bool in_manual_dep = false
-    ) {
+    Tensor(void *addr, uint64_t buffer_size_bytes, const uint32_t in_shapes[], uint32_t in_ndims, DataType in_dtype, int32_t in_version, bool in_manual_dep = false)
+    {
         init_external(addr, buffer_size_bytes, in_shapes, in_ndims, in_dtype, in_version, in_manual_dep);
     }
 
-    // ------------------------------------------------------------------------
-    // Internal helpers
-    // ------------------------------------------------------------------------
-
-    /// Recompute extent_elem_cache and is_contiguous from current shapes / stride.
-    /// Called after any op that mutates view metadata. Single reverse pass:
-    ///   extent_elem += (shapes[i] - 1) · strides[i]
-    ///   is_contiguous &&= (strides[i] == prod(shapes[i+1..]))
-    void refresh_derived() {
+    void refresh_derived()
+    {
         uint64_t e = 1;
         uint64_t expected = 1;
         bool contig = true;
-        for (int32_t i = static_cast<int32_t>(ndims) - 1; i >= 0; --i) {
+        for (int32_t i = static_cast<int32_t>(ndims) - 1; i >= 0; --i)
+        {
             if (strides[i] != expected) contig = false;
-            if (shapes[i] > 0) {
-                e += static_cast<uint64_t>(shapes[i] - 1) * static_cast<uint64_t>(strides[i]);
-            }
+            if (shapes[i] > 0) e += static_cast<uint64_t>(shapes[i] - 1) * static_cast<uint64_t>(strides[i]);
             expected *= shapes[i];
         }
         extent_elem_cache = e;
@@ -517,7 +412,8 @@ struct alignas(64) Tensor {
     }
 
     /// Assert the view stays inside the underlying buffer (byte-range safety).
-    void assert_in_buffer_bounds() const {
+    void assert_in_buffer_bounds() const
+    {
         const uint64_t elem_size = get_element_size(dtype);
         const uint64_t buffer_elems = buffer.size / elem_size;
         debug_assert(start_offset + extent_elem_cache <= buffer_elems);
@@ -525,9 +421,7 @@ struct alignas(64) Tensor {
 
     // Friends that need to construct Tensors
     friend struct PTO2TaskPayload;
-    friend inline Tensor make_tensor_external(
-        void *addr, const uint32_t shapes[], uint32_t ndims, DataType dtype, bool manual_dep, int32_t version
-    );
+    friend inline Tensor make_tensor_external(void *addr, const uint32_t shapes[], uint32_t ndims, DataType dtype, bool manual_dep, int32_t version);
 };
 
 static_assert(sizeof(Tensor) == 128, "Tensor must be exactly 2 cache lines (128 bytes)");
diff --git a/src/common/platform/onboard/aicpu/platform_aicpu_affinity.cpp b/src/common/platform/onboard/aicpu/platform_aicpu_affinity.cpp
index 56549f777..665f26ed0 100644
--- a/src/common/platform/onboard/aicpu/platform_aicpu_affinity.cpp
+++ b/src/common/platform/onboard/aicpu/platform_aicpu_affinity.cpp
@@ -18,137 +18,45 @@
 
 #include "common/unified_log.h"
 
-static constexpr int32_t AICPU_CORES_PER_CHIP = 8;
-static constexpr int32_t MAX_CLUSTERS = 2;
-static constexpr int32_t CPUS_PER_CLUSTER = 4;
 // 16 = headroom for a5's launch budget (14 logical user cpus on the
 // 0x7ffe SKU) + a small over-launch margin. a2a3 only ever launches 6
 // threads and never approaches this bound.
 static constexpr int32_t MAX_GATE_THREADS = 16;
 
-static std::atomic<uint64_t> s_cpumask{0};
-static std::atomic<int32_t> s_reported{0};
-static std::atomic<int32_t> s_gate_init{0};
-static std::atomic<int32_t> s_gate_ready{0};
-
-static int32_t s_thread_cpu[MAX_GATE_THREADS];
-static bool s_thread_survive[MAX_GATE_THREADS];
-
-static inline int32_t popcount64(uint64_t v) { return __builtin_popcountll(static_cast<unsigned long long>(v)); }
+static std::atomic<uint16_t> g_cpumask{0};
 
+/**
+ * This function determines which threads to use.
+ *
+ * It tries to use all the threads in the same NUMA domain (Both A2A3 and A5)
+ *
+ * @return: true if the thread is used, false if it gets dumped
+ */
 bool platform_aicpu_affinity_gate(int32_t logical_count, int32_t total_launched) {
+    // This should be impossible...
+    // Going to return false to dump all the threads.
     if (logical_count >= total_launched) {
-        return true;
-    }
-
-    // Assign thread index
-    int32_t idx = s_reported.fetch_add(1, std::memory_order_acq_rel);
-
-    // Report CPU
-#if defined(__aarch64__)
-    int32_t cpu = sched_getcpu();
-#elif defined(__x86_64__)
-    int32_t cpu = sched_getcpu();
-#else
-    int32_t cpu = -1;
-#endif
-
-    int32_t normalized_cpu = -1;
-    if (cpu >= 0) {
-        if (cpu < 63) {
-            s_cpumask.fetch_or(1ULL << cpu, std::memory_order_release);
-        }
-        normalized_cpu = cpu % AICPU_CORES_PER_CHIP;
-    }
-    if (idx < MAX_GATE_THREADS) {
-        s_thread_cpu[idx] = normalized_cpu;
+        LOG_ERROR("Illegal: logical_count=%d is greater or equal then total_launched=%d", logical_count, total_launched);
+        return false;
     }
 
-    // Barrier: wait until all total_launched threads have reported
-    while (popcount64(s_cpumask.load(std::memory_order_acquire)) < total_launched &&
-           s_reported.load(std::memory_order_acquire) < total_launched) {}
+    // Get current CPU ID
+    int cpu = sched_getcpu();
 
-    // CAS winner does cluster classification
-    int32_t expected = 0;
-    if (s_gate_init.compare_exchange_strong(expected, 1, std::memory_order_acq_rel, std::memory_order_acquire)) {
-        // Initialize survive flags
-        for (int32_t i = 0; i < total_launched; ++i) {
-            s_thread_survive[i] = false;
-        }
+    // At to cpumask
+    g_cpumask.fetch_or(1 << cpu, std::memory_order_relaxed);
 
-        struct ClusterInfo {
-            int32_t count{0};
-            int32_t tids[MAX_GATE_THREADS];
-        };
-        ClusterInfo clusters[MAX_CLUSTERS];
+    // Barrier wait until all the spawned threads are here before choosing which ones will be used.
+    while(__builtin_popcount(g_cpumask) < total_launched) {}
 
-        for (int32_t tid = 0; tid < total_launched; ++tid) {
-            int32_t c = s_thread_cpu[tid];
-            if (c < 0) continue;
-            int32_t cluster_id = c / CPUS_PER_CLUSTER;
-            if (cluster_id < 0 || cluster_id >= MAX_CLUSTERS) continue;
-            ClusterInfo &info = clusters[cluster_id];
-            if (info.count < MAX_GATE_THREADS) info.tids[info.count++] = tid;
-        }
+    // Choose the thread based on reverse bit order (highest cpu id to lowest)
+    // This assures that all the threads lie in the same NUMA domain
+    int how_many_on_top = __builtin_popcount(g_cpumask >> cpu);
+    bool will_be_used = how_many_on_top <= logical_count ? true : false;
 
-        int32_t major_id = (clusters[0].count >= clusters[1].count) ? 0 : 1;
-        int32_t minor_id = 1 - major_id;
-        int32_t major_cnt = clusters[major_id].count;
-        int32_t minor_cnt = clusters[minor_id].count;
+    LOG_INFO_V0("Thread[%d] how_many_on_top=%d, logical_count=%d, will_be_used=%d", cpu, how_many_on_top, logical_count, will_be_used);
 
-        LOG_INFO_V0(
-            "AICPU affinity gate: major=%d(cnt=%d) minor=%d(cnt=%d) logical=%d", major_id, major_cnt, minor_id,
-            minor_cnt, logical_count
-        );
-
-        if (major_cnt == logical_count && minor_cnt == (total_launched - logical_count)) {
-            // Expected topology: major cluster threads survive
-            for (int32_t i = 0; i < clusters[major_id].count; ++i) {
-                s_thread_survive[clusters[major_id].tids[i]] = true;
-            }
-        } else {
-            // Unexpected topology: fall back to first logical_count threads
-            LOG_WARN(
-                "AICPU affinity gate: unexpected topology (major=%d minor=%d), "
-                "falling back to index-based cutoff",
-                major_cnt, minor_cnt
-            );
-            for (int32_t i = 0; i < logical_count && i < total_launched; ++i) {
-                s_thread_survive[i] = true;
-            }
-        }
-
-        s_gate_ready.store(1, std::memory_order_release);
-    }
-
-    // Wait for classification to complete
-    while (s_gate_ready.load(std::memory_order_acquire) == 0) {}
-
-    bool survive = (idx < total_launched) ? s_thread_survive[idx] : false;
-
-    // Last thread resets state for next invocation
-    int32_t finished = s_reported.load(std::memory_order_acquire);
-    (void)finished;
-    // Reset is deferred: the statics persist but are re-initialized by the CAS winner
-    // on next call. We reset the atomics after all threads have read their result.
-    // Use a second atomic counter for cleanup.
-    static std::atomic<int32_t> s_cleanup{0};
-    int32_t cleanup_idx = s_cleanup.fetch_add(1, std::memory_order_acq_rel);
-    if (cleanup_idx + 1 == total_launched) {
-        s_cpumask.store(0, std::memory_order_release);
-        s_reported.store(0, std::memory_order_release);
-        s_gate_init.store(0, std::memory_order_release);
-        s_gate_ready.store(0, std::memory_order_release);
-        s_cleanup.store(0, std::memory_order_release);
-    }
-
-    if (!survive) {
-        LOG_INFO_V0("AICPU affinity gate: thread idx=%d cpu=%d DROPPED", idx, normalized_cpu);
-    } else {
-        LOG_INFO_V0("AICPU affinity gate: thread idx=%d cpu=%d ACTIVE", idx, normalized_cpu);
-    }
-
-    return survive;
+    return will_be_used;
 }
 
 // =============================================================================
@@ -286,4 +194,4 @@ bool platform_aicpu_affinity_gate_filter(const int32_t *allowed_cpus, int32_t al
     return survive;
 }
 
-int32_t platform_aicpu_affinity_thread_idx() { return tl_filter_exec_idx; }
+int32_t platform_aicpu_affinity_thread_idx() { return tl_filter_exec_idx; }
\ No newline at end of file

From 1f6b37bbaa5bd67beb1f671b886cd06bdbabf081 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 17 Jun 2026 11:21:02 +0200
Subject: [PATCH 02/14] Replace consumer->producer notification with watermark
 reclamation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the per-completion fanout_refcount notification from consumer tasks
to their fanin producers. Each ring now carries a single monotonic
completed_watermark — the highest local_id W such that every task 0..W
has reached COMPLETED. On submit, the orchestrator stamps each producer's
last_consumer_local_id with max(prev, self) (single-writer, plain
int32_t). On completion, the scheduler CAS-advances the watermark forward
through consecutive COMPLETED slots up to its own id, then retires tail
slots whose last_consumer_local_id is at or below the watermark.

Removes fanout_count/fanout_refcount, the CONSUMED state, on_task_release,
release_producer, check_and_handle_consumed, on_scope_end's release loop,
and the deferred_release_slot_states buffer threaded through
complete_slot_task / check_running_cores_for_completion /
poll_and_complete.

Case4 trimmed device avg: 1360 us. Case1 trimmed device avg:
28286 us (vs rebased baseline ~28801 us).
---
 .../runtime/pto_async_wait.h                  |   8 +-
 .../runtime/pto_orchestrator.h                |  19 +++-
 .../runtime/pto_runtime2.h                    |   7 +-
 .../runtime/pto_runtime2_types.h              |  29 ++---
 .../runtime/pto_scheduler.h                   | 106 +++++++-----------
 .../runtime/pto_shared_memory.h               |  14 ++-
 .../runtime/scheduler_context.h               |  31 +----
 7 files changed, 99 insertions(+), 115 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
index 429dd65b4..8bc1afa61 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
@@ -168,9 +168,6 @@ struct AsyncWaitList
     {
         PTO2SchedulerState *sched{nullptr};
         PTO2LocalReadyBuffer *local_bufs{nullptr};
-        PTO2TaskSlotState **deferred_release_slot_states{nullptr};
-        int32_t *deferred_release_count{nullptr};
-        int32_t deferred_release_capacity{0};
         int32_t inline_completed{0};
 
         bool can_inline_complete() const
@@ -179,8 +176,7 @@ struct AsyncWaitList
         }
     };
 
-    // Inline-complete a NotDeferred task during drain. Returns false on
-    // deferred_release_slot_states overflow.
+    // Inline-complete a NotDeferred task during drain.
     bool try_inline_complete_locked(DrainCompletionSink &sink, PTO2TaskSlotState &slot_state);
 
     int32_t drain_aicore_completion_mailbox_locked(AICoreCompletionMailbox *aicore_mailbox, DrainCompletionSink &sink, int32_t &error_code)
@@ -270,7 +266,7 @@ struct AsyncWaitList
     }
 
     template <bool Profiling>
-    AsyncPollResult poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs, PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, int32_t deferred_release_capacity);
+    AsyncPollResult poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs);
 };
 
 #endif  // PTO_ASYNC_WAIT_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 081d97bf8..1261f565b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -235,12 +235,10 @@ struct PTO2OrchestratorState
 
         bool ending_manual_scope = orch->scope_stack_top == orch->manual_begin_depth;
         int32_t begin = orch->scope_begins[orch->scope_stack_top--];
-        int32_t count = orch->scope_tasks_size - begin;
         if (ending_manual_scope) orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
 
-        if (orch->scheduler && count > 0) orch->scheduler->on_scope_end(&orch->scope_tasks[begin], count);
-
-        // Rewind the task buffer — these entries are no longer needed
+        // Watermark-based reclamation: scope-end has no work to do — consumers
+        // no longer need to notify producers.
         orch->scope_tasks_size = begin;
     }
     TaskOutputTensors submit_task(const MixedKernels &mixed_kernels, const Arg &args)
@@ -473,6 +471,9 @@ inline bool prepare_task(PTO2OrchestratorState *orch, const Arg &args, int32_t t
     out->slot_state->bind_buffers(out->payload, out->task);
 
     out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    // Seed last_consumer_local_id to self — with no consumers, the slot is
+    // safe to reclaim as soon as the watermark reaches this task itself.
+    out->slot_state->last_consumer_local_id = out->alloc_result.task_id;
     int16_t block_num = args.launch_spec.block_num();
     out->slot_state->total_required_subtasks = static_cast<int16_t>(block_num * __builtin_popcount(active_mask.core_mask()));
     out->slot_state->logical_block_num = block_num;
@@ -564,7 +565,15 @@ inline TaskOutputTensors submit_task_common(PTO2OrchestratorState *orch, const A
     task.packed_buffer_base = prepared.alloc_result.packed_base;
     task.packed_buffer_end = prepared.alloc_result.packed_end;
 
-    for (int32_t i = 0; i < fanin_builder.count; i++) fanin_builder.slots[i]->fanout_count++;
+    // Push this consumer's local_id into each producer's last_consumer high-
+    // water-mark, replacing the per-completion fanout_refcount notification.
+    // Reclamation gates on the global completed_watermark reaching this value.
+    const int32_t self_local = static_cast<int32_t>(task_id.local());
+    for (int32_t i = 0; i < fanin_builder.count; i++)
+    {
+        PTO2TaskSlotState *prod = fanin_builder.slots[i];
+        if (self_local > prod->last_consumer_local_id) prod->last_consumer_local_id = self_local;
+    }
 
     payload.fanin_count = fanin_builder.count;
     for (int32_t i = 0; i < fanin_builder.count; i++) payload.fanin_slot_states[i] = fanin_builder.slots[i];
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 004a386c5..d38e84cdf 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -234,9 +234,14 @@ inline bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wa
     auto wait_one_consumers = [&](const PTO2TaskSlotState &slot) {
         uint8_t ring_id = slot.ring_id;
         int32_t local_id = slot.task->task_id.local();
+        // With watermark-based reclamation, "all consumers done" means the
+        // per-ring completed_watermark has reached this slot's recorded
+        // last_consumer_local_id.
+        PTO2SharedMemoryRingHeader &ring_hdr = rt->orchestrator.sm_header->rings[ring_id];
+        int32_t target = slot.last_consumer_local_id;
         uint64_t t0 = get_sys_cnt_aicpu();
         int32_t spin_count = 0;
-        while (slot.fanout_refcount.load(std::memory_order_acquire) < slot.fanout_count - 1)
+        while (ring_hdr.completed_watermark.load(std::memory_order_acquire) < target)
         {
             SPIN_WAIT_HINT();
             if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES)
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index a22825088..742027aca 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -66,9 +66,10 @@ constexpr uint64_t PTO2_TENSOR_DATA_TIMEOUT_CYCLES = 15 * 1000 * 1000 * 1000ULL;
 
 typedef enum
 {
-    PTO2_TASK_PENDING = 0,    // Submitted; awaiting fanin, queued, or dispatched
-    PTO2_TASK_COMPLETED = 1,  // Execution finished, output may still be in use
-    PTO2_TASK_CONSUMED = 2    // Output fully consumed, buffers can be released
+    PTO2_TASK_PENDING = 0,   // Submitted; awaiting fanin, queued, or dispatched
+    PTO2_TASK_COMPLETED = 1  // Execution finished; per-ring completed_watermark
+                             // advances past this slot's last_consumer_local_id
+                             // to make its heap chunk reclaimable.
 } PTO2TaskState;
 
 struct PTO2TaskAllocResult
@@ -153,14 +154,17 @@ static_assert(sizeof(PTO2TaskPayload) == offsetof(PTO2TaskPayload, scalars) + MA
 
 struct alignas(64) PTO2TaskSlotState
 {
-    // Fanout: tracks producer->CONSUMED transition. Incremented by the
-    // orchestrator (+1 sentinel and once per consumer of this slot) and
-    // matched by release_producer in on_task_release.
-    int32_t fanout_count;
-    std::atomic<int32_t> fanout_refcount;
-
-    // Task state (PENDING/COMPLETED/CONSUMED). Polling readiness reads
-    // task_state on producer slots.
+    // Highest local task id among this slot's consumers. Set to this slot's
+    // own local_id in prepare_task; bumped via max() in submit_task_common for
+    // each consumer that has this slot as a fanin. The slot's heap chunk is
+    // safe to reclaim when the per-ring completed_watermark reaches at least
+    // this id (i.e. every task up to and including the last consumer has
+    // transitioned to COMPLETED). Single-writer (orchestrator) at submit time.
+    int32_t last_consumer_local_id;
+
+    // Task state (PENDING/COMPLETED). Polling readiness reads task_state on
+    // producer slots; reclamation gates on the completed_watermark instead of
+    // a separate CONSUMED transition.
     std::atomic<PTO2TaskState> task_state;
 
     PTO2TaskPayload *payload;
@@ -193,12 +197,11 @@ struct alignas(64) PTO2TaskSlotState
 
     void reset_for_reuse()
     {
-        fanout_count = 1;
-        fanout_refcount.store(0, std::memory_order_relaxed);
         completed_subtasks.store(0, std::memory_order_relaxed);
         next_block_idx = 0;
         any_subtask_deferred.store(false, std::memory_order_relaxed);
         next_pending = nullptr;
+        // last_consumer_local_id is reset in prepare_task once the task_id is known.
     }
 };
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
index 98a7f7c26..a7673bef3 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -378,13 +378,18 @@ struct PTO2SchedulerState
 
         void advance_ring_pointers()
         {
-            int32_t current_task_index = ring->fc.current_task_index.load(std::memory_order_acquire);
+            const int32_t watermark = ring->completed_watermark.load(std::memory_order_acquire);
             int32_t old_last_task_alive = last_task_alive;
 
-            while (last_task_alive < current_task_index)
+            // Retire any slot at the tail whose last consumer is at or below
+            // the global completed watermark — i.e. every consumer of this
+            // producer has reached COMPLETED. Implies this slot itself is
+            // COMPLETED because the seed value of last_consumer_local_id is
+            // the slot's own local_id.
+            while (last_task_alive <= watermark)
             {
                 PTO2TaskSlotState &slot_state = ring->get_slot_state_by_task_id(last_task_alive);
-                if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) break;
+                if (watermark < slot_state.last_consumer_local_id) break;
                 last_task_alive++;
             }
 
@@ -506,29 +511,6 @@ struct PTO2SchedulerState
         return drained + routed;
     }
 
-    void check_and_handle_consumed(PTO2TaskSlotState &slot_state)
-    {
-        if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return;
-
-        PTO2TaskState expected = PTO2_TASK_COMPLETED;
-        if (!slot_state.task_state.compare_exchange_strong(expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire)) return;
-
-        int32_t ring_id = slot_state.ring_id;
-        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
-        int32_t expected_lock = 0;
-        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed))
-        {
-            ring_sched_states[ring_id].advance_ring_pointers();
-            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
-        }
-    }
-
-    void release_producer(PTO2TaskSlotState &slot_state)
-    {
-        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
-        check_and_handle_consumed(slot_state);
-    }
-
     int get_ready_tasks_batch(PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count)
     {
         int count = 0;
@@ -538,44 +520,49 @@ struct PTO2SchedulerState
         return count;
     }
 
-    void on_scope_end(PTO2TaskSlotState **task_slot_states, int32_t count)
-    {
-        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
-        for (int32_t i = 0; i < count; i++)
-        {
-            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
-            release_producer(*task_slot_states[i]);
-        }
-    }
-
     bool on_subtask_complete(PTO2TaskSlotState &slot_state)
     {
         int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
         return (prev + 1) == slot_state.total_required_subtasks;
     }
 
-    void on_mixed_task_complete(
-        PTO2TaskSlotState &slot_state,
-
-        [[maybe_unused]] PTO2LocalReadyBuffer *local_bufs = nullptr
-    )
+    // Publish this slot as COMPLETED, then advance the per-ring monotonic
+    // completed_watermark — the highest local_id W such that every task
+    // 0..W has reached COMPLETED. Reclamation in advance_ring_pointers gates
+    // on watermark >= producer.last_consumer_local_id, so no consumer→producer
+    // notification edge is needed.
+    void on_mixed_task_complete(PTO2TaskSlotState &slot_state, [[maybe_unused]] PTO2LocalReadyBuffer *local_bufs = nullptr)
     {
-        // Polling model: just publish COMPLETED. Thread 0's pending-poll loop
-        // observes producer task_state and routes consumers when their fanin
-        // is satisfied.
         slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
-    }
 
-    int32_t on_task_release(PTO2TaskSlotState &slot_state)
-    {
-        PTO2TaskPayload *payload = slot_state.payload;
-        for_each_fanin_slot_state(*payload, [&](PTO2TaskSlotState *producer_slot_state) {
-            release_producer(*producer_slot_state);
-        });
+        const int32_t my_id = static_cast<int32_t>(slot_state.task->task_id.local());
+        int32_t ring_id = slot_state.ring_id;
+        auto &rss = ring_sched_states[ring_id];
+        auto &ring = *rss.ring;
+
+        // CAS-advance the watermark, bounded by my_id (which we know is
+        // published since we just completed it). If a forward task we observe
+        // as COMPLETED is also published, but a gap remains, we stop — the
+        // task filling the gap will resume the walk when it completes.
+        int32_t w = ring.completed_watermark.load(std::memory_order_acquire);
+        while (w < my_id)
+        {
+            int32_t next = w + 1;
+            PTO2TaskSlotState &cand = ring.get_slot_state_by_task_id(next);
+            if (cand.task_state.load(std::memory_order_acquire) != PTO2_TASK_COMPLETED) break;
+            if (ring.completed_watermark.compare_exchange_weak(w, next, std::memory_order_acq_rel, std::memory_order_acquire))
+            {
+                w = next;
+            }
+        }
 
-        // Self consumed check
-        check_and_handle_consumed(slot_state);
-        return payload->fanin_count;
+        // Try to retire slots whose last consumer has reached COMPLETED.
+        int32_t expected_lock = 0;
+        if (rss.advance_lock.compare_exchange_strong(expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed))
+        {
+            rss.advance_ring_pointers();
+            rss.advance_lock.store(0, std::memory_order_release);
+        }
     }
 
     // === Cold-path API ===
@@ -642,15 +629,12 @@ struct PTO2SchedulerState
 inline bool AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state)
 {
     sink.sched->on_mixed_task_complete(slot_state, sink.local_bufs);
-    if (*sink.deferred_release_count >= sink.deferred_release_capacity)
-        while (*sink.deferred_release_count > 0) sink.sched->on_task_release(*sink.deferred_release_slot_states[--(*sink.deferred_release_count)]);
-    sink.deferred_release_slot_states[(*sink.deferred_release_count)++] = &slot_state;
     sink.inline_completed++;
     return true;
 }
 
 template <bool Profiling>
-inline AsyncPollResult AsyncWaitList::poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs, PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, int32_t deferred_release_capacity)
+inline AsyncPollResult AsyncWaitList::poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs)
 {
     AsyncPollResult result;
     if (!try_lock()) return result;
@@ -658,9 +642,6 @@ inline AsyncPollResult AsyncWaitList::poll_and_complete(AICoreCompletionMailbox
     AsyncWaitList::DrainCompletionSink sink{};
     sink.sched = sched;
     sink.local_bufs = local_bufs;
-    sink.deferred_release_slot_states = deferred_release_slot_states;
-    sink.deferred_release_count = &deferred_release_count;
-    sink.deferred_release_capacity = deferred_release_capacity;
 
     int32_t drain_err = PTO2_ERROR_NONE;
     drain_aicore_completion_mailbox_locked(aicore_mailbox, sink, drain_err);
@@ -708,9 +689,6 @@ inline AsyncPollResult AsyncWaitList::poll_and_complete(AICoreCompletionMailbox
         if (entry.normal_done && entry.waiting_completion_count <= 0)
         {
             sched->on_mixed_task_complete(*entry.slot_state, local_bufs);
-            if (deferred_release_count >= deferred_release_capacity)
-                while (deferred_release_count > 0) sched->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-            deferred_release_slot_states[deferred_release_count++] = entry.slot_state;
             result.completed++;
 
             int32_t last = count - 1;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index 47c2115be..a52366993 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -39,6 +39,12 @@ struct alignas(64) PTO2SharedMemoryRingHeader
 {
     PTO2RingFlowControl fc;
 
+    // Highest task_id such that every task with id in [0, completed_watermark]
+    // has reached COMPLETED. Maintained at task-completion time. Used to gate
+    // slot reclamation: a producer slot P is safe to retire when
+    // completed_watermark >= P.last_consumer_local_id.
+    alignas(64) std::atomic<int32_t> completed_watermark;
+
     // Layout metadata (set once at init)
     uint64_t task_window_size;
     int32_t task_window_mask;
@@ -223,7 +229,13 @@ struct PTO2SharedMemoryHandle
     void init_header_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH])
     {
         // Per-ring flow control (start at 0)
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) header->rings[r].fc.init();
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            header->rings[r].fc.init();
+            // -1 = "no task completed yet"; first task to complete (local_id 0)
+            // will advance the watermark to 0.
+            header->rings[r].completed_watermark.store(-1, std::memory_order_relaxed);
+        }
 
         header->orchestrator_done.store(0, std::memory_order_relaxed);
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
index b2c178a92..6e0f71b08 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
@@ -36,8 +36,6 @@
 #define unlikely(x) __builtin_expect(!!(x), 0)
 #endif
 
-inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
-
 inline void latch_scheduler_error(PTO2SharedMemoryHeader *header, int32_t thread_idx, int32_t error_code)
 {
     if (header == nullptr || error_code == PTO2_ERROR_NONE) return;
@@ -224,8 +222,6 @@ class SchedulerContext
         PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE];
         PTO2LocalReadyBuffer local_bufs[PTO2_NUM_RESOURCE_SHAPES];
         for (int32_t i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) local_bufs[i].reset(local_ptrs[i], LOCAL_READY_CAP_PER_TYPE);
-        PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP];
-        int32_t deferred_release_count = 0;
 
         bool cores_released = false;
 
@@ -254,7 +250,7 @@ class SchedulerContext
             int32_t completed_this_turn = 0;
 
             bool try_completed = tracker.has_any_running_cores();
-            if (try_completed) check_running_cores_for_completion(thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress, deferred_release_slot_states, deferred_release_count, local_bufs);
+            if (try_completed) check_running_cores_for_completion(thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress, local_bufs);
             if (completed_this_turn > 0)
             {
                 int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
@@ -269,7 +265,7 @@ class SchedulerContext
 
             if (rt_ != nullptr && rt_->aicore_mailbox != nullptr && (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending()))
             {
-                AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(rt_->aicore_mailbox, sched_, local_bufs, deferred_release_slot_states, deferred_release_count, PTO2_DEFERRED_RELEASE_CAP);
+                AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(rt_->aicore_mailbox, sched_, local_bufs);
                 if (poll_result.error_code != PTO2_ERROR_NONE)
                 {
                     int32_t expected = PTO2_ERROR_NONE;
@@ -311,9 +307,6 @@ class SchedulerContext
                 {
                     PTO2TaskSlotState &dummy_slot = *dummy_batch[di];
                     sched_->on_mixed_task_complete(dummy_slot, local_bufs);
-                    deferred_release_slot_states[deferred_release_count++] = &dummy_slot;
-                    if (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP)
-                        while (deferred_release_count > 0) sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
                     int32_t prev = completed_tasks_.fetch_add(1, std::memory_order_relaxed);
                     last_progress_count = prev + 1;
                     cur_thread_completed++;
@@ -335,7 +328,6 @@ class SchedulerContext
             }
             else
             {
-                while (deferred_release_count > 0) sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
                 idle_iterations++;
 
                 if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0)
@@ -356,8 +348,6 @@ class SchedulerContext
             }
         }
 
-        while (deferred_release_count > 0) sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-
         return cur_thread_completed;
     }
 
@@ -1089,7 +1079,7 @@ class SchedulerContext
         return t;
     }
 
-    void complete_slot_task(PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, int32_t core_id, Handshake *hank, int32_t &completed_this_turn, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs)
+    void complete_slot_task(PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, int32_t core_id, Handshake *hank, int32_t &completed_this_turn, PTO2LocalReadyBuffer *local_bufs)
     {
         (void)hank;
         AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr;
@@ -1150,15 +1140,6 @@ class SchedulerContext
         if (mixed_complete && !defer_completion_to_consumer)
         {
             sched_->on_mixed_task_complete(slot_state, local_bufs);
-            if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP)
-            {
-                deferred_release_slot_states[deferred_release_count++] = &slot_state;
-            }
-            else
-            {
-                while (deferred_release_count > 0) sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-                deferred_release_slot_states[deferred_release_count++] = &slot_state;
-            }
             completed_this_turn++;
         }
     }
@@ -1177,7 +1158,7 @@ class SchedulerContext
         core.running_reg_task_id = AICPU_TASK_INVALID;
     }
 
-    void check_running_cores_for_completion(int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed, bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs)
+    void check_running_cores_for_completion(int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed, bool &made_progress, PTO2LocalReadyBuffer *local_bufs)
     {
         CoreTracker &tracker = core_trackers_[thread_idx];
         auto running_core_states = tracker.get_all_running_cores();
@@ -1200,12 +1181,12 @@ class SchedulerContext
             // 1. Complete finished tasks (capture pointers before modifying core state)
             if (t.pending_done)
             {
-                complete_slot_task(*core.pending_slot_state, core.pending_reg_task_id, core_id, hank, completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs);
+                complete_slot_task(*core.pending_slot_state, core.pending_reg_task_id, core_id, hank, completed_this_turn, local_bufs);
                 cur_thread_completed++;
             }
             if (t.running_done)
             {
-                complete_slot_task(*core.running_slot_state, core.running_reg_task_id, core_id, hank, completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs);
+                complete_slot_task(*core.running_slot_state, core.running_reg_task_id, core_id, hank, completed_this_turn, local_bufs);
                 cur_thread_completed++;
             }
 

From f1387d58799a396ea80f20cc85e3f4f356fc27c2 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 17 Jun 2026 11:41:50 +0200
Subject: [PATCH 03/14] Poll fanin readiness via compact byte array

Replace the per-fanin pointer chase to producer slot_state.task_state
with a byte read from a contiguous per-ring completion_flags array
indexed by producer local_id & task_window_mask. Each task carries
fanin_local_ids[] (4B per id) in place of fanin_slot_states[] (8B
per pointer), and the completer writes a single byte instead of
publishing through a 128B-aligned slot.

For Case1's working set (16384 slots), the flag array is 16KB and
fits L1. Thread 0's fanin_satisfied polling now condenses 16 fanin
checks into 1-2 cache lines instead of one per producer slot.

The orchestrator clears the new slot's byte in prepare_task before
the wiring-queue push (release) makes it visible to thread 0; reset
happens single-threaded so no atomic is needed. The completer's set
uses release ordering to publish the producer's output writes to
acquire-loading consumers.

Case4 trimmed device avg: 1308 us (was 1360). Case1 trimmed device
avg: 28047 us (was 28286); trimmed host avg: 292834 us (was 453591).
---
 .../runtime/pto_orchestrator.h                | 37 ++++++++++++-------
 .../runtime/pto_ring_buffer.h                 | 31 ----------------
 .../runtime/pto_runtime2_types.h              |  9 +++--
 .../runtime/pto_scheduler.h                   | 10 ++++-
 .../runtime/pto_shared_memory.h               | 13 +++++++
 5 files changed, 51 insertions(+), 49 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 1261f565b..4d5cf0138 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -46,12 +46,7 @@ struct PTO2FaninBuilder
 {
     int32_t count{0};
     PTO2TaskSlotState *slots[PTO2_MAX_FANIN];
-
-    template <typename Fn>
-    PTO2FaninForEachReturn<Fn> for_each(Fn &&fn) const
-    {
-        return for_each_fanin_in(slots, count, static_cast<Fn &&>(fn));
-    }
+    int32_t local_ids[PTO2_MAX_FANIN];
 
     bool contains(PTO2TaskSlotState *prod_state) const
     {
@@ -68,7 +63,7 @@ inline void orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code,
 inline void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state);
 inline bool prepare_task(PTO2OrchestratorState *orch, const Arg &args, int32_t total_output_size, ActiveMask active_mask, PTO2PreparedTask *out);
 inline PTO2OutputLayout calculate_output_layout(const Arg &args);
-inline bool append_fanin_or_fail(PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder);
+inline bool append_fanin_or_fail(PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, int32_t prod_local_id, PTO2FaninBuilder *fanin_builder);
 inline bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator);
 inline void prefetch_payload(PTO2TaskPayload *payload, int32_t tensor_count, int32_t scalar_count);
 inline TaskOutputTensors submit_task_common(PTO2OrchestratorState *orch, const Arg &args, ActiveMask active_mask, int32_t aic_kernel_id, int32_t aiv0_kernel_id, int32_t aiv1_kernel_id);
@@ -355,7 +350,12 @@ struct PTO2OrchestratorState
         payload.init(args, outputs, prepared.alloc_result, layout);
         payload.fanin_count = 0;
 
-        if (prepared.slot_state != nullptr) prepared.slot_state->task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+        if (prepared.slot_state != nullptr)
+        {
+            prepared.slot_state->task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+            uint8_t ring_id = prepared.task_id.ring();
+            orch->sm_header->rings[ring_id].completion_flags[prepared.alloc_result.slot].store(1, std::memory_order_release);
+        }
         orch->inline_completed_tasks++;
 
         return outputs;
@@ -398,7 +398,7 @@ inline void orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code,
     (void)message;
 }
 
-inline bool append_fanin_or_fail(PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, PTO2FaninBuilder *fanin_builder)
+inline bool append_fanin_or_fail(PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, int32_t prod_local_id, PTO2FaninBuilder *fanin_builder)
 {
     if (fanin_builder->contains(prod_state)) return true;
     if (fanin_builder->count >= PTO2_MAX_FANIN)
@@ -406,7 +406,9 @@ inline bool append_fanin_or_fail(PTO2OrchestratorState *orch, PTO2TaskSlotState
         orch_mark_fatal(orch, PTO2_ERROR_DEPENDENCY_OVERFLOW);
         return false;
     }
-    fanin_builder->slots[fanin_builder->count++] = prod_state;
+    int32_t idx = fanin_builder->count++;
+    fanin_builder->slots[idx] = prod_state;
+    fanin_builder->local_ids[idx] = prod_local_id;
     return true;
 }
 
@@ -471,6 +473,12 @@ inline bool prepare_task(PTO2OrchestratorState *orch, const Arg &args, int32_t t
     out->slot_state->bind_buffers(out->payload, out->task);
 
     out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
+    // Clear the polling-fast completion byte for the newly-allocated slot.
+    // The previous incarnation's completer set this byte to 1; we publish 0
+    // before this task can be added as a fanin to any consumer (single-
+    // orchestrator-thread guarantee) and before the wiring-queue push
+    // (release-acquire) makes the slot visible to thread 0.
+    orch->sm_header->rings[ring_id].completion_flags[out->alloc_result.slot].store(0, std::memory_order_relaxed);
     // Seed last_consumer_local_id to self — with no consumers, the slot is
     // safe to reclaim as soon as the watermark reaches this task itself.
     out->slot_state->last_consumer_local_id = out->alloc_result.task_id;
@@ -541,7 +549,7 @@ inline TaskOutputTensors submit_task_common(PTO2OrchestratorState *orch, const A
         int32_t dep_last_task_alive = dep_ring.fc.last_task_alive.load(std::memory_order_acquire);
         if (dep_local_task_id < dep_last_task_alive) continue;
         PTO2TaskSlotState *producer_slot_state = &dep_ring.get_slot_state_by_task_id(dep_local_task_id);
-        if (!append_fanin_or_fail(orch, producer_slot_state, &fanin_builder)) return result;
+        if (!append_fanin_or_fail(orch, producer_slot_state, dep_local_task_id, &fanin_builder)) return result;
     }
 
     DepInputs dep_inputs{
@@ -549,8 +557,9 @@ inline TaskOutputTensors submit_task_common(PTO2OrchestratorState *orch, const A
     };
 
     auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool {
-        PTO2TaskSlotState *prod_state = &orch->sm_header->rings[producer_task_id.ring()].get_slot_state_by_task_id(producer_task_id.local());
-        return append_fanin_or_fail(orch, prod_state, &fanin_builder);
+        int32_t prod_local = static_cast<int32_t>(producer_task_id.local());
+        PTO2TaskSlotState *prod_state = &orch->sm_header->rings[producer_task_id.ring()].get_slot_state_by_task_id(prod_local);
+        return append_fanin_or_fail(orch, prod_state, prod_local, &fanin_builder);
     };
 
     if (!compute_task_fanin(dep_inputs, orch->tensor_map, orch->in_manual_scope(), runtime_emit)) return result;
@@ -576,7 +585,7 @@ inline TaskOutputTensors submit_task_common(PTO2OrchestratorState *orch, const A
     }
 
     payload.fanin_count = fanin_builder.count;
-    for (int32_t i = 0; i < fanin_builder.count; i++) payload.fanin_slot_states[i] = fanin_builder.slots[i];
+    for (int32_t i = 0; i < fanin_builder.count; i++) payload.fanin_local_ids[i] = fanin_builder.local_ids[i];
 
     payload.init(args, result, prepared.alloc_result, layout);
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index ebc91f324..3faef6b4c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -243,37 +243,6 @@ class PTO2TaskAllocator
     }
 };
 
-template <typename Fn>
-using PTO2FaninCallbackResult = std::invoke_result_t<Fn &, PTO2TaskSlotState *>;
-
-template <typename Fn>
-using PTO2FaninForEachReturn = std::conditional_t<std::is_same_v<PTO2FaninCallbackResult<Fn>, void>, void, bool>;
-
-template <typename Slots, typename Fn>
-inline PTO2FaninForEachReturn<Fn> for_each_fanin_in(Slots &&slot_states, int32_t fanin_count, Fn &&fn)
-{
-    using FaninCallbackResult = PTO2FaninCallbackResult<Fn>;
-    static_assert(std::is_same_v<FaninCallbackResult, void> || std::is_same_v<FaninCallbackResult, bool>, "fanin callback must return void or bool");
-
-    if constexpr (std::is_void_v<FaninCallbackResult>)
-    {
-        for (int32_t i = 0; i < fanin_count; i++) fn(slot_states[i]);
-    }
-    else
-    {
-        for (int32_t i = 0; i < fanin_count; i++)
-            if (!fn(slot_states[i])) return false;
-        return true;
-    }
-}
-
-template <typename Fn>
-inline PTO2FaninForEachReturn<Fn> for_each_fanin_slot_state(const PTO2TaskPayload &payload, Fn &&fn)
-{
-    return for_each_fanin_in(payload.fanin_slot_states, payload.fanin_count, static_cast<Fn &&>(fn));
-}
-
-
 struct PTO2RingSet
 {
     PTO2TaskAllocator task_allocator;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 742027aca..910e17f24 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -112,8 +112,11 @@ struct PTO2TaskPayload
     // === Cache lines 0-2 (192B) — metadata + fanin ===
     int32_t tensor_count{0};
     int32_t scalar_count{0};
-    int32_t fanin_count{0};  // Number of valid entries in fanin_slot_states
-    PTO2TaskSlotState *fanin_slot_states[PTO2_MAX_FANIN];
+    int32_t fanin_count{0};  // Number of valid entries in fanin_local_ids
+    // Local ids of fanin producers, used by the thread-0 polling loop to
+    // index a compact ring-level completion_flags byte array. Avoids a
+    // pointer chase per fanin into a 128B-aligned slot_state.
+    int32_t fanin_local_ids[PTO2_MAX_FANIN];
     // === Tensors (Tensor is alignas(64); array is naturally aligned) ===
     Tensor tensors[MAX_TENSOR_ARGS];
     // === Scalars ===
@@ -148,7 +151,7 @@ struct PTO2TaskPayload
 };
 
 // PTO2TaskPayload layout verification (offsetof requires complete type).
-static_assert(offsetof(PTO2TaskPayload, fanin_slot_states) == 16, "fanin array must follow metadata words");
+static_assert(offsetof(PTO2TaskPayload, fanin_local_ids) == 12, "fanin array must follow metadata words");
 static_assert(offsetof(PTO2TaskPayload, scalars) == offsetof(PTO2TaskPayload, tensors) + MAX_TENSOR_ARGS * sizeof(Tensor), "scalars must immediately follow tensors");
 static_assert(sizeof(PTO2TaskPayload) == offsetof(PTO2TaskPayload, scalars) + MAX_SCALAR_ARGS * sizeof(uint64_t), "no trailing padding after scalars");
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
index a7673bef3..3df1c0226 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -463,8 +463,11 @@ struct PTO2SchedulerState
     bool fanin_satisfied(PTO2TaskSlotState *s) const
     {
         const PTO2TaskPayload &p = *s->payload;
+        const auto &ring = *ring_sched_states[s->ring_id].ring;
+        const int32_t mask = ring.task_window_mask;
+        std::atomic<uint8_t> *flags = ring.completion_flags;
         for (int32_t i = 0; i < p.fanin_count; i++)
-            if (p.fanin_slot_states[i]->task_state.load(std::memory_order_acquire) < PTO2_TASK_COMPLETED) return false;
+            if (flags[p.fanin_local_ids[i] & mask].load(std::memory_order_acquire) == 0) return false;
         return true;
     }
 
@@ -540,6 +543,11 @@ struct PTO2SchedulerState
         auto &rss = ring_sched_states[ring_id];
         auto &ring = *rss.ring;
 
+        // Publish to the polling-fast completion array. Release ordering
+        // makes the producer's output writes visible to consumers that
+        // acquire-load this byte in fanin_satisfied.
+        ring.completion_flags[my_id & ring.task_window_mask].store(1, std::memory_order_release);
+
         // CAS-advance the watermark, bounded by my_id (which we know is
         // published since we just completed it). If a forward task we observe
         // as COMPLETED is also published, but a gap remains, we stop — the
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index a52366993..a5e029ee8 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -56,6 +56,15 @@ struct alignas(64) PTO2SharedMemoryRingHeader
     PTO2TaskPayload *task_payloads;
     PTO2TaskSlotState *slot_states;
 
+    // Compact contiguous array (one byte per slot) holding the polling-fast
+    // "task X completed?" flag. 0 = pending, 1 = completed. Indexed by
+    // local_id & task_window_mask. Writer: the task's completer at
+    // on_mixed_task_complete; Resetter: orchestrator in prepare_task for the
+    // newly-allocated slot. Reader: thread-0 fanin polling. Replaces a chain
+    // of 128B-aligned slot_state pointer derefs with byte reads into a single
+    // array — typically condenses 16 fanin checks into 1-2 cache lines.
+    std::atomic<uint8_t> *completion_flags;
+
     PTO2TaskDescriptor &get_task_by_slot(int32_t slot)
     {
         return task_descriptors[slot];
@@ -149,6 +158,7 @@ struct PTO2SharedMemoryHandle
             size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
             size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
             size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+            size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(std::atomic<uint8_t>), PTO2_ALIGN_SIZE);
         }
 
         return size;
@@ -299,6 +309,9 @@ struct PTO2SharedMemoryHandle
 
             ring.slot_states = (PTO2TaskSlotState *)ptr;
             ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+
+            ring.completion_flags = (std::atomic<uint8_t> *)ptr;
+            ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(std::atomic<uint8_t>), PTO2_ALIGN_SIZE);
         }
     }
 };

From 4eaaf840c2c78e60d4ff873572d519aeca52e261 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Wed, 17 Jun 2026 12:01:22 +0200
Subject: [PATCH 04/14] Move pending FIFO out of PTO2TaskSlotState
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the intrusive next_pending pointer in PTO2TaskSlotState with a
thread-0-private circular FIFO of slot pointers, sized to the per-ring
task window (PTO2_TASK_WINDOW_SIZE) and allocated from the scheduler
arena. Same memory budget (was 8B per slot × window_size; now one
contiguous buffer of the same total size), but keeps scheduler-private
linkage out of the task struct.

Push/pop become array writes/reads at head_idx/tail_idx & mask. The
buffer's cache lines amortize across 64 entries per line, matching the
hit rate the old design got from co-locating next_pending with the
slot_state cache line that fanin_satisfied already loaded.

Case4 trimmed device avg: 1319 us (was 1308 us). Case1 trimmed device
avg: 28080 us (was 28047 us). Differences are within shared-box noise.
---
 .../runtime/pto_runtime2_types.h              |  4 --
 .../runtime/pto_scheduler.h                   | 63 +++++++++++--------
 2 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 910e17f24..6d2fa9ba5 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -173,9 +173,6 @@ struct alignas(64) PTO2TaskSlotState
     PTO2TaskPayload *payload;
     PTO2TaskDescriptor *task;
 
-    // Intrusive linkage for the thread-0 pending-readiness queue.
-    PTO2TaskSlotState *next_pending{nullptr};
-
     // --- Set per-submit (depend on task inputs) ---
     ActiveMask active_mask;  // Bitmask of active subtask slots (set once)
     uint8_t ring_id;         // Ring layer (immutable after init)
@@ -203,7 +200,6 @@ struct alignas(64) PTO2TaskSlotState
         completed_subtasks.store(0, std::memory_order_relaxed);
         next_block_idx = 0;
         any_subtask_deferred.store(false, std::memory_order_relaxed);
-        next_pending = nullptr;
         // last_consumer_local_id is reset in prepare_task once the task_id is known.
     }
 };
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
index 3df1c0226..a23139157 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -345,8 +345,10 @@ struct PTO2SchedulerLayout
     size_t off_ready_queue_slots[PTO2_NUM_RESOURCE_SHAPES];
     size_t off_dummy_ready_queue_slots;
     size_t off_pending_spsc_buffer;
+    size_t off_pending_buffer;
     uint64_t ready_queue_capacity;
     uint64_t spsc_capacity;
+    uint64_t pending_capacity;
 };
 
 struct PTO2SchedulerState
@@ -406,9 +408,11 @@ struct PTO2SchedulerState
     // the dispatch loop and completed inline -- never goes to AICore.
     PTO2ReadyQueue dummy_ready_queue;
 
-    // Thread 0 exclusive: intrusive pending list of tasks awaiting fanin
-    // readiness. SPSC queue receives slot_states from the orchestrator; thread 0
-    // drains them into the pending list and polls fanin producers' task_state.
+    // Thread 0 exclusive: circular FIFO of tasks awaiting fanin readiness.
+    // SPSC queue receives slot_states from the orchestrator; thread 0 drains
+    // them into the pending ring and polls fanin readiness. Storing the FIFO
+    // out of band (instead of intrusively in PTO2TaskSlotState) keeps the
+    // task struct free of scheduler-private state.
     struct alignas(64) PendingState
     {
         static constexpr int BACKOFF_LIMIT = 32;
@@ -416,9 +420,11 @@ struct PTO2SchedulerState
         static constexpr int POLL_MAX_PER_ITER = 128;
 
         // --- Thread 0 exclusive ---
-        PTO2TaskSlotState *pending_head{nullptr};
-        PTO2TaskSlotState *pending_tail{nullptr};
-        int32_t pending_count{0};
+        PTO2TaskSlotState **pending_buf{nullptr};  // capacity slots, arena-owned
+        uint32_t pending_cap{0};
+        uint32_t pending_mask{0};
+        uint32_t pending_head_idx{0};  // next pop
+        uint32_t pending_tail_idx{0};  // next push
         int backoff_counter{0};
         PTO2TaskSlotState *drain_buf[DRAIN_BATCH];
 
@@ -427,6 +433,9 @@ struct PTO2SchedulerState
 
         // --- Orchestrator write, thread 0 read ---
         alignas(64) std::atomic<bool> orch_needs_drain{false};
+
+        uint32_t pending_count() const { return pending_tail_idx - pending_head_idx; }
+        bool pending_empty() const { return pending_tail_idx == pending_head_idx; }
     } wiring;
 
     alignas(64) AsyncWaitList async_wait_list;
@@ -438,25 +447,19 @@ struct PTO2SchedulerState
         else ready_queues[static_cast<int32_t>(shape)].push(slot_state);
     }
 
-    // Append slot to the tail of the intrusive pending list.
+    // Append slot to the tail of the pending FIFO.
     void pending_push_back(PTO2TaskSlotState *s)
     {
-        s->next_pending = nullptr;
-        if (wiring.pending_tail) wiring.pending_tail->next_pending = s;
-        else wiring.pending_head = s;
-        wiring.pending_tail = s;
-        wiring.pending_count++;
+        wiring.pending_buf[wiring.pending_tail_idx & wiring.pending_mask] = s;
+        wiring.pending_tail_idx++;
     }
 
-    // Pop the head of the pending list (or nullptr).
+    // Pop the head of the pending FIFO (or nullptr).
     PTO2TaskSlotState *pending_pop_front()
     {
-        PTO2TaskSlotState *s = wiring.pending_head;
-        if (s == nullptr) return nullptr;
-        wiring.pending_head = s->next_pending;
-        if (wiring.pending_head == nullptr) wiring.pending_tail = nullptr;
-        s->next_pending = nullptr;
-        wiring.pending_count--;
+        if (wiring.pending_empty()) return nullptr;
+        PTO2TaskSlotState *s = wiring.pending_buf[wiring.pending_head_idx & wiring.pending_mask];
+        wiring.pending_head_idx++;
         return s;
     }
 
@@ -477,12 +480,12 @@ struct PTO2SchedulerState
     // 0 signals no productive work.
     int drain_wiring_queue(bool force_drain = false)
     {
-        // Stage 1: drain SPSC → pending list tail
+        // Stage 1: drain SPSC → pending FIFO tail
         int drained = wiring.queue.pop_batch(wiring.drain_buf, PendingState::DRAIN_BATCH);
         for (int i = 0; i < drained; i++) pending_push_back(wiring.drain_buf[i]);
 
         // Backoff when nothing to do and orchestrator isn't pressing
-        if (drained == 0 && wiring.pending_head == nullptr)
+        if (drained == 0 && wiring.pending_empty())
         {
             if (!force_drain && !wiring.orch_needs_drain.load(std::memory_order_acquire) && wiring.backoff_counter < PendingState::BACKOFF_LIMIT)
             {
@@ -492,9 +495,9 @@ struct PTO2SchedulerState
         }
         wiring.backoff_counter = 0;
 
-        // Stage 2: poll pending list, route ready tasks
+        // Stage 2: poll pending FIFO, route ready tasks
         int routed = 0;
-        int to_visit = wiring.pending_count;
+        int to_visit = static_cast<int>(wiring.pending_count());
         if (to_visit > PendingState::POLL_MAX_PER_ITER) to_visit = PendingState::POLL_MAX_PER_ITER;
         for (int i = 0; i < to_visit; i++)
         {
@@ -580,10 +583,12 @@ struct PTO2SchedulerState
         PTO2SchedulerLayout layout{};
         layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
         layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
+        layout.pending_capacity = PTO2_TASK_WINDOW_SIZE;  // bounded by per-ring slot window
 
         for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
         layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
         layout.off_pending_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
+        layout.off_pending_buffer = arena.reserve(layout.pending_capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
         return layout;
     }
 
@@ -600,9 +605,13 @@ struct PTO2SchedulerState
         if (!ready_queue_init_data_from_layout(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity)) return false;
 
         if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_pending_spsc_buffer, layout.spsc_capacity)) return false;
-        sched->wiring.pending_head = nullptr;
-        sched->wiring.pending_tail = nullptr;
-        sched->wiring.pending_count = 0;
+
+        if (layout.pending_capacity == 0 || (layout.pending_capacity & (layout.pending_capacity - 1)) != 0) return false;
+        sched->wiring.pending_buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_pending_buffer));
+        sched->wiring.pending_cap = static_cast<uint32_t>(layout.pending_capacity);
+        sched->wiring.pending_mask = sched->wiring.pending_cap - 1;
+        sched->wiring.pending_head_idx = 0;
+        sched->wiring.pending_tail_idx = 0;
         sched->wiring.backoff_counter = 0;
 
         return true;
@@ -614,6 +623,7 @@ struct PTO2SchedulerState
         for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
         ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
         sched->wiring.queue.wire_arena_pointers(arena, layout.off_pending_spsc_buffer);
+        sched->wiring.pending_buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_pending_buffer));
     }
 
     // Forget per-region pointers; arena owns the backing memory.
@@ -622,6 +632,7 @@ struct PTO2SchedulerState
         PTO2SchedulerState *sched = this;
         for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) sched->ring_sched_states[r].destroy();
         sched->wiring.queue.destroy();
+        sched->wiring.pending_buf = nullptr;
         for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) ready_queue_destroy(&sched->ready_queues[i]);
         ready_queue_destroy(&sched->dummy_ready_queue);
     }

From 047b20d5219e8e2b85a1270f97a7921d2883ae57 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 18 Jun 2026 12:50:04 +0200
Subject: [PATCH 05/14] Per-thread phase cycle profiling for
 resolve_and_dispatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add SchedulerThreadProfile (per-phase cumulative cycles + entry counts)
and instrument the main loop to attribute time to:
  - completion check
  - async wait poll
  - drain_wiring_queue (split into SPSC drain vs pending FIFO poll)
  - dummy ready-queue drain
  - dispatch_ready_tasks
  - idle spin

Dump via LOG_INFO_V9 once per resolve_and_dispatch exit so the hot path
only accumulates cycle counters. Output is tagged CLAUDE_PROFILING and
written to ${HOME}/ascend/log/debug/; pull it with
  cat /root/ascend/log/debug/*/* | grep CLAUDE_PROFILING

Used to identify thread 0's pending FIFO fanin polling as the
dominant cost in Case1 (54% of round time) — the data-driven basis
for the wake-list optimization that follows.
---
 .../runtime/pto_scheduler.h                   | 25 ++++++-
 .../runtime/scheduler_context.h               | 65 +++++++++++++++++--
 .../runtime/scheduler_types.h                 | 28 ++++++++
 3 files changed, 113 insertions(+), 5 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
index a23139157..2bfc5f693 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -20,6 +20,11 @@
 #include "pto_runtime2_types.h"
 #include "pto_shared_memory.h"
 
+// Forward declaration so this header can compile under both AICPU and host
+// builds. The actual definition is provided by aicpu/device_time.cpp (AICPU)
+// or a weak stub in pto_runtime2.h (host). Used only for sub-phase profiling.
+uint64_t get_sys_cnt_aicpu();
+
 struct PTO2ReadyQueueSlot
 {
     std::atomic<int64_t> sequence;
@@ -478,11 +483,23 @@ struct PTO2SchedulerState
     // for newly-ready tasks. Not-ready tasks rotate to the tail.
     // Returns >0 if anything moved (SPSC drained OR tasks routed to ready);
     // 0 signals no productive work.
-    int drain_wiring_queue(bool force_drain = false)
+    //
+    // Sub-phase timing pointers (optional). If non-null, cumulative cycle/
+    // iteration counters for Stage 1 (SPSC drain) and Stage 2 (pending poll)
+    // are accumulated into them.
+    int drain_wiring_queue(bool force_drain = false,
+                           uint64_t *spsc_cyc_out = nullptr, uint64_t *spsc_iters_out = nullptr,
+                           uint64_t *poll_cyc_out = nullptr, uint64_t *poll_iters_out = nullptr)
     {
         // Stage 1: drain SPSC → pending FIFO tail
+        uint64_t t0 = spsc_cyc_out ? get_sys_cnt_aicpu() : 0;
         int drained = wiring.queue.pop_batch(wiring.drain_buf, PendingState::DRAIN_BATCH);
         for (int i = 0; i < drained; i++) pending_push_back(wiring.drain_buf[i]);
+        if (spsc_cyc_out)
+        {
+            *spsc_cyc_out += get_sys_cnt_aicpu() - t0;
+            if (spsc_iters_out) (*spsc_iters_out)++;
+        }
 
         // Backoff when nothing to do and orchestrator isn't pressing
         if (drained == 0 && wiring.pending_empty())
@@ -496,6 +513,7 @@ struct PTO2SchedulerState
         wiring.backoff_counter = 0;
 
         // Stage 2: poll pending FIFO, route ready tasks
+        uint64_t t1 = poll_cyc_out ? get_sys_cnt_aicpu() : 0;
         int routed = 0;
         int to_visit = static_cast<int>(wiring.pending_count());
         if (to_visit > PendingState::POLL_MAX_PER_ITER) to_visit = PendingState::POLL_MAX_PER_ITER;
@@ -513,6 +531,11 @@ struct PTO2SchedulerState
                 pending_push_back(s);
             }
         }
+        if (poll_cyc_out)
+        {
+            *poll_cyc_out += get_sys_cnt_aicpu() - t1;
+            if (poll_iters_out) (*poll_iters_out)++;
+        }
 
         return drained + routed;
     }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
index 6e0f71b08..857a7113c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
@@ -30,7 +30,10 @@
 #include "aicpu/tensor_dump_aicpu.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
+#include "common/unified_log.h"
 #include "spin_hint.h"
+// SchedulerThreadProfile is defined in scheduler_types.h (above) so the
+// drain_wiring_queue method in pto_scheduler.h can take a pointer to it.
 
 #ifndef unlikely
 #define unlikely(x) __builtin_expect(!!(x), 0)
@@ -229,10 +232,18 @@ class SchedulerContext
 
         uint64_t last_progress_ts = get_sys_cnt_aicpu();
 
+        // Profile reset + total-cycle start. Reset here so each
+        // resolve_and_dispatch call (≈ one kernel launch) records its own
+        // breakdown. The dump happens at loop exit, well outside the hot path.
+        SchedulerThreadProfile &profile = thread_profiles_[thread_idx];
+        profile.reset();
+        const uint64_t profile_loop_start = get_sys_cnt_aicpu();
+
         while (true)
         {
             if (completed_.load(std::memory_order_acquire)) break;
             bool made_progress = false;
+            profile.total_iters++;
             int32_t task_count = 0;
             if (!tracker.has_any_running_cores())
             {
@@ -250,7 +261,13 @@ class SchedulerContext
             int32_t completed_this_turn = 0;
 
             bool try_completed = tracker.has_any_running_cores();
-            if (try_completed) check_running_cores_for_completion(thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress, local_bufs);
+            if (try_completed)
+            {
+                uint64_t t0 = get_sys_cnt_aicpu();
+                check_running_cores_for_completion(thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress, local_bufs);
+                profile.completion_cycles += get_sys_cnt_aicpu() - t0;
+                profile.completion_iters++;
+            }
             if (completed_this_turn > 0)
             {
                 int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
@@ -263,8 +280,10 @@ class SchedulerContext
                 }
             }
 
+            uint64_t t0_async = 0;
             if (rt_ != nullptr && rt_->aicore_mailbox != nullptr && (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending()))
             {
+                t0_async = get_sys_cnt_aicpu();
                 AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(rt_->aicore_mailbox, sched_, local_bufs);
                 if (poll_result.error_code != PTO2_ERROR_NONE)
                 {
@@ -280,6 +299,8 @@ class SchedulerContext
                     last_progress_count = new_total;
                     made_progress = true;
                 }
+                profile.async_wait_cycles += get_sys_cnt_aicpu() - t0_async;
+                profile.async_wait_iters++;
             }
 
             bool try_pushed = false;
@@ -291,15 +312,23 @@ class SchedulerContext
                 continue;
             }
 
-            // Phase 3: Drain wiring queue (thread 0 only)
+            // Phase 3: Drain wiring queue (thread 0 only). Pass cumulative
+            // sub-phase counters (SPSC drain stage 1 / pending-FIFO poll
+            // stage 2) so drain_wiring_queue accumulates into them.
             if (thread_idx == 0)
             {
-                int wired = sched_->drain_wiring_queue(orchestrator_done_);
+                uint64_t t0 = get_sys_cnt_aicpu();
+                int wired = sched_->drain_wiring_queue(orchestrator_done_,
+                    &profile.spsc_drain_cycles, &profile.spsc_drain_iters,
+                    &profile.pending_poll_cycles, &profile.pending_poll_iters);
                 if (wired > 0) made_progress = true;
+                profile.drain_wiring_cycles += get_sys_cnt_aicpu() - t0;
+                profile.drain_wiring_iters++;
             }
 
             if (thread_idx == 0)
             {
+                uint64_t t0 = get_sys_cnt_aicpu();
                 constexpr int DUMMY_DRAIN_BATCH = 16;
                 PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH];
                 int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH);
@@ -312,11 +341,18 @@ class SchedulerContext
                     cur_thread_completed++;
                 }
                 if (dummy_got > 0) made_progress = true;
+                profile.dummy_drain_cycles += get_sys_cnt_aicpu() - t0;
+                profile.dummy_drain_iters++;
             }
 
             // Phase 4: MIX-strict-priority dispatch with phase-split and
             // cross-thread idle gating. See dispatch_ready_tasks for the policy.
-            dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed);
+            {
+                uint64_t t0 = get_sys_cnt_aicpu();
+                dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed);
+                profile.dispatch_cycles += get_sys_cnt_aicpu() - t0;
+                profile.dispatch_iters++;
+            }
 
             (void)try_completed;
             (void)try_pushed;
@@ -328,6 +364,7 @@ class SchedulerContext
             }
             else
             {
+                uint64_t t0_idle = get_sys_cnt_aicpu();
                 idle_iterations++;
 
                 if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0)
@@ -345,9 +382,28 @@ class SchedulerContext
                     last_progress_ts = get_sys_cnt_aicpu();
                 }
                 SPIN_WAIT_HINT();
+                profile.idle_spin_cycles += get_sys_cnt_aicpu() - t0_idle;
+                profile.idle_iters++;
             }
         }
 
+        // Dump profile breakdown for this thread. Logged AFTER the hot loop
+        // exits, so this adds no overhead to the measured phases.
+        profile.total_cycles = get_sys_cnt_aicpu() - profile_loop_start;
+        LOG_INFO_V9(
+            "CLAUDE_PROFILING thread=%d total_cyc=%lu iters=%lu compl_cyc=%lu compl_n=%lu async_cyc=%lu async_n=%lu drain_cyc=%lu drain_n=%lu spsc_cyc=%lu spsc_n=%lu poll_cyc=%lu poll_n=%lu poll_skipped=%lu dummy_cyc=%lu dummy_n=%lu dispatch_cyc=%lu dispatch_n=%lu idle_cyc=%lu idle_n=%lu",
+            (int)thread_idx,
+            (unsigned long)profile.total_cycles, (unsigned long)profile.total_iters,
+            (unsigned long)profile.completion_cycles, (unsigned long)profile.completion_iters,
+            (unsigned long)profile.async_wait_cycles, (unsigned long)profile.async_wait_iters,
+            (unsigned long)profile.drain_wiring_cycles, (unsigned long)profile.drain_wiring_iters,
+            (unsigned long)profile.spsc_drain_cycles, (unsigned long)profile.spsc_drain_iters,
+            (unsigned long)profile.pending_poll_cycles, (unsigned long)profile.pending_poll_iters,
+            (unsigned long)profile.pending_poll_skipped,
+            (unsigned long)profile.dummy_drain_cycles, (unsigned long)profile.dummy_drain_iters,
+            (unsigned long)profile.dispatch_cycles, (unsigned long)profile.dispatch_iters,
+            (unsigned long)profile.idle_spin_cycles, (unsigned long)profile.idle_iters);
+
         return cur_thread_completed;
     }
 
@@ -457,6 +513,7 @@ class SchedulerContext
 
     // Cluster-ordered core trackers, one per scheduler thread
     CoreTracker core_trackers_[MAX_AICPU_THREADS];
+    SchedulerThreadProfile thread_profiles_[MAX_AICPU_THREADS];
 
     // Per-core dispatch payload storage: dual-buffer for pipelining.
     // buf_idx = reg_task_id & 1; adjacent dispatches alternate automatically.
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h
index f2dc71ed5..c2c8159fc 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h
@@ -41,6 +41,34 @@ enum class LoopAction : int8_t
     BREAK_LOOP,  // equivalent to 'break' from the while(true) loop
 };
 
+// Per-thread phase profiling. Accumulates cumulative cycle counts and entry
+// counts for each phase of resolve_and_dispatch's main loop. Dumped once at
+// loop exit via LOG_INFO_V9 — the hot path only does cycle counter math.
+struct alignas(64) SchedulerThreadProfile
+{
+    uint64_t total_cycles{0};
+    uint64_t completion_cycles{0};
+    uint64_t async_wait_cycles{0};
+    uint64_t drain_wiring_cycles{0};
+    uint64_t spsc_drain_cycles{0};    // sub-phase of drain_wiring: SPSC → pending FIFO
+    uint64_t pending_poll_cycles{0};  // sub-phase of drain_wiring: pending FIFO → ready
+    uint64_t dummy_drain_cycles{0};
+    uint64_t dispatch_cycles{0};
+    uint64_t idle_spin_cycles{0};
+    uint64_t completion_iters{0};
+    uint64_t async_wait_iters{0};
+    uint64_t drain_wiring_iters{0};
+    uint64_t spsc_drain_iters{0};
+    uint64_t pending_poll_iters{0};
+    uint64_t pending_poll_skipped{0};  // (a) gate hits: poll calls skipped due to no new completions
+    uint64_t dummy_drain_iters{0};
+    uint64_t dispatch_iters{0};
+    uint64_t idle_iters{0};
+    uint64_t total_iters{0};
+
+    void reset() { *this = SchedulerThreadProfile{}; }
+};
+
 struct alignas(64) CoreExecState
 {
     // --- Hot fields (completion + dispatch, every iteration) ---

From ed1ec09fde475ee6f9c061331f98c4133c570f64 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 18 Jun 2026 12:52:57 +0200
Subject: [PATCH 06/14] Wake-list notification for last unmet fanin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the pure-polling pending-FIFO loop with a hybrid:
  - 0 unmet fanins  → push to ready_queues (unchanged)
  - exactly 1 unmet → register the consumer on that producer's wake list
                      and remove from FIFO (was: push back to FIFO)
  - 2+ unmet        → push back to FIFO for the next poll (unchanged)

Each producer slot gets a wake_list_head atomic pointer. Registration
is a CAS push onto the head. Completion does an atomic-exchange to a
SENTINEL (refusing further registrations) and pushes every waiter to
ready_queues. Slots reset wake_list_head to nullptr on reuse.

The intuition: most pending lifetime is spent waiting on the last
fanin to complete. The polling model re-walks every fanin on every
poll iteration even though only one byte changes. Wake-list registration
costs one CAS per task and zero further polls — the producer pushes the
waiter on completion. The submission-time variant of this idea ((f) in
the investigation) regressed because cross-thread cache traffic on the
orchestrator's hot path overwhelmed the savings; restricting wake-list
work to the scheduler-side keeps the writers on the same cache line.

Case1 (large workload, 65K tasks): -2.2% trimmed device time
  (~28072 µs → ~27451 µs).
Case4 (small workload): +2.2% trimmed device time
  (~1322 µs → ~1351 µs). The per-task atomic exchange overhead is not
  amortized at this scale.

Profile shift on Case1 (thread 0):
  drain_wiring_cycles  819K → 396K (-52%)
  pending_poll_cycles  767K → 343K (-55%)
  All threads run ~40% fewer main-loop iterations (denser per-iteration
  work).
---
 .../runtime/pto_runtime2_types.h              | 18 ++++
 .../runtime/pto_scheduler.h                   | 84 ++++++++++++++++++-
 2 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 6d2fa9ba5..93cf0ffe9 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -173,6 +173,16 @@ struct alignas(64) PTO2TaskSlotState
     PTO2TaskPayload *payload;
     PTO2TaskDescriptor *task;
 
+    // --- (e) Wake-list: lightweight last-fanin notification ---
+    // When a pending consumer's fanin scan finds exactly ONE unmet fanin,
+    // it registers itself on the producer's wake list (CAS push). On producer
+    // completion, the producer atomic-exchanges wake_list_head to the
+    // SENTINEL value and pushes every waiter to the ready queues. Consumers
+    // that observe SENTINEL during registration push themselves directly
+    // (producer already completed). Reset to nullptr on slot reuse.
+    std::atomic<PTO2TaskSlotState *> wake_list_head{nullptr};
+    PTO2TaskSlotState *next_in_wake_list{nullptr};
+
     // --- Set per-submit (depend on task inputs) ---
     ActiveMask active_mask;  // Bitmask of active subtask slots (set once)
     uint8_t ring_id;         // Ring layer (immutable after init)
@@ -200,10 +210,18 @@ struct alignas(64) PTO2TaskSlotState
         completed_subtasks.store(0, std::memory_order_relaxed);
         next_block_idx = 0;
         any_subtask_deferred.store(false, std::memory_order_relaxed);
+        // (e) Wake list: clear for the next incarnation. Previous incarnation
+        // left it at WAKE_LIST_SENTINEL (set by its on_mixed_task_complete).
+        wake_list_head.store(nullptr, std::memory_order_relaxed);
+        next_in_wake_list = nullptr;
         // last_consumer_local_id is reset in prepare_task once the task_id is known.
     }
 };
 
+// (e) Sentinel marking a wake list as "owner already completed; no more
+// registrations accepted". Distinct from any real slot_state pointer.
+inline PTO2TaskSlotState *const WAKE_LIST_SENTINEL = reinterpret_cast<PTO2TaskSlotState *>(uintptr_t{1});
+
 static_assert(sizeof(PTO2TaskSlotState) <= 128, "slot state should fit in two cache lines");
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
index 2bfc5f693..bee5613d2 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -479,6 +479,53 @@ struct PTO2SchedulerState
         return true;
     }
 
+    // (e) Single-pass fanin classification used by the pending poll. Returns:
+    //   -2: all fanins met (route directly to ready)
+    //   -1: 2+ fanins unmet (push back to pending FIFO)
+    //   ≥0: exactly 1 fanin unmet, returned index identifies which fanin
+    //       (register on that producer's wake list).
+    int classify_fanin_state(PTO2TaskSlotState *s) const
+    {
+        const PTO2TaskPayload &p = *s->payload;
+        const auto &ring = *ring_sched_states[s->ring_id].ring;
+        const int32_t mask = ring.task_window_mask;
+        std::atomic<uint8_t> *flags = ring.completion_flags;
+        int unmet_idx = -2;
+        for (int32_t i = 0; i < p.fanin_count; i++)
+        {
+            if (flags[p.fanin_local_ids[i] & mask].load(std::memory_order_acquire) == 0)
+            {
+                if (unmet_idx >= 0) return -1;  // 2+ unmet
+                unmet_idx = i;
+            }
+        }
+        return unmet_idx;
+    }
+
+    // (e) Register `consumer` on `producer`'s wake list. If producer has
+    // already completed (head == WAKE_LIST_SENTINEL), push consumer directly
+    // to ready_queues. Otherwise CAS push-onto the head.
+    void register_wake(PTO2TaskSlotState *producer, PTO2TaskSlotState *consumer)
+    {
+        PTO2TaskSlotState *expected = producer->wake_list_head.load(std::memory_order_relaxed);
+        while (true)
+        {
+            if (expected == WAKE_LIST_SENTINEL)
+            {
+                // Producer already completed and drained its wake list. The
+                // last unmet fanin is now satisfied; push consumer to ready.
+                push_ready_routed(consumer);
+                return;
+            }
+            consumer->next_in_wake_list = expected;
+            if (producer->wake_list_head.compare_exchange_weak(expected, consumer, std::memory_order_acq_rel, std::memory_order_relaxed))
+            {
+                return;  // registered
+            }
+            // CAS failed: expected was updated by load on retry. Loop.
+        }
+    }
+
     // Thread 0 entry point: drain SPSC into pending list, then poll pending
     // for newly-ready tasks. Not-ready tasks rotate to the tail.
     // Returns >0 if anything moved (SPSC drained OR tasks routed to ready);
@@ -512,7 +559,11 @@ struct PTO2SchedulerState
         }
         wiring.backoff_counter = 0;
 
-        // Stage 2: poll pending FIFO, route ready tasks
+        // Stage 2: poll pending FIFO. Three-way classification:
+        //   - all fanins met → push to ready_queues
+        //   - exactly 1 unmet → register on that producer's wake list (no
+        //     more polling for this task; producer wakes it on completion)
+        //   - 2+ unmet → push back to FIFO for the next poll cycle
         uint64_t t1 = poll_cyc_out ? get_sys_cnt_aicpu() : 0;
         int routed = 0;
         int to_visit = static_cast<int>(wiring.pending_count());
@@ -521,14 +572,24 @@ struct PTO2SchedulerState
         {
             PTO2TaskSlotState *s = pending_pop_front();
             if (s == nullptr) break;
-            if (fanin_satisfied(s))
+            int state = classify_fanin_state(s);
+            if (state == -2)
             {
                 push_ready_routed(s);
                 routed++;
             }
+            else if (state == -1)
+            {
+                pending_push_back(s);  // 2+ missing, re-check next cycle
+            }
             else
             {
-                pending_push_back(s);
+                // exactly 1 unmet at index `state`; register and remove from FIFO
+                int32_t prod_local = s->payload->fanin_local_ids[state];
+                auto &ring = *ring_sched_states[s->ring_id].ring;
+                PTO2TaskSlotState *producer = &ring.get_slot_state_by_task_id(prod_local);
+                register_wake(producer, s);
+                routed++;  // count as routed since it's no longer in FIFO
             }
         }
         if (poll_cyc_out)
@@ -574,6 +635,23 @@ struct PTO2SchedulerState
         // acquire-load this byte in fanin_satisfied.
         ring.completion_flags[my_id & ring.task_window_mask].store(1, std::memory_order_release);
 
+        // (e) Drain the wake list. Any consumer registered on this slot was
+        // waiting on us as their last unmet fanin. After completion_flag is
+        // set above, atomic-exchange wake_list_head to SENTINEL (refusing
+        // any future registrations) and push every waiter to the ready
+        // queues. Ordering: completion_flag is set BEFORE the exchange, so
+        // any consumer that races a registration against our exchange and
+        // observes a SENTINEL during retry will see completion_flag=1 and
+        // push itself directly.
+        PTO2TaskSlotState *waiter = slot_state.wake_list_head.exchange(WAKE_LIST_SENTINEL, std::memory_order_acq_rel);
+        while (waiter != nullptr && waiter != WAKE_LIST_SENTINEL)
+        {
+            PTO2TaskSlotState *next = waiter->next_in_wake_list;
+            waiter->next_in_wake_list = nullptr;
+            push_ready_routed(waiter);
+            waiter = next;
+        }
+
         // CAS-advance the watermark, bounded by my_id (which we know is
         // published since we just completed it). If a forward task we observe
         // as COMPLETED is also published, but a gap remains, we stop — the

From f708f193a8f4a13f815cddbc2373692b59a8cc05 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 18 Jun 2026 15:58:32 +0200
Subject: [PATCH 07/14] Sub-phase profiling for complete_slot_task and core
 scans

Break down the completion phase further: separate complete_slot_task
body time from the per-iter cond_ptr-read + transition-decide overhead,
plus a count of cores scanned per iter. Lets future investigations see
which sub-phase actually dominates compl_cyc.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../runtime/scheduler_context.h                      | 12 +++++++++++-
 .../runtime/scheduler_types.h                        |  7 +++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
index 857a7113c..f1d44d17e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
@@ -391,10 +391,12 @@ class SchedulerContext
         // exits, so this adds no overhead to the measured phases.
         profile.total_cycles = get_sys_cnt_aicpu() - profile_loop_start;
         LOG_INFO_V9(
-            "CLAUDE_PROFILING thread=%d total_cyc=%lu iters=%lu compl_cyc=%lu compl_n=%lu async_cyc=%lu async_n=%lu drain_cyc=%lu drain_n=%lu spsc_cyc=%lu spsc_n=%lu poll_cyc=%lu poll_n=%lu poll_skipped=%lu dummy_cyc=%lu dummy_n=%lu dispatch_cyc=%lu dispatch_n=%lu idle_cyc=%lu idle_n=%lu",
+            "CLAUDE_PROFILING thread=%d total_cyc=%lu iters=%lu compl_cyc=%lu compl_n=%lu ctask_cyc=%lu ctask_n=%lu cores_scan=%lu async_cyc=%lu async_n=%lu drain_cyc=%lu drain_n=%lu spsc_cyc=%lu spsc_n=%lu poll_cyc=%lu poll_n=%lu poll_skipped=%lu dummy_cyc=%lu dummy_n=%lu dispatch_cyc=%lu dispatch_n=%lu idle_cyc=%lu idle_n=%lu",
             (int)thread_idx,
             (unsigned long)profile.total_cycles, (unsigned long)profile.total_iters,
             (unsigned long)profile.completion_cycles, (unsigned long)profile.completion_iters,
+            (unsigned long)profile.complete_task_cycles, (unsigned long)profile.complete_task_calls,
+            (unsigned long)profile.cores_scanned,
             (unsigned long)profile.async_wait_cycles, (unsigned long)profile.async_wait_iters,
             (unsigned long)profile.drain_wiring_cycles, (unsigned long)profile.drain_wiring_iters,
             (unsigned long)profile.spsc_drain_cycles, (unsigned long)profile.spsc_drain_iters,
@@ -1217,6 +1219,7 @@ class SchedulerContext
 
     void check_running_cores_for_completion(int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed, bool &made_progress, PTO2LocalReadyBuffer *local_bufs)
     {
+        SchedulerThreadProfile &profile = thread_profiles_[thread_idx];
         CoreTracker &tracker = core_trackers_[thread_idx];
         auto running_core_states = tracker.get_all_running_cores();
         while (running_core_states.has_value())
@@ -1224,6 +1227,7 @@ class SchedulerContext
             int32_t bit_pos = running_core_states.pop_first();
             int32_t core_id = tracker.get_core_id_by_offset(bit_pos);
             CoreExecState &core = core_exec_states_[core_id];
+            profile.cores_scanned++;
 
             uint64_t reg_val = static_cast<uint64_t>(*core.cond_ptr);
             rmb();
@@ -1238,12 +1242,18 @@ class SchedulerContext
             // 1. Complete finished tasks (capture pointers before modifying core state)
             if (t.pending_done)
             {
+                uint64_t tc0 = get_sys_cnt_aicpu();
                 complete_slot_task(*core.pending_slot_state, core.pending_reg_task_id, core_id, hank, completed_this_turn, local_bufs);
+                profile.complete_task_cycles += get_sys_cnt_aicpu() - tc0;
+                profile.complete_task_calls++;
                 cur_thread_completed++;
             }
             if (t.running_done)
             {
+                uint64_t tc0 = get_sys_cnt_aicpu();
                 complete_slot_task(*core.running_slot_state, core.running_reg_task_id, core_id, hank, completed_this_turn, local_bufs);
+                profile.complete_task_cycles += get_sys_cnt_aicpu() - tc0;
+                profile.complete_task_calls++;
                 cur_thread_completed++;
             }
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h
index c2c8159fc..68718affd 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h
@@ -48,6 +48,13 @@ struct alignas(64) SchedulerThreadProfile
 {
     uint64_t total_cycles{0};
     uint64_t completion_cycles{0};
+    // Sub-phase of completion: time spent INSIDE complete_slot_task, and
+    // count of times it ran (one per subtask completion observed).
+    uint64_t complete_task_cycles{0};
+    uint64_t complete_task_calls{0};
+    // Sub-phase of completion: count of cores scanned per iter (proxy for
+    // cond_ptr read cost; aggregate / completion_iters = avg cores/iter).
+    uint64_t cores_scanned{0};
     uint64_t async_wait_cycles{0};
     uint64_t drain_wiring_cycles{0};
     uint64_t spsc_drain_cycles{0};    // sub-phase of drain_wiring: SPSC → pending FIFO

From a50988103d55e05f6b7fdedd945c835dcf79375a Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 18 Jun 2026 15:58:58 +0200
Subject: [PATCH 08/14] Drop task_state field, gate slab read on count==0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(m) PTO2TaskSlotState::task_state was a redundant completion signal —
completion_flags already records the same transition with the right
memory ordering. Drop the atomic release store on the completion path,
switch the watermark CAS-advance loop and the wait/stall-dump readers
to consult completion_flags directly. Saves one atomic store per task.

(q) In complete_slot_task, read deferred_slab->count before
deferred_slab->error_code. Kernels that don't register async conditions
leave count at 0 (the dispatch-time reset value), so checking count
first lets the common path skip the error_code load + branch and the
condition-forwarding loop.

Each change is neutral on Case1 in isolation (within ±50 µs run-to-run
variance over 80-round trimmed avgs), but both clean up redundant
work on the completion hot path.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../runtime/pto_orchestrator.h                |  2 +-
 .../runtime/pto_runtime2.h                    |  5 ++-
 .../runtime/pto_scheduler.h                   | 10 +++--
 .../runtime/scheduler_context.h               | 42 ++++++++++---------
 4 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 4d5cf0138..314862915 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -352,7 +352,7 @@ struct PTO2OrchestratorState
 
         if (prepared.slot_state != nullptr)
         {
-            prepared.slot_state->task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
+            // (m) Inline completion uses completion_flags only.
             uint8_t ring_id = prepared.task_id.ring();
             orch->sm_header->rings[ring_id].completion_flags[prepared.alloc_result.slot].store(1, std::memory_order_release);
         }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index d38e84cdf..ca06791aa 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -217,9 +217,12 @@ inline bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wa
     auto wait_one_producer = [&](const PTO2TaskSlotState &slot) {
         uint8_t ring_id = slot.ring_id;
         int32_t local_id = static_cast<int32_t>(slot.task->task_id.local());
+        auto &ring_hdr = orch.sm_header->rings[ring_id];
+        const int32_t mask = ring_hdr.task_window_mask;
         uint64_t t0 = get_sys_cnt_aicpu();
         int32_t spin_count = 0;
-        while (slot.task_state.load(std::memory_order_acquire) < PTO2_TASK_COMPLETED)
+        // (m) Use completion_flags as the single completion signal.
+        while (ring_hdr.completion_flags[local_id & mask].load(std::memory_order_acquire) == 0)
         {
             SPIN_WAIT_HINT();
             if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES)
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
index bee5613d2..2dae488f6 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -623,8 +623,8 @@ struct PTO2SchedulerState
     // notification edge is needed.
     void on_mixed_task_complete(PTO2TaskSlotState &slot_state, [[maybe_unused]] PTO2LocalReadyBuffer *local_bufs = nullptr)
     {
-        slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
-
+        // (m) Skip slot_state.task_state.store here; completion_flags below is
+        // the single source of truth. Saves one atomic release store per task.
         const int32_t my_id = static_cast<int32_t>(slot_state.task->task_id.local());
         int32_t ring_id = slot_state.ring_id;
         auto &rss = ring_sched_states[ring_id];
@@ -660,8 +660,10 @@ struct PTO2SchedulerState
         while (w < my_id)
         {
             int32_t next = w + 1;
-            PTO2TaskSlotState &cand = ring.get_slot_state_by_task_id(next);
-            if (cand.task_state.load(std::memory_order_acquire) != PTO2_TASK_COMPLETED) break;
+            // (m) Read completion_flags (already published by the candidate's
+            // completer) instead of cand.task_state — one fewer atomic store
+            // per task in the common path.
+            if (ring.completion_flags[next & ring.task_window_mask].load(std::memory_order_acquire) == 0) break;
             if (ring.completed_watermark.compare_exchange_weak(w, next, std::memory_order_acq_rel, std::memory_order_acquire))
             {
                 w = next;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
index f1d44d17e..4d637e5c4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
@@ -1147,26 +1147,30 @@ class SchedulerContext
         if (slot_state.payload != nullptr)
         {
             volatile DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][expected_reg_task_id & 1];
-            int32_t slab_err = deferred_slab->error_code;
-            if (slab_err != PTO2_ERROR_NONE)
-            {
-                int32_t expected = PTO2_ERROR_NONE;
-                sched_->sm_header->sched_error_code.compare_exchange_strong(expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire);
-                completed_.store(true, std::memory_order_release);
-                return;
-            }
-
+            // (q) Read count first. AICore only writes error_code as part of a
+            // condition-registration attempt that also increments count, so
+            // count == 0 ⇒ no error and no conditions to forward. This is the
+            // common path for kernels that don't use async waits (paged
+            // attention, GEMM, etc.) and saves an L1 load + branch per call.
             uint32_t cond_count = deferred_slab->count;
-            if (cond_count > MAX_COMPLETIONS_PER_TASK)
+            if (cond_count != 0)
             {
-                int32_t expected = PTO2_ERROR_NONE;
-                sched_->sm_header->sched_error_code.compare_exchange_strong(expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire);
-                completed_.store(true, std::memory_order_release);
-                return;
-            }
+                int32_t slab_err = deferred_slab->error_code;
+                if (slab_err != PTO2_ERROR_NONE)
+                {
+                    int32_t expected = PTO2_ERROR_NONE;
+                    sched_->sm_header->sched_error_code.compare_exchange_strong(expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire);
+                    completed_.store(true, std::memory_order_release);
+                    return;
+                }
+                if (cond_count > MAX_COMPLETIONS_PER_TASK)
+                {
+                    int32_t expected = PTO2_ERROR_NONE;
+                    sched_->sm_header->sched_error_code.compare_exchange_strong(expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire);
+                    completed_.store(true, std::memory_order_release);
+                    return;
+                }
 
-            if (cond_count > 0)
-            {
                 slot_state.any_subtask_deferred.store(true, std::memory_order_release);
 
                 const PTO2TaskId token = slot_state.task->task_id;
@@ -1487,9 +1491,9 @@ class SchedulerContext
                 for (int32_t si = 0; si < ring_task_count; si++)
                 {
                     PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si);
-                    PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed);
+                    // (m) task_state retired; use completion_flags directly.
                     bool fanin_ready = sched_->fanin_satisfied(&slot_state);
-                    if (st >= PTO2_TASK_COMPLETED) continue;
+                    if (ring.completion_flags[si & ring.task_window_mask].load(std::memory_order_relaxed) != 0) continue;
                     char running_on[192] = {0};
                     int32_t owner = -1;
                     int32_t pos = 0;

From 0a3e24f622c6027f868d8edd725dae7ec5ba822e Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 18 Jun 2026 16:09:56 +0200
Subject: [PATCH 09/14] Remove dead task_state field from a2a3 slot state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous commit dropped the producer-side .store(COMPLETED) — the
field had no remaining writers on the hot path. Remove the field itself,
the orchestrator's no-longer-needed PENDING-init at submit time, and
the SCALAR_DATA_ACCESS / MULTI_RING doc snippets that still spelled the
spin-wait and watermark-walk in terms of task_state. completion_flags
is now the sole completion signal in a2a3.

The a2a3 test_task_state.cpp UT was a leftover copy of the a5 version —
it #includes "scheduler/pto_scheduler.h" (an a5-only path) and calls
release_fanin_and_check_ready / release_producer methods that don't
exist in the a2a3 scheduler. It never compiled against a2a3; remove it
and the matching CMakeLists entry.

Note: RUNTIME_LOGIC.md sections 6.2 / 7.3 / 8.2 / 8.4 still describe a
much older fanout_lock + CONSUMED state architecture that no longer
exists in the codebase. That cleanup is out of scope here — flagged
for a follow-up doc pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../docs/MULTI_RING.md                        |   5 +-
 .../docs/SCALAR_DATA_ACCESS.md                |   2 +-
 .../runtime/pto_orchestrator.h                |   1 -
 .../runtime/pto_runtime2_types.h              |   5 -
 tests/ut/cpp/CMakeLists.txt                   |   1 -
 tests/ut/cpp/a2a3/test_task_state.cpp         | 203 ------------------
 6 files changed, 4 insertions(+), 213 deletions(-)
 delete mode 100644 tests/ut/cpp/a2a3/test_task_state.cpp

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
index ff8f8a531..0ec9b155f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
@@ -179,8 +179,9 @@ Each ring's `last_task_alive` advances independently:
 
 ```text
 advance_ring_pointers(ring_id):  // protected by per-ring advance_lock
-    la = ring->fc.last_task_alive
-    while ring->get_slot_state_by_task_id(la).task_state >= CONSUMED:
+    watermark = ring->completed_watermark
+    la = last_task_alive
+    while la <= watermark and watermark >= slot[la].last_consumer_local_id:
         reset slot for reuse
         la++
     sync_to_sm()  // release-store last_task_alive
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SCALAR_DATA_ACCESS.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SCALAR_DATA_ACCESS.md
index bd93f87da..846cdf377 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SCALAR_DATA_ACCESS.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SCALAR_DATA_ACCESS.md
@@ -32,7 +32,7 @@ addr null-check → TensorMap lookup → spin-wait producer COMPLETED → comput
 
 - **addr null-check**: `buffer.addr == 0` means unallocated — log error, return 0
 - **TensorMap lookup**: find producer task by `buffer.addr`
-- **spin-wait**: wait until producer `task_state >= PTO2_TASK_COMPLETED`
+- **spin-wait**: wait until producer's `completion_flags[local_id & mask] == 1`
 - **No producer** (lookup callback never fires): skip waiting, read immediately
 
 ### 3.2 set_tensor_data Flow
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 314862915..e16f71e88 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -472,7 +472,6 @@ inline bool prepare_task(PTO2OrchestratorState *orch, const Arg &args, int32_t t
 
     out->slot_state->bind_buffers(out->payload, out->task);
 
-    out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
     // Clear the polling-fast completion byte for the newly-allocated slot.
     // The previous incarnation's completer set this byte to 1; we publish 0
     // before this task can be added as a fanin to any consumer (single-
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 93cf0ffe9..8f4ffe4ca 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -165,11 +165,6 @@ struct alignas(64) PTO2TaskSlotState
     // transitioned to COMPLETED). Single-writer (orchestrator) at submit time.
     int32_t last_consumer_local_id;
 
-    // Task state (PENDING/COMPLETED). Polling readiness reads task_state on
-    // producer slots; reclamation gates on the completed_watermark instead of
-    // a separate CONSUMED transition.
-    std::atomic<PTO2TaskState> task_state;
-
     PTO2TaskPayload *payload;
     PTO2TaskDescriptor *task;
 
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 437451f14..af41c0e37 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -366,7 +366,6 @@ add_a2a3_test(test_a2a3_fatal a2a3/test_a2a3_fatal.cpp)
 add_a2a3_runtime_test(test_task_allocator   a2a3/test_task_allocator.cpp)
 add_a2a3_runtime_test(test_dep_list_pool    a2a3/test_dep_list_pool.cpp)
 add_a2a3_runtime_test(test_scheduler_state  a2a3/test_scheduler_state.cpp)
-add_a2a3_runtime_test(test_task_state       a2a3/test_task_state.cpp)
 add_a2a3_runtime_test(test_ready_queue      a2a3/test_ready_queue.cpp)
 add_a2a3_runtime_test(test_shared_memory    a2a3/test_shared_memory.cpp)
 add_a2a3_runtime_test(test_a2a3_tensormap   a2a3/test_tensormap.cpp)
diff --git a/tests/ut/cpp/a2a3/test_task_state.cpp b/tests/ut/cpp/a2a3/test_task_state.cpp
deleted file mode 100644
index c0773ec22..000000000
--- a/tests/ut/cpp/a2a3/test_task_state.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Unit tests for PTO2TaskSlotState lifecycle through PTO2SchedulerState API.
- *
- * These tests drive state transitions via src methods (release_fanin,
- * on_subtask_complete, check_and_handle_consumed) rather than manually
- * operating atomic fields.  For concurrent exactly-once semantics of
- * fanin/subtask/fanout, see test_scheduler_state.cpp which already
- * covers those paths via the same API.
- *
- * This file focuses on:
- * - Full lifecycle through src API
- * - Ready-path behavior (task_state stays PENDING through dispatch)
- * - Double subtask completion (counter-model weakness)
- */
-
-#include <gtest/gtest.h>
-#include <atomic>
-#include <cstring>
-#include <thread>
-#include <vector>
-#include "utils/device_arena.h"
-#include "scheduler/pto_scheduler.h"
-
-class TaskStateTest : public ::testing::Test {
-protected:
-    PTO2SchedulerState sched;
-    PTO2SharedMemoryHandle *sm_handle = nullptr;
-    DeviceArena sm_arena;
-    DeviceArena sched_arena;
-
-    void SetUp() override {
-        sm_handle = PTO2SharedMemoryHandle::create_and_init_default(sm_arena);
-        ASSERT_NE(sm_handle, nullptr);
-        auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
-        ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
-        sched.wire_arena_pointers(layout, sched_arena);
-    }
-
-    void TearDown() override {
-        sched.destroy();
-        sched_arena.release();
-        sm_arena.release();
-    }
-
-    void init_slot(PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count) {
-        memset(&slot, 0, sizeof(slot));
-        slot.task_state.store(state);
-        slot.fanin_count = fanin_count;
-        slot.fanin_refcount.store(0);
-        slot.fanout_count = fanout_count;
-        slot.fanout_refcount.store(0);
-        slot.fanout_lock.store(0);
-        slot.fanout_head = nullptr;
-        slot.ring_id = 0;
-        slot.active_mask = ActiveMask(PTO2_SUBTASK_MASK_AIC);
-        slot.completed_subtasks.store(0);
-        slot.total_required_subtasks = 1;
-        slot.logical_block_num = 1;
-    }
-};
-
-// =============================================================================
-// Full lifecycle through src API: PENDING -> (fanin) -> (queued + dispatched)
-// -> (subtask) -> COMPLETED -> (fanout) -> CONSUMED
-// =============================================================================
-TEST_F(TaskStateTest, FullLifecycleThroughAPI) {
-    alignas(64) PTO2TaskSlotState slot;
-    init_slot(slot, PTO2_TASK_PENDING, 1, 1);
-    slot.total_required_subtasks = 1;
-    slot.completed_subtasks.store(0);
-
-    // Fanin satisfied -> task becomes ready
-    bool ready = sched.release_fanin_and_check_ready(slot);
-    EXPECT_TRUE(ready);
-
-    // Subtask completes -> task done
-    bool done = sched.on_subtask_complete(slot);
-    EXPECT_TRUE(done);
-
-    // Manually transition to COMPLETED (normally done by scheduler dispatch loop)
-    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
-
-    // Fanout released -> CONSUMED
-    sched.release_producer(slot);
-    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
-}
-
-// =============================================================================
-// release_fanin does not write task_state.
-//
-// Readiness is determined solely by fanin_refcount reaching fanin_count.
-// task_state stays PENDING from submit through "queued in ready_queue" and
-// "dispatched to a worker" until the worker stores COMPLETED.
-// =============================================================================
-TEST_F(TaskStateTest, ReadyPathStaysPending) {
-    alignas(64) PTO2TaskSlotState slot;
-    init_slot(slot, PTO2_TASK_PENDING, 1, 1);
-
-    bool ready = sched.release_fanin_and_check_ready(slot);
-    ASSERT_TRUE(ready) << "Task should be detected as ready via refcount";
-
-    // task_state remains PENDING -- there is no intermediate ready/running state.
-    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_PENDING) << "release_fanin_and_check_ready must not write task_state";
-}
-
-// =============================================================================
-// Multi-fanin: partial release does not trigger ready
-// =============================================================================
-TEST_F(TaskStateTest, MultiFaninPartialNotReady) {
-    alignas(64) PTO2TaskSlotState slot;
-    init_slot(slot, PTO2_TASK_PENDING, 3, 1);
-
-    EXPECT_FALSE(sched.release_fanin_and_check_ready(slot));
-    EXPECT_FALSE(sched.release_fanin_and_check_ready(slot));
-    EXPECT_TRUE(sched.release_fanin_and_check_ready(slot));
-}
-
-// =============================================================================
-// Concurrent fanin: exactly one thread detects ready (via src API)
-// =============================================================================
-TEST_F(TaskStateTest, ConcurrentFaninExactlyOneReady) {
-    constexpr int ROUNDS = 500;
-
-    for (int round = 0; round < ROUNDS; round++) {
-        alignas(64) PTO2TaskSlotState slot;
-        init_slot(slot, PTO2_TASK_PENDING, 3, 1);
-        std::atomic<int> ready_count{0};
-
-        auto release = [&]() {
-            if (sched.release_fanin_and_check_ready(slot)) {
-                ready_count.fetch_add(1);
-            }
-        };
-
-        std::thread t1(release), t2(release), t3(release);
-        t1.join();
-        t2.join();
-        t3.join();
-
-        EXPECT_EQ(ready_count.load(), 1) << "Round " << round;
-    }
-}
-
-// =============================================================================
-// Concurrent subtask completion: exactly one thread sees done (via src API)
-// =============================================================================
-TEST_F(TaskStateTest, ConcurrentSubtaskCompletion) {
-    constexpr int ROUNDS = 500;
-
-    for (int round = 0; round < ROUNDS; round++) {
-        alignas(64) PTO2TaskSlotState slot;
-        init_slot(slot, PTO2_TASK_PENDING, 1, 1);
-        slot.total_required_subtasks = 3;
-        slot.completed_subtasks.store(0);
-        std::atomic<int> done_count{0};
-
-        auto complete = [&]() {
-            if (sched.on_subtask_complete(slot)) {
-                done_count.fetch_add(1);
-            }
-        };
-
-        std::thread t1(complete), t2(complete), t3(complete);
-        t1.join();
-        t2.join();
-        t3.join();
-
-        EXPECT_EQ(done_count.load(), 1) << "Round " << round;
-        EXPECT_EQ(slot.completed_subtasks.load(), 3);
-    }
-}
-
-// =============================================================================
-// Double subtask completion (counter-model weakness).
-// With the counter model, double-completing the same subtask increments
-// completed_subtasks twice, potentially reaching total prematurely.
-// Unlike the old bitmask model, the counter cannot detect duplicates.
-// =============================================================================
-TEST_F(TaskStateTest, DoubleSubtaskCompletionCounterWeakness) {
-    alignas(64) PTO2TaskSlotState slot;
-    init_slot(slot, PTO2_TASK_PENDING, 1, 1);
-    slot.total_required_subtasks = 2;
-    slot.completed_subtasks.store(0);
-
-    // First subtask completion
-    bool done1 = sched.on_subtask_complete(slot);
-    EXPECT_FALSE(done1) << "Single completion doesn't complete the task";
-
-    // Same subtask completes AGAIN (logic error at caller level)
-    bool done2 = sched.on_subtask_complete(slot);
-    EXPECT_TRUE(done2) << "Counter model: double-completion falsely triggers done";
-}

From 63aa69c36659b39425ae7cc1dc4fbb351c27c93b Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Thu, 18 Jun 2026 16:41:10 +0200
Subject: [PATCH 10/14] Drop dead code in a2a3 tensormap_and_ringbuffer runtime
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Walk the recently-touched scheduler / orchestrator surface for unused
parameters and dead state, and drop what no caller or body actually
exercises:

- on_mixed_task_complete / complete_slot_task / check_running_cores_for_completion:
  drop the threaded-through `local_bufs` argument (none of these bodies
  read it anymore — it was a leftover from the (g)/(g') wake-list-via-
  local-bufs variants that didn't ship). Also drops `local_bufs` from
  AsyncWaitList::poll_and_complete and the DrainCompletionSink field.
- check_running_cores_for_completion / complete_slot_task: drop the
  `Handshake *hank` argument (only forwarded, never read). The local
  `hank` in resolve_and_dispatch's loop scope is dropped with it.
- dispatch_shape / dispatch_ready_tasks: drop the `bool &try_pushed`
  out-param chain. Set deep inside dispatch_shape but the only
  consumer in resolve_and_dispatch was a (void) suppression.
- pop_ready_tasks_batch: drop the unused `thread_idx` argument.
- log_stall_diagnostics: drop the [[maybe_unused]] `task_count`.
- log_shutdown_stall_snapshot + handle_timeout_exit: drop the
  [[maybe_unused]] `trigger_idle_iterations` / `trigger_last_progress_count`
  and the matching unused `idle_iterations` / `last_progress_count` on
  the timeout-exit caller.
- handle_orchestrator_exit: drop the `int32_t &task_count` out-param —
  the caller's only use was a `if (...task_count > 0) { if (...) {} }`
  with an empty inner body. Read total_tasks_ directly instead.
- resolve_and_dispatch loop: drop the now-dead `task_count` and
  `last_progress_count` locals (and the three write-only updates to
  the latter); inline the `try_completed = ...; if (try_completed)`
  pattern into a single `if`.
- PTO2SchedulerState::print_stats / print_queues: empty no-op stubs,
  never called — remove (along with the cold-path API comment that
  pointed at them).
- PTO2TensorMap::print_stats: 45-line stat-collection function whose
  output goes nowhere (the per-ring loop body is also empty) — remove.
- orch_report_fatal_v: drop the dead vsnprintf-into-a-buffer-then-
  discard block; just latch the error code via orch_mark_fatal. The
  fmt + va_list params are kept (unnamed) since callers pass them and
  the wider rt_report_fatal -> orchestrator.report_fatal -> v API
  surface is symmetric for a future logging-sink hookup.

Build is clean, Case4 and Case1 pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../runtime/pto_async_wait.h                  |  3 +-
 .../runtime/pto_orchestrator.h                | 14 +--
 .../runtime/pto_scheduler.h                   | 15 +--
 .../runtime/pto_tensormap.h                   | 47 ----------
 .../runtime/scheduler_context.h               | 92 +++++++------------
 5 files changed, 44 insertions(+), 127 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
index 8bc1afa61..7c0d891ee 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
@@ -167,7 +167,6 @@ struct AsyncWaitList
     struct DrainCompletionSink
     {
         PTO2SchedulerState *sched{nullptr};
-        PTO2LocalReadyBuffer *local_bufs{nullptr};
         int32_t inline_completed{0};
 
         bool can_inline_complete() const
@@ -266,7 +265,7 @@ struct AsyncWaitList
     }
 
     template <bool Profiling>
-    AsyncPollResult poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs);
+    AsyncPollResult poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched);
 };
 
 #endif  // PTO_ASYNC_WAIT_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index e16f71e88..e5f3ddd36 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -386,16 +386,12 @@ inline int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code)
     return expected;
 }
 
-inline void orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *fmt, va_list args)
+inline void orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *, va_list)
 {
-    int32_t latched_code = orch_mark_fatal(orch, error_code);
-
-    if (fmt == nullptr || fmt[0] == '\0') return;
-
-    char message[1024];
-    vsnprintf(message, sizeof(message), fmt, args);
-    (void)latched_code;
-    (void)message;
+    // fmt + args are accepted for future logging-sink wiring but are not yet
+    // routed anywhere — the error_code is latched in shared memory via
+    // orch_mark_fatal and that's what callers actually observe.
+    orch_mark_fatal(orch, error_code);
 }
 
 inline bool append_fanin_or_fail(PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, int32_t prod_local_id, PTO2FaninBuilder *fanin_builder)
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
index 2dae488f6..6305ad10b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -621,7 +621,7 @@ struct PTO2SchedulerState
     // 0..W has reached COMPLETED. Reclamation in advance_ring_pointers gates
     // on watermark >= producer.last_consumer_local_id, so no consumer→producer
     // notification edge is needed.
-    void on_mixed_task_complete(PTO2TaskSlotState &slot_state, [[maybe_unused]] PTO2LocalReadyBuffer *local_bufs = nullptr)
+    void on_mixed_task_complete(PTO2TaskSlotState &slot_state)
     {
         // (m) Skip slot_state.task_state.store here; completion_flags below is
         // the single source of truth. Saves one atomic release store per task.
@@ -739,31 +739,26 @@ struct PTO2SchedulerState
         for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) ready_queue_destroy(&sched->ready_queues[i]);
         ready_queue_destroy(&sched->dummy_ready_queue);
     }
-    void print_stats()
-    {}
-    void print_queues()
-    {}
 };
 
 // Scheduler cold-path API is declared as PTO2SchedulerState member functions.
-// See init()/destroy()/print_stats()/print_queues() below the struct definition.
+// See init()/destroy() below the struct definition.
 
 inline bool AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state)
 {
-    sink.sched->on_mixed_task_complete(slot_state, sink.local_bufs);
+    sink.sched->on_mixed_task_complete(slot_state);
     sink.inline_completed++;
     return true;
 }
 
 template <bool Profiling>
-inline AsyncPollResult AsyncWaitList::poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs)
+inline AsyncPollResult AsyncWaitList::poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched)
 {
     AsyncPollResult result;
     if (!try_lock()) return result;
 
     AsyncWaitList::DrainCompletionSink sink{};
     sink.sched = sched;
-    sink.local_bufs = local_bufs;
 
     int32_t drain_err = PTO2_ERROR_NONE;
     drain_aicore_completion_mailbox_locked(aicore_mailbox, sink, drain_err);
@@ -810,7 +805,7 @@ inline AsyncPollResult AsyncWaitList::poll_and_complete(AICoreCompletionMailbox
 
         if (entry.normal_done && entry.waiting_completion_count <= 0)
         {
-            sched->on_mixed_task_complete(*entry.slot_state, local_bufs);
+            sched->on_mixed_task_complete(*entry.slot_state);
             result.completed++;
 
             int32_t last = count - 1;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index e9e29e2d5..d1be5e2da 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -472,53 +472,6 @@ struct PTO2TensorMap
         entry.prev_in_task = nullptr;
     }
 
-    void print_stats()
-    {
-        int32_t valid = 0;
-        int32_t stale = 0;
-        int32_t empty_buckets = 0;
-        int32_t max_chain = 0;
-        int64_t total_chain = 0;
-        int32_t non_empty_buckets = 0;
-
-        // Count entries
-        for (int32_t i = 0; i < pool_size; i++)
-        {
-            if (entry_pool[i].bucket_index != -1)
-            {
-                if (entry_valid(entry_pool[i])) valid++;
-                else stale++;
-            }
-        }
-
-        // Count bucket stats
-        for (int32_t b = 0; b < num_buckets; b++)
-        {
-            int32_t chain_len = 0;
-            auto cur_entry = buckets[b];
-
-            while (cur_entry != nullptr)
-            {
-                chain_len++;
-                cur_entry = cur_entry->next_in_bucket;
-            }
-
-            if (chain_len == 0)
-            {
-                empty_buckets++;
-            }
-            else
-            {
-                non_empty_buckets++;
-                total_chain += chain_len;
-                if (chain_len > max_chain) max_chain = chain_len;
-            }
-        }
-
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
-        {}
-    }
-
     int32_t valid_count()
     {
         int32_t count = 0;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
index 4d637e5c4..f0f33ff20 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
@@ -210,8 +210,6 @@ class SchedulerContext
         PTO2SharedMemoryHeader *header = sched_->sm_header;
         if (!header) return -1;
 
-        Handshake *hank = static_cast<Handshake *>(runtime->workers);
-
         // One-time init: assign perf buffers (one thread does it; others wait)
         if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel)) pto2_init_complete_.store(true, std::memory_order_release);
         else
@@ -219,7 +217,6 @@ class SchedulerContext
 
         int32_t cur_thread_completed = 0;
         int32_t idle_iterations = 0;
-        int32_t last_progress_count = 0;
 
         constexpr int LOCAL_READY_CAP_PER_TYPE = 64;
         PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE];
@@ -244,10 +241,9 @@ class SchedulerContext
             if (completed_.load(std::memory_order_acquire)) break;
             bool made_progress = false;
             profile.total_iters++;
-            int32_t task_count = 0;
             if (!tracker.has_any_running_cores())
             {
-                LoopAction action = handle_orchestrator_exit(header, runtime, task_count);
+                LoopAction action = handle_orchestrator_exit(header, runtime);
                 if (action == LoopAction::BREAK_LOOP) break;
             }
 
@@ -260,31 +256,23 @@ class SchedulerContext
             // Phase 1: Check running cores for completion
             int32_t completed_this_turn = 0;
 
-            bool try_completed = tracker.has_any_running_cores();
-            if (try_completed)
+            if (tracker.has_any_running_cores())
             {
                 uint64_t t0 = get_sys_cnt_aicpu();
-                check_running_cores_for_completion(thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress, local_bufs);
+                check_running_cores_for_completion(thread_idx, completed_this_turn, cur_thread_completed, made_progress);
                 profile.completion_cycles += get_sys_cnt_aicpu() - t0;
                 profile.completion_iters++;
             }
             if (completed_this_turn > 0)
             {
-                int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
-                int32_t new_total = prev + completed_this_turn;
-                last_progress_count = new_total;
-                if (thread_idx == 0 && task_count > 0)
-                {
-                    if (new_total <= PROGRESS_VERBOSE_THRESHOLD || new_total / PROGRESS_LOG_INTERVAL != prev / PROGRESS_LOG_INTERVAL || new_total >= task_count)
-                    {}
-                }
+                completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
             }
 
             uint64_t t0_async = 0;
             if (rt_ != nullptr && rt_->aicore_mailbox != nullptr && (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending()))
             {
                 t0_async = get_sys_cnt_aicpu();
-                AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(rt_->aicore_mailbox, sched_, local_bufs);
+                AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(rt_->aicore_mailbox, sched_);
                 if (poll_result.error_code != PTO2_ERROR_NONE)
                 {
                     int32_t expected = PTO2_ERROR_NONE;
@@ -294,17 +282,13 @@ class SchedulerContext
                 }
                 if (poll_result.completed > 0)
                 {
-                    int32_t prev = completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed);
-                    int32_t new_total = prev + poll_result.completed;
-                    last_progress_count = new_total;
+                    completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed);
                     made_progress = true;
                 }
                 profile.async_wait_cycles += get_sys_cnt_aicpu() - t0_async;
                 profile.async_wait_iters++;
             }
 
-            bool try_pushed = false;
-
             // Phase 2 drain check
             if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0)
             {
@@ -335,9 +319,8 @@ class SchedulerContext
                 for (int di = 0; di < dummy_got; di++)
                 {
                     PTO2TaskSlotState &dummy_slot = *dummy_batch[di];
-                    sched_->on_mixed_task_complete(dummy_slot, local_bufs);
-                    int32_t prev = completed_tasks_.fetch_add(1, std::memory_order_relaxed);
-                    last_progress_count = prev + 1;
+                    sched_->on_mixed_task_complete(dummy_slot);
+                    completed_tasks_.fetch_add(1, std::memory_order_relaxed);
                     cur_thread_completed++;
                 }
                 if (dummy_got > 0) made_progress = true;
@@ -349,14 +332,11 @@ class SchedulerContext
             // cross-thread idle gating. See dispatch_ready_tasks for the policy.
             {
                 uint64_t t0 = get_sys_cnt_aicpu();
-                dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed);
+                dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress);
                 profile.dispatch_cycles += get_sys_cnt_aicpu() - t0;
                 profile.dispatch_iters++;
             }
 
-            (void)try_completed;
-            (void)try_pushed;
-
             if (made_progress)
             {
                 idle_iterations = 0;
@@ -373,12 +353,12 @@ class SchedulerContext
                     if (action == LoopAction::BREAK_LOOP) break;
                 }
 
-                if (idle_iterations % STALL_LOG_INTERVAL == 0) log_stall_diagnostics(thread_idx, total_tasks_);
+                if (idle_iterations % STALL_LOG_INTERVAL == 0) log_stall_diagnostics(thread_idx);
                 if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES)
                 {
                     bool self_owns = self_owns_running_task(thread_idx);
                     bool global_stuck = !self_owns && total_tasks_ > 0 && completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ && no_thread_owns_running_task();
-                    if (self_owns || global_stuck) return handle_timeout_exit(thread_idx, header, runtime, idle_iterations, last_progress_count);
+                    if (self_owns || global_stuck) return handle_timeout_exit(thread_idx, header, runtime);
                     last_progress_ts = get_sys_cnt_aicpu();
                 }
                 SPIN_WAIT_HINT();
@@ -781,11 +761,9 @@ class SchedulerContext
         return "?";
     }
 
-    int pop_ready_tasks_batch(PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count)
+    int pop_ready_tasks_batch(PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count)
     {
-        (void)thread_idx;
-        int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
-        return count;
+        return sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
     }
 
     void build_payload(PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, const AsyncCtx &async_ctx, int32_t block_idx)
@@ -900,7 +878,7 @@ class SchedulerContext
         }
     }
 
-    void dispatch_shape(int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf, CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed)
+    void dispatch_shape(int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf, CoreTracker &tracker, bool &entered_drain, bool &made_progress)
     {
         if (entered_drain) return;
 
@@ -912,7 +890,7 @@ class SchedulerContext
         {
             int want = cores.count();
             PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS * 3];
-            int got = pop_ready_tasks_batch(shape, thread_idx, local_buf, batch, want);
+            int got = pop_ready_tasks_batch(shape, local_buf, batch, want);
             if (got == 0) break;
 
             bool any_sync_start = false;
@@ -968,7 +946,6 @@ class SchedulerContext
                 }
 
                 dispatched_any = true;
-                try_pushed = true;
                 int32_t remaining = slot_state->logical_block_num - slot_state->next_block_idx;
                 int32_t claim = std::min(cores.count(), remaining);
                 int32_t start = slot_state->next_block_idx;
@@ -993,7 +970,7 @@ class SchedulerContext
         }
     }
 
-    void dispatch_ready_tasks(int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES], bool pmu_active, bool &made_progress, bool &try_pushed)
+    void dispatch_ready_tasks(int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES], bool pmu_active, bool &made_progress)
     {
         using Phase = CoreTracker::DispatchPhase;
         constexpr int32_t MIX_I = static_cast<int32_t>(PTO2ResourceShape::MIX);
@@ -1043,7 +1020,7 @@ class SchedulerContext
         bool entered_drain = false;
 
         // ===== IDLE stage =====
-        dispatch_shape(thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain, made_progress, try_pushed);
+        dispatch_shape(thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain, made_progress);
         if (entered_drain) return;
 
         bool skip_aic_aiv = has_residual_mix(local_bufs[MIX_I]);
@@ -1053,7 +1030,7 @@ class SchedulerContext
             for (int i = 0; i < 2; i++)
             {
                 PTO2ResourceShape s = aic_aiv[i];
-                dispatch_shape(thread_idx, s, Phase::IDLE, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress, try_pushed);
+                dispatch_shape(thread_idx, s, Phase::IDLE, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress);
                 if (entered_drain) return;
             }
         }
@@ -1066,7 +1043,7 @@ class SchedulerContext
 
         if (!has_idle_in_other_threads(thread_idx, PTO2ResourceShape::MIX))
         {
-            dispatch_shape(thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain, made_progress, try_pushed);
+            dispatch_shape(thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain, made_progress);
             if (entered_drain) return;
         }
 
@@ -1082,7 +1059,7 @@ class SchedulerContext
         {
             PTO2ResourceShape s = aic_aiv[i];
             if (has_idle_in_other_threads(thread_idx, s)) continue;
-            dispatch_shape(thread_idx, s, Phase::PENDING, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress, try_pushed);
+            dispatch_shape(thread_idx, s, Phase::PENDING, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress);
             if (entered_drain) return;
         }
     }
@@ -1138,9 +1115,8 @@ class SchedulerContext
         return t;
     }
 
-    void complete_slot_task(PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, int32_t core_id, Handshake *hank, int32_t &completed_this_turn, PTO2LocalReadyBuffer *local_bufs)
+    void complete_slot_task(PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, int32_t core_id, int32_t &completed_this_turn)
     {
-        (void)hank;
         AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr;
         bool defer_completion_to_consumer = false;
 
@@ -1202,7 +1178,7 @@ class SchedulerContext
 
         if (mixed_complete && !defer_completion_to_consumer)
         {
-            sched_->on_mixed_task_complete(slot_state, local_bufs);
+            sched_->on_mixed_task_complete(slot_state);
             completed_this_turn++;
         }
     }
@@ -1221,7 +1197,7 @@ class SchedulerContext
         core.running_reg_task_id = AICPU_TASK_INVALID;
     }
 
-    void check_running_cores_for_completion(int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed, bool &made_progress, PTO2LocalReadyBuffer *local_bufs)
+    void check_running_cores_for_completion(int32_t thread_idx, int32_t &completed_this_turn, int32_t &cur_thread_completed, bool &made_progress)
     {
         SchedulerThreadProfile &profile = thread_profiles_[thread_idx];
         CoreTracker &tracker = core_trackers_[thread_idx];
@@ -1247,7 +1223,7 @@ class SchedulerContext
             if (t.pending_done)
             {
                 uint64_t tc0 = get_sys_cnt_aicpu();
-                complete_slot_task(*core.pending_slot_state, core.pending_reg_task_id, core_id, hank, completed_this_turn, local_bufs);
+                complete_slot_task(*core.pending_slot_state, core.pending_reg_task_id, core_id, completed_this_turn);
                 profile.complete_task_cycles += get_sys_cnt_aicpu() - tc0;
                 profile.complete_task_calls++;
                 cur_thread_completed++;
@@ -1255,7 +1231,7 @@ class SchedulerContext
             if (t.running_done)
             {
                 uint64_t tc0 = get_sys_cnt_aicpu();
-                complete_slot_task(*core.running_slot_state, core.running_reg_task_id, core_id, hank, completed_this_turn, local_bufs);
+                complete_slot_task(*core.running_slot_state, core.running_reg_task_id, core_id, completed_this_turn);
                 profile.complete_task_cycles += get_sys_cnt_aicpu() - tc0;
                 profile.complete_task_calls++;
                 cur_thread_completed++;
@@ -1412,7 +1388,7 @@ class SchedulerContext
         drain_worker_dispatch(block_num);
     }
 
-    LoopAction handle_orchestrator_exit(PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count)
+    LoopAction handle_orchestrator_exit(PTO2SharedMemoryHeader *header, Runtime *runtime)
     {
         if (completed_.load(std::memory_order_acquire)) return LoopAction::BREAK_LOOP;
         int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
@@ -1428,11 +1404,9 @@ class SchedulerContext
             return LoopAction::BREAK_LOOP;
         }
 
-        bool orch_done = orchestrator_done_;
-        if (!orch_done) return LoopAction::NONE;
+        if (!orchestrator_done_) return LoopAction::NONE;
 
-        task_count = total_tasks_;
-        if (task_count > 0 && completed_tasks_.load(std::memory_order_relaxed) >= task_count)
+        if (total_tasks_ > 0 && completed_tasks_.load(std::memory_order_relaxed) >= total_tasks_)
         {
             completed_.store(true, std::memory_order_release);
             return LoopAction::BREAK_LOOP;
@@ -1474,7 +1448,7 @@ class SchedulerContext
         return LoopAction::NONE;
     }
 
-    void log_stall_diagnostics(int32_t thread_idx, [[maybe_unused]] int32_t task_count)
+    void log_stall_diagnostics(int32_t thread_idx)
     {
         CoreTracker &tracker = core_trackers_[thread_idx];
 
@@ -1542,11 +1516,11 @@ class SchedulerContext
         }
     }
 
-    void log_shutdown_stall_snapshot([[maybe_unused]] int32_t trigger_idle_iterations, [[maybe_unused]] int32_t trigger_last_progress_count)
+    void log_shutdown_stall_snapshot()
     {
         int32_t thread_count = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
         if (thread_count < 0 || thread_count > MAX_AICPU_THREADS) thread_count = thread_count < 0 ? 0 : MAX_AICPU_THREADS;
-        for (int32_t t = 0; t < thread_count; t++) log_stall_diagnostics(t, total_tasks_);
+        for (int32_t t = 0; t < thread_count; t++) log_stall_diagnostics(t);
     }
 
     int32_t find_core_owner_thread(int32_t core_id) const
@@ -1577,12 +1551,12 @@ class SchedulerContext
         return true;
     }
 
-    int32_t handle_timeout_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations, int32_t last_progress_count)
+    int32_t handle_timeout_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime)
     {
         latch_scheduler_error(header, thread_idx, PTO2_ERROR_SCHEDULER_TIMEOUT);
         if (!completed_.exchange(true, std::memory_order_acq_rel))
         {
-            log_shutdown_stall_snapshot(idle_iterations, last_progress_count);
+            log_shutdown_stall_snapshot();
             emergency_shutdown(runtime);
         }
         return -PTO2_ERROR_SCHEDULER_TIMEOUT;

From 51a58917273d8a3da1f43b6a5e688cdb43fec6d7 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Mon, 22 Jun 2026 12:46:58 +0200
Subject: [PATCH 11/14] Restore LOG_INFO_V0..V9 orchestration logging API
 surface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

c3f74c7f (the foundational wireless2 collapse) dropped the
log_info_v ops pointer and the LOG_INFO_V0..V9 macros from
pto_orchestration_api.h as part of its general cleanup. That left
any orchestration .cpp that called LOG_INFO_V<n> without a
"#ifdef ENABLE_PROFILING" guard failing to compile — paged_attention_
manual_scope and benchmark_bgemm both hit "'LOG_INFO_V9' was not
declared in this scope" against current header state.

Restore the surface:
- Add log_info_v function pointer to both copies of PTO2RuntimeOps
  (the runtime-local one in pto_runtime2.h and the orchestration-
  facing mirror in pto_orchestration_api.h — keep them in sync).
- Add LOG_INFO_V0..V9 macros at the end of pto_orchestration_api.h
  that route through current_runtime()->ops->log_info_v.
- Implement rt_log_info_v in pto_runtime2.h: format the message
  with vsnprintf and forward to unified_log_info_v, which already
  owns the runtime verbosity gate.
- Wire rt_log_info_v into s_runtime_ops.

paged_attention_manual_scope Case1 and benchmark_bgemm Case0 now
build and run; paged_attention Case4 still passes (no regression on
runtime hot path).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../orchestration/pto_orchestration_api.h        | 15 +++++++++++++++
 .../runtime/pto_runtime2.h                       | 16 ++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
index 8551b9e5c..33f67d0c8 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -62,6 +62,7 @@ typedef struct PTO2RuntimeOps
 
     // Logging (populated by runtime, called by orchestration)
     // INFO with explicit verbosity tier (v ∈ [0, 9]; gating done inside).
+    void (*log_info_v)(const char *func, int v, const char *fmt, ...);
 
     // Cross-layer data access (orchestration reads/writes tensor values via runtime)
     // Placed after logging to avoid shifting hot-path field offsets.
@@ -230,6 +231,20 @@ class PTO2ScopeGuard
 
 #define PTO2_SCOPE(...) if (PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__){__VA_ARGS__}; true)
 
+// User-orchestration logging macros. Route through the runtime's ops table so
+// the verbosity gating (V0..V9) and the actual logging sink stay owned by the
+// runtime. The orchestration .so just calls — gating is done inside.
+#define LOG_INFO_V0(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 0, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V1(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 1, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V2(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 2, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V3(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 3, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V4(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 4, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V5(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 5, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V6(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 6, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V7(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 7, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V8(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 8, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V9(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 9, fmt, ##__VA_ARGS__)
+
 #ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
 #define PTO2_ORCHESTRATION_CONFIG_DEFINED
 struct PTO2OrchestrationConfig
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index ca06791aa..d73b8859e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -25,6 +25,7 @@
 #include <stdio.h>
 #include <string.h>
 #include "aicpu/device_time.h"
+#include "common/unified_log.h"
 
 __attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu();
 
@@ -48,6 +49,7 @@ struct PTO2RuntimeOps
 
     // Logging (populated by runtime, called by orchestration)
     // INFO with explicit verbosity tier (v ∈ [0, 9]; gating done inside).
+    void (*log_info_v)(const char *func, int v, const char *fmt, ...);
 
     // Cross-layer data access (orchestration reads/writes tensor values via runtime)
     // Placed after logging to avoid shifting hot-path field offsets.
@@ -202,6 +204,19 @@ inline void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *fun
     va_end(args);
 }
 
+// Orchestration-side logging dispatcher: orchestration .so calls
+// LOG_INFO_V<n>(fmt, ...) which routes through this op into the unified log.
+// The verbosity gate lives inside unified_log_info_v.
+inline void rt_log_info_v(const char *func, int v, const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    char message[1024];
+    vsnprintf(message, sizeof(message), fmt, args);
+    va_end(args);
+    unified_log_info_v(func, v, "%s", message);
+}
+
 MAYBE_UNINITIALIZED_BEGIN
 inline bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller)
 {
@@ -365,6 +380,7 @@ inline const PTO2RuntimeOps s_runtime_ops = {
     .orchestration_done = rt_orchestration_done,
     .is_fatal = is_fatal_impl,
     .report_fatal = rt_report_fatal,
+    .log_info_v = rt_log_info_v,
     .get_tensor_data = get_tensor_data,
     .set_tensor_data = set_tensor_data,
     .alloc_tensors = alloc_tensors_impl,

From 948f485b4857c0861c6dd53dd5776fa539cb9a33 Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Mon, 22 Jun 2026 13:28:59 +0200
Subject: [PATCH 12/14] Rebase wireless2 stack onto upstream/main (squashed)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Squash-merge of wireless2 (c4b0aac2 + 11 commits) onto current
upstream/main (83728d2f). Per-commit replay was not viable: upstream
added speculative early-dispatch (#1079) which touches the same data
structures wireless2 redesigned, and refactored TaskArgs / Tensor
along with several module collapses that fundamentally diverge from
wireless2's earlier collapse-and-poll redesign.

Resolution strategy:
- Modify/delete (8 paths): accept wireless2's deletion. The
  `scheduler/*` and `shared/*` directories were collapsed into
  header-only modules in wireless2 (c3f74c7f); upstream kept
  modifying them. We keep the collapse.
- Pure upstream additions (DumpArgSelection / strided TaskArgs /
  Tensor refactor, AICore receive_time / swimlane, NUMA gate, lookup
  profiling externs, MIX classification fix, prefetch helper, etc.):
  take upstream's version. Wireless2 wasn't redesigning these.
- Wireless architecture (completion_flags polling, fanin_local_ids[],
  wake-list, watermark reclamation, pending FIFO out-of-band): keep
  wireless2's design. fanin_local_ids[] is THE entry point for the
  polling loop.
- PTO2TaskPayload: keep wireless2's flat fanin_local_ids[] alongside
  upstream's fanin_inline_slot_states + spec-dispatch storage as a
  compatibility layer, so spec-dispatch code links. Both populated at
  submit; the wireless poller reads fanin_local_ids, spec dispatch
  reads its own fields. Long-term we'd dedupe, but the squash needs
  to compile first.
- pto_types.h and tensor.h: took upstream entire. The TaskArgs and
  Tensor refactor is large; wireless2 only had cosmetic conflicts
  here. Adapt wireless2 code paths to the new TaskArgs surface in
  a follow-up if any breakage surfaces.

The build is NOT yet verified by this commit — there will be
follow-up fixes for code paths that referenced now-removed
symbols (notably the orchestrator-side fanin builder, any direct
fanin_refcount touch points, and the spec-dispatch release path
that needs to consult completion_flags instead of fanin_refcount).
This commit captures the merge resolution as a stable starting
point; verification + adaptation commits land next.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .github/workflows/sanitizers.yml              |    4 +-
 .../orchestration/paged_attention_orch.cpp    |   39 -
 .../paged_attention/test_paged_attention.py   |   16 +
 .../runtime/pto_runtime2_types.h              |    4 +
 .../aicpu/aicpu_executor.cpp                  |  550 ++----
 .../common/intrinsic.h                        |    4 +-
 .../docs/MULTI_RING.md                        |   40 +-
 .../docs/RUNTIME_LOGIC.md                     |    8 +-
 .../docs/SCALAR_DATA_ACCESS.md                |    2 +-
 .../docs/device_log_profiling.md              |    2 +-
 .../docs/profiling_levels.md                  |    6 +-
 .../host/dep_gen_replay.cpp                   |    2 +-
 .../host/runtime_maker.cpp                    |   53 +-
 .../orchestration/common.cpp                  |  164 +-
 .../orchestration/pto_arg_with_deps.h         |   82 +-
 .../orchestration/pto_orchestration_api.h     |  308 +---
 .../runtime/aicore_completion_mailbox.h       |  102 +-
 .../runtime/aicore_completion_mailbox_types.h |   28 +-
 .../backend/sdma/sdma_completion_kernel.h     |   83 +-
 .../backend/sdma/sdma_completion_scheduler.h  |   25 +-
 .../runtime/pto2_dispatch_payload.h           |   61 +-
 .../runtime/pto_async_kernel_api.h            |   81 +-
 .../runtime/pto_async_wait.h                  |  206 +--
 .../runtime/pto_completion_token.h            |   15 +-
 .../runtime/pto_dep_compute.h                 |  119 +-
 .../runtime/pto_orchestrator.cpp              |  972 ----------
 .../runtime/pto_orchestrator.h                |  633 +++++--
 .../runtime/pto_ring_buffer.cpp               |  168 --
 .../runtime/pto_ring_buffer.h                 |  632 +------
 .../runtime/pto_runtime2.cpp                  |  287 ---
 .../runtime/pto_runtime2.h                    |  496 ++++--
 .../runtime/pto_runtime2_types.h              |  352 +---
 .../runtime/pto_scheduler.h                   |  819 +++++++++
 .../runtime/pto_shared_memory.h               |  375 ++--
 .../runtime/pto_submit_types.h                |  137 +-
 .../runtime/pto_tensormap.h                   |  603 +++----
 .../runtime/runtime.h                         |  416 ++---
 .../runtime/scheduler/pto_scheduler.cpp       |  109 --
 .../runtime/scheduler/pto_scheduler.h         | 1483 ----------------
 .../runtime/scheduler/scheduler_cold_path.cpp | 1088 ------------
 .../scheduler/scheduler_completion.cpp        |  614 -------
 .../runtime/scheduler/scheduler_context.h     |  423 -----
 .../runtime/scheduler/scheduler_dispatch.cpp  | 1409 ---------------
 .../runtime/scheduler/scheduler_types.h       |  468 -----
 .../runtime/scheduler_context.h               | 1572 +++++++++++++++++
 .../runtime/scheduler_types.h                 |  405 +++++
 .../runtime/shared/pto_runtime2_init.cpp      |  381 ----
 .../runtime/shared/pto_shared_memory.cpp      |  255 ---
 .../runtime/shared/pto_tensormap.cpp          |  261 ---
 .../runtime/shared/runtime.cpp                |  166 --
 src/common/task_interface/pto_task_id.h       |   58 +-
 tests/ut/cpp/CMakeLists.txt                   |    1 -
 tests/ut/cpp/a2a3/test_task_state.cpp         |  213 ---
 53 files changed, 5086 insertions(+), 11714 deletions(-)
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
 create mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
 create mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
 create mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
 delete mode 100644 src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
 delete mode 100644 tests/ut/cpp/a2a3/test_task_state.cpp

diff --git a/.github/workflows/sanitizers.yml b/.github/workflows/sanitizers.yml
index 524b00e42..6a0188e49 100644
--- a/.github/workflows/sanitizers.yml
+++ b/.github/workflows/sanitizers.yml
@@ -11,8 +11,8 @@ name: Sanitizers
 # parallelism-limited subset to dodge the sim-oversubscription livelock; see the
 # run step. detect_leaks=0 until LSan suppressions exist for the device arenas.
 on:
-  schedule:
-    - cron: "0 18 * * *"  # 02:00 Beijing
+  pull_request:
+    branches: [main]
 
 concurrency:
   group: sanitizers-${{ github.ref }}
diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 4b11d437f..018c99304 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -106,8 +106,6 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
     uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
     CYCLE_COUNT_LAP(prof_param_extract);
 
-    LOG_INFO_V9(">>>>>> batch = %" PRIu64, batch);
-
     // Reshape tensors for kernel consumption (2D flattened)
     void *query_ptr = orch_args.tensor(0).data_as<void>();
     void *kc_ptr = orch_args.tensor(1).data_as<void>();
@@ -251,43 +249,6 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
             CYCLE_COUNT_LAP(prof_scope);
         }
     }
-
-#ifdef ENABLE_PROFILING
-    uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup +
-                     prof_submit_task + prof_scope;
-    LOG_INFO_V9(
-        "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count,
-        prof_make_count, prof_view_count, cycles_to_us(total)
-    );
-    if (total > 0) {
-        LOG_INFO_V9(
-            "  param_extract    : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract),
-            prof_param_extract * 100.0 / total
-        );
-        LOG_INFO_V9(
-            "  ext_tensor(x4)   : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total
-        );
-        LOG_INFO_V9(
-            "  create_info(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor),
-            prof_make_tensor * 100.0 / total,
-            prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0
-        );
-        LOG_INFO_V9(
-            "  tensor_view(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view),
-            prof_tensor_view * 100.0 / total,
-            prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0
-        );
-        LOG_INFO_V9(
-            "  param_setup      : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total
-        );
-        LOG_INFO_V9("  scope            : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total);
-        LOG_INFO_V9(
-            "  submit_task(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task),
-            prof_submit_task * 100.0 / total,
-            prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0
-        );
-    }
-#endif
 }
 
 }  // extern "C"
diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
index f6f5e970e..1beb156e4 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
+++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
@@ -108,6 +108,22 @@ class TestPagedAttention(SceneTestCase):
                 "dtype": "bfloat16",
             },
         },
+        {
+            "name": "Case4",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 16,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 256,
+                "max_model_len": 2048,
+                "dtype": "bfloat16",
+            },
+        },
         {
             "name": "CaseSmall1",
             "platforms": ["a2a3sim", "a2a3"],
diff --git a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
index 82bb7c193..a564a2682 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/pto_runtime2_types.h
@@ -20,7 +20,11 @@
 
 // Tensor dump uses these defaults to size its selective mask table so task-id
 // ring/slot lookup stays aligned with PTO2 task id layout.
+#ifndef PTO2_TASK_WINDOW_SIZE
 #define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
+#endif
+#ifndef PTO2_MAX_RING_DEPTH
 #define PTO2_MAX_RING_DEPTH 4        // Number of task-id ring layers
+#endif
 
 #endif  // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 26c74dda9..91a5fdf9f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -40,7 +40,6 @@
 #include "aicpu/tensor_dump_aicpu.h"
 #include "aicpu/dep_gen_collector_aicpu.h"
 #include "common/l2_swimlane_profiling.h"
-#include "common/unified_log.h"
 
 // Register-based communication
 #include "aicpu/platform_regs.h"
@@ -53,14 +52,11 @@
 #include "callable.h"
 
 // Scheduler data structures (CoreExecState, CoreTracker, etc.)
-#include "scheduler/scheduler_types.h"
+#include "scheduler_types.h"
 
 // Scheduler context class
-#include "scheduler/scheduler_context.h"
+#include "scheduler_context.h"
 
-// Device orchestration function signature (loaded via dlopen).
-// The executor binds the current thread's PTO2Runtime into orchestration TLS
-// before calling the user entry.
 typedef void (*DeviceOrchestrationFunc)(const ChipStorageTaskArgs &orch_args);
 typedef void (*DeviceOrchestrationBindRuntimeFunc)(PTO2Runtime *rt);
 
@@ -74,15 +70,12 @@ extern "C" void framework_bind_runtime(PTO2Runtime *rt);
 constexpr const char *DEFAULT_ORCH_ENTRY_SYMBOL = "aicpu_orchestration_entry";
 constexpr const char *DEFAULT_ORCH_CONFIG_SYMBOL = "aicpu_orchestration_config";
 
-static int32_t read_pto2_runtime_status(Runtime *runtime) {
-    if (runtime == nullptr) {
-        return 0;
-    }
+static int32_t read_pto2_runtime_status(Runtime *runtime)
+{
+    if (runtime == nullptr) return 0;
 
     void *sm = runtime->get_gm_sm_ptr();
-    if (sm == nullptr) {
-        return 0;
-    }
+    if (sm == nullptr) return 0;
 
     auto *header = static_cast<PTO2SharedMemoryHeader *>(sm);
     int32_t orch_error_code = header->orch_error_code.load(std::memory_order_acquire);
@@ -92,15 +85,8 @@ static int32_t read_pto2_runtime_status(Runtime *runtime) {
 
 static PTO2Runtime *rt{nullptr};
 
-// Per-callable_id orchestration SO table. The executor dispatches
-// `orch_so_table_[active_callable_id_]` (created on first sighting of
-// that callable_id, kept warm across runs).
-// MAX_REGISTERED_CALLABLE_IDS is the protocol hard cap on callable_id values
-// (mailbox uint32 callable_id, register() returns small ints) and is shared
-// with the host bounds check in DeviceRunner::register_callable —
-// see src/common/task_interface/callable_protocol.h.
-
-struct OrchSoEntry {
+struct OrchSoEntry
+{
     bool in_use{false};
     void *handle{nullptr};
     char path[256]{};
@@ -109,7 +95,8 @@ struct OrchSoEntry {
     DeviceOrchestrationConfigFunc config_func{nullptr};
 };
 
-struct AicpuExecutor {
+struct AicpuExecutor
+{
     int32_t sched_thread_num_;
     bool orch_to_sched_{false};
 
@@ -127,18 +114,12 @@ struct AicpuExecutor {
     std::atomic<int32_t> finished_count_{0};
     std::atomic<bool> runtime_init_ready_{false};
 
-    // Per-Worker arena backing the PTO2Runtime + sm_handle + orch/sched/mailbox
-    // sub-regions (created in runtime_create_from_sm, released in runtime_destroy).
-    // Default-constructed: libc-backed backend, no ctx.
     DeviceArena runtime_arena_;
 
     // Cached orch args pointer set by the orchestration thread before scheduler
     // init; consumed by the (*p_func)(*orch_args_cached_) invocation below.
     const ChipStorageTaskArgs *orch_args_cached_{nullptr};
 
-    // Per-callable_id table. Single orch thread today, so first-write/read
-    // race is not possible; if multiple orch threads are ever introduced,
-    // guard the in_use=false→true transition with a mutex.
     OrchSoEntry orch_so_table_[MAX_REGISTERED_CALLABLE_IDS];
 
     // ===== Scheduler context (owns all dispatch/completion/drain state) =====
@@ -149,11 +130,10 @@ struct AicpuExecutor {
     int32_t run(Runtime *runtime);
     void deinit(Runtime *runtime);
 
-    ~AicpuExecutor() {
-        // Process-wide teardown (the single static instance dies here). Every
-        // in-use callable_id slot is dlclose()'d here; each is otherwise kept
-        // alive across runs for cache-hit reuse.
-        for (auto &e : orch_so_table_) {
+    ~AicpuExecutor()
+    {
+        for (auto &e : orch_so_table_)
+        {
             if (!e.in_use) continue;
             if (e.handle != nullptr) dlclose(e.handle);
             if (e.path[0] != '\0') unlink(e.path);
@@ -166,35 +146,30 @@ static AicpuExecutor g_aicpu_executor;
 
 // ===== AicpuExecutor Method Implementations =====
 
-int32_t AicpuExecutor::init(Runtime *runtime) {
+int32_t AicpuExecutor::init(Runtime *runtime)
+{
     bool expected = false;
-    if (!initialized_.compare_exchange_strong(expected, true, std::memory_order_acq_rel, std::memory_order_acquire)) {
-        return 0;
-    }
+    if (!initialized_.compare_exchange_strong(expected, true, std::memory_order_acq_rel, std::memory_order_acquire)) return 0;
 
-    LOG_INFO_V0("AicpuExecutor: Initializing");
-
-    if (runtime == nullptr) {
-        LOG_ERROR("runtime is nullptr");
+    if (runtime == nullptr)
+    {
         init_failed_.store(true, std::memory_order_release);
         return -1;
     }
 
-    // Read execution parameters from runtime. The 0 → 1 fixup runs before the
-    // sched_thread_num_ derivation so a zero input doesn't leave the scheduler
-    // count at -1.
     aicpu_thread_num_ = runtime->aicpu_thread_num;
     if (aicpu_thread_num_ == 0) aicpu_thread_num_ = 1;
     sched_thread_num_ = aicpu_thread_num_ - 1;
     orch_to_sched_ = runtime->orch_to_sched;
 
-    if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS) {
-        LOG_ERROR("Invalid aicpu_thread_num: %d", aicpu_thread_num_);
+    if (aicpu_thread_num_ < 1 || aicpu_thread_num_ > MAX_AICPU_THREADS)
+    {
         init_failed_.store(true, std::memory_order_release);
         return -1;
     }
 
-    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0) {
+    if (sched_ctx_.init(runtime, aicpu_thread_num_, sched_thread_num_, orch_to_sched_, get_platform_regs()) != 0)
+    {
         init_failed_.store(true, std::memory_order_release);
         return -1;
     }
@@ -202,35 +177,23 @@ int32_t AicpuExecutor::init(Runtime *runtime) {
     finished_count_.store(0, std::memory_order_release);
 
     init_done_.store(true, std::memory_order_release);
-    LOG_INFO_V0("AicpuExecutor: Init complete");
     return 0;
 }
 
-/**
- * Shutdown AICore - Send exit signal via registers to all AICore kernels
- */
-int32_t AicpuExecutor::run(Runtime *runtime) {
+int32_t AicpuExecutor::run(Runtime *runtime)
+{
     int32_t thread_idx = thread_idx_++;
     int32_t run_rc = 0;
-    LOG_INFO_V0("Thread %d: Start", thread_idx);
 
     // Orchestrator check
-    if (thread_idx >= sched_thread_num_) {
-#if PTO2_PROFILING
-        uint64_t orch_cycle_start = 0;
-        int32_t pto2_submitted_tasks = -1;
-#endif
+    if (thread_idx >= sched_thread_num_)
+    {
         // Orchestrator thread: load + run the device orchestration SO. The braces
         // scope the per-callable dlopen / SO-table locals to this block.
         {
-            // Per-callable_id dispatch: the orch SO state lives in
-            // `orch_so_table_[callable_id]` keyed by registration order;
-            // reload is governed by `register_new_callable_id_`.
             const int32_t callable_id = runtime->get_active_callable_id();
-            if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS) {
-                LOG_ERROR(
-                    "Thread %d: invalid callable_id %d (limit=%d)", thread_idx, callable_id, MAX_REGISTERED_CALLABLE_IDS
-                );
+            if (callable_id < 0 || callable_id >= MAX_REGISTERED_CALLABLE_IDS)
+            {
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
             }
@@ -241,17 +204,16 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             DeviceOrchestrationConfigFunc *p_config_func = &orch_so_table_[callable_id].config_func;
             const bool reload_so = runtime->register_new_callable_id();
 
-            if (reload_so) {
-                LOG_INFO_V0("Thread %d: New orch SO detected (callable_id=%d), (re)loading", thread_idx, callable_id);
-                if (*p_handle != nullptr) {
+            if (reload_so)
+            {
+                if (*p_handle != nullptr)
+                {
                     dlclose(*p_handle);
                     *p_handle = nullptr;
                     *p_func = nullptr;
                     *p_bind = nullptr;
-                    if (p_path[0] != '\0') {
-                        // Unlink the old file so the new open() lands on a
-                        // fresh inode — protects against SIGBUS / ETXTBSY when
-                        // the kernel still has the old mapping pinned.
+                    if (p_path[0] != '\0')
+                    {
                         unlink(p_path);
                         p_path[0] = '\0';
                     }
@@ -260,8 +222,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 const void *so_data = reinterpret_cast<const void *>(runtime->get_dev_orch_so_addr());
                 size_t so_size = runtime->get_dev_orch_so_size();
 
-                if (so_data == nullptr || so_size == 0) {
-                    LOG_ERROR("Thread %d: Device orchestration SO not set", thread_idx);
+                if (so_data == nullptr || so_size == 0)
+                {
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
@@ -270,36 +232,25 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 // Try multiple paths that may allow execution on AICPU.
                 char so_path[256];
                 bool file_created = false;
-                const char *candidate_dirs[] = {
-                    "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp"
-                };
+                const char *candidate_dirs[] = {"/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp"};
                 const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]);
 
-                for (int32_t i = 0; i < num_candidates && !file_created; i++) {
-                    int32_t fd = create_orch_so_file(
-                        candidate_dirs[i], callable_id, get_orch_device_id(), so_path, sizeof(so_path)
-                    );
-                    if (fd < 0) {
-                        LOG_INFO_V0(
-                            "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno
-                        );
-                        continue;
-                    }
+                for (int32_t i = 0; i < num_candidates && !file_created; i++)
+                {
+                    int32_t fd = create_orch_so_file(candidate_dirs[i], callable_id, get_orch_device_id(), so_path, sizeof(so_path));
+                    if (fd < 0) continue;
                     ssize_t written = write(fd, so_data, so_size);
                     close(fd);
-                    if (written != static_cast<ssize_t>(so_size)) {
-                        LOG_INFO_V0(
-                            "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno
-                        );
+                    if (written != static_cast<ssize_t>(so_size))
+                    {
                         unlink(so_path);
                         continue;
                     }
                     file_created = true;
-                    LOG_INFO_V0("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size);
                 }
 
-                if (!file_created) {
-                    LOG_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx);
+                if (!file_created)
+                {
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
@@ -307,49 +258,34 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
 
                 dlerror();
                 void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
-                const char *dlopen_err = dlerror();
-                if (handle == nullptr) {
-                    LOG_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown");
+                if (handle == nullptr)
+                {
                     unlink(so_path);
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
                 }
-                LOG_INFO_V0("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);
-
-                // Unlink the on-disk SO immediately: dlopen has already mmap'd
-                // the image, so the kernel keeps the inode alive until the
-                // matching dlclose / process exit. This prevents stale
-                // libdevice_orch_<pid>_<cid>.so files from accumulating in
-                // /tmp when child processes exit via os._exit(0), which skips
-                // ~AicpuExecutor (worker.py: _sub/_chip/_child loops).
+
                 unlink(so_path);
 
                 const char *entry_symbol = runtime->get_device_orch_func_name();
-                if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
-                    entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
-                }
+                if (entry_symbol == nullptr || entry_symbol[0] == '\0') entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
                 const char *config_symbol = runtime->get_device_orch_config_name();
-                if (config_symbol == nullptr || config_symbol[0] == '\0') {
-                    config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL;
-                }
+                if (config_symbol == nullptr || config_symbol[0] == '\0') config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL;
 
                 dlerror();
-                DeviceOrchestrationFunc orch_func =
-                    reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, entry_symbol));
+                DeviceOrchestrationFunc orch_func = reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, entry_symbol));
                 const char *entry_dlsym_error = dlerror();
-                if (entry_dlsym_error != nullptr) {
-                    LOG_ERROR(
-                        "Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error
-                    );
+                if (entry_dlsym_error != nullptr)
+                {
                     dlclose(handle);
                     unlink(so_path);
                     // Unblock scheduler threads before returning so they don't spin forever.
                     runtime_init_ready_.store(true, std::memory_order_release);
                     return -1;
                 }
-                if (orch_func == nullptr) {
-                    LOG_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol);
+                if (orch_func == nullptr)
+                {
                     dlclose(handle);
                     unlink(so_path);
                     // Unblock scheduler threads before returning so they don't spin forever.
@@ -360,22 +296,12 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 dlerror();
                 auto config_func = reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, config_symbol));
                 const char *config_dlsym_error = dlerror();
-                if (config_dlsym_error != nullptr || config_func == nullptr) {
-                    LOG_ERROR(
-                        "Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol,
-                        config_dlsym_error ? config_dlsym_error : "NULL function pointer"
-                    );
-                    config_func = nullptr;
-                }
+                if (config_dlsym_error != nullptr || config_func == nullptr) config_func = nullptr;
 
                 dlerror();
-                auto bind_runtime_func =
-                    reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "framework_bind_runtime"));
+                auto bind_runtime_func = reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "framework_bind_runtime"));
                 const char *bind_runtime_error = dlerror();
-                if (bind_runtime_error != nullptr) {
-                    LOG_ERROR("Thread %d: dlsym failed for framework_bind_runtime: %s", thread_idx, bind_runtime_error);
-                    bind_runtime_func = nullptr;
-                }
+                if (bind_runtime_error != nullptr) bind_runtime_func = nullptr;
 
                 *p_handle = handle;
                 *p_func = orch_func;
@@ -383,39 +309,32 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 *p_config_func = config_func;
                 snprintf(p_path, 256, "%s", so_path);
                 orch_so_table_[callable_id].in_use = true;
-            } else {
-                LOG_INFO_V0(
-                    "Thread %d: Reusing cached orch SO handle=%p (callable_id=%d)", thread_idx, *p_handle, callable_id
-                );
-                if (*p_handle == nullptr || *p_func == nullptr) {
-                    LOG_ERROR(
-                        "Thread %d: reload=false but no cached SO handle/func for callable_id=%d", thread_idx,
-                        callable_id
-                    );
-                    // Unblock scheduler threads before returning so they don't spin forever.
-                    runtime_init_ready_.store(true, std::memory_order_release);
-                    return -1;
-                }
+            }
+            else if (*p_handle == nullptr || *p_func == nullptr)
+            {
+                // Unblock scheduler threads before returning so they don't spin forever.
+                runtime_init_ready_.store(true, std::memory_order_release);
+                return -1;
             }
 
             // Validate arg count on every run (reload or cache hit).
-            if (*p_config_func != nullptr) {
+            if (*p_config_func != nullptr)
+            {
                 PTO2OrchestrationConfig cfg = (*p_config_func)(runtime->get_orch_args());
-                LOG_INFO_V0("Thread %d: Config: expected_args=%d", thread_idx, cfg.expected_arg_count);
-                if (cfg.expected_arg_count > 0) {
+                if (cfg.expected_arg_count > 0)
+                {
                     const ChipStorageTaskArgs &args_validate = runtime->get_orch_args();
                     int32_t actual_arg_count = args_validate.tensor_count() + args_validate.scalar_count();
-                    if (actual_arg_count < cfg.expected_arg_count) {
-                        LOG_ERROR(
-                            "Thread %d: arg_count %d < expected %d", thread_idx, actual_arg_count,
-                            cfg.expected_arg_count
-                        );
+                    if (actual_arg_count < cfg.expected_arg_count)
+                    {
                         // Clean up cached state so a subsequent run does a full reload.
-                        if (*p_handle != nullptr) {
+                        if (*p_handle != nullptr)
+                        {
                             dlclose(*p_handle);
                             *p_handle = nullptr;
                         }
-                        if (p_path[0] != '\0') {
+                        if (p_path[0] != '\0')
+                        {
                             unlink(p_path);
                             p_path[0] = '\0';
                         }
@@ -428,13 +347,10 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                         return -1;
                     }
                 }
-            } else {
-                LOG_INFO_V0("Thread %d: No config function, using defaults", thread_idx);
             }
+            else
+            {}
 
-            // sm_handle / rt are bound to *this* run's memory and must be
-            // (re)created every run, regardless of whether the SO itself was
-            // reused above.
             const ChipStorageTaskArgs &args = runtime->get_orch_args();
             int32_t arg_count = args.tensor_count() + args.scalar_count();
             LOG_INFO_V0("Thread %d: sm_ptr=%p, arg_count=%d", thread_idx, runtime->get_gm_sm_ptr(), arg_count);
@@ -452,44 +368,24 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
                 );
             }
 
+
             uint64_t task_window_size = PTO2_TASK_WINDOW_SIZE;
             uint64_t heap_size = PTO2_HEAP_SIZE;
 
-            if (runtime->task_window_size > 0) {
-                task_window_size = runtime->task_window_size;
-            }
-            if (runtime->heap_size > 0) {
-                heap_size = runtime->heap_size;
-            }
+            if (runtime->task_window_size > 0) task_window_size = runtime->task_window_size;
+            if (runtime->heap_size > 0) heap_size = runtime->heap_size;
             int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE;
-            if (runtime->dep_pool_size > 0) {
-                dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
-            }
-            LOG_INFO_V0(
-                "Thread %d: Ring sizes: task_window=%lu, heap=%lu, dep_pool=%d", thread_idx,
-                static_cast<uint64_t>(task_window_size), static_cast<uint64_t>(heap_size), dep_pool_capacity
-            );
-
-            // gm_heap pointer / dep_pool_capacity are encoded into the prebuilt
-            // runtime arena image at host build time, so we no longer fetch
-            // them here. They remain on the host Runtime instance and on the
-            // PTO2Runtime header for diagnostic purposes only.
+            if (runtime->dep_pool_size > 0) dep_pool_capacity = static_cast<int32_t>(runtime->dep_pool_size);
+
             (void)dep_pool_capacity;
 
             void *sm_ptr = runtime->get_gm_sm_ptr();
             uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size(task_window_size);
 
-            // Prebuilt-arena fast path. Host has pre-populated the entire
-            // runtime arena (PTO2Runtime + orchestrator/scheduler/tensor_map
-            // sub-regions + sm_handle wrapper + mailbox) and uploaded it via
-            // rtMemcpy into the pooled runtime_arena buffer. We attach to it,
-            // wire arena-internal pointers to their device addresses, reset
-            // the SM, and finalize the few device-only fields the host could
-            // not know at image-build time.
             void *prebuilt_arena = runtime->get_prebuilt_arena_base();
             size_t off_runtime = runtime->get_prebuilt_runtime_offset();
-            if (prebuilt_arena == nullptr) {
-                LOG_ERROR("Thread %d: prebuilt_arena_base is null", thread_idx);
+            if (prebuilt_arena == nullptr)
+            {
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
             }
@@ -500,39 +396,18 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // addresses; we overwrite them with device addresses).
             runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt);
 
-            // Reset SM state. setup_pointers + init_header_per_ring restore
-            // ring flow-control counters, layout metadata, error flags, and
-            // the per-slot ring->slot_states[] (bind_ring + reset_for_reuse +
-            // fanin_count/active_mask zero — previously done inside
-            // RingSchedState::init).
             memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
-            if (!rt->sm_handle->init(sm_ptr, sm_size, task_window_size, heap_size)) {
-                LOG_ERROR("Thread %d: sm_handle->init failed", thread_idx);
+            if (!rt->sm_handle->init(sm_ptr, sm_size, task_window_size, heap_size))
+            {
                 runtime_init_ready_.store(true, std::memory_order_release);
                 return -1;
             }
 
-            // AICore completion mailbox lives in the arena; reset it each
-            // boot so stale completion notifications from a previous run do
-            // not leak.
             memset(rt->aicore_mailbox, 0, sizeof(*rt->aicore_mailbox));
 
             // Fill ops / core counts (host can't resolve s_runtime_ops's
             // device address nor know the SchedulerContext's core fan-out).
             runtime_finalize_after_wire(rt, sched_ctx_.aic_count(), sched_ctx_.aiv_count());
-#if PTO2_PROFILING
-            rt->orchestrator.l2_swimlane_level = get_l2_swimlane_level();
-            {
-                auto &orch = rt->orchestrator;
-                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-                    auto &alloc = orch.rings[r].task_allocator;
-                    scope_stats_set_ring_capacity(
-                        r, alloc.window_size(), alloc.heap_capacity(), rt->prebuilt_layout.dep_pool_capacity
-                    );
-                }
-                scope_stats_set_tensormap_capacity(orch.tensor_map.pool_capacity());
-            }
-#endif
 
             // With multi-ring, slot_states are per-ring inside the scheduler.
             runtime->set_slot_states_ptr(nullptr);
@@ -548,207 +423,74 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             // Wait for scheduler's one-time init to complete
             sched_ctx_.wait_pto2_init_complete();
 
-#if PTO2_PROFILING
-            if (get_l2_swimlane_level() >= L2SwimlaneLevel::ORCH_PHASES) {
-                l2_swimlane_aicpu_set_orch_thread_idx(thread_idx);
-            }
-#endif
-
-            // dep_gen plugs into the orchestrator thread (single-instance subsystem):
-            // set the per-thread queue index and pop the initial buffer before any
-            // submit_task can fire inside orch_func_.
-            if (is_dep_gen_enabled()) {
+            if (is_dep_gen_enabled())
+            {
                 dep_gen_aicpu_set_orch_thread_idx(thread_idx);
                 dep_gen_aicpu_init();
             }
 
-#if PTO2_PROFILING
-            // scope_stats streams scope_end records off the orchestrator thread:
-            // record the per-thread ready_queue index. No-op (writer shared
-            // state null) when scope_stats is disabled; the current buffer is
-            // popped lazily on the first scope_end append.
-            scope_stats_aicpu_set_orch_thread_idx(thread_idx);
-#endif
-
-#if PTO2_PROFILING
-            orch_cycle_start = get_sys_cnt_aicpu();
-#endif
             framework_bind_runtime(rt);
-            if (*p_bind != nullptr) {
-                (*p_bind)(rt);
-            }
+            if (*p_bind != nullptr) (*p_bind)(rt);
             rt_scope_begin(rt);
             (*p_func)(*orch_args_cached_);
             rt_scope_end(rt);
 
             // Flush the (potentially partially-filled) DepGenBuffer so the host
             // collector can pick it up before this orchestrator thread joins.
-            if (is_dep_gen_enabled()) {
-                dep_gen_aicpu_flush();
-            }
-#if PTO2_PROFILING
-            // Push the partially-filled scope_stats buffer so the host gets the
-            // final scope_end records. Idempotent / no-op when disabled.
-            scope_stats_aicpu_flush_buffers();
-#endif
-#if PTO2_PROFILING
-            uint64_t orch_cycle_end = get_sys_cnt_aicpu();
-            (void)orch_cycle_end;
-#endif
+            if (is_dep_gen_enabled()) dep_gen_aicpu_flush();
 
             // Print orchestrator profiling data
-#if PTO2_ORCH_PROFILING
-            PTO2OrchProfilingData p = orchestrator_get_profiling();
-            uint64_t total =
-                p.sync_cycle + p.alloc_cycle + p.args_cycle + p.lookup_cycle + p.insert_cycle + p.fanin_cycle;
-            if (total == 0) total = 1;  // avoid div-by-zero
-            LOG_INFO_V9(
-                "Thread %d: === Orchestrator Profiling: %" PRId64 " tasks, total=%.3fus ===", thread_idx,
-                static_cast<int64_t>(p.submit_count), cycles_to_us(total)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   task+heap_alloc: %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "",
-                thread_idx, cycles_to_us(p.alloc_cycle), p.alloc_cycle * 100.0 / total,
-                cycles_to_us(p.alloc_cycle - p.alloc_wait_cycle), cycles_to_us(p.alloc_wait_cycle),
-                static_cast<uint64_t>(p.alloc_atomic_count)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   sync_tensormap : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.sync_cycle),
-                p.sync_cycle * 100.0 / total
-            );
-            LOG_INFO_V9(
-                "Thread %d:   lookup+dep     : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.lookup_cycle),
-                p.lookup_cycle * 100.0 / total
-            );
-            LOG_INFO_V9(
-                "Thread %d:   tensormap_ins  : %.3fus (%.1f%%)", thread_idx, cycles_to_us(p.insert_cycle),
-                p.insert_cycle * 100.0 / total
-            );
-            LOG_INFO_V9(
-                "Thread %d:   param_copy     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
-                cycles_to_us(p.args_cycle), p.args_cycle * 100.0 / total, static_cast<uint64_t>(p.args_atomic_count)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   fanin+ready    : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus", thread_idx,
-                cycles_to_us(p.fanin_cycle), p.fanin_cycle * 100.0 / total,
-                cycles_to_us(p.fanin_cycle - p.fanin_wait_cycle), cycles_to_us(p.fanin_wait_cycle)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   avg/task       : %.3fus", thread_idx,
-                p.submit_count > 0 ? cycles_to_us(total) / p.submit_count : 0.0
-            );
-
-#if PTO2_TENSORMAP_PROFILING
-            PTO2TensorMapProfilingData tp = pto2_tensormap_get_profiling();
-            LOG_INFO_V9("Thread %d: === TensorMap Lookup Stats ===", thread_idx);
-            LOG_INFO_V9(
-                "Thread %d:   lookups        : %" PRIu64 ", inserts: %" PRIu64 "", thread_idx,
-                static_cast<uint64_t>(tp.lookup_count), static_cast<uint64_t>(tp.insert_count)
-            );
-            LOG_INFO_V9(
-                "Thread %d:   chain walked   : total=%" PRIu64 ", avg=%.1f, max=%d", thread_idx,
-                static_cast<uint64_t>(tp.lookup_chain_total),
-                tp.lookup_count > 0 ? static_cast<double>(tp.lookup_chain_total) / tp.lookup_count : 0.0,
-                tp.lookup_chain_max
-            );
-            LOG_INFO_V9(
-                "Thread %d:   overlap checks : %" PRIu64 ", hits=%" PRIu64 " (%.1f%%)", thread_idx,
-                static_cast<uint64_t>(tp.overlap_checks), static_cast<uint64_t>(tp.overlap_hits),
-                tp.overlap_checks > 0 ? tp.overlap_hits * 100.0 / tp.overlap_checks : 0.0
-            );
-#endif
-#endif  // PTO2_ORCH_PROFILING
-
-            // Latch task count from PTO2 shared memory to hand off to the
-            // scheduler. The orchestrator's run window (start_time / end_time /
-            // submit_count) is no longer published to shared memory — the
-            // device LOG_INFO_V9 "orch_start=… orch_end=… orch_cost=…" line
-            // below carries the same envelope info for debugging, and
-            // host-side swimlane derives per-phase timing from the per-event
-            // L2SwimlaneAicpuSchedPhaseRecord[] + L2SwimlaneAicpuOrchPhaseRecord[]
-            // streams that already cover everything inside submit_task().
-            int32_t total_tasks = 0;
-            if (rt->orchestrator.sm_header) {
-                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-                    total_tasks +=
-                        rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
-                }
-            }
 
-#if PTO2_PROFILING
-            pto2_submitted_tasks = total_tasks;
-#endif
+            int32_t total_tasks = 0;
+            if (rt->orchestrator.sm_header)
+                for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) total_tasks += rt->orchestrator.sm_header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
 
             // Signal completion to the orchestrator state machine
             rt_orchestration_done(rt);
 
-            sched_ctx_.on_orchestration_done(runtime, rt, thread_idx, total_tasks);
-        }
-#if PTO2_PROFILING
-        uint64_t orch_end_ts = get_sys_cnt_aicpu();
-        LOG_INFO_V9(
-            "Thread %d: orch_start=%" PRIu64 " orch_end=%" PRIu64 " orch_cost=%.3fus", thread_idx,
-            static_cast<uint64_t>(orch_cycle_start), static_cast<uint64_t>(orch_end_ts),
-            cycles_to_us(orch_end_ts - orch_cycle_start)
-        );
-        if (pto2_submitted_tasks >= 0) {
-            LOG_INFO_V9(
-                "PTO2 total submitted tasks = %d, already executed %d tasks", pto2_submitted_tasks,
-                sched_ctx_.completed_tasks_count()
-            );
+            sched_ctx_.on_orchestration_done(runtime, rt, total_tasks);
         }
-#endif
-        LOG_INFO_V0("Thread %d: Orchestrator completed", thread_idx);
     }
 
     // Scheduler thread (orchestrator threads skip dispatch when orch_to_sched_ is false)
-    if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_)) {
+    if (!sched_ctx_.is_completed() && (thread_idx < sched_thread_num_ || orch_to_sched_))
+    {
         // Device orchestration: wait for the primary orchestrator to initialize the SM header
-        while (!runtime_init_ready_.load(std::memory_order_acquire)) {
-            SPIN_WAIT_HINT();
-        }
-        if (rt == nullptr) {
-            LOG_ERROR("Thread %d: rt is null after orchestrator error, skipping dispatch", thread_idx);
-        } else {
+        while (!runtime_init_ready_.load(std::memory_order_acquire)) SPIN_WAIT_HINT();
+        if (rt == nullptr)
+        {}
+        else
+        {
             sched_ctx_.bind_runtime(rt);
             int32_t completed = sched_ctx_.resolve_and_dispatch(runtime, thread_idx);
-            if (completed < 0) {
-                LOG_ERROR("Thread %d: Scheduler failed with rc=%d", thread_idx, completed);
+            if (completed < 0)
+            {
                 run_rc = completed;
-            } else {
-                LOG_INFO_V0("Thread %d: Executed %d tasks from runtime", thread_idx, completed);
             }
+            else
+            {}
         }
     }
 
-    // Always shutdown AICore — even if sched_ctx_.completed_ was already true.
-    // platform_deinit_aicore_regs is idempotent; orchestrator threads have
-    // core_trackers_[thread_idx].core_num() == 0 so they skip the loop harmlessly.
     int32_t shutdown_rc = sched_ctx_.shutdown(thread_idx);
-    if (shutdown_rc != 0 && run_rc == 0) {
-        run_rc = shutdown_rc;
-    }
-
-    LOG_INFO_V0("Thread %d: Completed", thread_idx);
+    if (shutdown_rc != 0 && run_rc == 0) run_rc = shutdown_rc;
 
     // Check if this is the last thread to finish
     int32_t prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel);
-    if (prev_finished + 1 == aicpu_thread_num_) {
+    if (prev_finished + 1 == aicpu_thread_num_)
+    {
         finished_.store(true, std::memory_order_release);
-        // Destroy PTO2 runtime. sm_handle / rt are recreated every run so we
-        // always tear them down here, but we keep the per-cid orch SO entries
-        // alive for the next run's cache-hit reuse (see run() reload_so branch).
-        if (rt != nullptr) {
+        if (rt != nullptr)
+        {
             // Clear g_current_runtime in this DSO and in the orchestration SO before destroying rt.
             const int32_t callable_id = runtime->get_active_callable_id();
             framework_bind_runtime(nullptr);
-            if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS) {
+            if (callable_id >= 0 && callable_id < MAX_REGISTERED_CALLABLE_IDS)
+            {
                 DeviceOrchestrationBindRuntimeFunc bind = orch_so_table_[callable_id].bind;
-                if (bind != nullptr) {
-                    bind(nullptr);
-                }
+                if (bind != nullptr) bind(nullptr);
             }
-            runtime_destroy(rt, runtime_arena_);
+            runtime_destroy(rt);
             rt = nullptr;
         }
     }
@@ -756,10 +498,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
     return run_rc;
 }
 
-void AicpuExecutor::deinit(Runtime *runtime) {
-    // 1. Invalidate AICPU cache for Runtime address range.
-    //    Next round's Host DMA (rtMemcpy) writes fresh Runtime to HBM but
-    //    bypasses this cache. Invalidating now ensures next round reads from HBM.
+void AicpuExecutor::deinit(Runtime *runtime)
+{
     cache_invalidate_range(runtime, sizeof(Runtime));
 
     // Reset all SchedulerContext-owned state in one place.
@@ -773,9 +513,6 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     orch_to_sched_ = false;
 
     orch_args_cached_ = nullptr;
-    // orch_so_table_ entries are intentionally preserved across deinit: the
-    // next run reuses cached handles when register_new_callable_id() returns
-    // false. The destructor releases them at process teardown.
 
     // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit)
     rt = nullptr;
@@ -783,71 +520,36 @@ void AicpuExecutor::deinit(Runtime *runtime) {
     // Clear dep_gen file-local bookkeeping. No-op when dep_gen is disabled.
     dep_gen_aicpu_finalize();
 
-    LOG_INFO_V0("DeInit: Runtime execution state reset");
-
     initialized_.store(false, std::memory_order_release);
     init_done_.store(false, std::memory_order_release);
     init_failed_.store(false, std::memory_order_release);
     thread_idx_.store(0, std::memory_order_release);
     finished_.store(false, std::memory_order_release);
-
-    LOG_INFO_V0("DeInit: AicpuExecutor reset complete");
 }
 
 // ===== Public Entry Point =====
 
-/**
- * aicpu_execute - Main AICPU kernel execution entry point
- *
- * This is called by DynTileFwkBackendKernelServer in kernel.cpp.
- * Orchestrates the complete task runtime execution:
- * 1. Initialize executor (thread-safe, first thread only)
- * 2. Wait for initialization to complete
- * 3. Execute tasks on managed cores
- * 4. Cleanup when last thread finishes
- *
- * @param runtime Pointer to Runtime structure
- * @return 0 on success, non-zero on error
- */
-extern "C" int32_t aicpu_execute(Runtime *runtime) {
-    if (runtime == nullptr) {
-        LOG_ERROR("%s", "Invalid argument: null Runtime pointer");
-        return -1;
-    }
-
-    LOG_INFO_V0("%s", "aicpu_execute: Starting AICPU kernel execution");
+extern "C" int32_t aicpu_execute(Runtime *runtime)
+{
+    if (runtime == nullptr) return -1;
 
     g_aicpu_executor.init(runtime);
 
-    while (!g_aicpu_executor.init_done_.load(std::memory_order_acquire)) {
-        if (g_aicpu_executor.init_failed_.load(std::memory_order_acquire)) {
-            LOG_ERROR("%s", "aicpu_execute: Initialization failed, aborting execution");
-            return -1;
-        }
-    }
+    while (!g_aicpu_executor.init_done_.load(std::memory_order_acquire))
+        if (g_aicpu_executor.init_failed_.load(std::memory_order_acquire)) return -1;
 
     int32_t rc = g_aicpu_executor.run(runtime);
-    if (rc != 0) {
-        LOG_ERROR("aicpu_execute: Thread execution failed with rc=%d", rc);
-    }
+    if (rc != 0)
+    {}
 
     int32_t runtime_rc = read_pto2_runtime_status(runtime);
 
     // Last thread cleans up
-    if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) {
-        LOG_INFO_V0("aicpu_execute: Last thread finished, cleaning up");
-        g_aicpu_executor.deinit(runtime);
-    }
+    if (g_aicpu_executor.finished_.load(std::memory_order_acquire)) g_aicpu_executor.deinit(runtime);
 
-    if (runtime_rc != 0) {
-        LOG_ERROR("aicpu_execute: PTO2 runtime failed with rc=%d", runtime_rc);
-        return runtime_rc;
-    }
+    if (runtime_rc != 0) return runtime_rc;
 
-    if (rc != 0) {
-        return rc;
-    }
+    if (rc != 0) return rc;
 
-    LOG_INFO_V0("%s", "aicpu_execute: Kernel execution completed successfully");
     return 0;
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/common/intrinsic.h b/src/a2a3/runtime/tensormap_and_ringbuffer/common/intrinsic.h
index 768e6a612..ba83a8b5c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/common/intrinsic.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/common/intrinsic.h
@@ -63,7 +63,7 @@
  *     compiled, ran without error, and produced wrong output. Use
  *     `get_sub_block_id(args)` instead, which reads from the runtime's
  *     `GlobalContext.sub_block_id` that the scheduler initializes per
- *     AIV core in `scheduler_cold_path.cpp::SchedulerContext::init`.
+ *     AIV core in `scheduler_context.h::SchedulerContext::init`.
  *
  *   - `get_block_idx()` and `get_block_num()` are not redirected to
  *     simpler's LocalContext either — use the `(args)` variants below
@@ -97,7 +97,7 @@ static constexpr int32_t PTO2_EXT_PARAMS_COUNT = 2;
 
 /**
  * Args[] suffix indices for context pointers.
- * Derived from MAX_TENSOR_ARGS(32) + MAX_SCALAR_ARGS(16).
+ * Derived from MAX_TENSOR_ARGS(16) + MAX_SCALAR_ARGS(32).
  * Users should not depend on these values; use the Get* functions below.
  */
 static constexpr int32_t SPMD_LOCAL_CONTEXT_INDEX = 48;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
index c32a73dc0..0ec9b155f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/MULTI_RING.md
@@ -179,8 +179,9 @@ Each ring's `last_task_alive` advances independently:
 
 ```text
 advance_ring_pointers(ring_id):  // protected by per-ring advance_lock
-    la = ring->fc.last_task_alive
-    while ring->get_slot_state_by_task_id(la).task_state >= CONSUMED:
+    watermark = ring->completed_watermark
+    la = last_task_alive
+    while la <= watermark and watermark >= slot[la].last_consumer_local_id:
         reset slot for reuse
         la++
     sync_to_sm()  // release-store last_task_alive
@@ -235,30 +236,9 @@ AICore uses `last_reg_val` to detect new dispatches — identical values cause s
 | `PTO2_HEAP_SIZE` | 256 MB | 1 GB |
 | `PTO2_DEP_LIST_POOL_SIZE` | 16384 | 65536 |
 
-### 7.2 Runtime Overrides
-
-Precedence per value: **per-task `CallConfig` field > `PTO2_RING_*` env var
-> compile-time default**. Uniform across all rings of that task's runtime.
-
-Per-task via `CallConfig.runtime_env` — different L2 tasks in one launch can
-each carry their own sizes. Invalid values raise at submit time (`validate()`):
-
-```python
-cfg = CallConfig()
-cfg.runtime_env.ring_task_window = 128   # power of 2, >= 4
-cfg.runtime_env.ring_heap = 262144       # bytes/ring, power of 2, >= 1024
-cfg.runtime_env.ring_dep_pool = 256      # 4 .. INT32_MAX
-orchestrator.submit_next_level(handle, args, cfg)
-```
+### 7.2 Runtime Environment Overrides
 
-Scene tests set the same keys under a nested `runtime_env` block in the
-per-case `config` dict:
-
-```python
-"config": {"runtime_env": {"ring_task_window": 128, "ring_heap": 262144, "ring_dep_pool": 256}}
-```
-
-Process-wide env fallback (invalid values are silently ignored):
+Uniform (applies to all rings):
 
 ```bash
 PTO2_RING_TASK_WINDOW=1024
@@ -266,6 +246,16 @@ PTO2_RING_HEAP=1048576
 PTO2_RING_DEP_POOL=1024
 ```
 
+In `kernel_config.py`:
+
+```python
+RUNTIME_ENV = {
+    "PTO2_RING_TASK_WINDOW": "128",
+    "PTO2_RING_HEAP": "262144",
+    "PTO2_RING_DEP_POOL": "256",
+}
+```
+
 ### 7.3 Sizing Guidelines
 
 - `task_window` must be ≥ max tasks in any single scope + headroom for concurrent scopes
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
index a3cc143c6..be0a6e9e1 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/RUNTIME_LOGIC.md
@@ -538,7 +538,7 @@ This is protected by a per-ring try-lock (`advance_lock`) in `RingSchedState`, e
 
 ### 8.5 SchedulerContext
 
-All scheduler-side state and methods live in `SchedulerContext` (`runtime/scheduler/scheduler_context.h`). It is held as a `sched_ctx_` member of `AicpuExecutor`; `AicpuExecutor` is a thin wrapper that owns the lifecycle atomics and the orchestration SO handle, and delegates everything else to `SchedulerContext`.
+All scheduler-side state and methods live in `SchedulerContext` (`runtime/scheduler_context.h`). It is held as a `sched_ctx_` member of `AicpuExecutor`; `AicpuExecutor` is a thin wrapper that owns the lifecycle atomics and the orchestration SO handle, and delegates everything else to `SchedulerContext`.
 
 Public surface (called from `AicpuExecutor::init/run/deinit`):
 
@@ -552,11 +552,7 @@ Public surface (called from `AicpuExecutor::init/run/deinit`):
 | `deinit()` | once per run | Reset every scheduler-owned field to its post-construction default |
 | Read-only accessors | various | `aic_count()` / `aiv_count()` / `is_completed()` / `completed_tasks_count()` / `wait_pto2_init_complete()` |
 
-Private internals are split across three .cpp files by responsibility:
-
-- `scheduler_completion.cpp` — completion polling, drain protocol
-- `scheduler_dispatch.cpp` — task dispatch loop and helpers
-- `scheduler_cold_path.cpp` — exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done`
+Private internals all live inline in `scheduler_context.h`, covering completion polling, drain protocol, task dispatch loop and helpers, exit checks, stall diagnostics, profiling, lifecycle (`init/deinit`), core management (`handshake_all_cores` / `assign_cores_to_threads` / `reassign_cores_for_all_threads` / `emergency_shutdown`), and `on_orchestration_done`.
 
 `AicpuExecutor` calls neither `handshake_*`, `assign_*`, `reassign_*`, nor `emergency_shutdown` directly — they are private, invoked only by `init` and `on_orchestration_done`.
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SCALAR_DATA_ACCESS.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SCALAR_DATA_ACCESS.md
index bd93f87da..846cdf377 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SCALAR_DATA_ACCESS.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/SCALAR_DATA_ACCESS.md
@@ -32,7 +32,7 @@ addr null-check → TensorMap lookup → spin-wait producer COMPLETED → comput
 
 - **addr null-check**: `buffer.addr == 0` means unallocated — log error, return 0
 - **TensorMap lookup**: find producer task by `buffer.addr`
-- **spin-wait**: wait until producer `task_state >= PTO2_TASK_COMPLETED`
+- **spin-wait**: wait until producer's `completion_flags[local_id & mask] == 1`
 - **No producer** (lookup callback never fires): skip waiting, read immediately
 
 ### 3.2 set_tensor_data Flow
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
index af661d440..a5aa05bdd 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/device_log_profiling.md
@@ -52,7 +52,7 @@ Thread 3: PTO2 total submitted tasks = 16704
 
 ### Field Reference
 
-| Field | Source (`pto_orchestrator.cpp`) | Description |
+| Field | Source (`pto_orchestrator.h`) | Description |
 | ----- | ------------------------------- | ----------- |
 | **cost** | Wall-clock around `orch_func()` call | Total time including orchestration logic + scope overhead |
 | **total** | Sum of all sub-steps below | Accumulated time inside `submit_task` across all tasks |
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
index bd669f365..df938ddfa 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/docs/profiling_levels.md
@@ -48,7 +48,7 @@ Each sub-level macro requires `PTO2_PROFILING=1`:
 
 - Debug/diagnostic logs (always present)
 - Progress tracking (`PTO2 progress: completed=...`)
-- Stall detection and dump (triggered after the `SCHEDULER_TIMEOUT_MS` wall-clock no-progress budget)
+- Stall detection and dump (triggered only after `MAX_IDLE_ITERATIONS` idle loops)
 - Deadlock/livelock detection (`diagnose_stuck_state`, called on stall)
 
 **What's NOT compiled:**
@@ -278,7 +278,7 @@ Identity fields the AICPU side used to write at level 1 (`func_id`,
   collector (`L2SwimlaneCollector::set_core_types`).
 
 AICore buffer rotation no longer piggy-backs on `complete_task`. AICPU
-counts dispatches per core in the dispatch path (scheduler_dispatch in
+counts dispatches per core in the dispatch path (scheduler_context in
 tensormap_and_ringbuffer; aicpu_executor in host_build_graph) and rotates
 the AICore buffer when the count is about to cross a
 `PLATFORM_AICORE_BUFFER_SIZE` boundary — strictly before
@@ -451,7 +451,7 @@ definitions to runtime headers.
 ### Code Locations
 
 - Macro defaults and validation: `src/common/task_interface/profiling_config.h`
-- Scheduler profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp` and `scheduler_cold_path.cpp`
+- Scheduler profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h`
 - Orchestrator profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp`
 - TensorMap profiling: `src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h`
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
index 4520ad473..f26bfadeb 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/dep_gen_replay.cpp
@@ -554,7 +554,7 @@ dep_gen_replay_emit_deps_json(const DepGenRecord *records, size_t num_records, c
         // `explicit_dep_count` / `over->dep_count` originate from device
         // shared memory and are bounded by the writer to the array sizes, but
         // we clamp on read too so a corrupted record never drives an OOB read
-        // off the end of rec.explicit_deps[64] / over->deps[582].
+        // off the end of rec.explicit_deps[64] / over->deps[326].
         const uint64_t *deps_data;
         int32_t dc;
         if (rec.flags & DEP_GEN_FLAG_HAS_OVERFLOW) {
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 0e121fe47..c0b407b83 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -15,14 +15,12 @@
  * Supports device orchestration where AICPU thread 3 runs the orchestrator.
  *
  * init_runtime_impl:
- *   - Converts host tensor pointers to device pointers (all inputs copied H2D;
- *     only OUTPUT/INOUT tensors are copied back D2H)
+ *   - Converts host tensor pointers to device pointers (all tensors copied both directions)
  *   - Copies orchestration SO to device memory
  *   - Sets up runtime state for device orchestration
  *
  * validate_runtime_impl:
- *   - Copies OUTPUT/INOUT tensors back from device to host (read-only inputs
- *     are skipped)
+ *   - Copies recorded tensors back from device to host
  *   - Frees device memory
  */
 
@@ -163,8 +161,8 @@ prepare_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const
  * @return 0 on success, -1 on failure
  */
 extern "C" int bind_callable_to_runtime_impl(
-    Runtime *runtime, const ChipStorageTaskArgs *orch_args, void *host_orch_func_ptr, const ArgDirection *signature,
-    int sig_count, uint64_t ring_task_window, uint64_t ring_heap, uint64_t ring_dep_pool
+    Runtime *runtime, const ChipStorageTaskArgs *orch_args, void *host_orch_func_ptr,
+    const ArgDirection * /*signature*/, int /*sig_count*/
 ) {
     if (runtime == nullptr) {
         LOG_ERROR("Runtime pointer is null");
@@ -210,32 +208,13 @@ extern "C" int bind_callable_to_runtime_impl(
             return -1;
         }
 
-        // Pure write-only OUTPUT buffers carry no meaningful host content, so
-        // the H2D copy-in is wasted. Zero them on-device instead (cheap HBM
-        // memset, no PCIe) so any region the kernel leaves unwritten reads as 0
-        // rather than pooled-allocator garbage. INOUT (read-before-write)
-        // and IN keep the H2D copy. Falls back to copy_to_device if a backend
-        // did not wire device_memset.
-        bool is_pure_output = (signature != nullptr && i < sig_count && signature[i] == ArgDirection::OUT);
-        int rc;
-        if (is_pure_output && runtime->host_api.device_memset != nullptr) {
-            rc = runtime->host_api.device_memset(dev_ptr, 0, size);
-        } else {
-            rc = runtime->host_api.copy_to_device(dev_ptr, host_ptr, size);
-        }
+        int rc = runtime->host_api.copy_to_device(dev_ptr, host_ptr, size);
         if (rc != 0) {
-            LOG_ERROR("Failed to stage tensor %d to device", i);
+            LOG_ERROR("Failed to copy tensor %d to device", i);
             runtime->host_api.device_free(dev_ptr);
             return -1;
         }
-        // Read-only INPUT tensors are never written by the kernel, so there is
-        // no point copying them back D2H at the end. Index the signature
-        // by the orch tensor index `i` (child_memory tensors are skipped above
-        // but do not consume a separate signature slot — scalars follow the
-        // tensor entries). Anything not provably IN keeps the safe default of
-        // copying back.
-        bool needs_copy_back = !(signature != nullptr && i < sig_count && signature[i] == ArgDirection::IN);
-        runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size, needs_copy_back});
+        runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size});
         LOG_INFO_V0("  Tensor %d: %zu bytes at %p", i, size, dev_ptr);
 
         t.buffer.addr = reinterpret_cast<uint64_t>(dev_ptr);
@@ -255,13 +234,11 @@ extern "C" int bind_callable_to_runtime_impl(
         LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->orch_to_sched ? "enabled" : "disabled");
     }
 
-    // Ring buffer size overrides: per-task CallConfig value wins over the
-    // env var; both fall back to the compile-time default when zero.
+    // Read ring buffer size overrides from environment
     {
-        runtime->task_window_size =
-            ring_task_window ? ring_task_window : parse_env_uint64("PTO2_RING_TASK_WINDOW", 4, true);
-        runtime->heap_size = ring_heap ? ring_heap : parse_env_uint64("PTO2_RING_HEAP", 1024, true);
-        runtime->dep_pool_size = ring_dep_pool ? ring_dep_pool : parse_env_uint64("PTO2_RING_DEP_POOL", 4, false);
+        runtime->task_window_size = parse_env_uint64("PTO2_RING_TASK_WINDOW", 4, true);
+        runtime->heap_size = parse_env_uint64("PTO2_RING_HEAP", 1024, true);
+        runtime->dep_pool_size = parse_env_uint64("PTO2_RING_DEP_POOL", 4, false);
         if (runtime->task_window_size || runtime->heap_size || runtime->dep_pool_size) {
             LOG_INFO_V0(
                 "Ring buffer overrides: task_window=%" PRIu64 " heap=%" PRIu64 " dep_pool=%" PRIu64,
@@ -454,14 +431,6 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
                 continue;
             }
 
-            // Read-only INPUT tensors were uploaded H2D but the kernel never
-            // wrote them — copying them back (potentially ~GB) is pure waste.
-            // They are still device_free'd in the cleanup loop below.
-            if (!pair.needs_copy_back) {
-                LOG_INFO_V0("Tensor %d: read-only input, skipping copy-back", i);
-                continue;
-            }
-
             void *src_ptr = pair.dev_ptr;
             size_t copy_size = pair.size;
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp
index 0a6ab5664..13b4af4fb 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp
@@ -11,174 +11,20 @@
 #include "common.h"
 #include "pto_orchestration_api.h"
 
-#ifdef __linux__
-#include <cxxabi.h>
-#include <dlfcn.h>
-#include <execinfo.h>
-#include <unistd.h>
-
-#include <array>
-#include <cstring>
-#include <vector>
-#endif
-
 struct PTO2Runtime;
 
 namespace {
-// Plain global (not thread_local) to avoid glibc TLSDESC stale-resolution
-// crash (BZ #32412) when the orchestration SO is dlclose'd/re-dlopen'd
-// between execution rounds.  All orchestrator threads bind the same rt
-// value, so per-thread storage is unnecessary.
 PTO2Runtime *g_current_runtime = nullptr;
 }  // namespace
 
-extern "C" __attribute__((visibility("default"))) void framework_bind_runtime(PTO2Runtime *rt) {
+extern "C" __attribute__((visibility("default"))) void framework_bind_runtime(PTO2Runtime *rt)
+{
     g_current_runtime = rt;
 }
 
 // Keep current_runtime local to this .so so orchestration helpers do not
 // accidentally bind to the AICPU binary's same-named symbol.
-extern "C" __attribute__((visibility("hidden"))) PTO2Runtime *framework_current_runtime() { return g_current_runtime; }
-
-/**
- * Use addr2line to convert an address to file:line information.
- * Uses the -i flag to expand inlines; returns the first line (innermost actual code location).
- * If inlining is present, also returns the outer call chain via inline_chain.
- */
-#ifdef __linux__
-static std::string addr_to_line(const char *executable, void *addr, std::string *inline_chain = nullptr) {
-    char cmd[512];
-    snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr);
-
-    std::array<char, 256> buffer;
-    std::string raw_output;
-
-    FILE *pipe = popen(cmd, "r");
-    if (pipe) {
-        while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
-            raw_output += buffer.data();
-        }
-        pclose(pipe);
-    }
-
-    if (raw_output.empty() || raw_output.find("??") != std::string::npos) {
-        return "";
-    }
-
-    // Split by lines
-    std::vector<std::string> lines;
-    size_t pos = 0;
-    while (pos < raw_output.size()) {
-        size_t nl = raw_output.find('\n', pos);
-        if (nl == std::string::npos) nl = raw_output.size();
-        std::string line = raw_output.substr(pos, nl - pos);
-        while (!line.empty() && line.back() == '\r')
-            line.pop_back();
-        if (!line.empty()) lines.push_back(line);
-        pos = nl + 1;
-    }
-
-    if (lines.empty()) return "";
-
-    // First line is the innermost actual code location; subsequent lines are outer inline callers
-    if (inline_chain && lines.size() > 1) {
-        *inline_chain = "";
-        for (size_t j = 1; j < lines.size(); j++) {
-            *inline_chain += "    [inlined by] " + lines[j] + "\n";
-        }
-    }
-
-    return lines.front();
-}
-#endif
-
-/**
- * Get current stack trace information (including file paths and line numbers).
- * Uses dladdr to locate the shared library for each stack frame, then calls addr2line with relative addresses.
- */
-std::string get_stacktrace(int skip_frames) {
-    (void)skip_frames;  // May be unused on non-Linux platforms
-    std::string result;
-#ifdef __linux__
-    const int max_frames = 64;
-    void *buffer[max_frames];
-    int nframes = backtrace(buffer, max_frames);
-    char **symbols = backtrace_symbols(buffer, nframes);
-
-    if (symbols) {
-        result = "Stack trace:\n";
-        for (int i = skip_frames; i < nframes; i++) {
-            std::string frame_info;
-
-            void *addr = (void *)((char *)buffer[i] - 1);
-
-            Dl_info dl_info;
-            std::string inline_chain;
-            if (dladdr(addr, &dl_info) && dl_info.dli_fname) {
-                void *rel_addr = (void *)((char *)addr - (char *)dl_info.dli_fbase);
-                std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain);
-
-                if (addr2line_result.empty()) {
-                    addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain);
-                }
-
-                if (!addr2line_result.empty()) {
-                    frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result;
-                }
-            }
-
-            if (frame_info.empty()) {
-                std::string frame(symbols[i]);
-
-                size_t start = frame.find('(');
-                size_t end = frame.find('+', start);
-                if (start != std::string::npos && end != std::string::npos) {
-                    std::string mangled = frame.substr(start + 1, end - start - 1);
-                    int status;
-                    char *demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
-                    if (status == 0 && demangled) {
-                        frame = frame.substr(0, start + 1) + demangled + frame.substr(end);
-                        free(demangled);
-                    }
-                }
-                frame_info = frame;
-            }
-
-            char buf[16];
-            snprintf(buf, sizeof(buf), "  #%d ", i - skip_frames);
-            result += buf + frame_info + "\n";
-            if (!inline_chain.empty()) {
-                result += inline_chain;
-            }
-        }
-        free(symbols);
-    }
-#else
-    result = "(Stack trace is only available on Linux)\n";
-#endif
-    return result;
-}
-
-// AssertionError constructor
-static std::string build_assert_message(const char *condition, const char *file, int line) {
-    std::string msg = "Assertion failed: " + std::string(condition) + "\n";
-    msg += "  Location: " + std::string(file) + ":" + std::to_string(line) + "\n";
-    msg += get_stacktrace(3);
-    return msg;
-}
-
-AssertionError::AssertionError(const char *condition, const char *file, int line) :
-    std::runtime_error(build_assert_message(condition, file, line)),
-    condition_(condition),
-    file_(file),
-    line_(line) {}
-
-[[noreturn]] void assert_impl(const char *condition, const char *file, int line) {
-    LOG_ERROR("\n========================================");
-    LOG_ERROR("Assertion failed: %s", condition);
-    LOG_ERROR("Location: %s:%d", file, line);
-    LOG_ERROR("%s", get_stacktrace(2).c_str());
-    LOG_ERROR("========================================\n");
-
-    throw AssertionError(condition, file, line);
+extern "C" __attribute__((visibility("hidden"))) PTO2Runtime *framework_current_runtime()
+{
+    return g_current_runtime;
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_arg_with_deps.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_arg_with_deps.h
index 5ea856487..0a289ef5e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_arg_with_deps.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_arg_with_deps.h
@@ -8,31 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * Convenience layer over Arg: bundles a fixed-capacity dependency buffer with
- * an Arg and exposes an incremental add_dep(...) API on top of the runtime
- * primitive Arg::set_dependencies(ptr, count).
- *
- * Layering:
- *   - Primitive:   Arg + set_dependencies(ptr, count) in pto_types.h.
- *                  No cap, caller owns the deps buffer.
- *   - Convenience: ArgWithDeps<N> in this header. Owns a stack-sized dep
- *                  buffer of capacity N (default 16); provides add_dep().
- *                  Submitted via the rt_submit_*_task overloads below, which
- *                  forward the bundled deps into the underlying Arg.
- *
- * This file is auto-included at the bottom of pto_orchestration_api.h so
- * orchestration sources see ArgWithDeps after a single `#include
- * "pto_orchestration_api.h"`. The split is purely organizational —
- * orchestration code should not include this header directly. Code generated
- * from pypto can ignore the convenience layer entirely and target Arg +
- * set_dependencies(ptr, count) directly.
- *
- * ArgWithDeps uses private inheritance from Arg so that set_dependencies and
- * the explicit_dep* accessors are NOT reachable on a wrapper instance — users
- * who pick the convenience layer cannot accidentally mix it with the
- * primitive layer's dep API on the same object.
- */
 
 #pragma once
 
@@ -44,7 +19,8 @@
 #include "pto_orchestration_api.h"  // Arg, MixedKernels, rt_submit_* primitives
 
 template <size_t MAX_DEP_COUNT = 16>
-class ArgWithDeps : private Arg {
+class ArgWithDeps : private Arg
+{
 public:
     // Tensor / scalar setters — forward to Arg
     using Arg::add_inout;
@@ -64,50 +40,27 @@ class ArgWithDeps : private Arg {
     using Arg::launch_spec;
     using Arg::set_error;
 
-    // NOT exposed: set_dependencies, explicit_dep_count, explicit_dep,
-    // explicit_deps_data — these are the primitive-layer dep API. Users of
-    // the convenience layer reach dependencies only through add_dep() below.
-
-    /**
-     * Append one or more dependencies to the bundled buffer. May be called
-     * multiple times; deps accumulate. Variadic accepts any non-zero number
-     * of PTO2TaskId arguments.
-     *
-     * Overflow (more than MAX_DEP_COUNT total) records an error on the
-     * underlying Arg; the error surfaces at submit time.
-     */
     template <typename... Ids>
-    void add_dep(Ids... ids) {
+    void add_dep(Ids... ids)
+    {
         static_assert(sizeof...(Ids) >= 1, "add_dep: at least one task id is required");
-        static_assert(
-            (std::is_same_v<std::decay_t<Ids>, PTO2TaskId> && ...), "add_dep: all arguments must be PTO2TaskId"
-        );
-        if (count_ + sizeof...(Ids) > MAX_DEP_COUNT) {
+        static_assert((std::is_same_v<std::decay_t<Ids>, PTO2TaskId> && ...), "add_dep: all arguments must be PTO2TaskId");
+        if (count_ + sizeof...(Ids) > MAX_DEP_COUNT)
+        {
             Arg::set_error("ArgWithDeps::add_dep: dep count exceeds MAX_DEP_COUNT (bump the template arg)");
             return;
         }
         ((deps_[count_++] = ids), ...);
     }
 
-    /**
-     * Clear the bundled dep buffer and reset the underlying Arg.
-     * Use this to recycle an ArgWithDeps across loop iterations.
-     */
-    void reset() {
+    void reset()
+    {
         Arg::reset();
         count_ = 0;
     }
 
-    /**
-     * Submit-only hook: bind the bundled deps onto the underlying Arg and
-     * return it as Arg&. Called by the rt_submit_*_task overloads below;
-     * orchestration code does not invoke this directly.
-     *
-     * Idempotent: explicitly clears any prior dep binding before re-setting,
-     * so a wrapper can be re-finalized (e.g. resubmitted) without tripping
-     * the primitive layer's single-shot check.
-     */
-    Arg &finalize_for_submit() {
+    Arg &finalize_for_submit()
+    {
         Arg::set_dependencies(nullptr, 0);
         Arg::set_dependencies(deps_, count_);
         return *this;
@@ -118,21 +71,20 @@ class ArgWithDeps : private Arg {
     uint32_t count_ = 0;
 };
 
-// =============================================================================
-// Submit overloads — accept ArgWithDeps<N> transparently
-// =============================================================================
-
 template <size_t N>
-static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, ArgWithDeps<N> &awd) {
+static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, ArgWithDeps<N> &awd)
+{
     return rt_submit_task(mixed_kernels, awd.finalize_for_submit());
 }
 
 template <size_t N>
-static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, ArgWithDeps<N> &awd) {
+static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, ArgWithDeps<N> &awd)
+{
     return rt_submit_aic_task(kernel_id, awd.finalize_for_submit());
 }
 
 template <size_t N>
-static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, ArgWithDeps<N> &awd) {
+static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, ArgWithDeps<N> &awd)
+{
     return rt_submit_aiv_task(kernel_id, awd.finalize_for_submit());
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
index 204b1d7ad..dbecd49f4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/pto_orchestration_api.h
@@ -8,21 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Orchestration API - Slim header for orchestration .so files
- *
- * This header provides everything an orchestration source needs without
- * pulling in runtime implementation headers.  The orchestration .so has
- * zero link dependencies on runtime .cpp files; all runtime calls go
- * through the PTO2RuntimeOps function-pointer table embedded in
- * PTO2Runtime.
- *
- * Orchestration sources include ONLY this header:
- *   #include "pto_orchestration_api.h"
- *
- * Runtime sources continue to use pto_runtime2.h (which defines the
- * full PTO2Runtime struct with all internal fields).
- */
 
 #pragma once
 
@@ -62,28 +47,12 @@ inline Tensor from_tensor_arg(const Tensor &t, bool manual_dep = false, int32_t
     return result;
 }
 
-// =============================================================================
-// Ops Table and Opaque Runtime
-// =============================================================================
-
-/**
- * Forward declaration — the orchestration sees PTO2Runtime as a partial
- * struct whose first field is the ops pointer.  The full definition
- * lives in pto_runtime2.h (used only by runtime .cpp files).
- */
 typedef struct PTO2Runtime PTO2Runtime;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/**
- * Framework-internal TLS bridge.
- *
- * The executor binds the current thread's runtime before invoking
- * aicpu_orchestration_entry(), so orchestration helpers can fetch the
- * current PTO2Runtime without explicit parameter threading.
- */
 PTO2Runtime *framework_current_runtime(void);
 void framework_bind_runtime(PTO2Runtime *rt);
 
@@ -91,11 +60,8 @@ void framework_bind_runtime(PTO2Runtime *rt);
 }
 #endif
 
-/**
- * Function-pointer table for runtime operations.
- * Populated by the runtime; called by orchestration through inline wrappers.
- */
-typedef struct PTO2RuntimeOps {
+typedef struct PTO2RuntimeOps
+{
     TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args);
     void (*scope_begin)(PTO2Runtime *rt);
     void (*scope_end)(PTO2Runtime *rt);
@@ -104,160 +70,119 @@ typedef struct PTO2RuntimeOps {
     void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
 
     // Logging (populated by runtime, called by orchestration)
-    void (*log_error)(const char *func, const char *fmt, ...);
-    void (*log_warn)(const char *func, const char *fmt, ...);
-    void (*log_debug)(const char *func, const char *fmt, ...);
-    // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside).
+    // INFO with explicit verbosity tier (v ∈ [0, 9]; gating done inside).
     void (*log_info_v)(const char *func, int v, const char *fmt, ...);
 
     // Cross-layer data access (orchestration reads/writes tensor values via runtime)
     // Placed after logging to avoid shifting hot-path field offsets.
     uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
-    void (*set_tensor_data)(
-        PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
-    );
+    void (*set_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value);
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args);
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args);
 
-    // Stash the call-site of the next PTO2ScopeGuard so the [ScopeStats]
-    // collector can log it. Always present to keep ops-table layout stable
-    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
     void (*scope_set_site)(const char *file, int line);
 } PTO2RuntimeOps;
 
-/**
- * Partial PTO2Runtime definition for orchestration.
- *
- * Exposes the ops pointer (for runtime calls) and pending_scope_mode
- * (read directly by inline scope wrappers).  The real struct (in
- * pto_runtime2.h) has the same first fields, so accessing them through
- * this definition is well-defined (C struct layout guarantee).
- */
-struct PTO2Runtime {
+struct PTO2Runtime
+{
     const PTO2RuntimeOps *ops;
     PTO2ScopeMode pending_scope_mode;
 };
 
-// =============================================================================
-// Inline Convenience Wrappers (call through ops table)
-// =============================================================================
-
-static inline PTO2Runtime *current_runtime() { return framework_current_runtime(); }
+static inline PTO2Runtime *current_runtime()
+{
+    return framework_current_runtime();
+}
 
-static inline TaskOutputTensors alloc_tensors(const Arg &args) {
+static inline TaskOutputTensors alloc_tensors(const Arg &args)
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return TaskOutputTensors{};
-    }
+    if (rt->ops->is_fatal(rt)) return TaskOutputTensors{};
     return rt->ops->alloc_tensors(rt, args);
 }
 
-static inline TaskOutputTensors alloc_tensors(const TensorCreateInfo create_infos[], uint32_t count) {
+static inline TaskOutputTensors alloc_tensors(const TensorCreateInfo create_infos[], uint32_t count)
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return TaskOutputTensors{};
-    }
+    if (rt->ops->is_fatal(rt)) return TaskOutputTensors{};
     Arg args;
-    for (uint32_t i = 0; i < count; i++) {
-        args.add_output(create_infos[i]);
-    }
-    if (args.has_error) {
-        rt->ops->report_fatal(
-            rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
-            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
-        );
+    for (uint32_t i = 0; i < count; i++) args.add_output(create_infos[i]);
+    if (args.has_error)
+    {
+        rt->ops->report_fatal(rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg");
         return TaskOutputTensors{};
     }
     return alloc_tensors(args);
 }
 
 template <typename... CIs>
-static inline TaskOutputTensors alloc_tensors(const CIs &...cis) {
+static inline TaskOutputTensors alloc_tensors(const CIs &...cis)
+{
     static_assert(sizeof...(cis) > 0, "alloc_tensors requires at least one TensorCreateInfo");
-    static_assert(
-        (std::is_same_v<std::decay_t<CIs>, TensorCreateInfo> && ...),
-        "alloc_tensors only accepts TensorCreateInfo arguments"
-    );
+    static_assert((std::is_same_v<std::decay_t<CIs>, TensorCreateInfo> && ...), "alloc_tensors only accepts TensorCreateInfo arguments");
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return TaskOutputTensors{};
-    }
+    if (rt->ops->is_fatal(rt)) return TaskOutputTensors{};
     Arg args;
     (args.add_output(cis), ...);
-    if (args.has_error) {
-        rt->ops->report_fatal(
-            rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
-            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
-        );
+    if (args.has_error)
+    {
+        rt->ops->report_fatal(rt, PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg");
         return TaskOutputTensors{};
     }
     return alloc_tensors(args);
 }
 
-static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, const Arg &args) {
+static inline TaskOutputTensors rt_submit_task(const MixedKernels &mixed_kernels, const Arg &args)
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return TaskOutputTensors{};
-    }
+    if (rt->ops->is_fatal(rt)) return TaskOutputTensors{};
     return rt->ops->submit_task(rt, mixed_kernels, args);
 }
 
-/**
- * Convenience wrapper: submit an AIC-only task.
- */
-static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, const Arg &args) {
+static inline TaskOutputTensors rt_submit_aic_task(int32_t kernel_id, const Arg &args)
+{
     MixedKernels mk;
     mk.aic_kernel_id = kernel_id;
     return rt_submit_task(mk, args);
 }
 
-/**
- * Convenience wrapper: submit an AIV-only task (uses AIV0 slot).
- */
-static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, const Arg &args) {
+static inline TaskOutputTensors rt_submit_aiv_task(int32_t kernel_id, const Arg &args)
+{
     MixedKernels mk;
     mk.aiv0_kernel_id = kernel_id;
     return rt_submit_task(mk, args);
 }
 
-/**
- * Submit a dependency-only task. Accepts the same Arg shape as rt_submit_task
- * (inputs, outputs, inouts, explicit_deps, scalars) but does not run any
- * AICore kernel. The task still participates in the dependency graph: it
- * waits on its fanin and notifies its fanout. Useful as a synchronization
- * barrier or as a placeholder producer for tests / dep-graph wiring.
- */
-static inline TaskOutputTensors rt_submit_dummy_task(const Arg &args) {
+static inline TaskOutputTensors rt_submit_dummy_task(const Arg &args)
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return TaskOutputTensors{};
-    }
+    if (rt->ops->is_fatal(rt)) return TaskOutputTensors{};
     return rt->ops->submit_dummy_task(rt, args);
 }
 
-static inline void rt_scope_begin(PTO2ScopeMode mode = PTO2ScopeMode::AUTO) {
+static inline void rt_scope_begin(PTO2ScopeMode mode = PTO2ScopeMode::AUTO)
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return;
-    }
+    if (rt->ops->is_fatal(rt)) return;
     rt->pending_scope_mode = mode;
     rt->ops->scope_begin(rt);
 }
 
-static inline void rt_scope_end() {
+static inline void rt_scope_end()
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return;
-    }
+    if (rt->ops->is_fatal(rt)) return;
     rt->ops->scope_end(rt);
 }
 
-static inline void rt_orchestration_done() {
+static inline void rt_orchestration_done()
+{
     PTO2Runtime *rt = current_runtime();
     rt->ops->orchestration_done(rt);
 }
 
-static inline bool rt_is_fatal() {
+static inline bool rt_is_fatal()
+{
     PTO2Runtime *rt = current_runtime();
     return rt->ops->is_fatal(rt);
 }
@@ -268,111 +193,40 @@ static inline bool rt_is_fatal() {
         _rt->ops->report_fatal(_rt, (code), __FUNCTION__, (fmt), ##__VA_ARGS__); \
     } while (0)
 
-// =============================================================================
-// Logging Macros for Orchestration (call through ops table)
-// =============================================================================
-
-#define LOG_ERROR(fmt, ...) current_runtime()->ops->log_error(__FUNCTION__, fmt, ##__VA_ARGS__)
-#define LOG_WARN(fmt, ...) current_runtime()->ops->log_warn(__FUNCTION__, fmt, ##__VA_ARGS__)
-#define LOG_DEBUG(fmt, ...) current_runtime()->ops->log_debug(__FUNCTION__, fmt, ##__VA_ARGS__)
-
 // INFO verbosity tiers. v=0 most verbose, v=9 must-see, v=5 default.
-#define LOG_INFO_V0(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 0, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V1(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 1, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V2(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 2, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V3(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 3, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V4(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 4, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V5(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 5, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V6(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 6, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V7(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 7, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V8(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 8, fmt, ##__VA_ARGS__)
-#define LOG_INFO_V9(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 9, fmt, ##__VA_ARGS__)
 
-// =============================================================================
-// Cross-Layer Data Access
-// =============================================================================
-
-/**
- * Read a value from a tensor at the given multi-dimensional indices.
- *
- * Default T = uint64_t preserves old behavior (raw bits).
- * Specify T to get automatic type conversion:
- *
- *   uint64_t raw = get_tensor_data(tensor, 1, idx);       // old usage unchanged
- *   float val = get_tensor_data<float>(tensor, 1, idx);   // typed read
- *
- * If the tensor has a producer in TensorMap, spin-waits until the producer
- * task completes before reading. External tensors (make_tensor_external)
- * are read immediately without waiting.
- */
 template <typename T = uint64_t>
-static inline T get_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) {
+static inline T get_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[])
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return from_u64<T>(0);
-    }
+    if (rt->ops->is_fatal(rt)) return from_u64<T>(0);
     return from_u64<T>(rt->ops->get_tensor_data(rt, tensor, ndims, indices));
 }
 
-/**
- * Write a value to a tensor at the given multi-dimensional indices.
- *
- * Type is deduced from value argument; uint64_t by default:
- *
- *   set_tensor_data(tensor, 1, idx, raw_u64);     // old usage unchanged
- *   set_tensor_data(tensor, 1, idx, 42.0f);       // typed write (T = float)
- *
- * If the tensor has a producer in TensorMap, spin-waits until the producer
- * and all its consumers complete before writing (WAW + WAR safety).
- * External tensors (make_tensor_external) with no TensorMap entry are
- * written immediately without waiting.
- *
- * Limitation: TensorMap only tracks producers (OUTPUT/INOUT), not consumers
- * that used the tensor as INPUT. If a kernel reads this tensor as INPUT
- * (not INOUT) and the tensor has no TensorMap producer entry, set_tensor_data
- * cannot detect the reader and may cause a data race.
- *
- * To ensure WAR safety for all access patterns, use add_inout() instead of
- * add_input() for kernel parameters that may later be written via
- * set_tensor_data. INOUT creates a TensorMap entry that enables automatic
- * consumer tracking via fanout_refcount.
- *
- * The tensor must already have an allocated buffer (addr != 0).
- * For runtime-created outputs, call this only on the Tensor returned by
- * add_output(TensorCreateInfo) after submit returns.
- */
 template <typename T = uint64_t>
-static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[], T value) {
+static inline void set_tensor_data(const Tensor &tensor, uint32_t ndims, const uint32_t indices[], T value)
+{
     PTO2Runtime *rt = current_runtime();
-    if (rt->ops->is_fatal(rt)) {
-        return;
-    }
+    if (rt->ops->is_fatal(rt)) return;
     rt->ops->set_tensor_data(rt, tensor, ndims, indices, to_u64(value));
 }
 
-// =============================================================================
-// C++ Scope Guards and Macros
-// =============================================================================
-
-/**
- * RAII Scope Guard (calls through ops table)
- */
-class PTO2ScopeGuard {
+class PTO2ScopeGuard
+{
 public:
-    explicit PTO2ScopeGuard(
-        PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE()
-    ) :
-        rt_(current_runtime()) {
-        if (!rt_->ops->is_fatal(rt_)) {
+    explicit PTO2ScopeGuard(PTO2ScopeMode mode = PTO2ScopeMode::AUTO, const char *file = __builtin_FILE(), int line = __builtin_LINE()) :
+        rt_(current_runtime())
+    {
+        if (!rt_->ops->is_fatal(rt_))
+        {
             rt_->pending_scope_mode = mode;
             if (rt_->ops->scope_set_site) rt_->ops->scope_set_site(file, line);
             rt_->ops->scope_begin(rt_);
         }
     }
-    ~PTO2ScopeGuard() {
-        if (!rt_->ops->is_fatal(rt_)) {
-            rt_->ops->scope_end(rt_);
-        }
+    ~PTO2ScopeGuard()
+    {
+        if (!rt_->ops->is_fatal(rt_)) rt_->ops->scope_end(rt_);
     }
 
 private:
@@ -384,34 +238,28 @@ class PTO2ScopeGuard {
 
 #define PTO2_SCOPE_GUARD() [[maybe_unused]] PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__)
 
-/**
- * Scoped block macro:
- *   PTO2_SCOPE() {
- *       rt_submit_task(...);
- *   }
- */
 #define PTO2_SCOPE(...) if (PTO2ScopeGuard _PTO2_CONCATENATE(scope_guard_, __COUNTER__){__VA_ARGS__}; true)
 
-// =============================================================================
-// Orchestration Config
-// =============================================================================
+// User-orchestration logging macros. Route through the runtime's ops table so
+// the verbosity gating (V0..V9) and the actual logging sink stay owned by the
+// runtime. The orchestration .so just calls — gating is done inside.
+#define LOG_INFO_V0(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 0, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V1(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 1, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V2(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 2, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V3(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 3, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V4(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 4, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V5(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 5, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V6(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 6, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V7(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 7, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V8(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 8, fmt, ##__VA_ARGS__)
+#define LOG_INFO_V9(fmt, ...) current_runtime()->ops->log_info_v(__FUNCTION__, 9, fmt, ##__VA_ARGS__)
 
-/**
- * Configuration exported by orchestration .so via aicpu_orchestration_config().
- * The executor reads these values to set up shared memory and runtime.
- *
- * This struct is defined identically in pto_runtime2.h (with an include
- * guard) so the executor can use the same type without including this header.
- */
 #ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
 #define PTO2_ORCHESTRATION_CONFIG_DEFINED
-struct PTO2OrchestrationConfig {
+struct PTO2OrchestrationConfig
+{
     int expected_arg_count;
 };
 #endif
 
-// Convenience layer (ArgWithDeps<N> + matching rt_submit_*_task overloads).
-// Pulled in at the bottom so the wrapper sees Arg, MixedKernels, and the
-// rt_submit_*_task primitives defined above. Orchestration sources include
-// only this single header to access both the primitive and convenience APIs.
 #include "pto_arg_with_deps.h"  // NOLINT(build/include_subdir)
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox.h
index 0f73a043a..d2eb173c2 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox.h
@@ -19,21 +19,10 @@
 #include "pto_constants.h"
 #include "pto_task_id.h"
 
-// AICPU-only MPSC ring used to convey deferred-completion observations from
-// FIN-handling scheduler threads to the dispatch thread. Producers push under
-// CAS on `head`; the single consumer (dispatch thread, under AsyncWaitList::
-// busy) drains in seq order. Kernel-side code never touches this struct —
-// AICore writes go into DeferredCompletionSlab (see
-// aicore_completion_mailbox_types.h), which the FIN thread reads, flattens
-// into messages here, and forwards.
-
 #define AICORE_COMPLETION_MAILBOX_CAPACITY 4096u
 #define AICORE_COMPLETION_MAILBOX_MASK (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)
 
-static_assert(
-    (AICORE_COMPLETION_MAILBOX_CAPACITY & (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)) == 0,
-    "AICORE_COMPLETION_MAILBOX_CAPACITY must be a power of two"
-);
+static_assert((AICORE_COMPLETION_MAILBOX_CAPACITY & (AICORE_COMPLETION_MAILBOX_CAPACITY - 1u)) == 0, "AICORE_COMPLETION_MAILBOX_CAPACITY must be a power of two");
 
 // Mailbox message discriminator. CONDITION carries one deferred-completion
 // observation flattened from a DeferredCompletionEntry. TASK_NORMAL_DONE
@@ -45,16 +34,10 @@ static_assert(
 #define MSG_KIND_CONDITION 0u
 #define MSG_KIND_TASK_NORMAL_DONE 1u
 
-struct AICoreCompletionMailboxMessage {
-    // Per-slot ready flag. Producer publishes `tail+1` after filling the rest
-    // of the slot with a release store; consumer waits for the matching seq
-    // value with an acquire load. The release-acquire pair publishes all
-    // other fields below as a side effect, so they stay plain.
+struct AICoreCompletionMailboxMessage
+{
     std::atomic<uint64_t> seq;
     PTO2TaskId task_token;
-    // CONDITION: completion observation addr (counter / SDMA event record).
-    // TASK_NORMAL_DONE: PTO2TaskSlotState pointer carried over to the consumer
-    //   so it can finalize the AsyncWaitEntry.slot_state binding.
     uint64_t addr;
     uint32_t expected_value;
     uint32_t engine;
@@ -64,19 +47,11 @@ struct AICoreCompletionMailboxMessage {
 };
 
 static_assert(sizeof(AICoreCompletionMailboxMessage) == PTO2_ALIGN_SIZE, "AICoreCompletionMailboxMessage layout drift");
-static_assert(
-    sizeof(std::atomic<uint64_t>) == sizeof(uint64_t),
-    "std::atomic<uint64_t> must be layout-compatible with uint64_t for the message slot layout to hold"
-);
-static_assert(
-    std::atomic<uint64_t>::is_always_lock_free,
-    "AICoreCompletionMailbox requires lock-free uint64_t atomics on every supported target"
-);
-
-// POD view of a drained message. `seq` is the ring's publication flag, not
-// payload, so try_pop copies out only the fields below (and seq is not even
-// copyable — it is a std::atomic).
-struct AICoreCompletionMsgView {
+static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t), "std::atomic<uint64_t> must be layout-compatible with uint64_t for the message slot layout to hold");
+static_assert(std::atomic<uint64_t>::is_always_lock_free, "AICoreCompletionMailbox requires lock-free uint64_t atomics on every supported target");
+
+struct AICoreCompletionMsgView
+{
     PTO2TaskId task_token{PTO2TaskId::invalid()};
     uint64_t addr{0};
     uint32_t expected_value{0};
@@ -85,7 +60,8 @@ struct AICoreCompletionMsgView {
     uint32_t kind{0};
 };
 
-struct AICoreCompletionMailbox {
+struct AICoreCompletionMailbox
+{
     // head and tail live on their own cache lines so producer CAS contention
     // on head can't false-share with the consumer's tail updates.
     alignas(PTO2_ALIGN_SIZE) std::atomic<uint64_t> head;
@@ -96,32 +72,21 @@ struct AICoreCompletionMailbox {
 
     // Cheap, lock-free pending hint. Callers may invoke this outside the
     // consumer lock; a stale answer only over/under-triggers a drain attempt.
-    bool has_pending() { return tail.load(std::memory_order_acquire) < head.load(std::memory_order_acquire); }
-
-    // MPSC push for a CONDITION message. Returns false when the ring is full
-    // (head - tail >= CAPACITY); caller should SPIN_WAIT_HINT and retry.
-    // Lock-free: CAS the shared head to claim a slot, write the fields, then
-    // release-store seq so the single consumer observes the publication.
-    //
-    // The head CAS is relaxed: head is a pure ticket counter and carries no
-    // data to the consumer — publication is solely the seq release-store, and
-    // slot-reuse safety rests on the acquire load of tail. The relaxed failure
-    // order is likewise sufficient since a lost CAS just re-reads head and
-    // retries. compare_exchange_weak is used because this loop already re-reads
-    // head and re-checks fullness, so masking LL/SC spurious failures (what
-    // _strong adds on aarch64) would only be a redundant inner retry.
-    //
-    // Safe to call concurrently from any number of producers; structurally
-    // independent of the AsyncWaitList::busy lock.
-    bool try_push_condition(
-        PTO2TaskId task_token, uint64_t addr, uint32_t expected_value, uint32_t engine, int32_t completion_type
-    ) {
-        while (true) {
+    bool has_pending()
+    {
+        return tail.load(std::memory_order_acquire) < head.load(std::memory_order_acquire);
+    }
+
+    bool try_push_condition(PTO2TaskId task_token, uint64_t addr, uint32_t expected_value, uint32_t engine, int32_t completion_type)
+    {
+        while (true)
+        {
             uint64_t h = head.load(std::memory_order_relaxed);
             uint64_t t = tail.load(std::memory_order_acquire);
             if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false;
             uint64_t new_head = h + 1;
-            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) {
+            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed))
+            {
                 AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK];
                 slot->task_token.raw = task_token.raw;
                 slot->addr = addr;
@@ -136,16 +101,16 @@ struct AICoreCompletionMailbox {
         }
     }
 
-    // MPSC push for a TASK_NORMAL_DONE sentinel. Carries the PTO2TaskSlotState
-    // pointer in the `addr` field so the consumer can finish binding the
-    // AsyncWaitEntry.slot_state without going back to the FIN-handling thread.
-    bool try_push_normal_done(PTO2TaskId task_token, uint64_t slot_state_addr) {
-        while (true) {
+    bool try_push_normal_done(PTO2TaskId task_token, uint64_t slot_state_addr)
+    {
+        while (true)
+        {
             uint64_t h = head.load(std::memory_order_relaxed);
             uint64_t t = tail.load(std::memory_order_acquire);
             if (h - t >= AICORE_COMPLETION_MAILBOX_CAPACITY) return false;
             uint64_t new_head = h + 1;
-            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed)) {
+            if (head.compare_exchange_weak(h, new_head, std::memory_order_relaxed, std::memory_order_relaxed))
+            {
                 AICoreCompletionMailboxMessage *slot = &entries[h & AICORE_COMPLETION_MAILBOX_MASK];
                 slot->task_token.raw = task_token.raw;
                 slot->addr = slot_state_addr;
@@ -159,13 +124,8 @@ struct AICoreCompletionMailbox {
         }
     }
 
-    // Single-consumer transport-level dequeue (caller holds the consumer lock).
-    // Returns false at the first not-yet-published slot (gap) or when empty;
-    // otherwise copies the next message in tail order into `out`, advances
-    // tail, and returns true. tail is consumer-only-written (relaxed read);
-    // head bounds the scan (relaxed); the seq acquire is the real publication
-    // gate; the tail release publishes "slot free" to reusing producers.
-    bool try_pop(AICoreCompletionMsgView &out) {
+    bool try_pop(AICoreCompletionMsgView &out)
+    {
         uint64_t t = tail.load(std::memory_order_relaxed);
         uint64_t h = head.load(std::memory_order_relaxed);
         if (t >= h) return false;
@@ -182,8 +142,6 @@ struct AICoreCompletionMailbox {
     }
 };
 
-static_assert(
-    sizeof(AICoreCompletionMailbox) % PTO2_ALIGN_SIZE == 0, "AICoreCompletionMailbox size must be cache-line aligned"
-);
+static_assert(sizeof(AICoreCompletionMailbox) % PTO2_ALIGN_SIZE == 0, "AICoreCompletionMailbox size must be cache-line aligned");
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox_types.h
index da0d89ad7..5617cd6d4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/aicore_completion_mailbox_types.h
@@ -16,16 +16,6 @@
 
 #include "pto_constants.h"
 
-// Types shared across the AICore↔AICPU boundary.
-//
-// This header is reachable from AICore-side translation units (via
-// pto_async_kernel_api.h / pto_completion_token.h / sdma_completion_kernel.h)
-// and must stay parseable by every AICore toolchain configuration: no
-// <atomic>, no __atomic_* intrinsics, no MPSC ring buffer struct.
-//
-// The MPSC ring (AICoreCompletionMailbox) and its push/drain helpers live in
-// aicore_completion_mailbox.h, which is AICPU-only.
-
 inline constexpr int32_t MAX_COMPLETIONS_PER_TASK = 64;
 
 #define COMPLETION_ENGINE_SDMA 0u
@@ -36,14 +26,8 @@ inline constexpr int32_t MAX_COMPLETIONS_PER_TASK = 64;
 #define COMPLETION_TYPE_COUNTER 0
 #define COMPLETION_TYPE_SDMA_EVENT_RECORD 1
 
-// DeferredCompletionEntry / DeferredCompletionSlab back the per-task scratch
-// area that AICore writes into to record "this completion has to be observed
-// before the task can retire." The FIN-handling scheduler thread reads the
-// slab, flattens entries into AICoreCompletionMailbox messages, and forwards
-// them to the dispatch thread. `volatile` here is load-bearing: writers live
-// on AICore and readers on AICPU, so the qualifier is the correct way to
-// pin the compiler against caching / reordering on either side.
-struct DeferredCompletionEntry {
+struct DeferredCompletionEntry
+{
     uint64_t addr;
     uint32_t expected_value;
     uint32_t engine;
@@ -53,15 +37,13 @@ struct DeferredCompletionEntry {
 
 static_assert(sizeof(DeferredCompletionEntry) == 24, "DeferredCompletionEntry layout drift");
 
-struct alignas(PTO2_ALIGN_SIZE) DeferredCompletionSlab {
+struct alignas(PTO2_ALIGN_SIZE) DeferredCompletionSlab
+{
     volatile uint32_t count;
     volatile int32_t error_code;
     DeferredCompletionEntry entries[MAX_COMPLETIONS_PER_TASK];
 };
 
-static_assert(
-    sizeof(DeferredCompletionSlab) % PTO2_ALIGN_SIZE == 0,
-    "DeferredCompletionSlab size must preserve array element cache-line boundaries"
-);
+static_assert(sizeof(DeferredCompletionSlab) % PTO2_ALIGN_SIZE == 0, "DeferredCompletionSlab size must preserve array element cache-line boundaries");
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_AICORE_COMPLETION_MAILBOX_TYPES_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_kernel.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_kernel.h
index 49ee7cc11..c83bb475e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_kernel.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_kernel.h
@@ -31,24 +31,15 @@
 // <pto/npu/comm/async/sdma/sdma_types.hpp> just to spell their scratch tile.
 inline constexpr uint32_t SDMA_SCRATCH_ALIGNMENT = pto::comm::sdma::UB_ALIGN_SIZE;
 
-enum class SdmaOp : uint8_t {
+enum class SdmaOp : uint8_t
+{
     TGET = 0,
     TPUT = 1,
 };
 
-// SdmaRequestDescriptor bundles everything send_request_entry needs to drive
-// one SDMA transfer + completion registration. It is a template because the
-// destination / source / scratch types carry tensor shape & stride at compile
-// time; the SdmaTget() / SdmaTput() helpers below let callers skip the
-// template arguments.
-//
-// sync_id selects which event-record slot inside the workspace the engine
-// writes into. Concurrent dispatches must use distinct sync_ids; today every
-// caller submits one request per kernel invocation so passing 0 is safe.
-// Future work (see .docs/25.comm-api-refactor/03.implementation-plan.md §5.2)
-// will fold sync_id allocation into the adapter.
 template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
-struct SdmaRequestDescriptor {
+struct SdmaRequestDescriptor
+{
     SdmaOp op;
     DstTensor dst;
     SrcTensor src;
@@ -58,45 +49,38 @@ struct SdmaRequestDescriptor {
 };
 
 template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
-inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTget(
-    const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace,
-    uint32_t sync_id = 0
-) {
-    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TGET, dst,       src,
-                                                                     scratch,      workspace, sync_id};
+inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTget(const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace, uint32_t sync_id = 0)
+{
+    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TGET, dst, src, scratch, workspace, sync_id};
 }
 
 template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
-inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTput(
-    const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace,
-    uint32_t sync_id = 0
-) {
-    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TPUT, dst,       src,
-                                                                     scratch,      workspace, sync_id};
+inline __aicore__ SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> SdmaTput(const DstTensor &dst, const SrcTensor &src, const ScratchTileT &scratch, __gm__ uint8_t *workspace, uint32_t sync_id = 0)
+{
+    return SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT>{SdmaOp::TPUT, dst, src, scratch, workspace, sync_id};
 }
 
 namespace pto2::detail {
 
-inline __aicore__ void register_sdma_event_record(AsyncCtx &ctx, volatile __gm__ void *record_addr) {
-    CompletionToken token{
-        reinterpret_cast<uint64_t>(record_addr), 0, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_SDMA_EVENT_RECORD, 0
-    };
+inline __aicore__ void register_sdma_event_record(AsyncCtx &ctx, volatile __gm__ void *record_addr)
+{
+    CompletionToken token{reinterpret_cast<uint64_t>(record_addr), 0, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_SDMA_EVENT_RECORD, 0};
     (void)register_completion_condition(ctx, token);
 }
 
 template <typename PtoAsyncEvent, typename PtoAsyncSession>
-inline __aicore__ void
-register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsyncSession &session) {
-    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) {
+inline __aicore__ void register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsyncSession &session)
+{
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr)
+    {
         (void)event.Wait(session);
         return;
     }
-    if (event.handle == 0) {
-        return;
-    }
+    if (event.handle == 0) return;
 
     const uint32_t engine = static_cast<uint32_t>(event.engine);
-    if (engine != static_cast<uint32_t>(::pto::comm::DmaEngine::SDMA)) {
+    if (engine != static_cast<uint32_t>(::pto::comm::DmaEngine::SDMA))
+    {
         defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
         return;
     }
@@ -105,38 +89,29 @@ register_pto_async_event(AsyncCtx &ctx, const PtoAsyncEvent &event, const PtoAsy
     uint32_t sync_id = 0;
     __gm__ uint8_t *recv_workspace = nullptr;
     uint32_t queue_num = 0;
-    if (!::pto::comm::sdma::detail::PrepareEventCheck(
-            session.sdmaSession, tmp_buf, sync_id, recv_workspace, queue_num
-        )) {
+    if (!::pto::comm::sdma::detail::PrepareEventCheck(session.sdmaSession, tmp_buf, sync_id, recv_workspace, queue_num))
+    {
         defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
         return;
     }
-    for (uint32_t queue_id = 0; queue_id < queue_num; ++queue_id) {
-        register_sdma_event_record(ctx, ::pto::comm::sdma::detail::GetEventRecord(recv_workspace, queue_id));
-    }
+    for (uint32_t queue_id = 0; queue_id < queue_num; ++queue_id) register_sdma_event_record(ctx, ::pto::comm::sdma::detail::GetEventRecord(recv_workspace, queue_id));
 }
 
 }  // namespace pto2::detail
 
-// SDMA overload of the runtime's send_request_entry. Submits the descriptor
-// to PTO-ISA, then registers the resulting AsyncEvent's GM flag(s) into the
-// AsyncCtx deferred-wait slab and flushes. Returns false on submit/session
-// failure (also records the error in ctx.completion_error_code).
 template <typename DstTensor, typename SrcTensor, typename ScratchTileT>
-inline __aicore__ bool
-send_request_entry(AsyncCtx &ctx, SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> desc) {
+inline __aicore__ bool send_request_entry(AsyncCtx &ctx, SdmaRequestDescriptor<DstTensor, SrcTensor, ScratchTileT> desc)
+{
     pto::comm::AsyncSession session;
-    if (!pto::comm::BuildAsyncSession(desc.scratch, desc.workspace, session, desc.sync_id)) {
+    if (!pto::comm::BuildAsyncSession(desc.scratch, desc.workspace, session, desc.sync_id))
+    {
         pto2::detail::defer_error(ctx, PTO2_ERROR_ASYNC_COMPLETION_INVALID);
         return false;
     }
 
     pto::comm::AsyncEvent event;
-    if (desc.op == SdmaOp::TGET) {
-        event = pto::comm::TGET_ASYNC(desc.dst, desc.src, session);
-    } else {
-        event = pto::comm::TPUT_ASYNC(desc.dst, desc.src, session);
-    }
+    if (desc.op == SdmaOp::TGET) event = pto::comm::TGET_ASYNC(desc.dst, desc.src, session);
+    else event = pto::comm::TPUT_ASYNC(desc.dst, desc.src, session);
     pto2::detail::register_pto_async_event(ctx, event, session);
     pto2::detail::defer_flush(ctx);
     return true;
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_scheduler.h
index 689219c35..577e5138d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_scheduler.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/backend/sdma/sdma_completion_scheduler.h
@@ -19,10 +19,8 @@
 #include "pto_completion_token.h"
 #include "pto_runtime_status.h"
 
-// runtime-side mirror of the PTO-ISA SdmaEventRecord. SDMA backend is the only
-// allowed holder of this ABI knowledge; the generic scheduler dispatches into
-// the helpers below through the completion ops table.
-struct SdmaEventRecord {
+struct SdmaEventRecord
+{
     uint32_t flag;
     uint32_t sq_tail;
     uint64_t channel_info;
@@ -31,25 +29,24 @@ struct SdmaEventRecord {
 static_assert(sizeof(SdmaEventRecord) == 16, "SDMA event record ABI drift");
 static_assert(offsetof(SdmaEventRecord, sq_tail) == 4, "SDMA event record ABI drift");
 
-inline uintptr_t sdma_completion_cache_line(const volatile void *addr) {
+inline uintptr_t sdma_completion_cache_line(const volatile void *addr)
+{
     return reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
 }
 
-inline CompletionPollResult poll_sdma_event_record(uint64_t record_addr) {
-    if (record_addr == 0) {
-        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
-    }
-    volatile SdmaEventRecord *record =
-        reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
+inline CompletionPollResult poll_sdma_event_record(uint64_t record_addr)
+{
+    if (record_addr == 0) return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    volatile SdmaEventRecord *record = reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
     cache_invalidate_range(reinterpret_cast<const void *>(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE);
     uint32_t flag = __atomic_load_n(&record->flag, __ATOMIC_ACQUIRE);
     return {flag != 0 ? CompletionPollState::READY : CompletionPollState::PENDING, PTO2_ERROR_NONE};
 }
 
-inline void retire_sdma_event_record(uint64_t record_addr) {
+inline void retire_sdma_event_record(uint64_t record_addr)
+{
     if (record_addr == 0) return;
-    volatile SdmaEventRecord *record =
-        reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
+    volatile SdmaEventRecord *record = reinterpret_cast<volatile SdmaEventRecord *>(static_cast<uintptr_t>(record_addr));
     cache_invalidate_range(reinterpret_cast<const void *>(sdma_completion_cache_line(record)), PTO2_ALIGN_SIZE);
     uint32_t completed_tail = __atomic_load_n(&record->sq_tail, __ATOMIC_ACQUIRE);
     uint64_t channel_info_addr = __atomic_load_n(&record->channel_info, __ATOMIC_ACQUIRE);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
index 3ee022224..0fb534eb4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto2_dispatch_payload.h
@@ -9,29 +9,6 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * @file pto2_dispatch_payload.h
- * @brief Per-core dispatch payload for AICore kernel execution
- *
- * PTO2DispatchPayload holds the kernel function address, a per-core args[]
- * array, and embedded SPMD context (LocalContext + GlobalContext).  AICPU
- * maintains a static array of these (one per core).
- *
- * GlobalContext (sub_block_id) is initialized once at runtime startup via
- * init_global_context() and never modified afterwards.
- *
- * LocalContext (block_idx, block_num) and args[] are rebuilt by build_payload()
- * before each dispatch.  Both context struct pointers are written into the
- * args[] suffix on every dispatch (since args[] is rebuilt entirely each time).
- *
- * AICore caches a pointer to its per-core slot at startup and reads from
- * it on each dispatch.  The struct is cache-line aligned to avoid false
- * sharing across concurrently dispatched cores.
- *
- * The DATA_MAIN_BASE register protocol is unchanged from the base runtime:
- * a monotonically increasing reg_task_id signals new work to AICore.
- */
-
 #pragma once
 
 #include <stdint.h>
@@ -39,7 +16,6 @@
 #include "intrinsic.h"
 #include "pto_types.h"
 
-/** Max dispatch arguments: 16 scalars + up to 32 tensor pointers + ext params */
 #ifndef PTO2_DISPATCH_MAX_ARGS
 #define PTO2_DISPATCH_MAX_ARGS (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + PTO2_EXT_PARAMS_COUNT)
 #endif
@@ -49,36 +25,16 @@
 #endif
 
 // Verify hardcoded indices in intrinsic.h match the computed values.
-static_assert(
-    (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) == SPMD_LOCAL_CONTEXT_INDEX, "LOCAL_CONTEXT_INDEX out of sync with intrinsic.h"
-);
-static_assert(
-    (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + 1) == SPMD_GLOBAL_CONTEXT_INDEX,
-    "GLOBAL_CONTEXT_INDEX out of sync with intrinsic.h"
-);
+static_assert((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) == SPMD_LOCAL_CONTEXT_INDEX, "LOCAL_CONTEXT_INDEX out of sync with intrinsic.h");
+static_assert((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS + 1) == SPMD_GLOBAL_CONTEXT_INDEX, "GLOBAL_CONTEXT_INDEX out of sync with intrinsic.h");
 
-/**
- * Per-core dispatch payload: function address + args[] + SPMD context.
- *
- * AICPU maintains a static array s_payload_per_core[RUNTIME_MAX_WORKER].
- * AICore caches a pointer to its per-core slot at startup (via Handshake.task)
- * and reads from it on each dispatch.
- *
- * The struct is cache-line aligned to prevent false sharing across
- * concurrently dispatched cores.
- */
-struct alignas(64) PTO2DispatchPayload {
-    uint64_t function_bin_addr;            /**< Kernel entry address in GM (set by Scheduler) */
-    uint64_t args[PTO2_DISPATCH_MAX_ARGS]; /**< Kernel arguments (GM pointers + scalars + ext params) */
+struct alignas(64) PTO2DispatchPayload
+{
+    uint64_t function_bin_addr;
+    uint64_t args[PTO2_DISPATCH_MAX_ARGS];
 
-    /** Per-dispatch context: block_idx and block_num.
-     *  Written by build_payload() before each dispatch.
-     *  args[SPMD_LOCAL_CONTEXT_INDEX] points here. */
     LocalContext local_context;
 
-    /** Per-core global context: sub_block_id (AIV lane identity).
-     *  Initialized once by init_global_context() at runtime startup.
-     *  args[SPMD_GLOBAL_CONTEXT_INDEX] points here. */
     GlobalContext global_context;
 
     /** Speculative early-dispatch gate. 0 = ready: AICore executes on pickup.
@@ -88,10 +44,7 @@ struct alignas(64) PTO2DispatchPayload {
     uint8_t reserved_payload_abi_pad[4];
 
     static_assert(sizeof(args[0]) == 8);
-    static_assert(
-        PTO2_ALIGN_UP((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]), 64) ==
-        (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0])
-    );
+    static_assert(PTO2_ALIGN_UP((MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]), 64) == (MAX_TENSOR_ARGS + MAX_SCALAR_ARGS) * sizeof(args[0]));
 };
 
 static_assert(sizeof(PTO2DispatchPayload) == 512, "PTO2DispatchPayload hardware ABI size drift");
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_kernel_api.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_kernel_api.h
index cf6eb4790..357a1fdcf 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_kernel_api.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_kernel_api.h
@@ -29,13 +29,10 @@
 #define __gm__
 #endif
 
-// Public surface: get_async_ctx, async_ctx_is_deferred,
-// register_completion_condition, send_notification,
-// save_expected_notification_counter. Everything else lives in
-// pto2::detail and is reserved for backend adapters / internal use.
 namespace pto2::detail {
 
-inline __aicore__ void defer_load_slab(AsyncCtx &ctx) {
+inline __aicore__ void defer_load_slab(AsyncCtx &ctx)
+{
     if (ctx.completion_count == nullptr) return;
 #if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
     uintptr_t line = reinterpret_cast<uintptr_t>(ctx.completion_count) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
@@ -45,41 +42,33 @@ inline __aicore__ void defer_load_slab(AsyncCtx &ctx) {
 #endif
 }
 
-inline __aicore__ void defer_error(AsyncCtx &ctx, int32_t error_code) {
-    if (ctx.task_token.is_valid() && ctx.completion_error_code != nullptr) {
-        *ctx.completion_error_code = error_code;
-    }
+inline __aicore__ void defer_error(AsyncCtx &ctx, int32_t error_code)
+{
+    if (ctx.task_token.is_valid() && ctx.completion_error_code != nullptr) *ctx.completion_error_code = error_code;
 }
 
-inline __aicore__ void defer_flush_range(volatile __gm__ void *addr, uint32_t size_bytes) {
+inline __aicore__ void defer_flush_range(volatile __gm__ void *addr, uint32_t size_bytes)
+{
     if (addr == nullptr || size_bytes == 0) return;
 #if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
     uintptr_t start = reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
-    uintptr_t end =
-        (reinterpret_cast<uintptr_t>(addr) + size_bytes + PTO2_ALIGN_SIZE - 1u) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
-    for (uintptr_t p = start; p < end; p += PTO2_ALIGN_SIZE) {
-        dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT);
-    }
+    uintptr_t end = (reinterpret_cast<uintptr_t>(addr) + size_bytes + PTO2_ALIGN_SIZE - 1u) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
+    for (uintptr_t p = start; p < end; p += PTO2_ALIGN_SIZE) dcci((__gm__ int32_t *)p, SINGLE_CACHE_LINE, CACHELINE_OUT);
 #else
     (void)addr;
     (void)size_bytes;
 #endif
 }
 
-inline __aicore__ void defer_flush(AsyncCtx &ctx) {
+inline __aicore__ void defer_flush(AsyncCtx &ctx)
+{
     if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr) return;
 #if defined(__CCE_KT_TEST__) || defined(__CCE_AICORE__) || defined(__DAV_C220__)
     uint32_t count = *ctx.completion_count;
-    if (count > ctx.completion_capacity) {
-        count = ctx.completion_capacity;
-    }
+    if (count > ctx.completion_capacity) count = ctx.completion_capacity;
     uint32_t flush_bytes = static_cast<uint32_t>(sizeof(*ctx.completion_count));
-    if (ctx.completion_error_code != nullptr) {
-        flush_bytes += static_cast<uint32_t>(sizeof(*ctx.completion_error_code));
-    }
-    if (ctx.completion_entries != nullptr) {
-        flush_bytes += count * static_cast<uint32_t>(sizeof(DeferredCompletionEntry));
-    }
+    if (ctx.completion_error_code != nullptr) flush_bytes += static_cast<uint32_t>(sizeof(*ctx.completion_error_code));
+    if (ctx.completion_entries != nullptr) flush_bytes += count * static_cast<uint32_t>(sizeof(DeferredCompletionEntry));
     defer_flush_range(ctx.completion_count, flush_bytes);
 #if defined(__CPU_SIM)
     dsb(0);
@@ -95,9 +84,9 @@ inline __aicore__ void defer_flush(AsyncCtx &ctx) {
 
 }  // namespace pto2::detail
 
-inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args) {
-    __gm__ LocalContext *lc =
-        reinterpret_cast<__gm__ LocalContext *>(static_cast<uintptr_t>(args[PAYLOAD_LOCAL_CONTEXT_INDEX]));
+inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args)
+{
+    __gm__ LocalContext *lc = reinterpret_cast<__gm__ LocalContext *>(static_cast<uintptr_t>(args[PAYLOAD_LOCAL_CONTEXT_INDEX]));
     AsyncCtx ctx{};
     ctx.completion_count = lc->async_ctx.completion_count;
     ctx.completion_error_code = lc->async_ctx.completion_error_code;
@@ -108,23 +97,19 @@ inline __aicore__ AsyncCtx get_async_ctx(__gm__ int64_t *args) {
     return ctx;
 }
 
-inline __aicore__ bool async_ctx_is_deferred(const AsyncCtx &ctx) { return ctx.task_token.is_valid(); }
+inline __aicore__ bool async_ctx_is_deferred(const AsyncCtx &ctx)
+{
+    return ctx.task_token.is_valid();
+}
 
-// Canonical writer: backend submit handlers build a CompletionToken and pass
-// it here. Writes one DeferredCompletionEntry to the AsyncCtx slab and
-// bumps completion_count. Returns false on overflow (also stores
-// PTO2_ERROR_ASYNC_WAIT_OVERFLOW in ctx.completion_error_code) or when ctx is
-// not currently a deferred context.
-inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const CompletionToken &token) {
-    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) {
-        return false;
-    }
+inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const CompletionToken &token)
+{
+    if (ctx.task_token.is_invalid() || ctx.completion_count == nullptr || ctx.completion_entries == nullptr) return false;
 
     uint32_t idx = *ctx.completion_count;
-    if (idx >= ctx.completion_capacity) {
-        if (ctx.completion_error_code != nullptr) {
-            *ctx.completion_error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
-        }
+    if (idx >= ctx.completion_capacity)
+    {
+        if (ctx.completion_error_code != nullptr) *ctx.completion_error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
         return false;
     }
 
@@ -138,18 +123,16 @@ inline __aicore__ bool register_completion_condition(AsyncCtx &ctx, const Comple
     return true;
 }
 
-inline __aicore__ void
-send_notification(volatile __gm__ void *remote_counter_addr, int32_t value, pto::comm::NotifyOp notify_op) {
+inline __aicore__ void send_notification(volatile __gm__ void *remote_counter_addr, int32_t value, pto::comm::NotifyOp notify_op)
+{
     __gm__ int32_t *counter = reinterpret_cast<__gm__ int32_t *>(const_cast<__gm__ void *>(remote_counter_addr));
     pto::comm::Signal signal(counter);
     pto::comm::TNOTIFY(signal, value, notify_op);
 }
 
-inline __aicore__ void
-save_expected_notification_counter(AsyncCtx &ctx, volatile __gm__ void *counter_addr, uint32_t expected_value) {
-    CompletionToken token{
-        reinterpret_cast<uint64_t>(counter_addr), expected_value, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_COUNTER, 0
-    };
+inline __aicore__ void save_expected_notification_counter(AsyncCtx &ctx, volatile __gm__ void *counter_addr, uint32_t expected_value)
+{
+    CompletionToken token{reinterpret_cast<uint64_t>(counter_addr), expected_value, COMPLETION_ENGINE_SDMA, COMPLETION_TYPE_COUNTER, 0};
     (void)register_completion_condition(ctx, token);
     pto2::detail::defer_flush(ctx);
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
index 65608ad2f..7c0d891ee 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_async_wait.h
@@ -29,12 +29,8 @@ struct CompletionStats;
 
 inline constexpr int32_t MAX_ASYNC_WAITS = 64;
 
-// The mailbox transport (has_pending / try_push_condition /
-// try_push_normal_done / try_pop) lives as AICoreCompletionMailbox member
-// functions in aicore_completion_mailbox.h. This file only holds the
-// application layer: translating drained messages into wait-list state.
-
-inline uintptr_t mailbox_cache_line(const volatile void *addr) {
+inline uintptr_t mailbox_cache_line(const volatile void *addr)
+{
     return reinterpret_cast<uintptr_t>(addr) & ~(uintptr_t(PTO2_ALIGN_SIZE) - 1u);
 }
 
@@ -43,12 +39,14 @@ struct CompletionCondition;
 using CompletionPollFn = CompletionPollResult (*)(const CompletionCondition &);
 using CompletionRetireFn = void (*)(CompletionCondition &);
 
-struct CompletionBackendOps {
+struct CompletionBackendOps
+{
     CompletionPollFn poll;
     CompletionRetireFn retire;
 };
 
-struct CompletionCondition {
+struct CompletionCondition
+{
     AsyncEngine engine{ASYNC_ENGINE_SDMA};
     int32_t completion_type{COMPLETION_TYPE_COUNTER};
     bool satisfied{false};
@@ -61,28 +59,27 @@ struct CompletionCondition {
     void retire();
 };
 
-// Per-completion-type ops. SDMA_EVENT_RECORD detail lives in
-// backend/sdma/sdma_completion_scheduler.h; the op wrappers below are thin
-// glue mapping CompletionCondition.addr into the backend's raw-addr helpers.
-inline CompletionPollResult counter_poll_op(const CompletionCondition &cond) {
-    if (cond.counter_addr == nullptr) {
-        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
-    }
-    return {
-        *cond.counter_addr >= cond.expected_value ? CompletionPollState::READY : CompletionPollState::PENDING,
-        PTO2_ERROR_NONE
-    };
+inline CompletionPollResult counter_poll_op(const CompletionCondition &cond)
+{
+    if (cond.counter_addr == nullptr) return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
+    return {*cond.counter_addr >= cond.expected_value ? CompletionPollState::READY : CompletionPollState::PENDING, PTO2_ERROR_NONE};
 }
 
-inline void counter_retire_op(CompletionCondition & /*cond*/) {}
+inline void counter_retire_op(CompletionCondition &)
+{}
 
-inline CompletionPollResult sdma_event_record_poll_op(const CompletionCondition &cond) {
+inline CompletionPollResult sdma_event_record_poll_op(const CompletionCondition &cond)
+{
     return poll_sdma_event_record(cond.addr);
 }
 
-inline void sdma_event_record_retire_op(CompletionCondition &cond) { retire_sdma_event_record(cond.addr); }
+inline void sdma_event_record_retire_op(CompletionCondition &cond)
+{
+    retire_sdma_event_record(cond.addr);
+}
 
-inline const CompletionBackendOps *completion_backend_ops_for(int completion_type) {
+inline const CompletionBackendOps *completion_backend_ops_for(int completion_type)
+{
     static const CompletionBackendOps kOps[] = {
         {counter_poll_op, counter_retire_op},                      // COMPLETION_TYPE_COUNTER = 0
         {sdma_event_record_poll_op, sdma_event_record_retire_op},  // COMPLETION_TYPE_SDMA_EVENT_RECORD = 1
@@ -92,27 +89,24 @@ inline const CompletionBackendOps *completion_backend_ops_for(int completion_typ
     return &kOps[completion_type];
 }
 
-inline CompletionPollResult CompletionCondition::test() const {
-    if (satisfied) {
-        return {CompletionPollState::READY, PTO2_ERROR_NONE};
-    }
+inline CompletionPollResult CompletionCondition::test() const
+{
+    if (satisfied) return {CompletionPollState::READY, PTO2_ERROR_NONE};
     const CompletionBackendOps *ops = completion_backend_ops_for(completion_type);
-    if (ops == nullptr || ops->poll == nullptr) {
-        return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
-    }
+    if (ops == nullptr || ops->poll == nullptr) return {CompletionPollState::FAILED, PTO2_ERROR_ASYNC_COMPLETION_INVALID};
     return ops->poll(*this);
 }
 
-inline void CompletionCondition::retire() {
+inline void CompletionCondition::retire()
+{
     if (retired) return;
     const CompletionBackendOps *ops = completion_backend_ops_for(completion_type);
-    if (ops != nullptr && ops->retire != nullptr) {
-        ops->retire(*this);
-    }
+    if (ops != nullptr && ops->retire != nullptr) ops->retire(*this);
     retired = true;
 }
 
-struct AsyncWaitEntry {
+struct AsyncWaitEntry
+{
     PTO2TaskSlotState *slot_state{nullptr};
     PTO2TaskId task_token{PTO2TaskId::invalid()};
     CompletionCondition conditions[MAX_COMPLETIONS_PER_TASK];
@@ -121,14 +115,17 @@ struct AsyncWaitEntry {
     bool normal_done{false};
 };
 
-struct AsyncPollResult {
+struct AsyncPollResult
+{
     int32_t completed{0};
     int32_t error_code{PTO2_ERROR_NONE};
     PTO2TaskSlotState *failed_slot_state{nullptr};
 };
 
-inline const char *async_engine_name(AsyncEngine engine) {
-    switch (engine) {
+inline const char *async_engine_name(AsyncEngine engine)
+{
+    switch (engine)
+    {
     case ASYNC_ENGINE_SDMA:
         return "SDMA";
     case ASYNC_ENGINE_ROCE:
@@ -142,75 +139,62 @@ inline const char *async_engine_name(AsyncEngine engine) {
     }
 }
 
-struct AsyncWaitList {
+struct AsyncWaitList
+{
     std::atomic<int32_t> busy{0};
     AsyncWaitEntry entries[MAX_ASYNC_WAITS];
     int32_t count{0};
-    // Diagnostic: counts every FIN-side try_push that hit a full mailbox.
-    // Expected to stay zero on real workloads (ring is 4096 entries); a
-    // non-zero value means consumers are too slow or the ring is undersized.
-    // Read by scheduler shutdown / l2 perf summary; not on the hot path.
     std::atomic<uint64_t> mpsc_skipped_count{0};
 
-    bool try_lock() {
+    bool try_lock()
+    {
         int32_t expected = 0;
         return busy.compare_exchange_strong(expected, 1, std::memory_order_acquire, std::memory_order_relaxed);
     }
 
-    void unlock() { busy.store(0, std::memory_order_release); }
+    void unlock()
+    {
+        busy.store(0, std::memory_order_release);
+    }
 
-    AsyncWaitEntry *find_entry_by_token(PTO2TaskId token) {
-        for (int32_t i = 0; i < count; i++) {
+    AsyncWaitEntry *find_entry_by_token(PTO2TaskId token)
+    {
+        for (int32_t i = 0; i < count; i++)
             if (entries[i].task_token == token) return &entries[i];
-        }
         return nullptr;
     }
 
-    // Captures the side-channel a scheduler-aware drain needs to complete
-    // NotDeferred tasks inline (without storing a transient entry in
-    // entries[]).
-    struct DrainCompletionSink {
+    struct DrainCompletionSink
+    {
         PTO2SchedulerState *sched{nullptr};
-        PTO2LocalReadyBuffer *local_bufs{nullptr};
-        PTO2TaskSlotState **deferred_release_slot_states{nullptr};
-        int32_t *deferred_release_count{nullptr};
-        int32_t deferred_release_capacity{0};
         int32_t inline_completed{0};
-#if PTO2_SCHED_PROFILING
-        int32_t thread_idx{0};
-#endif
 
-        bool can_inline_complete() const { return sched != nullptr; }
+        bool can_inline_complete() const
+        {
+            return sched != nullptr;
+        }
     };
 
-    // Inline-complete a NotDeferred task during drain. Returns false on
-    // deferred_release_slot_states overflow.
+    // Inline-complete a NotDeferred task during drain.
     bool try_inline_complete_locked(DrainCompletionSink &sink, PTO2TaskSlotState &slot_state);
 
-    // Single-consumer drain: pop each published message in tail order and
-    // translate it into wait-list state. An empty sink (sched == nullptr) just
-    // materializes entries; a sched-aware sink additionally inline-completes
-    // lonely NotDeferred NORMAL_DONEs without ever growing entries[].
-    int32_t drain_aicore_completion_mailbox_locked(
-        AICoreCompletionMailbox *aicore_mailbox, DrainCompletionSink &sink, int32_t &error_code
-    ) {
+    int32_t drain_aicore_completion_mailbox_locked(AICoreCompletionMailbox *aicore_mailbox, DrainCompletionSink &sink, int32_t &error_code)
+    {
         error_code = PTO2_ERROR_NONE;
         if (aicore_mailbox == nullptr) return 0;
 
         int32_t drained = 0;
         AICoreCompletionMsgView msg;
-        // try_pop is the transport layer (seq-gated, in-order dequeue); this
-        // loop is the application layer (translate each message into wait-list
-        // state). try_pop returns false at the first gap or when empty.
-        while (aicore_mailbox->try_pop(msg)) {
+        while (aicore_mailbox->try_pop(msg))
+        {
             drained++;
-            if (msg.kind == MSG_KIND_CONDITION) {
+            if (msg.kind == MSG_KIND_CONDITION)
+            {
                 AsyncWaitEntry *entry = find_entry_by_token(msg.task_token);
-                if (entry == nullptr) {
-                    // First message for this task — materialize the entry here.
-                    // slot_state stays null until the matching TASK_NORMAL_DONE
-                    // sentinel arrives.
-                    if (count >= MAX_ASYNC_WAITS) {
+                if (entry == nullptr)
+                {
+                    if (count >= MAX_ASYNC_WAITS)
+                    {
                         error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
                         return drained;
                     }
@@ -221,28 +205,21 @@ struct AsyncWaitList {
                     entry->waiting_completion_count = 0;
                     entry->normal_done = false;
                 }
-                if (!append_condition_locked(
-                        *entry, msg.addr, msg.expected_value, static_cast<AsyncEngine>(msg.engine), msg.completion_type,
-                        error_code
-                    )) {
-                    return drained;
-                }
-            } else if (msg.kind == MSG_KIND_TASK_NORMAL_DONE) {
-                PTO2TaskSlotState *slot_state_ptr =
-                    reinterpret_cast<PTO2TaskSlotState *>(static_cast<uintptr_t>(msg.addr));
+                if (!append_condition_locked(*entry, msg.addr, msg.expected_value, static_cast<AsyncEngine>(msg.engine), msg.completion_type, error_code)) return drained;
+            }
+            else if (msg.kind == MSG_KIND_TASK_NORMAL_DONE)
+            {
+                PTO2TaskSlotState *slot_state_ptr = reinterpret_cast<PTO2TaskSlotState *>(static_cast<uintptr_t>(msg.addr));
                 AsyncWaitEntry *entry = find_entry_by_token(msg.task_token);
-                if (entry == nullptr) {
-                    // Producers strictly order: all CONDITIONs for token T are
-                    // pushed before the matching NORMAL_DONE (the acq_rel on
-                    // on_subtask_complete enforces this across producers). So
-                    // observing NORMAL_DONE first => the task registered no
-                    // conditions => NotDeferred. Complete it inline when the
-                    // sink allows; otherwise fall back to the entry-store path.
-                    if (sink.can_inline_complete()) {
+                if (entry == nullptr)
+                {
+                    if (sink.can_inline_complete())
+                    {
                         (void)try_inline_complete_locked(sink, *slot_state_ptr);
                         continue;
                     }
-                    if (count >= MAX_ASYNC_WAITS) {
+                    if (count >= MAX_ASYNC_WAITS)
+                    {
                         error_code = PTO2_ERROR_ASYNC_WAIT_OVERFLOW;
                         return drained;
                     }
@@ -252,13 +229,15 @@ struct AsyncWaitList {
                     entry->condition_count = 0;
                     entry->waiting_completion_count = 0;
                     entry->normal_done = true;
-                } else {
-                    if (entry->slot_state == nullptr) {
-                        entry->slot_state = slot_state_ptr;
-                    }
+                }
+                else
+                {
+                    if (entry->slot_state == nullptr) entry->slot_state = slot_state_ptr;
                     entry->normal_done = true;
                 }
-            } else {
+            }
+            else
+            {
                 error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED;
                 return drained;
             }
@@ -266,11 +245,10 @@ struct AsyncWaitList {
         return drained;
     }
 
-    bool append_condition_locked(
-        AsyncWaitEntry &entry, uint64_t addr, uint32_t expected_value, AsyncEngine engine, int32_t completion_type,
-        int32_t &error_code
-    ) {
-        if (entry.condition_count >= MAX_COMPLETIONS_PER_TASK) {
+    bool append_condition_locked(AsyncWaitEntry &entry, uint64_t addr, uint32_t expected_value, AsyncEngine engine, int32_t completion_type, int32_t &error_code)
+    {
+        if (entry.condition_count >= MAX_COMPLETIONS_PER_TASK)
+        {
             error_code = PTO2_ERROR_ASYNC_REGISTRATION_FAILED;
             return false;
         }
@@ -280,24 +258,14 @@ struct AsyncWaitList {
         cond.satisfied = false;
         cond.retired = false;
         cond.addr = addr;
-        cond.counter_addr = completion_type == COMPLETION_TYPE_COUNTER ?
-                                reinterpret_cast<volatile uint32_t *>(static_cast<uintptr_t>(addr)) :
-                                nullptr;
+        cond.counter_addr = completion_type == COMPLETION_TYPE_COUNTER ? reinterpret_cast<volatile uint32_t *>(static_cast<uintptr_t>(addr)) : nullptr;
         cond.expected_value = expected_value;
         entry.waiting_completion_count++;
         return true;
     }
 
     template <bool Profiling>
-    AsyncPollResult poll_and_complete(
-        AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs,
-        PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count,
-        int32_t deferred_release_capacity
-#if PTO2_SCHED_PROFILING
-        ,
-        int thread_idx
-#endif
-    );
+    AsyncPollResult poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched);
 };
 
 #endif  // PTO_ASYNC_WAIT_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_completion_token.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_completion_token.h
index c5a8c345f..d017f8597 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_completion_token.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_completion_token.h
@@ -17,13 +17,8 @@
 #include "aicore_completion_mailbox_types.h"
 #include "pto_runtime_status.h"
 
-// CompletionToken is the runtime-internal POD that backend submit handlers
-// produce and the generic register_completion_condition() consumes. It is the
-// ABI contract for "this is one completion to wait on" — independent of which
-// backend (SDMA, RoCE, notification counter, ...) generated it. Each backend's
-// (poll, retire) pair is registered in pto_async_wait.h's ops table, keyed by
-// completion_type.
-struct CompletionToken {
+struct CompletionToken
+{
     uint64_t addr;
     uint32_t expected_value;
     uint32_t engine;
@@ -31,13 +26,15 @@ struct CompletionToken {
     uint64_t backend_cookie;
 };
 
-enum class CompletionPollState : uint8_t {
+enum class CompletionPollState : uint8_t
+{
     PENDING = 0,
     READY = 1,
     FAILED = 2,
 };
 
-struct CompletionPollResult {
+struct CompletionPollResult
+{
     CompletionPollState state{CompletionPollState::PENDING};
     int32_t error_code{PTO2_ERROR_NONE};
 };
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_dep_compute.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_dep_compute.h
index 0f5bad413..e3ff8ba6e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_dep_compute.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_dep_compute.h
@@ -9,37 +9,6 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * @file pto_dep_compute.h
- * @brief Dependency computation primitives shared by runtime submit_task and dep_gen replay.
- *
- * Two header-only template entry points:
- *
- *   compute_task_fanin     — STEP 3 in submit_task: per-tensor creator retention (Step A)
- *                            + tensormap.lookup for INPUT/INOUT (Step B). Calls back into
- *                            user-supplied `emit` for each producer it identifies.
- *
- *   register_task_outputs  — STEP 4 in submit_task: tensormap.insert for INOUT and
- *                            OUTPUT_EXISTING tensors. No callbacks.
- *
- * STEP 1 (explicit_deps) is intentionally left at the runtime call site because its
- * `last_task_alive` shortcut + unchecked slot lookup is subtly different from the
- * `slot_state->task->task_id == producer` reuse check in STEP 3. Unifying them would
- * require two emit semantics or a marginal behavior change in transients — not worth
- * the minor structural overlap. Replay handles STEP 1 with a one-line loop of its own.
- *
- * The Emit callback contract:
- *   bool emit(PTO2TaskId producer);
- *     - return true to continue (whether or not the producer was actually recorded —
- *       producer-not-alive / dedup-hit / etc. all return true silently)
- *     - return false to signal fatal (e.g. fanin spill overflow); caller bails
- *
- * Performance: Emit is a template parameter, not std::function. Both runtime
- * (lambda capturing fanin_builder + sm_header) and replay (lambda capturing edge
- * vector) instantiate at the call site and inline through. Do NOT replace with
- * std::function — it would break the inlining and add ~5 ns/call to the orch hot path.
- */
-
 #ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
 #define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_DEP_COMPUTE_H_
 
@@ -50,14 +19,8 @@
 #include "pto_types.h"  // TensorRef
 #include "tensor.h"
 
-/**
- * View struct for inputs to compute_task_fanin / register_task_outputs.
- *
- * Both runtime and replay assemble one of these from their own data sources
- * (runtime: from Arg accessors; replay: from SubmitTraceEntry fields). All
- * pointer arrays must remain valid for the duration of the call.
- */
-struct DepInputs {
+struct DepInputs
+{
     int32_t tensor_count;
     const TensorRef *tensors;        // length = tensor_count (union; OUTPUT slots' .ptr is unused)
     const TensorArgType *arg_types;  // length = tensor_count
@@ -65,28 +28,16 @@ struct DepInputs {
     const PTO2TaskId *explicit_deps;  // length = explicit_dep_count (validity checked by caller)
 };
 
-/**
- * Compute fanin for a task being submitted (STEP 3: Step A creator retention +
- * Step B tensormap modifier lookup).
- *
- * For each non-OUTPUT tensor:
- *   - If owner_task_id is valid, emit(owner)
- *   - For INPUT/INOUT (and not manual_dep), tensor_map.lookup(*tensor) and emit
- *     each matching producer. INOUT+COVERED triggers tensor_map.remove_entry(entry).
- *
- * @return true on success (or producer-skipped-silently); false if emit signaled
- *         fatal — caller should propagate (after any fatal bookkeeping done by emit).
- */
 template <typename Emit>
-[[nodiscard]] inline bool
-compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, Emit emit) {
-    if (in_manual_scope) {
-        return true;
-    }
+[[nodiscard]] inline bool compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_manual_scope, Emit emit)
+{
+    if (in_manual_scope) return true;
 
-    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+    for (int32_t i = 0; i < inputs.tensor_count; i++)
+    {
         TensorArgType ptype = inputs.arg_types[i];
-        if (ptype == TensorArgType::OUTPUT) {
+        if (ptype == TensorArgType::OUTPUT)
+        {
             // Runtime-created OUTPUT tensors are not looked up in the TensorMap since
             // they have no dependencies.
             continue;
@@ -96,58 +47,40 @@ compute_task_fanin(const DepInputs &inputs, PTO2TensorMap &tensor_map, bool in_m
 
         // Step A: creator retention — all existing tensors extend their creator lifetime.
         PTO2TaskId owner = tensor->owner_task_id;
-        if (owner.is_valid()) {
-            if (!emit(owner)) {
-                return false;
-            }
+        if (owner.is_valid())
+        {
+            if (!emit(owner)) return false;
         }
 
         // Step B: only INPUT/INOUT need modifier dependency lookup.
-        if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) {
-            continue;
-        }
-        if (tensor->manual_dep) {
-            continue;
-        }
+        if (ptype != TensorArgType::INPUT && ptype != TensorArgType::INOUT) continue;
+        if (tensor->manual_dep) continue;
 
         bool fatal = false;
         tensor_map.lookup(*tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus overlap_status) -> bool {
-            if (!emit(entry.producer_task_id)) {
+            if (!emit(entry.producer_task_id))
+            {
                 fatal = true;
                 return false;  // stop iteration
             }
-            if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) {
-                tensor_map.remove_entry(entry);
-            }
+            if (ptype == TensorArgType::INOUT && overlap_status == OverlapStatus::COVERED) tensor_map.remove_entry(entry);
             return true;
         });
-        if (fatal) {
-            return false;
-        }
+        if (fatal) return false;
     }
     return true;
 }
 
-/**
- * Register a task's outputs in the tensormap (STEP 4 in submit_task).
- *
- * For INOUT and OUTPUT_EXISTING tensors (excluding manual_dep), inserts the
- * tensor into tensor_map keyed by its buffer.addr with `task_id` as producer.
- *
- * No-op when in_manual_scope.
- */
-inline void
-register_task_outputs(const DepInputs &inputs, PTO2TaskId task_id, PTO2TensorMap &tensor_map, bool in_manual_scope) {
-    if (in_manual_scope) {
-        return;
-    }
-    for (int32_t i = 0; i < inputs.tensor_count; i++) {
+inline void register_task_outputs(const DepInputs &inputs, PTO2TaskId task_id, PTO2TensorMap &tensor_map, bool in_manual_scope)
+{
+    if (in_manual_scope) return;
+    for (int32_t i = 0; i < inputs.tensor_count; i++)
+    {
         TensorArgType ptype = inputs.arg_types[i];
-        if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) {
+        if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING)
+        {
             const Tensor *tensor = inputs.tensors[i].ptr;
-            if (!tensor->manual_dep) {
-                tensor_map.insert(*tensor, task_id);
-            }
+            if (!tensor->manual_dep) tensor_map.insert(*tensor, task_id);
         }
     }
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
deleted file mode 100644
index 116525076..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ /dev/null
@@ -1,972 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-/**
- * PTO Runtime2 - Orchestrator Implementation
- *
- * Implements orchestrator state management, scope handling, and task submission.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_orchestrator.h"
-
-#include <assert.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "aicpu/dep_gen_collector_aicpu.h"
-#include "common/dep_gen.h"
-#include "common/unified_log.h"
-#include "pto_dep_compute.h"
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-#include "pto_tensormap.h"
-#include "pto_types.h"
-#include "tensor.h"
-
-#if PTO2_PROFILING
-#include "aicpu/scope_stats_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-#endif
-
-// Verify the captured Tensor blob size in DepGenRecord matches the runtime
-// Tensor layout. The platform header defines DEP_GEN_TENSOR_SIZE without
-// including runtime/tensor.h, so this check lives at the orch callsite.
-static_assert(sizeof(Tensor) == DEP_GEN_TENSOR_SIZE, "DepGenRecord::tensors slot size out of sync with sizeof(Tensor)");
-// DEP_GEN_MAX_EXPLICIT_DEPS is a diagnostic-side capture cap only; the runtime
-// imposes no hard cap on explicit dep count. If a submit exceeds this cap,
-// dep_gen_aicpu_record_submit() logs and truncates — runtime correctness is
-// unaffected, only the captured replay record is truncated.
-
-// Weak fallbacks: dep_gen_collector_aicpu.cpp provides the strong symbols in
-// AICPU builds. Host builds (host_build_graph runtime, future dep_gen replay)
-// link these no-op stubs so the runtime translation unit is self-contained.
-// Visibility is hidden so the HOST .so doesn't export them into the global
-// dynamic symbol table where they'd shadow the AICPU .so's strong symbols
-// (same pattern as get_sys_cnt_aicpu / l2_swimlane_aicpu_record_orch_phase below).
-extern "C" __attribute__((weak, visibility("hidden"))) bool is_dep_gen_enabled() { return false; }
-__attribute__((weak, visibility("hidden"))) void dep_gen_aicpu_record_submit(
-    uint64_t, bool, int, const void *const *, const uint8_t *, int, const uint64_t *, const int32_t[3]
-) {}
-
-// Scope_stats enable gate, queried via the same predicate idiom as
-// is_dep_gen_enabled above. The AICPU collector links the strong definition;
-// host builds fall back to this weak `false`. Gating here still skips the
-// cross-agent occupancy reads that feed the sample when scope_stats is disabled.
-extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; }
-
-// Heap-ring wrap report, called from the allocator (pto_ring_buffer.h) on each
-// wrap. Strong definition lives in the AICPU collector; host builds fall back to
-// this weak no-op so the runtime translation unit stays self-contained.
-extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {}
-
-// =============================================================================
-// Orchestrator Profiling (compile-time toggle)
-// =============================================================================
-#if PTO2_ORCH_PROFILING
-#include "aicpu/device_time.h"
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-// Weak fallback for builds that don't link device_time.cpp (e.g. host).
-// The strong symbol from platform/.../device_time.cpp wins in the AICPU build.
-//
-// IMPORTANT: visibility("hidden") is required to prevent the HOST .so from
-// exporting this weak fallback into the global dynamic symbol table via
-// RTLD_GLOBAL. Without it, when the AICPU .so is loaded and its PLT entry
-// for get_sys_cnt_aicpu is resolved, the dynamic linker finds the HOST .so's
-// weak definition first (already in global table) and uses it — returning 0.
-// With hidden visibility, the HOST .so does not export this symbol globally,
-// so the AICPU .so's PLT resolves to its own strong definition from
-// device_time.cpp.
-__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-// Weak fallback for builds that don't link l2_swimlane_collector_aicpu.cpp.
-// The strong symbol from the AICPU build wins when profiling is available.
-// Also hidden to prevent HOST .so from polluting the global symbol table.
-__attribute__((weak, visibility("hidden"))) void
-l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {}
-// Accumulated cycles per sub-step (only needed for ORCH_PROFILING export)
-static uint64_t g_orch_sync_cycle = 0;       // tensormap sync
-static uint64_t g_orch_alloc_cycle = 0;      // unified task+heap alloc
-static uint64_t g_orch_args_cycle = 0;       // param copy
-static uint64_t g_orch_lookup_cycle = 0;     // tensormap lookup + dep building
-static uint64_t g_orch_insert_cycle = 0;     // tensormap insert
-static uint64_t g_orch_fanin_cycle = 0;      // fanin list + early-return check
-static uint64_t g_orch_scope_end_cycle = 0;  // scope_end overhead
-static int64_t g_orch_submit_count = 0;
-static uint32_t g_orch_submit_idx = 0;
-uint64_t g_orch_alloc_wait_cycle = 0;
-uint64_t g_orch_fanin_wait_cycle = 0;
-uint64_t g_orch_alloc_atomic_count = 0;
-uint64_t g_orch_args_atomic_count = 0;
-uint64_t g_orch_scope_end_atomic_count = 0;
-// Cycle accumulation is unconditional under PTO2_ORCH_PROFILING (that's what
-// the flag is for) and feeds the per-sub-step `g_orch_*_cycle` cumulatives
-// printed in the cold-path log.
-//
-// Per-submit ORCH_SUBMIT record is the only swim-lane emit on the orch
-// path — one record per submit_task() / alloc_tensors() call spanning
-// the entire [start, end] window. Per-sub-step phase records were dropped
-// in favour of the cumulatives + per-submit envelope; the dispatcher
-// already inserts one record at the end of each submit path via
-// CYCLE_COUNT_ORCH_SUBMIT_RECORD.
-#define CYCLE_COUNT_START()                                                        \
-    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
-    uint64_t _t0 = get_sys_cnt_aicpu(), _t1;                                       \
-    uint64_t _submit_start_ts = _t0
-#define CYCLE_COUNT_LAP(acc)       \
-    do {                           \
-        _t1 = get_sys_cnt_aicpu(); \
-        acc += (_t1 - _t0);        \
-        _t0 = _t1;                 \
-    } while (0)
-#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                       \
-    do {                                                                                          \
-        if (_prof_active) {                                                                       \
-            l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \
-        }                                                                                         \
-    } while (0)
-#elif PTO2_PROFILING
-#include "aicpu/device_time.h"
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-__attribute__((weak, visibility("hidden"))) void
-l2_swimlane_aicpu_record_orch_phase(uint64_t, uint64_t, uint64_t, uint32_t) {}
-// submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
-static uint32_t g_orch_submit_idx = 0;
-#define CYCLE_COUNT_START()                                                        \
-    bool _prof_active = (orch->l2_swimlane_level >= L2SwimlaneLevel::ORCH_PHASES); \
-    uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0;                \
-    uint64_t _submit_start_ts = _t0
-#define CYCLE_COUNT_LAP(acc) \
-    do {                     \
-    } while (0)
-#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)                                                       \
-    do {                                                                                          \
-        if (_prof_active) {                                                                       \
-            _t1 = get_sys_cnt_aicpu();                                                            \
-            l2_swimlane_aicpu_record_orch_phase(_submit_start_ts, _t1, (tid), g_orch_submit_idx); \
-        }                                                                                         \
-    } while (0)
-#else
-#define CYCLE_COUNT_START()
-#define CYCLE_COUNT_LAP(acc)
-#define CYCLE_COUNT_ORCH_SUBMIT_RECORD(tid)
-#endif
-
-static int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code) {
-    always_assert(orch != nullptr);
-    orch->fatal = true;
-    if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) {
-        return PTO2_ERROR_NONE;
-    }
-
-    int32_t expected = PTO2_ERROR_NONE;
-    std::atomic<int32_t> &orch_error_code = orch->sm_header->orch_error_code;
-    if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) {
-        return error_code;
-    }
-    return expected;
-}
-
-static void
-orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *func, const char *fmt, va_list args) {
-    int32_t latched_code = orch_mark_fatal(orch, error_code);
-
-#if PTO2_PROFILING
-    // Flush the current scope's peaks BEFORE the FATAL log line, so the
-    // diagnostic context (which pool/window filled up) appears right next to
-    // the failure reason. on_fatal is latched, so duplicate fatals from
-    // different layers don't print multiple stats lines.
-    scope_stats_on_fatal();
-#endif
-
-    if (fmt == nullptr || fmt[0] == '\0') {
-        if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
-            unified_log_error(func, "FATAL(code=%d, latched=%d)", error_code, latched_code);
-        } else {
-            unified_log_error(func, "FATAL(code=%d)", error_code);
-        }
-        return;
-    }
-
-    char message[1024];
-    vsnprintf(message, sizeof(message), fmt, args);
-    if (latched_code != PTO2_ERROR_NONE && latched_code != error_code) {
-        unified_log_error(func, "FATAL(code=%d, latched=%d): %s", error_code, latched_code, message);
-        return;
-    }
-    unified_log_error(func, "FATAL(code=%d): %s", error_code, message);
-}
-
-void PTO2OrchestratorState::report_fatal(int32_t error_code, const char *func, const char *fmt, ...) {
-    auto *orch = this;
-    va_list args;
-    va_start(args, fmt);
-    orch_report_fatal_v(orch, error_code, func, fmt, args);
-    va_end(args);
-}
-
-static uint32_t next_fanin_seen_epoch(PTO2OrchestratorState *orch) {
-    uint32_t next = orch->fanin_seen_current_epoch + 1;
-    if (next == 0) {
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            memset(
-                orch->fanin_seen_epoch[r], 0,
-                static_cast<size_t>(orch->sm_header->rings[r].task_window_size) * sizeof(uint32_t)
-            );
-        }
-        next = 1;
-    }
-    orch->fanin_seen_current_epoch = next;
-    return next;
-}
-
-struct PTO2FaninBuilder {
-    PTO2FaninBuilder(PTO2OrchestratorState *orch, PTO2FaninPool &spill_pool, uint32_t seen_epoch) :
-        count(0),
-        spill_start(0),
-        orch(orch),
-        seen_epoch(seen_epoch),
-        spill_pool(spill_pool) {}
-    int32_t count{0};
-    int32_t spill_start{0};
-    PTO2OrchestratorState *orch{nullptr};
-    uint32_t seen_epoch{0};
-    PTO2FaninPool &spill_pool;
-    PTO2TaskSlotState *inline_slots[PTO2_FANIN_INLINE_CAP];
-
-    template <typename Fn>
-    PTO2FaninForEachReturn<Fn> for_each(Fn &&fn) const {
-        return for_each_fanin_storage(inline_slots, count, spill_start, spill_pool, static_cast<Fn &&>(fn));
-    }
-
-    bool mark_seen(uint8_t prod_ring, int32_t prod_slot) {
-        if (prod_ring >= PTO2_MAX_RING_DEPTH || prod_slot < 0) {
-            return false;
-        }
-        uint32_t *seen = orch->fanin_seen_epoch[prod_ring];
-        uint32_t slot = static_cast<uint32_t>(prod_slot);
-        if (seen[slot] == seen_epoch) {
-            return true;
-        }
-        seen[slot] = seen_epoch;
-        return false;
-    }
-};
-
-static bool append_fanin_or_fail(
-    PTO2OrchestratorState *orch, uint8_t prod_ring, int32_t prod_slot, PTO2TaskSlotState *prod_state,
-    PTO2FaninBuilder *fanin_builder, uint8_t ring_id
-) {
-    if (fanin_builder->mark_seen(prod_ring, prod_slot)) {
-        return true;
-    }
-
-    if (fanin_builder->count < PTO2_FANIN_INLINE_CAP) {
-        fanin_builder->inline_slots[fanin_builder->count++] = prod_state;
-        return true;
-    }
-
-    PTO2FaninPool &fanin_pool = fanin_builder->spill_pool;
-    if (!fanin_pool.ensure_space(orch->sm_header->rings[ring_id], 1)) {
-        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
-        return false;
-    }
-    int32_t spill_idx = fanin_pool.top;
-    PTO2FaninSpillEntry *entry = fanin_pool.alloc();
-    if (entry == nullptr) {
-        orch_mark_fatal(orch, PTO2_ERROR_DEP_POOL_OVERFLOW);
-        return false;
-    }
-    if (fanin_builder->count == PTO2_FANIN_INLINE_CAP) {
-        fanin_builder->spill_start = spill_idx;
-    }
-    entry->slot_state = prod_state;
-    fanin_builder->count++;
-    return true;
-}
-
-static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state);
-
-struct PTO2PreparedTask {
-    PTO2TaskId task_id = PTO2TaskId::invalid();
-    PTO2TaskAllocResult alloc_result = {-1, 0, nullptr, nullptr};
-    PTO2TaskDescriptor *task = nullptr;
-    PTO2TaskPayload *payload = nullptr;
-    PTO2TaskSlotState *slot_state = nullptr;
-};
-
-static PTO2OutputLayout calculate_output_layout(const Arg &args) {
-    PTO2OutputLayout layout;
-    for (int32_t i = 0; i < args.tensor_count(); i++) {
-        if (args.tag(i) != TensorArgType::OUTPUT) {
-            continue;
-        }
-        layout.offsets[i] = layout.total_output_size;
-        layout.buffer_sizes[i] =
-            PTO2_ALIGN_UP(args.tensor(i).create_info->buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN);
-        layout.total_output_size += layout.buffer_sizes[i];
-    }
-    return layout;
-}
-
-static bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator, uint8_t ring_id) {
-    always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
-
-    int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top];
-    if (scope_task_count < allocator.window_size() - 1) {
-        return true;
-    }
-
-    int32_t active_count = allocator.active_count();
-
-    LOG_ERROR("========================================");
-    LOG_ERROR("FATAL: Scope Deadlock Detected! (ring %d)", ring_id);
-    LOG_ERROR("========================================");
-    LOG_ERROR("Tasks in current scope (%d) >= task_window_size (%d).", scope_task_count, allocator.window_size());
-    LOG_ERROR("  scope_depth:        %d", orch->scope_stack_top + 1);
-    LOG_ERROR("  ring_id:            %d", ring_id);
-    LOG_ERROR("  scope_task_count:   %d", scope_task_count);
-    LOG_ERROR("  active_tasks:       %d / %d", active_count, allocator.window_size());
-    LOG_ERROR("Root Cause:");
-    LOG_ERROR("  Tasks within a scope hold a fanout_count reference that is only");
-    LOG_ERROR("  released at scope_end. When scope task count >= window_size,");
-    LOG_ERROR("  no slots can be reclaimed -> deadlock.");
-    LOG_ERROR("Solution:");
-    LOG_ERROR("  1. Reduce tasks per scope (use batching/unroll)");
-    LOG_ERROR("  2. Increase task window (current: %d)", allocator.window_size());
-    LOG_ERROR("     Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
-    LOG_ERROR("     Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2>");
-    LOG_ERROR("  3. Split work across multiple scopes");
-    LOG_ERROR("========================================");
-    orch_mark_fatal(orch, PTO2_ERROR_SCOPE_DEADLOCK);
-    return false;
-}
-
-static bool prepare_task(
-    PTO2OrchestratorState *orch, const Arg &args, int32_t total_output_size, ActiveMask active_mask,
-    PTO2PreparedTask *out
-) {
-    uint8_t ring_id = orch->current_ring_id();
-    auto &allocator = orch->rings[ring_id].task_allocator;
-
-    if (!check_scope_can_accept_task(orch, allocator, ring_id)) {
-        return false;
-    }
-
-    out->alloc_result = allocator.alloc(total_output_size);
-    if (out->alloc_result.failed()) {
-        orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK);
-        return false;
-    }
-
-    out->task_id = PTO2TaskId::make(ring_id, static_cast<uint32_t>(out->alloc_result.task_id));
-    out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot);
-    out->task = &orch->sm_header->rings[ring_id].task_descriptors[out->alloc_result.slot];
-    out->payload = &orch->sm_header->rings[ring_id].task_payloads[out->alloc_result.slot];
-
-    out->payload->prefetch(args.tensor_count(), args.scalar_count());
-
-    // Re-bind payload/task pointers each submit. Value is per-slot constant
-    // (same as &task_payloads[slot] / &task_descriptors[slot]), but writing
-    // here lets RingSchedState::init() skip the O(window_size) bind loop.
-    // Both writes hit the same 64B slot_state cache line we're about to
-    // dirty below, so the extra cost is two stores on an already-hot line.
-    // Must precede the scheduler wiring.queue.push at the end of
-    // submit_task_common — that push is the first read of slot_state->task /
-    // slot_state->payload by another thread.
-    out->slot_state->bind_buffers(out->payload, out->task);
-
-    // prepare_task does NO payload writes: all payload content (tensors/scalars +
-    // early-dispatch spec fields) is initialized in PTO2TaskPayload::init, the
-    // single payload-init point, which runs before the scheduler wiring push.
-
-    // Fields already reset by advance_ring_pointers (eager reset after CONSUMED):
-    //   fanout_lock=0, fanout_count=1, fanout_head=nullptr,
-    //   fanin_refcount=0, fanout_refcount=0, completed_subtasks=0, next_block_idx=0
-    // Fields immutable after RingSchedState::init():
-    //   ring_id
-    // task_state left as CONSUMED by eager reset (safe for stale wait_for_tensor
-    // observers); set to PENDING here when orchestrator actually reuses the slot.
-    out->slot_state->task_state.store(PTO2_TASK_PENDING, std::memory_order_relaxed);
-    int16_t block_num = args.launch_spec.block_num();
-    out->slot_state->total_required_subtasks =
-        static_cast<int16_t>(block_num * __builtin_popcount(active_mask.core_mask()));
-    out->slot_state->logical_block_num = block_num;
-    out->slot_state->active_mask = active_mask;
-    // fanin_count is set by scheduler during wiring
-    scope_tasks_push(orch, out->slot_state);
-
-    return true;
-}
-
-// =============================================================================
-// Scope Management
-// =============================================================================
-
-static void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state) {
-    if (orch->scope_tasks_size >= orch->scope_tasks_capacity) {
-        // scope_tasks lives in the per-Worker arena (single backing allocation),
-        // so realloc is not legal. Capacity == PTO2_SCOPE_TASKS_CAP ==
-        // PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH, the total in-flight slot
-        // budget — hitting it means every ring is saturated, so no further push
-        // could succeed regardless of buffer growth.
-        orch->report_fatal(
-            PTO2_ERROR_SCOPE_TASKS_OVERFLOW, __FUNCTION__,
-            "scope_tasks buffer saturated at %d entries (all rings full)", orch->scope_tasks_capacity
-        );
-        return;
-    }
-    orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state;
-}
-
-void PTO2OrchestratorState::begin_scope(PTO2ScopeMode mode) {
-    auto *orch = this;
-    if (orch->fatal) {
-        return;
-    }
-    assert(orch->scope_stack_top < static_cast<int32_t>(orch->scope_stack_capacity - 1) && "Scope stack overflow");
-    if (mode == PTO2ScopeMode::AUTO && orch->in_manual_scope()) {
-        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "auto scope nested inside manual scope is not supported");
-        return;
-    }
-
-    bool already_in_manual_scope = orch->in_manual_scope();
-    ++orch->scope_stack_top;
-    orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size;
-    if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) {
-        orch->manual_begin_depth = orch->scope_stack_top;
-    }
-#if PTO2_PROFILING
-    // Gate via is_scope_stats_enabled() (weak-false in host builds) BEFORE the
-    // collector call: when disabled we pay nothing. Sample the current ring's
-    // task/heap start-end and tensormap usage at the scope boundary.
-    if (is_scope_stats_enabled()) {
-        uint8_t ring_id = orch->current_ring_id();
-        auto &alloc = orch->rings[ring_id].task_allocator;
-        int32_t dep_pool_tail = 0;
-        int32_t dep_pool_top = 0;
-        if (orch->scheduler) {
-            orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top);
-        }
-        scope_stats_begin(
-            ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail,
-            dep_pool_top, orch->tensor_map.current_used()
-        );
-    }
-#endif
-}
-
-void PTO2OrchestratorState::end_scope() {
-    auto *orch = this;
-    if (orch->fatal) {
-        return;
-    }
-    assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
-
-    // Snapshot the ring start/end BEFORE the orchestrator drains pending tasks
-    // via scheduler->on_scope_end, so the end record reflects the scope's
-    // occupancy at close, not the residual after teardown.
-#if PTO2_PROFILING
-    // Gate via is_scope_stats_enabled() (see begin_scope). One collector call
-    // emits the end-boundary record and tears down bookkeeping.
-    if (is_scope_stats_enabled()) {
-        uint8_t ring_id = orch->current_ring_id();
-        auto &alloc = orch->rings[ring_id].task_allocator;
-        int32_t dep_pool_tail = 0;
-        int32_t dep_pool_top = 0;
-        if (orch->scheduler) {
-            orch->scheduler->ring_sched_states[ring_id].read_dep_pool_snapshot(dep_pool_tail, dep_pool_top);
-        }
-        scope_stats_end(
-            ring_id, alloc.task_tail(), alloc.task_head(), alloc.heap_tail(), alloc.heap_top(), dep_pool_tail,
-            dep_pool_top, orch->tensor_map.current_used()
-        );
-    }
-#endif
-
-#if PTO2_ORCH_PROFILING
-    uint64_t _se0 = get_sys_cnt_aicpu();
-#endif
-
-    bool ending_manual_scope = orch->scope_stack_top == orch->manual_begin_depth;
-    int32_t begin = orch->scope_begins[orch->scope_stack_top--];
-    int32_t count = orch->scope_tasks_size - begin;
-    if (ending_manual_scope) {
-        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-    }
-
-    if (orch->scheduler && count > 0) {
-        orch->scheduler->on_scope_end(&orch->scope_tasks[begin], count);
-    }
-
-    // Rewind the task buffer — these entries are no longer needed
-    orch->scope_tasks_size = begin;
-
-#if PTO2_ORCH_PROFILING
-    uint64_t _se1 = get_sys_cnt_aicpu();
-    g_orch_scope_end_cycle += (_se1 - _se0);
-#endif
-}
-
-// =============================================================================
-// Task Submission
-// =============================================================================
-
-// Shared body for submit_task / submit_dummy_task. Caller has already validated
-// args.has_error, decided active_mask (empty for dummy), and resolved the per-slot
-// kernel_ids (all INVALID_KERNEL_ID for dummy). Performs tensormap sync, fanin
-// computation (explicit_deps + auto), output registration, slot init, and pushes
-// to the scheduler wiring queue.
-static TaskOutputTensors submit_task_common(
-    PTO2OrchestratorState *orch, const Arg &args, ActiveMask active_mask, int32_t aic_kernel_id, int32_t aiv0_kernel_id,
-    int32_t aiv1_kernel_id
-) {
-    CYCLE_COUNT_START();
-    TaskOutputTensors result;
-    PTO2OutputLayout layout = calculate_output_layout(args);
-    PTO2PreparedTask prepared;
-    if (!prepare_task(orch, args, layout.total_output_size, active_mask, &prepared)) {
-        return result;
-    }
-    uint8_t ring_id = prepared.task_id.ring();
-    PTO2SchedulerState *sched = orch->scheduler;
-    PTO2RingFlowControl &fc = orch->sm_header->rings[ring_id].fc;
-    PTO2TaskId task_id = prepared.task_id;
-    PTO2TaskSlotState &cur_slot_state = *prepared.slot_state;
-    PTO2TaskDescriptor &task = *prepared.task;
-    PTO2TaskPayload &payload = *prepared.payload;
-    result.set_task_id(task_id);
-
-    // dep_gen capture point: snapshot the orch submit_task inputs while the
-    // tensormap is still in its pre-lookup state for this task. Replay reads
-    // these records offline to reconstruct the complete dep graph — the sole
-    // source of truth for fanout now that the swimlane hot path no longer
-    // records it.
-    if (is_dep_gen_enabled()) {
-        const void *tensor_ptrs[MAX_TENSOR_ARGS];
-        // TensorArgType is `enum class : int32_t` (4 bytes); the on-disk record
-        // packs arg_types as uint8_t[16] (5-value enum fits in a byte). Narrow
-        // each tag here rather than letting the AICPU writer reinterpret a
-        // 4×-wider array as bytes — that path silently lost two of every three
-        // tags on little-endian and synthesized phantom self-edges in replay.
-        uint8_t arg_types_u8[MAX_TENSOR_ARGS];
-        // Clamp to MAX_TENSOR_ARGS even though the Arg builder caps adds at
-        // MAX_TENSOR_ARGS: defensive against any future builder bypass /
-        // shared-memory bit-flip that could otherwise overrun the two
-        // MAX_TENSOR_ARGS-sized stack buffers above.
-        const int tc_raw = args.tensor_count();
-        const int tc = tc_raw > MAX_TENSOR_ARGS ? MAX_TENSOR_ARGS : tc_raw;
-        for (int i = 0; i < tc; i++) {
-            // OUTPUT slots carry create_info (not yet a Tensor); skip them —
-            // they have no producer to look up and replay's per-tensor loop
-            // also skips OUTPUT.
-            tensor_ptrs[i] = (args.tag(i) == TensorArgType::OUTPUT) ? nullptr : args.tensor(i).ptr;
-            arg_types_u8[i] = static_cast<uint8_t>(args.tag(i));
-        }
-        const int32_t kernel_ids_capture[3] = {aic_kernel_id, aiv0_kernel_id, aiv1_kernel_id};
-        dep_gen_aicpu_record_submit(
-            task_id.raw, orch->in_manual_scope(), tc, tensor_ptrs, arg_types_u8,
-            static_cast<int>(args.explicit_dep_count()), reinterpret_cast<const uint64_t *>(args.explicit_deps_data()),
-            kernel_ids_capture
-        );
-    }
-
-    PTO2FaninBuilder fanin_builder(orch, orch->rings[ring_id].fanin_pool, next_fanin_seen_epoch(orch));
-
-    CYCLE_COUNT_LAP(g_orch_alloc_cycle);
-
-#if PTO2_PROFILING
-    if (layout.total_output_size > 0) {
-        orch->buffers_allocated++;
-        orch->bytes_allocated += layout.total_output_size;
-    }
-#endif
-
-    // === STEP 2: Sync TensorMap validity and optional cleanup ===
-    // Read current last_task_alive from shared memory for this ring
-    int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire);
-
-    orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive);
-
-    CYCLE_COUNT_LAP(g_orch_sync_cycle);
-
-    for (uint32_t i = 0; i < args.explicit_dep_count(); i++) {
-        PTO2TaskId dep_task_id = args.explicit_dep(i);
-        if (!dep_task_id.is_valid()) {
-            orch->report_fatal(
-                PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "Arg.set_dependencies(...) requires valid task ids"
-            );
-            return result;
-        }
-        uint8_t dep_ring_id = dep_task_id.ring();
-        PTO2SharedMemoryRingHeader &dep_ring = orch->sm_header->rings[dep_ring_id];
-        int32_t dep_local_task_id = static_cast<int32_t>(dep_task_id.local());
-        int32_t dep_last_task_alive = dep_ring.fc.last_task_alive.load(std::memory_order_acquire);
-        if (dep_local_task_id < dep_last_task_alive) {
-            continue;
-        }
-        int32_t dep_slot = dep_ring.get_slot_by_task_id(dep_local_task_id);
-        PTO2TaskSlotState *producer_slot_state = &dep_ring.get_slot_state_by_slot(dep_slot);
-        if (!append_fanin_or_fail(orch, dep_ring_id, dep_slot, producer_slot_state, &fanin_builder, ring_id)) {
-            return result;
-        }
-    }
-
-    // === STEP 3: Lookup inputs (creator retention + tensormap modifier lookup) ===
-    DepInputs dep_inputs{
-        args.tensor_count(),       args.tensor_data(), args.tag_data(), static_cast<int32_t>(args.explicit_dep_count()),
-        args.explicit_deps_data(),
-    };
-
-    auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool {
-        uint8_t prod_ring = producer_task_id.ring();
-        PTO2SharedMemoryRingHeader &producer_ring = orch->sm_header->rings[prod_ring];
-        int32_t prod_slot = producer_ring.get_slot_by_task_id(static_cast<int32_t>(producer_task_id.local()));
-        PTO2TaskSlotState *prod_state = &producer_ring.get_slot_state_by_slot(prod_slot);
-        return append_fanin_or_fail(orch, prod_ring, prod_slot, prod_state, &fanin_builder, ring_id);
-    };
-
-    if (!compute_task_fanin(dep_inputs, orch->tensor_map, orch->in_manual_scope(), runtime_emit)) {
-        return result;
-    }
-
-    CYCLE_COUNT_LAP(g_orch_lookup_cycle);
-
-    // === STEP 4: Register outputs/inouts in TensorMap (must be separate from lookup) ===
-    register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope());
-
-    CYCLE_COUNT_LAP(g_orch_insert_cycle);
-
-    // === STEP 5: Batch-write to GM (single cache line burst) ===
-    // Deferred from allocation phase to avoid scattered GM writes that get
-    // evicted by TensorMap lookup/insert cache pressure.
-    __builtin_prefetch(&task, 1, 1);
-    task.task_id = task_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = aic_kernel_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = aiv0_kernel_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = aiv1_kernel_id;
-    task.packed_buffer_base = prepared.alloc_result.packed_base;
-    task.packed_buffer_end = prepared.alloc_result.packed_end;
-
-    // Increment fanout_count on each producer (no lock — only orch writes this field).
-    // Prevents premature CONSUMED: scope_end's release_producer checks fanout_refcount == fanout_count.
-    for_each_fanin_storage(
-        fanin_builder.inline_slots, fanin_builder.count, fanin_builder.spill_start, fanin_builder.spill_pool,
-        [](PTO2TaskSlotState *producer) {
-            producer->fanout_count++;
-        }
-    );
-
-    int32_t inline_count = std::min(fanin_builder.count, PTO2_FANIN_INLINE_CAP);
-    // Store fanin metadata in payload for scheduler to iterate
-    payload.fanin_actual_count = fanin_builder.count;
-    payload.fanin_spill_start = fanin_builder.spill_start;
-    payload.fanin_spill_pool = &fanin_builder.spill_pool;
-    for (int i = 0; i < inline_count; i++) {
-        payload.fanin_inline_slot_states[i] = fanin_builder.inline_slots[i];
-    }
-
-    payload.init(args, result, prepared.alloc_result, layout);
-#if PTO2_PROFILING
-    if (is_dump_args_enabled()) {
-        if (args.scalar_count() > 0) {
-            set_dump_args_task_scalar_dtypes(
-                task_id.raw, static_cast<uint32_t>(args.scalar_count()), args.scalar_dtypes()
-            );
-        }
-        // Selective vs full dump is latched at dump_args_init from DumpDataHeader
-        // (host-decided before any dispatch), so it is race-free regardless of
-        // submission order. Here we only record each marked task's arg mask and
-        // metadata flags, which selective collection consults.
-        if (args.dump_arg_mask() != 0) {
-            set_dump_args_task_mask(task_id.raw, args.dump_arg_mask(), args.dump_arg_index_ambiguous_mask());
-        }
-    }
-#endif
-
-    CYCLE_COUNT_LAP(g_orch_args_cycle);
-#if PTO2_ORCH_PROFILING
-    g_orch_args_atomic_count += 2;  // fanout_lock.store + fanout_count.store
-#endif
-
-    // === STEP 6: push to wiring queue ===
-    // Deferred wiring: orchestrator only stores dependency metadata and increments
-    // fanout_count. The actual fanout_head wiring (lock + dep_pool + early_finished)
-    // is handled asynchronously by scheduler thread 0 via the wiring queue.
-    // Push to global wiring queue — scheduler sets fanin_count, wires fanout, checks readiness
-    while (!sched->wiring.queue.push(&cur_slot_state)) {
-        SPIN_WAIT_HINT();
-    }
-
-    CYCLE_COUNT_LAP(g_orch_fanin_cycle);
-    CYCLE_COUNT_ORCH_SUBMIT_RECORD(task_id.raw);
-
-#if PTO2_PROFILING
-    orch->tasks_submitted++;
-#if PTO2_ORCH_PROFILING
-    g_orch_submit_count++;
-#endif
-    g_orch_submit_idx++;
-#endif
-    return result;
-}
-
-TaskOutputTensors PTO2OrchestratorState::submit_task(const MixedKernels &mixed_kernels, const Arg &args) {
-    auto *orch = this;
-
-    // Orchestration API should short-circuit after fatal, but keep this entry
-    // robust as a no-op in case a caller reaches it directly.
-    if (orch->fatal) {
-        return TaskOutputTensors{};
-    }
-
-    // Validate Arg construction (errors recorded by add_input/add_output/etc.)
-    if (args.has_error) {
-        LOG_ERROR("========================================");
-        LOG_ERROR("FATAL: Invalid Arg Detected!");
-        LOG_ERROR("========================================");
-        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
-        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
-        LOG_ERROR("This is a bug in the orchestration code.");
-        LOG_ERROR("========================================");
-        orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
-        return TaskOutputTensors{};
-    }
-    always_assert(orch->scheduler != nullptr);
-    // === Validate submit inputs ===
-    ActiveMask active_mask = mixed_kernels.to_active_mask();
-    always_assert(static_cast<bool>(active_mask) && "MixedKernels must have at least one active slot");
-
-    int16_t block_num = args.launch_spec.block_num();
-    always_assert(block_num >= 1 && "block_num must be >= 1");
-
-    // Normalize single-AIV tasks: if only aiv1 is set (no aic, no aiv0), move
-    // it to the aiv0 slot.  This guarantees the dispatch path can always use
-    // PTO2SubtaskSlot::AIV0 for single-AIV shapes without inspecting active_mask.
-    // Mixed tasks (AIC+AIV) keep their original AIV identity so the correct
-    // hardware channel (AIV0→AIC vs AIV1→AIC) is used at dispatch time.
-    MixedKernels normalized = mixed_kernels;
-    bool has_aic = active_mask.has_mask(PTO2_SUBTASK_MASK_AIC);
-    bool has_aiv0 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV0);
-    bool has_aiv1 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV1);
-    if (!has_aic && has_aiv1 && !has_aiv0) {
-        normalized.aiv0_kernel_id = normalized.aiv1_kernel_id;
-        normalized.aiv1_kernel_id = INVALID_KERNEL_ID;
-        active_mask = normalized.to_active_mask();
-    }
-
-    // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1)
-    if (block_num > 1 && args.launch_spec.require_sync_start()) {
-        // Deadlock check: block_num >= total available slots of the required type.
-        // For MIX/AIC: limit is total_cluster_count (one AIC per cluster).
-        // For AIV:     limit is total_aiv_count.
-        PTO2ResourceShape shape = active_mask.to_shape();
-        int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count;
-        if (limit > 0 && block_num > limit) {
-            report_fatal(
-                PTO2_ERROR_REQUIRE_SYNC_START_INVALID, __FUNCTION__,
-                "require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit
-            );
-            return TaskOutputTensors{};
-        }
-        active_mask.set_sync_start();
-    }
-
-    return submit_task_common(
-        orch, args, active_mask, normalized.aic_kernel_id, normalized.aiv0_kernel_id, normalized.aiv1_kernel_id
-    );
-}
-
-// Submit a dependency-only task: full dependency graph participation
-// (tensormap lookup/insert, explicit_deps, manual_dep, manual_scope) but no
-// AICore dispatch. Empty active_mask routes the slot to the DUMMY ready
-// bucket; dispatch loop short-circuits to completion. Accepts the same Arg
-// shape as submit_task; scalars are permitted but never consumed.
-TaskOutputTensors PTO2OrchestratorState::submit_dummy_task(const Arg &args) {
-    auto *orch = this;
-
-    if (orch->fatal) {
-        return TaskOutputTensors{};
-    }
-
-    if (args.has_error) {
-        LOG_ERROR("========================================");
-        LOG_ERROR("FATAL: Invalid Arg in submit_dummy_task!");
-        LOG_ERROR("========================================");
-        LOG_ERROR("Error: %s", args.error_msg ? args.error_msg : "(unknown)");
-        LOG_ERROR("  tensor_count: %d, scalar_count: %d", args.tensor_count(), args.scalar_count());
-        LOG_ERROR("========================================");
-        orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
-        return TaskOutputTensors{};
-    }
-    always_assert(orch->scheduler != nullptr);
-
-    return submit_task_common(orch, args, ActiveMask{}, INVALID_KERNEL_ID, INVALID_KERNEL_ID, INVALID_KERNEL_ID);
-}
-
-TaskOutputTensors PTO2OrchestratorState::alloc_tensors(const Arg &args) {
-    auto *orch = this;
-    // Orchestration API should short-circuit after fatal, but keep this entry
-    // robust as a no-op in case a caller reaches it directly.
-    if (orch->fatal) {
-        return TaskOutputTensors{};
-    }
-
-    if (args.tensor_count() <= 0) {
-        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors requires at least one TensorCreateInfo");
-        return TaskOutputTensors{};
-    }
-    if (args.scalar_count() != 0) {
-        report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args");
-        return TaskOutputTensors{};
-    }
-    for (int32_t i = 0; i < args.tensor_count(); i++) {
-        if (args.tag(i) != TensorArgType::OUTPUT) {
-            report_fatal(
-                PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args"
-            );
-            return TaskOutputTensors{};
-        }
-    }
-
-    CYCLE_COUNT_START();
-
-    if (args.has_error) {
-        report_fatal(
-            PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s",
-            args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg"
-        );
-        return TaskOutputTensors{};
-    }
-
-    PTO2OutputLayout layout = calculate_output_layout(args);
-    PTO2PreparedTask prepared;
-    if (!prepare_task(orch, args, layout.total_output_size, ActiveMask{}, &prepared)) {
-        return TaskOutputTensors{};
-    }
-
-    PTO2TaskDescriptor &task = *prepared.task;
-    PTO2TaskPayload &payload = *prepared.payload;
-
-    CYCLE_COUNT_LAP(g_orch_alloc_cycle);
-
-#if PTO2_PROFILING
-    if (layout.total_output_size > 0) {
-        orch->buffers_allocated++;
-        orch->bytes_allocated += layout.total_output_size;
-    }
-#endif
-
-    task.task_id = prepared.task_id;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = INVALID_KERNEL_ID;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = INVALID_KERNEL_ID;
-    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = INVALID_KERNEL_ID;
-    task.packed_buffer_base = prepared.alloc_result.packed_base;
-    task.packed_buffer_end = prepared.alloc_result.packed_end;
-
-    TaskOutputTensors outputs;
-    outputs.set_task_id(prepared.task_id);
-    payload.init(args, outputs, prepared.alloc_result, layout);
-    payload.fanin_actual_count = 0;
-    payload.fanin_spill_start = 0;
-    payload.fanin_spill_pool = &orch->rings[prepared.task_id.ring()].fanin_pool;
-    CYCLE_COUNT_LAP(g_orch_args_cycle);
-
-    if (prepared.slot_state != nullptr) {
-        // Hidden alloc tasks complete inline in the orchestrator before any
-        // consumer can exist, so they have no fanout to notify and no worker
-        // subtasks to retire. Running the full on_task_complete path
-        // would only pay unnecessary fanout_lock / traversal overhead here.
-        // The generic slot initialization done in prepare_task() is still
-        // required so scope_end can release the producer-side reference and
-        // drive the slot to CONSUMED, but worker dispatch fields are never
-        // observed for hidden alloc tasks.
-        prepared.slot_state->task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
-    }
-    orch->inline_completed_tasks++;
-
-    CYCLE_COUNT_LAP(g_orch_fanin_cycle);
-    CYCLE_COUNT_ORCH_SUBMIT_RECORD(prepared.task_id.raw);
-
-#if PTO2_PROFILING
-    orch->tasks_submitted++;
-#if PTO2_ORCH_PROFILING
-    g_orch_submit_count++;
-#endif
-    g_orch_submit_idx++;
-#endif
-
-    return outputs;
-}
-
-// =============================================================================
-// Flow Control
-// =============================================================================
-
-void PTO2OrchestratorState::mark_done() {
-    auto *orch = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        int32_t total_tasks = orch->rings[r].task_allocator.active_count();
-        if (total_tasks > 0) {
-            LOG_INFO_V0("=== [Orchestrator] ring %d: total_tasks=%d ===", r, total_tasks);
-        }
-        auto &fanin_pool = orch->rings[r].fanin_pool;
-        if (fanin_pool.top > 1) {
-            LOG_INFO_V0(
-                "=== [FaninPool %d] top=%d tail=%d used=%d high_water=%d capacity=%d ===", r, fanin_pool.top,
-                fanin_pool.tail, fanin_pool.top - fanin_pool.tail, fanin_pool.high_water, fanin_pool.capacity
-            );
-        }
-    }
-    orch->sm_header->orchestrator_done.store(1, std::memory_order_release);
-    orch->scope_tasks_size = 0;
-    orch->scope_stack_top = -1;
-    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-#if !PTO2_ORCH_PROFILING && PTO2_PROFILING
-    g_orch_submit_idx = 0;
-#endif
-}
-
-#if PTO2_ORCH_PROFILING
-PTO2OrchProfilingData orchestrator_get_profiling() {
-    PTO2OrchProfilingData d;
-    d.sync_cycle = g_orch_sync_cycle;
-    d.alloc_cycle = g_orch_alloc_cycle;
-    d.args_cycle = g_orch_args_cycle;
-    d.lookup_cycle = g_orch_lookup_cycle;
-    d.insert_cycle = g_orch_insert_cycle;
-    d.fanin_cycle = g_orch_fanin_cycle;
-    d.scope_end_cycle = g_orch_scope_end_cycle;
-    d.submit_count = g_orch_submit_count;
-    d.alloc_wait_cycle = g_orch_alloc_wait_cycle;
-    d.fanin_wait_cycle = g_orch_fanin_wait_cycle;
-    d.alloc_atomic_count = g_orch_alloc_atomic_count;
-    d.args_atomic_count = g_orch_args_atomic_count;
-    d.scope_end_atomic_count = g_orch_scope_end_atomic_count;
-
-    // Reset
-    g_orch_sync_cycle = g_orch_alloc_cycle = g_orch_args_cycle = 0;
-    g_orch_lookup_cycle = g_orch_insert_cycle = 0;
-    g_orch_fanin_cycle = g_orch_scope_end_cycle = 0;
-    g_orch_submit_count = 0;
-    g_orch_submit_idx = 0;
-    g_orch_alloc_wait_cycle = 0;
-    g_orch_fanin_wait_cycle = 0;
-    g_orch_alloc_atomic_count = 0;
-    g_orch_args_atomic_count = 0;
-    g_orch_scope_end_atomic_count = 0;
-    return d;
-}
-#endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 1471f6a2f..41a949a3d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -8,22 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Orchestrator Interface
- *
- * The Orchestrator is responsible for:
- * 1. Executing the orchestration function (Turing-complete control flow)
- * 2. Allocating intermediate buffers from the heap
- * 3. Submitting tasks via async InCore function calls
- * 4. Building the dependency graph using TensorMap
- * 5. Managing buffer scopes for lifecycle control
- *
- * The Orchestrator can run on either:
- * - Host CPU (lower latency for complex control, easier debugging)
- * - Device AI_CPU (lower latency for task submission)
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
 
 #ifndef PTO_ORCHESTRATOR_H
 #define PTO_ORCHESTRATOR_H
@@ -33,19 +17,59 @@
 #include "pto_ring_buffer.h"
 #include "pto_runtime2_types.h"
 #include "pto_submit_types.h"
-#include "scheduler/pto_scheduler.h"
+#include "pto_scheduler.h"
 #include "pto_shared_memory.h"
 #include "pto_tensormap.h"
 #include "pto_types.h"
 
-/**
- * Layout descriptor produced by PTO2OrchestratorState::reserve_layout(). Holds
- * arena offsets for every sub-region the orchestrator owns (per-ring fanin
- * pools, scope arrays, plus the nested PTO2TensorMap layout).
- */
-struct PTO2OrchestratorLayout {
-    size_t off_fanin_pool[PTO2_MAX_RING_DEPTH];
-    size_t off_fanin_seen_epoch[PTO2_MAX_RING_DEPTH];
+#include <stdarg.h>
+#include <stdio.h>
+#include "aicpu/dep_gen_collector_aicpu.h"
+#include "common/dep_gen.h"
+#include "pto_dep_compute.h"
+#include "tensor.h"
+
+struct PTO2OrchestratorState;
+
+// Full definitions of helper aggregate types that the inline methods on
+// PTO2OrchestratorState (and the helpers below) construct by value.
+struct PTO2PreparedTask
+{
+    PTO2TaskId task_id = PTO2TaskId::invalid();
+    PTO2TaskAllocResult alloc_result = {-1, 0, nullptr, nullptr};
+    PTO2TaskDescriptor *task = nullptr;
+    PTO2TaskPayload *payload = nullptr;
+    PTO2TaskSlotState *slot_state = nullptr;
+};
+
+struct PTO2FaninBuilder
+{
+    int32_t count{0};
+    PTO2TaskSlotState *slots[PTO2_MAX_FANIN];
+    int32_t local_ids[PTO2_MAX_FANIN];
+
+    bool contains(PTO2TaskSlotState *prod_state) const
+    {
+        for (int32_t i = 0; i < count; i++)
+            if (slots[i] == prod_state) return true;
+        return false;
+    }
+};
+
+// Forward declarations of helpers defined below — needed because the inline
+// methods on PTO2OrchestratorState reference them.
+inline int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code);
+inline void orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *fmt, va_list args);
+inline void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state);
+inline bool prepare_task(PTO2OrchestratorState *orch, const Arg &args, int32_t total_output_size, ActiveMask active_mask, PTO2PreparedTask *out);
+inline PTO2OutputLayout calculate_output_layout(const Arg &args);
+inline bool append_fanin_or_fail(PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, int32_t prod_local_id, PTO2FaninBuilder *fanin_builder);
+inline bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator);
+inline void prefetch_payload(PTO2TaskPayload *payload, int32_t tensor_count, int32_t scalar_count);
+inline TaskOutputTensors submit_task_common(PTO2OrchestratorState *orch, const Arg &args, ActiveMask active_mask, int32_t aic_kernel_id, int32_t aiv0_kernel_id, int32_t aiv1_kernel_id);
+
+struct PTO2OrchestratorLayout
+{
     size_t off_scope_tasks;
     size_t off_scope_begins;
     PTO2TensorMapLayout tensor_map;
@@ -54,16 +78,8 @@ struct PTO2OrchestratorLayout {
     uint64_t scope_stack_capacity;
 };
 
-// =============================================================================
-// Orchestrator State
-// =============================================================================
-
-/**
- * Orchestrator state structure (private to Orchestrator)
- *
- * Contains all state needed for task graph construction and buffer management.
- */
-struct PTO2OrchestratorState {
+struct PTO2OrchestratorState
+{
     // === SHARED MEMORY ACCESS ===
     PTO2SharedMemoryHeader *sm_header;
 
@@ -75,10 +91,6 @@ struct PTO2OrchestratorState {
     // === TENSOR MAP (Private) ===
     PTO2TensorMap tensor_map;  // Producer lookup
 
-    // === SCOPE STACK (Private) ===
-    // Single contiguous buffer of task IDs, partitioned by scope level.
-    // scope_begins[i] is the index into scope_tasks where scope i starts.
-    // Tasks for the top scope occupy [scope_begins[top], scope_tasks_size).
     PTO2TaskSlotState **scope_tasks;  // Flat buffer of taskSlotState (all scopes concatenated)
     int32_t scope_tasks_size;         // Number of task IDs currently in the buffer
     int32_t scope_tasks_capacity;     // Allocated capacity of scope_tasks
@@ -87,115 +99,496 @@ struct PTO2OrchestratorState {
     uint64_t scope_stack_capacity;    // Max nesting depth (PTO2_MAX_SCOPE_DEPTH)
     int32_t manual_begin_depth{PTO2_MAX_SCOPE_DEPTH};
 
-    // === SCHEDULER REFERENCE ===
-    // Note: In simulated mode, orchestrator and scheduler share address space
-    // In real mode, they communicate via shared memory only
     PTO2SchedulerState *scheduler;  // For simulated mode only
 
     // Total core counts set once at executor init; used for submit-time deadlock detection.
     int32_t total_cluster_count{0};  // AIC cores = MIX clusters
     int32_t total_aiv_count{0};      // AIV cores (= 2 × clusters on standard hardware)
-#if PTO2_PROFILING
-    // L2 swimlane_level copied from get_l2_swimlane_level().
-    L2SwimlaneLevel l2_swimlane_level{L2SwimlaneLevel::DISABLED};
-#endif
 
     // === GM HEAP (for output buffers) ===
     void *gm_heap_base;     // Base address of GM heap
     uint64_t gm_heap_size;  // Total size of GM heap (all rings)
 
-    // === FATAL ERROR ===
-    // Fatal error flag (single-thread access by orchestrator, no atomic needed)
-    // Cross-thread notification uses shared memory orch_error_code (atomic)
     bool fatal;
 
-    // Hidden alloc tasks complete synchronously inside the orchestrator and
-    // therefore bypass the executor's normal worker-completion counter path.
-    // The executor adds this count into its completed_tasks_ progress counter
-    // after orchestration finishes so shutdown/profiling totals remain closed.
     int64_t inline_completed_tasks{0};
 
     // === STATISTICS ===
-#if PTO2_PROFILING
-    int64_t tasks_submitted;
-    int64_t buffers_allocated;
-    int64_t bytes_allocated;
-#endif
-
-    /**
-     * Get current ring index from scope depth.
-     * Maps scope depth to ring_id: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
-     */
-    uint8_t current_ring_id() const {
+
+    uint8_t current_ring_id() const
+    {
         int32_t depth = scope_stack_top;
         if (depth < 0) depth = 0;
         return depth < PTO2_MAX_RING_DEPTH ? static_cast<uint8_t>(depth) : PTO2_MAX_RING_DEPTH - 1;
     }
 
-    bool in_manual_scope() const { return scope_stack_top >= manual_begin_depth; }
+    bool in_manual_scope() const
+    {
+        return scope_stack_top >= manual_begin_depth;
+    }
+
+    // === Cold-path API ===
+
+    static PTO2OrchestratorLayout reserve_layout(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity)
+    {
+        PTO2OrchestratorLayout layout{};
+        layout.dep_pool_capacity = dep_pool_capacity;
+        layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
+        layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
+
+        layout.off_scope_tasks = arena.reserve(static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *));
+        layout.off_scope_begins = arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
+        layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
+        return layout;
+    }
+
+    bool init_data_from_layout(const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size, uint64_t task_window_size)
+    {
+        auto *orch = this;
+        *orch = PTO2OrchestratorState{};
+
+        orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+        orch->gm_heap_base = gm_heap;
+        orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
+        orch->fatal = false;
+
+        // Mirror the SM API's per-ring window-size shape so a future per-ring
+        // SM layout cannot silently disagree with the addresses we compute here.
+        uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_window_sizes[r] = task_window_size;
+
+        auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
+            auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
+            auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
+            auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
+
+            orch->rings[r].task_allocator.init(task_descs_dev, static_cast<int32_t>(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base, heap_size, orch_err);
+        }
 
-    // === Cold-path API (defined in pto_orchestrator.cpp) ===
+        if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) return false;
 
-    // Phase 1: declare every sub-region (per-ring fanin pool, scope arrays,
-    // tensor_map sub-layout) on the supplied arena. task_window_sizes feeds
-    // the nested tensor_map layout. Returned layout is consumed by
-    // init_from_layout.
-    static PTO2OrchestratorLayout reserve_layout(
-        DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH],
-        int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
-    );
+        orch->scope_tasks_size = 0;
+        orch->scope_tasks_capacity = layout.scope_tasks_cap;
+        orch->scope_stack_top = -1;
+        orch->scope_stack_capacity = layout.scope_stack_capacity;
+        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
 
-    // Phase 3a: write everything *except* arena-internal pointer fields.
-    // sm_dev_base is the SM device address (only stored, never dereferenced);
-    // task_window_size feeds the per-ring SM address arithmetic. Safe to call
-    // on a host arena that holds the prebuilt image.
-    bool init_data_from_layout(
-        const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
-        uint64_t task_window_size
-    );
+        return true;
+    }
 
-    // Phase 3b: write the arena-internal pointer fields (scope_tasks,
-    // scope_begins, rings[].fanin_pool.base, tensor_map.{buckets,entry_pool,
-    // free_entry_list,task_entry_heads}, scheduler reference).
-    // Idempotent — host runs once on the image, AICPU runs once after attach.
-    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler);
+    void wire_arena_pointers(const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg)
+    {
+        auto *orch = this;
+        orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
+        orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
+        orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
+        orch->scheduler = scheduler_arg;
+    }
 
     // Forget pointers; arena owns the backing buffers.
-    void destroy();
-    void set_scheduler(PTO2SchedulerState *scheduler);
-    void report_fatal(int32_t error_code, const char *func, const char *fmt, ...);
-    void begin_scope(PTO2ScopeMode mode = PTO2ScopeMode::AUTO);
-    void end_scope();
-    TaskOutputTensors submit_task(const MixedKernels &mixed_kernels, const Arg &args);
-    TaskOutputTensors submit_dummy_task(const Arg &args);
-    TaskOutputTensors alloc_tensors(const Arg &args);
-    void mark_done();
-};
+    void destroy()
+    {
+        auto *orch = this;
+        orch->tensor_map.destroy();
+        orch->scope_tasks = nullptr;
+        orch->scope_begins = nullptr;
+    }
+    void set_scheduler(PTO2SchedulerState *scheduler)
+    {
+        this->scheduler = scheduler;
+    }
+    void report_fatal(int32_t error_code, [[maybe_unused]] const char *func, const char *fmt, ...)
+    {
+        auto *orch = this;
+        va_list args;
+        va_start(args, fmt);
+        orch_report_fatal_v(orch, error_code, fmt, args);
+        va_end(args);
+    }
+    void begin_scope(PTO2ScopeMode mode)
+    {
+        auto *orch = this;
+        if (orch->fatal) return;
+        assert(orch->scope_stack_top < static_cast<int32_t>(orch->scope_stack_capacity - 1) && "Scope stack overflow");
+        if (mode == PTO2ScopeMode::AUTO && orch->in_manual_scope())
+        {
+            report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "auto scope nested inside manual scope is not supported");
+            return;
+        }
+
+        bool already_in_manual_scope = orch->in_manual_scope();
+        ++orch->scope_stack_top;
+        orch->scope_begins[orch->scope_stack_top] = orch->scope_tasks_size;
+        if (mode == PTO2ScopeMode::MANUAL && !already_in_manual_scope) orch->manual_begin_depth = orch->scope_stack_top;
+    }
+    void end_scope()
+    {
+        auto *orch = this;
+        if (orch->fatal) return;
+        assert(orch->scope_stack_top >= 0 && "Scope stack underflow");
+
+        bool ending_manual_scope = orch->scope_stack_top == orch->manual_begin_depth;
+        int32_t begin = orch->scope_begins[orch->scope_stack_top--];
+        if (ending_manual_scope) orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+
+        // Watermark-based reclamation: scope-end has no work to do — consumers
+        // no longer need to notify producers.
+        orch->scope_tasks_size = begin;
+    }
+    TaskOutputTensors submit_task(const MixedKernels &mixed_kernels, const Arg &args)
+    {
+        auto *orch = this;
+
+        // Orchestration API should short-circuit after fatal, but keep this entry
+        // robust as a no-op in case a caller reaches it directly.
+        if (orch->fatal) return TaskOutputTensors{};
+
+        // Validate Arg construction (errors recorded by add_input/add_output/etc.)
+        if (args.has_error)
+        {
+            orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
+            return TaskOutputTensors{};
+        }
+        always_assert(orch->scheduler != nullptr);
+        // === Validate submit inputs ===
+        ActiveMask active_mask = mixed_kernels.to_active_mask();
+        always_assert(static_cast<bool>(active_mask) && "MixedKernels must have at least one active slot");
+
+        int16_t block_num = args.launch_spec.block_num();
+        always_assert(block_num >= 1 && "block_num must be >= 1");
+
+        MixedKernels normalized = mixed_kernels;
+        bool has_aic = active_mask.has_mask(PTO2_SUBTASK_MASK_AIC);
+        bool has_aiv0 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV0);
+        bool has_aiv1 = active_mask.has_mask(PTO2_SUBTASK_MASK_AIV1);
+        if (!has_aic && has_aiv1 && !has_aiv0)
+        {
+            normalized.aiv0_kernel_id = normalized.aiv1_kernel_id;
+            normalized.aiv1_kernel_id = INVALID_KERNEL_ID;
+            active_mask = normalized.to_active_mask();
+        }
+
+        // Encode require_sync_start into active_mask bit 3 (only meaningful for tasks with block_num > 1)
+        if (block_num > 1 && args.launch_spec.require_sync_start())
+        {
+            PTO2ResourceShape shape = active_mask.to_shape();
+            int32_t limit = (shape == PTO2ResourceShape::AIV) ? orch->total_aiv_count : orch->total_cluster_count;
+            if (limit > 0 && block_num > limit)
+            {
+                report_fatal(PTO2_ERROR_REQUIRE_SYNC_START_INVALID, __FUNCTION__, "require_sync_start block_num=%d > limit=%d (deadlock guaranteed)", block_num, limit);
+                return TaskOutputTensors{};
+            }
+            active_mask.set_sync_start();
+        }
+
+        return submit_task_common(orch, args, active_mask, normalized.aic_kernel_id, normalized.aiv0_kernel_id, normalized.aiv1_kernel_id);
+    }
+    TaskOutputTensors submit_dummy_task(const Arg &args)
+    {
+        auto *orch = this;
+
+        if (orch->fatal) return TaskOutputTensors{};
+
+        if (args.has_error)
+        {
+            orch_mark_fatal(orch, PTO2_ERROR_INVALID_ARGS);
+            return TaskOutputTensors{};
+        }
+        always_assert(orch->scheduler != nullptr);
+
+        return submit_task_common(orch, args, ActiveMask{}, INVALID_KERNEL_ID, INVALID_KERNEL_ID, INVALID_KERNEL_ID);
+    }
+    TaskOutputTensors alloc_tensors(const Arg &args)
+    {
+        auto *orch = this;
+        // Orchestration API should short-circuit after fatal, but keep this entry
+        // robust as a no-op in case a caller reaches it directly.
+        if (orch->fatal) return TaskOutputTensors{};
+
+        if (args.tensor_count() <= 0)
+        {
+            report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors requires at least one TensorCreateInfo");
+            return TaskOutputTensors{};
+        }
+        if (args.scalar_count() != 0)
+        {
+            report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args");
+            return TaskOutputTensors{};
+        }
+        for (int32_t i = 0; i < args.tensor_count(); i++)
+        {
+            if (args.tag(i) != TensorArgType::OUTPUT)
+            {
+                report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "alloc_tensors only accepts output TensorCreateInfo args");
+                return TaskOutputTensors{};
+            }
+        }
+
+        if (args.has_error)
+        {
+            report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "%s", args.error_msg ? args.error_msg : "alloc_tensors failed to construct output-only Arg");
+            return TaskOutputTensors{};
+        }
 
-// =============================================================================
-// Orchestrator Profiling Data
-// =============================================================================
-
-#if PTO2_ORCH_PROFILING
-struct PTO2OrchProfilingData {
-    uint64_t sync_cycle;
-    uint64_t alloc_cycle;  // Combined task slot + heap allocation
-    uint64_t args_cycle;
-    uint64_t lookup_cycle;
-    uint64_t insert_cycle;
-    uint64_t fanin_cycle;
-    uint64_t scope_end_cycle;
-    int64_t submit_count;
-    // Wait time tracking for blocking phases
-    uint64_t alloc_wait_cycle;  // Cycles spent waiting in unified alloc
-    uint64_t fanin_wait_cycle;  // Cycles spent waiting in fanout_lock
-    // Atomic operation counts per phase
-    uint64_t alloc_atomic_count;
-    uint64_t args_atomic_count;
-    uint64_t scope_end_atomic_count;
+        PTO2OutputLayout layout = calculate_output_layout(args);
+        PTO2PreparedTask prepared;
+        if (!prepare_task(orch, args, layout.total_output_size, ActiveMask{}, &prepared)) return TaskOutputTensors{};
+
+        PTO2TaskDescriptor &task = *prepared.task;
+        PTO2TaskPayload &payload = *prepared.payload;
+
+        task.task_id = prepared.task_id;
+        task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = INVALID_KERNEL_ID;
+        task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = INVALID_KERNEL_ID;
+        task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = INVALID_KERNEL_ID;
+        task.packed_buffer_base = prepared.alloc_result.packed_base;
+        task.packed_buffer_end = prepared.alloc_result.packed_end;
+
+        TaskOutputTensors outputs;
+        outputs.set_task_id(prepared.task_id);
+        payload.init(args, outputs, prepared.alloc_result, layout);
+        payload.fanin_count = 0;
+
+        if (prepared.slot_state != nullptr)
+        {
+            // (m) Inline completion uses completion_flags only.
+            uint8_t ring_id = prepared.task_id.ring();
+            orch->sm_header->rings[ring_id].completion_flags[prepared.alloc_result.slot].store(1, std::memory_order_release);
+        }
+        orch->inline_completed_tasks++;
+
+        return outputs;
+    }
+    void mark_done()
+    {
+        auto *orch = this;
+        orch->sm_header->orchestrator_done.store(1, std::memory_order_release);
+        orch->scope_tasks_size = 0;
+        orch->scope_stack_top = -1;
+        orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
+    }
 };
 
-PTO2OrchProfilingData orchestrator_get_profiling();
-#endif
+// -----------------------------------------------------------------------------
+// Helpers
+// -----------------------------------------------------------------------------
+
+inline int32_t orch_mark_fatal(PTO2OrchestratorState *orch, int32_t error_code)
+{
+    always_assert(orch != nullptr);
+    orch->fatal = true;
+    if (error_code == PTO2_ERROR_NONE || orch->sm_header == nullptr) return PTO2_ERROR_NONE;
+
+    int32_t expected = PTO2_ERROR_NONE;
+    std::atomic<int32_t> &orch_error_code = orch->sm_header->orch_error_code;
+    if (orch_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) return error_code;
+    return expected;
+}
+
+inline void orch_report_fatal_v(PTO2OrchestratorState *orch, int32_t error_code, const char *, va_list)
+{
+    // fmt + args are accepted for future logging-sink wiring but are not yet
+    // routed anywhere — the error_code is latched in shared memory via
+    // orch_mark_fatal and that's what callers actually observe.
+    orch_mark_fatal(orch, error_code);
+}
+
+inline bool append_fanin_or_fail(PTO2OrchestratorState *orch, PTO2TaskSlotState *prod_state, int32_t prod_local_id, PTO2FaninBuilder *fanin_builder)
+{
+    if (fanin_builder->contains(prod_state)) return true;
+    if (fanin_builder->count >= PTO2_MAX_FANIN)
+    {
+        orch_mark_fatal(orch, PTO2_ERROR_DEPENDENCY_OVERFLOW);
+        return false;
+    }
+    int32_t idx = fanin_builder->count++;
+    fanin_builder->slots[idx] = prod_state;
+    fanin_builder->local_ids[idx] = prod_local_id;
+    return true;
+}
+
+inline PTO2OutputLayout calculate_output_layout(const Arg &args)
+{
+    PTO2OutputLayout layout;
+    for (int32_t i = 0; i < args.tensor_count(); i++)
+    {
+        if (args.tag(i) != TensorArgType::OUTPUT) continue;
+        layout.offsets[i] = layout.total_output_size;
+        layout.buffer_sizes[i] = PTO2_ALIGN_UP(args.tensor(i).create_info->buffer_size_bytes(), PTO2_PACKED_OUTPUT_ALIGN);
+        layout.total_output_size += layout.buffer_sizes[i];
+    }
+    return layout;
+}
+
+inline bool check_scope_can_accept_task(PTO2OrchestratorState *orch, PTO2TaskAllocator &allocator)
+{
+    always_assert(orch->scope_stack_top >= 0 && "Cannot submit task outside a scope");
+
+    int32_t scope_task_count = orch->scope_tasks_size - orch->scope_begins[orch->scope_stack_top];
+    if (scope_task_count < allocator.window_size() - 1) return true;
+
+    orch_mark_fatal(orch, PTO2_ERROR_SCOPE_DEADLOCK);
+    return false;
+}
+
+inline void prefetch_payload(PTO2TaskPayload *payload, int32_t tensor_count, int32_t scalar_count)
+{
+    for (int32_t i = 0; i < tensor_count; i++)
+    {
+        __builtin_prefetch(&payload->tensors[i], 1, 3);
+        __builtin_prefetch(reinterpret_cast<char *>(&payload->tensors[i]) + 64, 1, 3);
+    }
+    for (int32_t i = 0; i < scalar_count; i += 8) __builtin_prefetch(&payload->scalars[i], 1, 3);
+    __builtin_prefetch(payload, 1, 3);
+    __builtin_prefetch(reinterpret_cast<char *>(payload) + 64, 1, 3);
+    __builtin_prefetch(reinterpret_cast<char *>(payload) + 128, 1, 3);
+}
+
+inline bool prepare_task(PTO2OrchestratorState *orch, const Arg &args, int32_t total_output_size, ActiveMask active_mask, PTO2PreparedTask *out)
+{
+    uint8_t ring_id = orch->current_ring_id();
+    auto &allocator = orch->rings[ring_id].task_allocator;
+
+    if (!check_scope_can_accept_task(orch, allocator)) return false;
+
+    out->alloc_result = allocator.alloc(total_output_size);
+    if (out->alloc_result.failed())
+    {
+        orch_mark_fatal(orch, PTO2_ERROR_HEAP_RING_DEADLOCK);
+        return false;
+    }
+
+    out->task_id = PTO2TaskId::make(ring_id, static_cast<uint32_t>(out->alloc_result.task_id));
+    out->slot_state = &orch->sm_header->rings[ring_id].get_slot_state_by_slot(out->alloc_result.slot);
+    out->task = &orch->sm_header->rings[ring_id].task_descriptors[out->alloc_result.slot];
+    out->payload = &orch->sm_header->rings[ring_id].task_payloads[out->alloc_result.slot];
+
+    prefetch_payload(out->payload, args.tensor_count(), args.scalar_count());
+
+    out->slot_state->bind_buffers(out->payload, out->task);
+
+    // Clear the polling-fast completion byte for the newly-allocated slot.
+    // The previous incarnation's completer set this byte to 1; we publish 0
+    // before this task can be added as a fanin to any consumer (single-
+    // orchestrator-thread guarantee) and before the wiring-queue push
+    // (release-acquire) makes the slot visible to thread 0.
+    orch->sm_header->rings[ring_id].completion_flags[out->alloc_result.slot].store(0, std::memory_order_relaxed);
+    // Seed last_consumer_local_id to self — with no consumers, the slot is
+    // safe to reclaim as soon as the watermark reaches this task itself.
+    out->slot_state->last_consumer_local_id = out->alloc_result.task_id;
+    int16_t block_num = args.launch_spec.block_num();
+    out->slot_state->total_required_subtasks = static_cast<int16_t>(block_num * __builtin_popcount(active_mask.core_mask()));
+    out->slot_state->logical_block_num = block_num;
+    out->slot_state->active_mask = active_mask;
+    scope_tasks_push(orch, out->slot_state);
+
+    return true;
+}
+
+inline void scope_tasks_push(PTO2OrchestratorState *orch, PTO2TaskSlotState *task_slot_state)
+{
+    if (orch->scope_tasks_size >= orch->scope_tasks_capacity)
+    {
+        orch->report_fatal(PTO2_ERROR_SCOPE_TASKS_OVERFLOW, __FUNCTION__, "scope_tasks buffer saturated at %d entries (all rings full)", orch->scope_tasks_capacity);
+        return;
+    }
+    orch->scope_tasks[orch->scope_tasks_size++] = task_slot_state;
+}
+
+inline TaskOutputTensors submit_task_common(PTO2OrchestratorState *orch, const Arg &args, ActiveMask active_mask, int32_t aic_kernel_id, int32_t aiv0_kernel_id, int32_t aiv1_kernel_id)
+{
+    TaskOutputTensors result;
+    PTO2OutputLayout layout = calculate_output_layout(args);
+    PTO2PreparedTask prepared;
+    if (!prepare_task(orch, args, layout.total_output_size, active_mask, &prepared)) return result;
+    uint8_t ring_id = prepared.task_id.ring();
+    PTO2SchedulerState *sched = orch->scheduler;
+    PTO2RingFlowControl &fc = orch->sm_header->rings[ring_id].fc;
+    PTO2TaskId task_id = prepared.task_id;
+    PTO2TaskSlotState &cur_slot_state = *prepared.slot_state;
+    PTO2TaskDescriptor &task = *prepared.task;
+    PTO2TaskPayload &payload = *prepared.payload;
+    result.set_task_id(task_id);
+
+    if (is_dep_gen_enabled())
+    {
+        const void *tensor_ptrs[MAX_TENSOR_ARGS];
+        uint8_t arg_types_u8[MAX_TENSOR_ARGS];
+        const int tc_raw = args.tensor_count();
+        const int tc = tc_raw > MAX_TENSOR_ARGS ? MAX_TENSOR_ARGS : tc_raw;
+        for (int i = 0; i < tc; i++)
+        {
+            tensor_ptrs[i] = (args.tag(i) == TensorArgType::OUTPUT) ? nullptr : args.tensor(i).ptr;
+            arg_types_u8[i] = static_cast<uint8_t>(args.tag(i));
+        }
+        const int32_t kernel_ids_capture[3] = {aic_kernel_id, aiv0_kernel_id, aiv1_kernel_id};
+        dep_gen_aicpu_record_submit(task_id.raw, orch->in_manual_scope(), tc, tensor_ptrs, arg_types_u8, static_cast<int>(args.explicit_dep_count()), reinterpret_cast<const uint64_t *>(args.explicit_deps_data()), kernel_ids_capture);
+    }
+
+    PTO2FaninBuilder fanin_builder;
+
+    int32_t sm_last_task_alive = fc.last_task_alive.load(std::memory_order_acquire);
+    orch->tensor_map.sync_tensormap(task_id, sm_last_task_alive);
+
+    for (uint32_t i = 0; i < args.explicit_dep_count(); i++)
+    {
+        PTO2TaskId dep_task_id = args.explicit_dep(i);
+        if (!dep_task_id.is_valid())
+        {
+            orch->report_fatal(PTO2_ERROR_INVALID_ARGS, __FUNCTION__, "Arg.set_dependencies(...) requires valid task ids");
+            return result;
+        }
+        PTO2SharedMemoryRingHeader &dep_ring = orch->sm_header->rings[dep_task_id.ring()];
+        int32_t dep_local_task_id = static_cast<int32_t>(dep_task_id.local());
+        int32_t dep_last_task_alive = dep_ring.fc.last_task_alive.load(std::memory_order_acquire);
+        if (dep_local_task_id < dep_last_task_alive) continue;
+        PTO2TaskSlotState *producer_slot_state = &dep_ring.get_slot_state_by_task_id(dep_local_task_id);
+        if (!append_fanin_or_fail(orch, producer_slot_state, dep_local_task_id, &fanin_builder)) return result;
+    }
+
+    DepInputs dep_inputs{
+        args.tensor_count(), args.tensor_data(), args.tag_data(), static_cast<int32_t>(args.explicit_dep_count()), args.explicit_deps_data(),
+    };
+
+    auto runtime_emit = [&](PTO2TaskId producer_task_id) -> bool {
+        int32_t prod_local = static_cast<int32_t>(producer_task_id.local());
+        PTO2TaskSlotState *prod_state = &orch->sm_header->rings[producer_task_id.ring()].get_slot_state_by_task_id(prod_local);
+        return append_fanin_or_fail(orch, prod_state, prod_local, &fanin_builder);
+    };
+
+    if (!compute_task_fanin(dep_inputs, orch->tensor_map, orch->in_manual_scope(), runtime_emit)) return result;
+
+    register_task_outputs(dep_inputs, task_id, orch->tensor_map, orch->in_manual_scope());
+
+    __builtin_prefetch(&task, 1, 1);
+    task.task_id = task_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIC)] = aic_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV0)] = aiv0_kernel_id;
+    task.kernel_id[static_cast<int>(PTO2SubtaskSlot::AIV1)] = aiv1_kernel_id;
+    task.packed_buffer_base = prepared.alloc_result.packed_base;
+    task.packed_buffer_end = prepared.alloc_result.packed_end;
+
+    // Push this consumer's local_id into each producer's last_consumer high-
+    // water-mark, replacing the per-completion fanout_refcount notification.
+    // Reclamation gates on the global completed_watermark reaching this value.
+    const int32_t self_local = static_cast<int32_t>(task_id.local());
+    for (int32_t i = 0; i < fanin_builder.count; i++)
+    {
+        PTO2TaskSlotState *prod = fanin_builder.slots[i];
+        if (self_local > prod->last_consumer_local_id) prod->last_consumer_local_id = self_local;
+    }
+
+    payload.fanin_count = fanin_builder.count;
+    for (int32_t i = 0; i < fanin_builder.count; i++) payload.fanin_local_ids[i] = fanin_builder.local_ids[i];
+
+    payload.init(args, result, prepared.alloc_result, layout);
+
+    while (!sched->wiring.queue.push(&cur_slot_state)) SPIN_WAIT_HINT();
+
+    return result;
+}
 
 #endif  // PTO_ORCHESTRATOR_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
deleted file mode 100644
index f6009dc57..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - Ring Buffer Implementation
- *
- * Implements DepListPool ring buffer for zero-overhead dependency management.
- * TaskAllocator methods are defined inline in pto_ring_buffer.h.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_ring_buffer.h"
-#include <inttypes.h>
-#include <string.h>
-#include "common/unified_log.h"
-#include "scheduler/pto_scheduler.h"
-
-static void latch_pool_error(std::atomic<int32_t> *error_code_ptr, int32_t error_code) {
-    if (error_code_ptr == nullptr) {
-        return;
-    }
-    int32_t expected = PTO2_ERROR_NONE;
-    error_code_ptr->compare_exchange_strong(expected, error_code, std::memory_order_acq_rel);
-}
-
-// =============================================================================
-// Fanin Spill Pool Implementation
-// =============================================================================
-void PTO2FaninPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) {
-    if (sm_last_task_alive <= reclaim_task_cursor) return;
-
-    int32_t scan_end = sm_last_task_alive;
-    for (int32_t task_id = reclaim_task_cursor; task_id < scan_end; ++task_id) {
-        PTO2TaskPayload &payload = ring.get_payload_by_task_id(task_id);
-        if (payload.fanin_spill_pool != this) {
-            continue;
-        }
-
-        int32_t inline_count = std::min(payload.fanin_actual_count, PTO2_FANIN_INLINE_CAP);
-        int32_t spill_edge_count = payload.fanin_actual_count - inline_count;
-        if (spill_edge_count > 0) {
-            advance_tail(payload.fanin_spill_start + spill_edge_count);
-        }
-    }
-    reclaim_task_cursor = scan_end;
-}
-
-bool PTO2FaninPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) {
-    if (available() >= needed) return true;
-
-    int spin_count = 0;
-    int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
-    while (available() < needed) {
-        reclaim(ring, prev_last_alive);
-        if (available() >= needed) return true;
-
-        spin_count++;
-
-        int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
-        if (cur_last_alive > prev_last_alive) {
-            spin_count = 0;
-            prev_last_alive = cur_last_alive;
-        }
-
-        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
-            int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire);
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Fanin Spill Pool Deadlock Detected!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("Fanin spill pool cannot reclaim space after %d spins (no progress).", spin_count);
-            LOG_ERROR(
-                "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
-                (capacity > 0) ? (100.0 * used() / capacity) : 0.0
-            );
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("  - Needed:        %d entries", needed);
-            LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
-            LOG_ERROR("  - current_task:    %d", current);
-            LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
-            LOG_ERROR("Diagnosis:");
-            LOG_ERROR("  last_task_alive is not advancing, so fanin spill pool tail");
-            LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase fanin spill pool capacity (current: %d, recommended: %d)", capacity, high_water * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
-            LOG_ERROR("========================================");
-            latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW);
-            return false;
-        }
-        SPIN_WAIT_HINT();
-    }
-    return true;
-}
-
-// =============================================================================
-// Dependency List Pool Implementation
-// =============================================================================
-void PTO2DepListPool::reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive) {
-    if (sm_last_task_alive >= last_reclaimed + PTO2_DEP_POOL_CLEANUP_INTERVAL && sm_last_task_alive > 0) {
-        int32_t mark = ring.get_slot_state_by_task_id(sm_last_task_alive - 1).dep_pool_mark;
-        if (mark > 0) {
-            advance_tail(mark);
-        }
-        last_reclaimed = sm_last_task_alive;
-    }
-}
-
-bool PTO2DepListPool::ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed) {
-    if (available() >= needed) return true;
-
-    int spin_count = 0;
-    int32_t prev_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
-    while (available() < needed) {
-        reclaim(ring, prev_last_alive);
-        if (available() >= needed) return true;
-
-        spin_count++;
-
-        // Progress detection: reset spin counter if last_task_alive advances
-        int32_t cur_last_alive = ring.fc.last_task_alive.load(std::memory_order_acquire);
-        if (cur_last_alive > prev_last_alive) {
-            spin_count = 0;
-            prev_last_alive = cur_last_alive;
-        }
-
-        if (spin_count >= PTO2_DEP_POOL_SPIN_LIMIT) {
-            int32_t current = ring.fc.current_task_index.load(std::memory_order_acquire);
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Dependency Pool Deadlock Detected!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("DepListPool cannot reclaim space after %d spins (no progress).", spin_count);
-            LOG_ERROR(
-                "  - Pool used:     %d / %d (%.1f%%)", used(), capacity,
-                (capacity > 0) ? (100.0 * used() / capacity) : 0.0
-            );
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("  - Needed:        %d entries", needed);
-            LOG_ERROR("  - last_task_alive: %d (stuck here)", cur_last_alive);
-            LOG_ERROR("  - current_task:    %d", current);
-            LOG_ERROR("  - In-flight tasks: %d", current - cur_last_alive);
-            LOG_ERROR("Diagnosis:");
-            LOG_ERROR("  last_task_alive is not advancing, so dep pool tail");
-            LOG_ERROR("  cannot reclaim. Check TaskRing diagnostics for root cause.");
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d)", capacity, high_water * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", high_water * 2);
-            LOG_ERROR("========================================");
-            latch_pool_error(error_code_ptr, PTO2_ERROR_DEP_POOL_OVERFLOW);
-            return false;
-        }
-        SPIN_WAIT_HINT();
-    }
-    return true;
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
index 4e04dc832..3faef6b4c 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_ring_buffer.h
@@ -8,28 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Ring Buffer Data Structures
- *
- * Implements ring buffer designs for zero-overhead memory management:
- *
- * 1. TaskAllocator - Unified task slot + output buffer allocation
- *    - Combines task ring (slot allocation) and heap ring (output buffer allocation)
- *    - Single spin-wait loop with unified back-pressure and deadlock detection
- *    - O(1) bump allocation for both task slots and heap buffers
- *
- * 2. FaninPool - Fanin spill entry allocation
- *    - Ring buffer for spilled fanin entries
- *    - O(1) append allocation
- *    - Implicit reclamation with task ring
- *
- * 3. DepListPool - Dependency list entry allocation
- *    - Ring buffer for linked list entries
- *    - O(1) prepend operation
- *    - Implicit reclamation with task ring
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
 
 #ifndef PTO_RING_BUFFER_H
 #define PTO_RING_BUFFER_H
@@ -40,14 +18,6 @@
 
 #include "pto_runtime2_types.h"
 #include "pto_shared_memory.h"
-#include "common/unified_log.h"
-
-#if PTO2_PROFILING
-// Heap-ring wrap reporting — the allocator is the only place each individual
-// wrap is observable, so it notifies the scope_stats collector here. Gated:
-// pays nothing (no include, no call) when profiling is compiled out.
-#include "aicpu/scope_stats_collector_aicpu.h"
-#endif
 
 // Block notification interval (in spin counts)
 #define PTO2_BLOCK_NOTIFY_INTERVAL 10000
@@ -57,41 +27,18 @@
 // Dep pool spin limit - if exceeded, dep pool capacity too small for workload
 #define PTO2_DEP_POOL_SPIN_LIMIT 100000
 
-// =============================================================================
-// Task Allocator (unified task slot + heap buffer allocation)
-// =============================================================================
+inline void latch_pool_error(std::atomic<int32_t> *error_code_ptr, int32_t error_code)
+{
+    if (error_code_ptr == nullptr) return;
+    int32_t expected = PTO2_ERROR_NONE;
+    error_code_ptr->compare_exchange_strong(expected, error_code, std::memory_order_acq_rel);
+}
 
-/**
- * Unified task slot + heap buffer allocator.
- *
- * Since task and heap are always allocated together and the orchestrator is
- * single-threaded, both pointers (task index, heap top) are tracked locally
- * and published to shared memory via plain store — no fetch_add or CAS needed.
- *
- * The alloc() method checks both resources BEFORE committing to either,
- * eliminating the need for rollback on partial failure.
- */
-class PTO2TaskAllocator {
+class PTO2TaskAllocator
+{
 public:
-    /**
-     * Initialize the allocator with task ring and heap ring resources.
-     *
-     * All pointer arguments are device addresses (live in SM / GM heap); this
-     * function only stores them, no dereferences, so it is safe to invoke
-     * from host code that constructs a prebuilt arena image.
-     *
-     * Production callers leave `initial_local_task_id` at 0: the SM ring
-     * flow-control counters that current_index_ptr / last_alive_ptr point at
-     * start at zero (PTO2RingFlowControl::init() runs on the AICPU during SM
-     * reset), so we keep local_task_id_ aligned with that without reading the
-     * SM. Tests that drive SM state directly may pass a non-zero seed to
-     * exercise corner cases like task IDs near INT32_MAX.
-     */
-    void init(
-        PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr,
-        std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr,
-        int32_t initial_local_task_id = 0
-    ) {
+    void init(PTO2TaskDescriptor *descriptors, int32_t window_size, std::atomic<int32_t> *current_index_ptr, std::atomic<int32_t> *last_alive_ptr, void *heap_base, uint64_t heap_size, std::atomic<int32_t> *error_code_ptr, int32_t initial_local_task_id = 0)
+    {
         descriptors_ = descriptors;
         window_size_ = window_size;
         window_mask_ = window_size - 1;
@@ -106,69 +53,50 @@ class PTO2TaskAllocator {
         last_alive_seen_ = 0;
     }
 
-    /**
-     * Allocate a task slot and its associated output buffer in one call.
-     *
-     * Both task index and heap top are maintained as local counters and
-     * published to shared memory only on success. Since the orchestrator is
-     * single-threaded, no CAS or fetch_add is needed — just check-then-commit.
-     *
-     * @param output_size  Total packed output size in bytes (0 = no heap needed)
-     * @return Allocation result; check failed() for errors
-     */
-    PTO2TaskAllocResult alloc(int32_t output_size) {
-        uint64_t aligned_size =
-            output_size > 0 ? PTO2_ALIGN_UP(static_cast<uint64_t>(output_size), PTO2_ALIGN_SIZE) : 0;
+    PTO2TaskAllocResult alloc(int32_t output_size)
+    {
+        uint64_t aligned_size = output_size > 0 ? PTO2_ALIGN_UP(static_cast<uint64_t>(output_size), PTO2_ALIGN_SIZE) : 0;
 
         int spin_count = 0;
         int32_t prev_last_alive = last_alive_ptr_->load(std::memory_order_acquire);
         int32_t last_alive = prev_last_alive;
         update_heap_tail(last_alive);
         bool blocked_on_heap = false;
-#if PTO2_ORCH_PROFILING
-        uint64_t wait_start = 0;
-        bool waiting = false;
-#endif
 
-        while (true) {
+        while (true)
+        {
             // Check both resources; commit only if both available
-            if (local_task_id_ - last_alive + 1 < window_size_) {
+            if (local_task_id_ - last_alive + 1 < window_size_)
+            {
                 void *heap_ptr = try_bump_heap(aligned_size);
-                if (heap_ptr) {
+                if (heap_ptr)
+                {
                     int32_t task_id = commit_task();
-#if PTO2_ORCH_PROFILING
-                    record_wait(spin_count, wait_start, waiting);
-#endif
                     return {task_id, task_id & window_mask_, heap_ptr, static_cast<char *>(heap_ptr) + aligned_size};
                 }
                 blocked_on_heap = true;
-            } else {
+            }
+            else
+            {
                 blocked_on_heap = false;
             }
 
             // Spin: wait for scheduler to advance last_task_alive
             spin_count++;
-#if PTO2_ORCH_PROFILING
-            if (!waiting) {
-                wait_start = get_sys_cnt_aicpu();
-                waiting = true;
-            }
-#endif
             last_alive = last_alive_ptr_->load(std::memory_order_acquire);
             update_heap_tail(last_alive);
-            if (last_alive > prev_last_alive) {
+            if (last_alive > prev_last_alive)
+            {
                 spin_count = 0;
                 prev_last_alive = last_alive;
-            } else {
-                if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0) {
-                    LOG_WARN(
-                        "[TaskAllocator] BLOCKED: tasks=%d/%d, heap=%" PRIu64 "/%" PRIu64 ", on=%s, spins=%d",
-                        local_task_id_ - last_alive, window_size_, heap_top_, heap_size_,
-                        blocked_on_heap ? "heap" : "task", spin_count
-                    );
-                }
-                if (spin_count >= PTO2_ALLOC_SPIN_LIMIT) {
-                    report_deadlock(output_size, blocked_on_heap);
+            }
+            else
+            {
+                if (spin_count % PTO2_BLOCK_NOTIFY_INTERVAL == 0)
+                {}
+                if (spin_count >= PTO2_ALLOC_SPIN_LIMIT)
+                {
+                    report_deadlock(blocked_on_heap);
                     return {-1, -1, nullptr, nullptr};
                 }
             }
@@ -176,25 +104,33 @@ class PTO2TaskAllocator {
         }
     }
 
-    // =========================================================================
-    // State queries
-    // =========================================================================
-
-    int32_t active_count() const {
+    int32_t active_count() const
+    {
         int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire);
         return local_task_id_ - last_alive;
     }
 
     // Task ring start/end: tail = oldest live task (last_task_alive), head =
     // next task id to allocate. head - tail == active_count().
-    int32_t task_tail() const { return last_alive_ptr_->load(std::memory_order_acquire); }
-    int32_t task_head() const { return local_task_id_; }
+    int32_t task_tail() const
+    {
+        return last_alive_ptr_->load(std::memory_order_acquire);
+    }
+    int32_t task_head() const
+    {
+        return local_task_id_;
+    }
 
-    int32_t window_size() const { return window_size_; }
+    int32_t window_size() const
+    {
+        return window_size_;
+    }
 
-    uint64_t heap_available() const {
+    uint64_t heap_available() const
+    {
         uint64_t tail = heap_tail_;
-        if (heap_top_ >= tail) {
+        if (heap_top_ >= tail)
+        {
             uint64_t at_end = heap_size_ - heap_top_;
             uint64_t at_begin = tail;
             return at_end > at_begin ? at_end : at_begin;
@@ -202,12 +138,22 @@ class PTO2TaskAllocator {
         return tail - heap_top_;
     }
 
-    uint64_t heap_top() const { return heap_top_; }
+    uint64_t heap_top() const
+    {
+        return heap_top_;
+    }
     // Heap ring start: reclaim pointer (oldest byte still live). heap_top() is
     // the end (next allocation). heap_top - heap_tail == heap_used_bytes().
-    uint64_t heap_tail() const { return heap_tail_; }
-    uint64_t heap_capacity() const { return heap_size_; }
-    uint64_t heap_used_bytes() const {
+    uint64_t heap_tail() const
+    {
+        return heap_tail_;
+    }
+    uint64_t heap_capacity() const
+    {
+        return heap_size_;
+    }
+    uint64_t heap_used_bytes() const
+    {
         if (heap_size_ == 0) return 0;
         return (heap_top_ + heap_size_ - heap_tail_) % heap_size_;
     }
@@ -233,461 +179,73 @@ class PTO2TaskAllocator {
     // --- Shared ---
     std::atomic<int32_t> *error_code_ptr_ = nullptr;
 
-    // =========================================================================
-    // Internal helpers
-    // =========================================================================
-
-    /**
-     * Commit a task slot: bump local counter and publish to shared memory.
-     * Must only be called after space check has passed.
-     */
-    int32_t commit_task() {
+    int32_t commit_task()
+    {
         int32_t task_id = local_task_id_++;
         current_index_ptr_->store(local_task_id_, std::memory_order_release);
         return task_id;
     }
 
-    /**
-     * Derive heap_tail_ from the last consumed task's packed_buffer_end.
-     *
-     * Every task has a valid packed_buffer_end (equal to packed_buffer_base
-     * for zero-size allocations), so the last consumed task always determines
-     * the correct heap_tail — no backward scan needed.
-     */
-    void update_heap_tail(int32_t last_alive) {
+    void update_heap_tail(int32_t last_alive)
+    {
         if (last_alive <= last_alive_seen_) return;
         last_alive_seen_ = last_alive;
 
         PTO2TaskDescriptor &desc = descriptors_[(last_alive - 1) & window_mask_];
-        uint64_t old_tail = heap_tail_;
-        heap_tail_ =
-            static_cast<uint64_t>(static_cast<char *>(desc.packed_buffer_end) - static_cast<char *>(heap_base_));
-#if PTO2_PROFILING
-        // Reclaim pointer moves forward monotonically in ring order; a decrease
-        // means it wrapped past heap_size_ (occupancy < heap_size_ guarantees at
-        // most one wrap per call). Report it so scope_stats can unroll.
-        if (is_scope_stats_enabled() && heap_tail_ < old_tail) {
-            scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_RECLAIM);
-        }
-#else
-        (void)old_tail;
-#endif
+        heap_tail_ = static_cast<uint64_t>(static_cast<char *>(desc.packed_buffer_end) - static_cast<char *>(heap_base_));
     }
 
-    /**
-     * Bump the heap pointer for the given allocation size.
-     * Returns the allocated pointer, or nullptr if insufficient space.
-     * When alloc_size == 0, returns current position without advancing.
-     */
-    void *try_bump_heap(uint64_t alloc_size) {
+    void *try_bump_heap(uint64_t alloc_size)
+    {
         uint64_t top = heap_top_;
-        if (alloc_size == 0) {
-            return static_cast<char *>(heap_base_) + top;
-        }
+        if (alloc_size == 0) return static_cast<char *>(heap_base_) + top;
         uint64_t tail = heap_tail_;
         void *result;
 
-        if (top >= tail) {
+        if (top >= tail)
+        {
             uint64_t space_at_end = heap_size_ - top;
-            if (space_at_end >= alloc_size) {
+            if (space_at_end >= alloc_size)
+            {
                 result = static_cast<char *>(heap_base_) + top;
                 heap_top_ = top + alloc_size;
-            } else if (tail > alloc_size) {
-                LOG_DEBUG(
-                    "try_bump_heap wrap-around alloc: top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64, top, tail,
-                    alloc_size
-                );
+            }
+            else if (tail > alloc_size)
+            {
                 result = heap_base_;
                 heap_top_ = alloc_size;
-#if PTO2_PROFILING
-                // Allocation pointer just wrapped past heap_size_; report it so
-                // scope_stats can unroll the wrapping offset into a monotonic value.
-                // The collector attributes the wrap to the current scope's ring.
-                if (is_scope_stats_enabled()) scope_stats_note_heap_wrap(SCOPE_STATS_HEAP_SIDE_ALLOC);
-#endif
-            } else {
-                LOG_DEBUG(
-                    "try_bump_heap failed (top>=tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64
-                    ", heap_size=%" PRIu64,
-                    top, tail, alloc_size, heap_size_
-                );
-                return nullptr;
             }
-        } else {
-            if (tail - top > alloc_size) {
-                result = static_cast<char *>(heap_base_) + top;
-                heap_top_ = top + alloc_size;
-            } else {
-                LOG_DEBUG(
-                    "try_bump_heap failed (top<tail): top=%" PRIu64 ", tail=%" PRIu64 ", alloc=%" PRIu64
-                    ", free_gap=%" PRIu64,
-                    top, tail, alloc_size, tail - top
-                );
+            else
+            {
                 return nullptr;
             }
         }
-
-        return result;
-    }
-
-#if PTO2_ORCH_PROFILING
-    void record_wait(int spin_count, uint64_t wait_start, bool waiting) {
-        if (waiting) {
-            extern uint64_t g_orch_alloc_wait_cycle;
-            g_orch_alloc_wait_cycle += (get_sys_cnt_aicpu() - wait_start);
-        }
+        else if (tail - top > alloc_size)
         {
-            extern uint64_t g_orch_alloc_atomic_count;
-            g_orch_alloc_atomic_count += spin_count + 1;
-        }
-    }
-#endif
-
-    /**
-     * Report deadlock with targeted diagnostics.
-     */
-    void report_deadlock(int32_t requested_output_size, bool heap_blocked) {
-        int32_t last_alive = last_alive_ptr_->load(std::memory_order_acquire);
-        int32_t active_tasks = local_task_id_ - last_alive;
-        uint64_t htail = heap_tail_;
-
-        LOG_ERROR("========================================");
-        if (heap_blocked) {
-            LOG_ERROR("FATAL: Task Allocator Deadlock - Heap Exhausted!");
-        } else {
-            LOG_ERROR("FATAL: Task Allocator Deadlock - Task Ring Full!");
-        }
-        LOG_ERROR("========================================");
-        LOG_ERROR("No progress after %d spins.", PTO2_ALLOC_SPIN_LIMIT);
-        LOG_ERROR(
-            "  Task ring:  current=%d, last_alive=%d, active=%d/%d (%.1f%%)", local_task_id_, last_alive, active_tasks,
-            window_size_, 100.0 * active_tasks / window_size_
-        );
-        LOG_ERROR(
-            "  Heap ring:  top=%" PRIu64 ", tail=%" PRIu64 ", size=%" PRIu64 ", available=%" PRIu64, heap_top_, htail,
-            heap_size_, heap_available()
-        );
-        if (heap_blocked) {
-            LOG_ERROR("  Requested:  %d bytes", requested_output_size);
+            result = static_cast<char *>(heap_base_) + top;
+            heap_top_ = top + alloc_size;
         }
-        LOG_ERROR("Diagnosis:");
-        LOG_ERROR("  last_task_alive is stuck at %d, meaning task %d", last_alive, last_alive);
-        LOG_ERROR("  cannot transition to CONSUMED. Possible causes:");
-        LOG_ERROR("  1. Task %d still executing (subtasks not complete)", last_alive);
-        LOG_ERROR("  2. Task %d fanout not fully released (downstream not done)", last_alive);
-        LOG_ERROR("  3. Scope reference not released (scope_end not called)");
-        LOG_ERROR("  4. Orchestrator blocked here -> can't call scope_end -> circular wait");
-        LOG_ERROR("Solution:");
-        if (heap_blocked) {
-            LOG_ERROR(
-                "  Increase heap size (current: %" PRIu64 ", recommended: %" PRIu64 ")", heap_size_, heap_size_ * 2
-            );
-            LOG_ERROR("  Compile-time: PTO2_HEAP_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_HEAP=<power-of-2 bytes> (e.g. %" PRIu64 ")", heap_size_ * 2);
-        } else {
-            LOG_ERROR("  Increase task window size (current: %d, recommended: %d)", window_size_, active_tasks * 2);
-            LOG_ERROR("  Compile-time: PTO2_TASK_WINDOW_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_TASK_WINDOW=<power-of-2> (e.g. %d)", active_tasks * 2);
-        }
-        LOG_ERROR("========================================");
-        if (error_code_ptr_) {
-            int32_t code = heap_blocked ? PTO2_ERROR_HEAP_RING_DEADLOCK : PTO2_ERROR_FLOW_CONTROL_DEADLOCK;
-            error_code_ptr_->store(code, std::memory_order_release);
-        }
-    }
-};
-
-// =============================================================================
-// Fanin Spill Pool
-// =============================================================================
-
-/**
- * Fanin spill pool structure
- *
- * True ring buffer for allocating spilled fanin entries.
- * Entries are reclaimed when their consumer tasks become CONSUMED.
- *
- * Linear counters (top, tail) grow monotonically; the physical index
- * is obtained via modulo: base[linear_index % capacity].
- */
-struct PTO2FaninPool {
-    PTO2FaninSpillEntry *base;       // Pool base address
-    int32_t capacity;                // Total number of entries
-    int32_t top;                     // Linear next-allocation counter (starts from 1)
-    int32_t tail;                    // Linear first-alive counter (entries before this are dead)
-    int32_t high_water;              // Peak concurrent usage (top - tail)
-    int32_t reclaim_task_cursor{0};  // Last task id scanned for reclaim on this pool
-
-    std::atomic<int32_t> *error_code_ptr = nullptr;
-
-    void init(PTO2FaninSpillEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
-        base = in_base;
-        capacity = in_capacity;
-        top = 1;
-        tail = 1;
-        high_water = 0;
-        reclaim_task_cursor = 0;
-        base[0].slot_state = nullptr;
-        error_code_ptr = in_error_code_ptr;
-    }
-
-    void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive);
-
-    bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed);
-
-    PTO2FaninSpillEntry *alloc() {
-        int32_t used = top - tail;
-        if (used >= capacity) {
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Fanin Spill Pool Overflow!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("Fanin spill pool exhausted: %d entries alive (capacity=%d).", used, capacity);
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase fanin spill pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
-            LOG_ERROR("========================================");
-            if (error_code_ptr) {
-                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
-            }
+        else
+        {
             return nullptr;
         }
-        int32_t idx = top % capacity;
-        top++;
-        used++;
-        if (used > high_water) high_water = used;
-        return &base[idx];
-    }
-
-    void advance_tail(int32_t new_tail) {
-        if (new_tail > tail) {
-            tail = new_tail;
-        }
-    }
-
-    int32_t used() const { return top - tail; }
-
-    int32_t available() const { return capacity - used(); }
-};
-
-template <typename Fn>
-using PTO2FaninCallbackResult = std::invoke_result_t<Fn &, PTO2TaskSlotState *>;
-
-template <typename Fn>
-using PTO2FaninForEachReturn = std::conditional_t<std::is_same_v<PTO2FaninCallbackResult<Fn>, void>, void, bool>;
-
-template <typename InlineSlots, typename Fn>
-inline PTO2FaninForEachReturn<Fn> for_each_fanin_storage(
-    InlineSlots &&inline_slot_states, int32_t fanin_count, int32_t spill_start, PTO2FaninPool &spill_pool, Fn &&fn
-) {
-    using FaninCallbackResult = PTO2FaninCallbackResult<Fn>;
-    static_assert(
-        std::is_same_v<FaninCallbackResult, void> || std::is_same_v<FaninCallbackResult, bool>,
-        "fanin callback must return void or bool"
-    );
-
-    if constexpr (std::is_void_v<FaninCallbackResult>) {
-        int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP);
-        for (int32_t i = 0; i < inline_count; i++) {
-            fn(inline_slot_states[i]);
-        }
-
-        int32_t spill_count = fanin_count - inline_count;
-        if (spill_count <= 0) {
-            return;
-        }
-
-        int32_t start_idx = spill_start % spill_pool.capacity;
-        int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx);
-        PTO2FaninSpillEntry *first = spill_pool.base + start_idx;
-        for (int32_t i = 0; i < first_count; i++) {
-            fn(first[i].slot_state);
-        }
-
-        int32_t second_count = spill_count - first_count;
-        for (int32_t i = 0; i < second_count; i++) {
-            fn(spill_pool.base[i].slot_state);
-        }
-        return;
-    } else {
-        int32_t inline_count = std::min(fanin_count, PTO2_FANIN_INLINE_CAP);
-        for (int32_t i = 0; i < inline_count; i++) {
-            if (!fn(inline_slot_states[i])) {
-                return false;
-            }
-        }
-
-        int32_t spill_count = fanin_count - inline_count;
-        if (spill_count <= 0) {
-            return true;
-        }
-
-        int32_t start_idx = spill_start % spill_pool.capacity;
-        int32_t first_count = std::min(spill_count, spill_pool.capacity - start_idx);
-        PTO2FaninSpillEntry *first = spill_pool.base + start_idx;
-        for (int32_t i = 0; i < first_count; i++) {
-            if (!fn(first[i].slot_state)) {
-                return false;
-            }
-        }
-
-        int32_t second_count = spill_count - first_count;
-        for (int32_t i = 0; i < second_count; i++) {
-            if (!fn(spill_pool.base[i].slot_state)) {
-                return false;
-            }
-        }
-        return true;
-    }
-}
-
-template <typename Fn>
-inline PTO2FaninForEachReturn<Fn> for_each_fanin_slot_state(const PTO2TaskPayload &payload, Fn &&fn) {
-    return for_each_fanin_storage(
-        payload.fanin_inline_slot_states, payload.fanin_actual_count, payload.fanin_spill_start,
-        *payload.fanin_spill_pool, static_cast<Fn &&>(fn)
-    );
-}
-
-// =============================================================================
-// Dependency List Pool
-// =============================================================================
-
-/**
- * Dependency list pool structure
- *
- * True ring buffer for allocating linked list entries.
- * Entries are reclaimed when their producer tasks become CONSUMED,
- * as tracked by the orchestrator via dep_pool_mark per task.
- *
- * Linear counters (top, tail) grow monotonically; the physical index
- * is obtained via modulo: base[linear_index % capacity].
- */
-struct PTO2DepListPool {
-    PTO2DepListEntry *base;     // Pool base address
-    int32_t capacity;           // Total number of entries
-    int32_t top;                // Linear next-allocation counter (starts from 1)
-    int32_t tail;               // Linear first-alive counter (entries before this are dead)
-    int32_t high_water;         // Peak concurrent usage (top - tail)
-    int32_t last_reclaimed{0};  // last_task_alive at last successful reclamation
-
-    // Error code pointer for fatal error reporting (→ sm_header->orch_error_code)
-    std::atomic<int32_t> *error_code_ptr = nullptr;
-
-    /**
-     *
-     * Initialize dependency list pool
-     * @param base      Pool base address from shared memory
-     * @param capacity  Total number of entries
-     */
-    void init(PTO2DepListEntry *in_base, int32_t in_capacity, std::atomic<int32_t> *in_error_code_ptr) {
-        base = in_base;
-        capacity = in_capacity;
-        top = 1;   // Start from 1, 0 means NULL/empty
-        tail = 1;  // Match initial top (no reclaimable entries yet)
-        high_water = 0;
-        last_reclaimed = 0;
-
-        // Initialize entry 0 as NULL marker
-        base[0].slot_state = nullptr;
-        base[0].next = nullptr;
-
-        error_code_ptr = in_error_code_ptr;
-    }
-
-    /**
-     * Reclaim dead entries based on scheduler's slot state dep_pool_mark.
-     * Safe to call multiple times — only advances tail forward.
-     *
-     * @param ring             Ring header (for reading slot dep_pool_mark)
-     * @param sm_last_task_alive Current last_task_alive from shared memory
-     */
-    void reclaim(PTO2SharedMemoryRingHeader &ring, int32_t sm_last_task_alive);
-
-    /**
-     * Ensure dep pool for a specific ring has at least `needed` entries available.
-     * Spin-waits for reclamation if under pressure. Detects deadlock if no progress.
-     */
-    bool ensure_space(PTO2SharedMemoryRingHeader &ring, int32_t needed);
 
-    /**
-     * Allocate a single entry from the pool (single-thread per pool instance)
-     *
-     * @return Pointer to allocated entry, or nullptr on fatal error
-     */
-    PTO2DepListEntry *alloc() {
-        int32_t used = top - tail;
-        if (used >= capacity) {
-            LOG_ERROR("========================================");
-            LOG_ERROR("FATAL: Dependency Pool Overflow!");
-            LOG_ERROR("========================================");
-            LOG_ERROR("DepListPool exhausted: %d entries alive (capacity=%d).", used, capacity);
-            LOG_ERROR("  - Pool top:      %d (linear)", top);
-            LOG_ERROR("  - Pool tail:     %d (linear)", tail);
-            LOG_ERROR("  - High water:    %d", high_water);
-            LOG_ERROR("Solution:");
-            LOG_ERROR("  Increase dep pool capacity (current: %d, recommended: %d).", capacity, capacity * 2);
-            LOG_ERROR("  Compile-time: PTO2_DEP_LIST_POOL_SIZE in pto_runtime2_types.h");
-            LOG_ERROR("  Runtime env:  PTO2_RING_DEP_POOL=%d", capacity * 2);
-            LOG_ERROR("========================================");
-            if (error_code_ptr) {
-                error_code_ptr->store(PTO2_ERROR_DEP_POOL_OVERFLOW, std::memory_order_release);
-            }
-            return nullptr;
-        }
-        int32_t idx = top % capacity;
-        top++;
-        used++;
-        if (used > high_water) high_water = used;
-        return &base[idx];
+        return result;
     }
 
-    /**
-     * Advance the tail pointer, reclaiming dead entries.
-     * Called by the orchestrator based on last_task_alive advancement.
-     */
-    void advance_tail(int32_t new_tail) {
-        if (new_tail > tail) {
-            tail = new_tail;
+    void report_deadlock(bool heap_blocked)
+    {
+        if (error_code_ptr_)
+        {
+            int32_t code = heap_blocked ? PTO2_ERROR_HEAP_RING_DEADLOCK : PTO2_ERROR_FLOW_CONTROL_DEADLOCK;
+            error_code_ptr_->store(code, std::memory_order_release);
         }
     }
-
-    /**
-     * Prepend a task ID to a dependency list
-     *
-     * O(1) operation: allocates new entry and links to current head.
-     *
-     * @param current_head  Current list head offset (0 = empty list)
-     * @param task_slot     Task slot to prepend
-     * @return New head offset
-     */
-    PTO2DepListEntry *prepend(PTO2DepListEntry *cur, PTO2TaskSlotState *slot_state) {
-        PTO2DepListEntry *new_entry = alloc();
-        if (!new_entry) return nullptr;
-        new_entry->slot_state = slot_state;
-        new_entry->next = cur;
-        return new_entry;
-    }
-
-    int32_t used() const { return top - tail; }
-
-    int32_t available() const { return capacity - used(); }
 };
 
-// =============================================================================
-// Ring Set (per-depth aggregate)
-// =============================================================================
-
-/**
- * Groups a TaskAllocator and DepPool into one per-depth unit.
- * PTO2_MAX_RING_DEPTH instances provide independent reclamation per scope depth.
- */
-struct PTO2RingSet {
+struct PTO2RingSet
+{
     PTO2TaskAllocator task_allocator;
-    PTO2FaninPool fanin_pool;
 };
 
 #endif  // PTO_RING_BUFFER_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
deleted file mode 100644
index 8aee802b1..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.cpp
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-/**
- * PTO Runtime2 - Main Implementation
- *
- * Implements the unified runtime API that combines orchestrator and scheduler.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_runtime2.h"
-
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <algorithm>
-
-#include "aicpu/device_time.h"
-#include "common/unified_log.h"
-#if PTO2_PROFILING
-#include "aicpu/scope_stats_collector_aicpu.h"
-#endif
-
-// Weak fallback for HOST .so builds (never called, but satisfies linker).
-// The AICPU build links the strong symbol from platform/.../device_time.cpp.
-// Hidden visibility prevents HOST .so from polluting global symbol table.
-__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu() { return 0; }
-
-// =============================================================================
-// Orchestration Ops Table (function-pointer dispatch for orchestration .so)
-// =============================================================================
-
-static TaskOutputTensors submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args) {
-    return rt->orchestrator.submit_task(mixed_kernels, args);
-}
-
-static TaskOutputTensors alloc_tensors_impl(PTO2Runtime *rt, const Arg &args) {
-    return rt->orchestrator.alloc_tensors(args);
-}
-
-static TaskOutputTensors submit_dummy_task_impl(PTO2Runtime *rt, const Arg &args) {
-    return rt->orchestrator.submit_dummy_task(args);
-}
-
-void rt_scope_begin(PTO2Runtime *rt) {
-    PTO2ScopeMode mode = rt->pending_scope_mode;
-    rt->pending_scope_mode = PTO2ScopeMode::AUTO;
-    rt->orchestrator.begin_scope(mode);
-}
-
-void rt_scope_end(PTO2Runtime *rt) { rt->orchestrator.end_scope(); }
-
-void rt_orchestration_done(PTO2Runtime *rt) { rt->orchestrator.mark_done(); }
-
-static bool is_fatal_impl(PTO2Runtime *rt) { return rt->orchestrator.fatal; }
-
-void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...) {
-    va_list args;
-    va_start(args, fmt);
-    if (fmt == nullptr || fmt[0] == '\0') {
-        rt->orchestrator.report_fatal(error_code, func, nullptr);
-    } else {
-        char message[1024];
-        vsnprintf(message, sizeof(message), fmt, args);
-        rt->orchestrator.report_fatal(error_code, func, "%s", message);
-    }
-    va_end(args);
-}
-
-// Wait for all producers of this tensor to be safe for data access.
-// Checks owner metadata (lifecycle anchor) and OverlapMap (modifier writers).
-// For reads: wait until each producer COMPLETED (done writing).
-// For writes: also wait until all consumers done reading
-//   (fanout_refcount >= fanout_count - 1, excluding scope reference).
-// Uses cycle-based timeout (checked every 1024 spins).
-// Returns false on timeout (sets orch.fatal).
-MAYBE_UNINITIALIZED_BEGIN
-static bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller) {
-    PTO2TaskId owner = tensor.owner_task_id;
-    PTO2OrchestratorState &orch = rt->orchestrator;
-
-    // Segmented wait: collect up to kSegmentCap producer slots, then flush by
-    // spinning on each. When the segment fills, we wait for the accumulated
-    // batch before continuing to gather more. Dedup is per-segment only; a
-    // producer that appears in two segments is waited on twice, which is
-    // idempotent (task_state is monotonic) and only adds one atomic load on
-    // the second encounter.
-    constexpr int kSegmentCap = 64;
-    const PTO2TaskSlotState *seg[kSegmentCap];
-    int seg_count = 0;
-    bool signaled = false;
-    bool failed = false;
-
-    auto wait_one_producer = [&](const PTO2TaskSlotState &slot) {
-        uint8_t ring_id = slot.ring_id;
-        int32_t local_id = static_cast<int32_t>(slot.task->task_id.local());
-        uint64_t t0 = get_sys_cnt_aicpu();
-        int32_t spin_count = 0;
-        while (slot.task_state.load(std::memory_order_acquire) < PTO2_TASK_COMPLETED) {
-            SPIN_WAIT_HINT();
-            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) {
-                orch.report_fatal(
-                    PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller,
-                    "Timeout (%llu cycles): producer (ring=%d, local=%d) not completed",
-                    (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id
-                );
-                failed = true;
-                return;
-            }
-        }
-    };
-
-    auto wait_one_consumers = [&](const PTO2TaskSlotState &slot) {
-        uint8_t ring_id = slot.ring_id;
-        int32_t local_id = slot.task->task_id.local();
-        uint64_t t0 = get_sys_cnt_aicpu();
-        int32_t spin_count = 0;
-        while (slot.fanout_refcount.load(std::memory_order_acquire) < slot.fanout_count - 1) {
-            SPIN_WAIT_HINT();
-            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES) {
-                orch.report_fatal(
-                    PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller,
-                    "Timeout (%llu cycles): consumers of producer (ring=%d, local=%d) not done",
-                    (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id
-                );
-                failed = true;
-                return;
-            }
-        }
-    };
-
-    auto flush_segment = [&]() {
-        for (int i = 0; i < seg_count; i++) {
-            wait_one_producer(*seg[i]);
-            if (failed) return;
-            if (!wait_for_consumers) continue;
-            wait_one_consumers(*seg[i]);
-            if (failed) return;
-        }
-        seg_count = 0;
-    };
-
-    auto try_push = [&](const PTO2TaskSlotState &s) {
-        for (int j = 0; j < seg_count; j++) {
-            if (seg[j] == &s) return;  // per-segment dedup
-        }
-        if (seg_count == kSegmentCap) {
-            flush_segment();
-            if (failed) return;
-        }
-        seg[seg_count++] = &s;
-        if (!signaled) {
-            orch.scheduler->wiring.orch_needs_drain.store(true, std::memory_order_release);
-            signaled = true;
-        }
-    };
-
-    auto do_wait = [&]() {
-        // Step A: creator retention — read owner directly from tensor metadata
-        if (owner.is_valid()) {
-            auto &s = orch.sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local());
-            try_push(s);
-            if (failed) return;
-        }
-
-        // Step B: modifier writer lookup (OverlapMap), direct callback
-        orch.tensor_map.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus) -> bool {
-            PTO2TaskId pid = entry.producer_task_id;
-            auto &s = orch.sm_header->rings[pid.ring()].get_slot_state_by_task_id(pid.local());
-            try_push(s);
-            return !failed;
-        });
-        if (failed) return;
-        flush_segment();
-    };
-
-    do_wait();
-    if (signaled) {
-        orch.scheduler->wiring.orch_needs_drain.store(false, std::memory_order_release);
-    }
-    return !failed;
-}
-MAYBE_UNINITIALIZED_END
-
-uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]) {
-    if (tensor.buffer.addr == 0) {
-        unified_log_error(
-            __FUNCTION__, "get_tensor_data: buffer not allocated (addr=0). "
-                          "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns."
-        );
-        return 0;
-    }
-
-    if (!wait_for_tensor_ready(rt, tensor, false, __FUNCTION__)) {
-        return 0;
-    }
-
-    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
-    uint64_t elem_size = get_element_size(tensor.dtype);
-    const void *ptr = reinterpret_cast<const void *>(tensor.buffer.addr + flat_offset * elem_size);
-    uint64_t result = 0;
-    memcpy(&result, ptr, elem_size);
-    return result;
-}
-
-void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value) {
-    if (tensor.buffer.addr == 0) {
-        unified_log_error(
-            __FUNCTION__, "set_tensor_data: buffer not allocated (addr=0). "
-                          "Use the Tensor returned by add_output(TensorCreateInfo) after submit returns."
-        );
-        return;
-    }
-
-    // Wait for producer + all consumers before writing (WAW + WAR safety)
-    if (!wait_for_tensor_ready(rt, tensor, true, __FUNCTION__)) {
-        return;
-    }
-
-    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
-    uint64_t elem_size = get_element_size(tensor.dtype);
-    void *ptr = reinterpret_cast<void *>(tensor.buffer.addr + flat_offset * elem_size);
-    memcpy(ptr, &value, elem_size);
-}
-
-// Ops-table entry that hands the call-site captured by PTO2ScopeGuard to the
-// [ScopeStats] collector. The slot is always present in the struct to keep
-// the layout stable; at PTO2_PROFILING=0 we fill nullptr so the orchestration
-// .so's null-check skips it.
-#if PTO2_PROFILING
-static void scope_set_site_impl(const char *file, int line) { scope_stats_set_pending_site(file, line); }
-#endif
-
-static const PTO2RuntimeOps s_runtime_ops = {
-    .submit_task = submit_task_impl,
-    .scope_begin = rt_scope_begin,
-    .scope_end = rt_scope_end,
-    .orchestration_done = rt_orchestration_done,
-    .is_fatal = is_fatal_impl,
-    .report_fatal = rt_report_fatal,
-    .log_error = unified_log_error,
-    .log_warn = unified_log_warn,
-    .log_debug = unified_log_debug,
-    .log_info_v = unified_log_info_v,
-    .get_tensor_data = get_tensor_data,
-    .set_tensor_data = set_tensor_data,
-    .alloc_tensors = alloc_tensors_impl,
-    .submit_dummy_task = submit_dummy_task_impl,
-#if PTO2_PROFILING
-    .scope_set_site = scope_set_site_impl,
-#else
-    .scope_set_site = nullptr,
-#endif
-};
-
-// =============================================================================
-// Runtime Lifecycle (AICPU-only fixup)
-// =============================================================================
-//
-// Layout / init_data / wire / destroy live in
-// runtime/shared/pto_runtime2_init.cpp so the host build can pre-populate the
-// prebuilt arena image. The pieces below — wiring the ops table and the
-// SPMD core counts — depend on the device-side s_runtime_ops global and the
-// AICPU SchedulerContext respectively, so they remain in the AICPU build.
-
-void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count) {
-    rt->ops = &s_runtime_ops;
-    rt->orchestrator.total_cluster_count = aic_count;
-    rt->orchestrator.total_aiv_count = aiv_count;
-}
-
-void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode) {
-    if (rt) {
-        rt->mode = mode;
-    }
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
index 155809365..d73b8859e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
@@ -8,29 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Main Interface
- *
- * This is the main header for the PTO Runtime2 system.
- * It provides a unified API for task graph construction and execution.
- *
- * Key Features:
- * - Ring buffer based memory management (zero allocation overhead)
- * - Lazy invalidation TensorMap for dependency discovery
- * - Scope-based buffer lifecycle management
- * - Per-task spinlocks for concurrent fanout updates
- * - Orchestrator-Scheduler decoupling via shared memory
- *
- * Usage:
- *   1. Create runtime: PTO2Runtime create methods
- *   2. Build task graph in orchestration function:
- *      - begin_scope() / end_scope()
- *      - submit_task()
- *   3. Mark orchestration complete: mark_done()
- *   4. Destroy runtime
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
 
 #pragma once
 
@@ -40,33 +17,29 @@
 #include "pto_shared_memory.h"
 #include "pto_ring_buffer.h"
 #include "pto_tensormap.h"
-#include "scheduler/pto_scheduler.h"
+#include "pto_scheduler.h"
 #include "pto_orchestrator.h"
 #include "aicore_completion_mailbox.h"
 
-// =============================================================================
-// Runtime Context
-// =============================================================================
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include "aicpu/device_time.h"
+#include "common/unified_log.h"
 
-/**
- * Runtime execution mode
- */
-enum PTO2RuntimeMode {
+__attribute__((weak, visibility("hidden"))) uint64_t get_sys_cnt_aicpu();
+
+enum PTO2RuntimeMode
+{
     PTO2_MODE_EXECUTE = 0,    // Execute tasks on workers
     PTO2_MODE_SIMULATE = 1,   // Simulate task execution with cycle counting
     PTO2_MODE_GRAPH_ONLY = 2  // Build graph only, no execution
 };
 
-/**
- * Function-pointer ops table for runtime operations.
- *
- * The orchestration .so calls runtime functions through this table
- * (via pto_orchestration_api.h inline wrappers), so it has zero link
- * dependencies on runtime .cpp files.
- */
 typedef struct PTO2Runtime PTO2Runtime;  // forward declare for ops signatures
 
-struct PTO2RuntimeOps {
+struct PTO2RuntimeOps
+{
     TaskOutputTensors (*submit_task)(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args);
     void (*scope_begin)(PTO2Runtime *rt);
     void (*scope_end)(PTO2Runtime *rt);
@@ -75,34 +48,20 @@ struct PTO2RuntimeOps {
     void (*report_fatal)(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
 
     // Logging (populated by runtime, called by orchestration)
-    void (*log_error)(const char *func, const char *fmt, ...);
-    void (*log_warn)(const char *func, const char *fmt, ...);
-    void (*log_debug)(const char *func, const char *fmt, ...);
-    // INFO with explicit verbosity tier (v ∈ [0,9]; gating done inside).
+    // INFO with explicit verbosity tier (v ∈ [0, 9]; gating done inside).
     void (*log_info_v)(const char *func, int v, const char *fmt, ...);
 
     // Cross-layer data access (orchestration reads/writes tensor values via runtime)
     // Placed after logging to avoid shifting hot-path field offsets.
     uint64_t (*get_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
-    void (*set_tensor_data)(
-        PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value
-    );
+    void (*set_tensor_data)(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value);
     TaskOutputTensors (*alloc_tensors)(PTO2Runtime *rt, const Arg &args);
     TaskOutputTensors (*submit_dummy_task)(PTO2Runtime *rt, const Arg &args);
-    // Stash the call-site captured by PTO2ScopeGuard into the [ScopeStats]
-    // collector. Always present in the struct to keep ops-table layout stable
-    // across PTO2_PROFILING settings; set to nullptr at PTO2_PROFILING=0.
     void (*scope_set_site)(const char *file, int line);
 };
 
-/**
- * Layout descriptor for the prebuilt runtime arena. Holds all sub-region
- * offsets (orchestrator / scheduler / sm_handle wrapper / runtime header /
- * AICore mailbox) plus the layout-defining capacities. Produced once on the
- * host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout
- * and runtime_wire_arena_pointers.
- */
-struct PTO2RuntimeArenaLayout {
+struct PTO2RuntimeArenaLayout
+{
     size_t off_sm_handle{0};
     PTO2OrchestratorLayout orch;
     PTO2SchedulerLayout sched;
@@ -119,13 +78,8 @@ struct PTO2RuntimeArenaLayout {
     size_t arena_size{0};
 };
 
-/**
- * PTO Runtime2 context
- *
- * Contains all state for orchestration and scheduling.
- * In simulated mode, runs in single process with shared address space.
- */
-struct PTO2Runtime {
+struct PTO2Runtime
+{
     // Ops table (first field — used by orchestration .so via function pointers)
     const PTO2RuntimeOps *ops;
     PTO2ScopeMode pending_scope_mode;
@@ -147,136 +101,304 @@ struct PTO2Runtime {
     // Statistics
     int64_t total_cycles;
 
-    // Prebuilt-arena fast path metadata. Carries every offset
-    // wire_arena_pointers needs at AICPU boot so the AICPU can reconstruct
-    // all arena-internal pointer fields without re-running init_data. The
-    // device base of the runtime arena travels separately on the host-side
-    // Runtime (Runtime::prebuilt_arena_base_), since the AICPU needs it
-    // *before* dereferencing this image. Populated on host by
-    // runtime_init_data_from_layout + runtime_wire_arena_pointers; read by
-    // aicpu_executor.cpp.
     PTO2RuntimeArenaLayout prebuilt_layout;
 };
 
-// =============================================================================
-// Runtime Lifecycle API
-// =============================================================================
-
-/**
- * Phase 1 — declare every sub-region (sm_handle wrapper, orchestrator /
- * scheduler / tensor_map / mailbox / PTO2Runtime header) on the supplied
- * arena. Pure arithmetic; does not touch device memory and may run on host.
- * Returns the layout descriptor; caller commits/attaches the arena before
- * Phase 2/3.
- */
-PTO2RuntimeArenaLayout runtime_reserve_layout(
-    DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE
-);
-
-/**
- * Phase 2 — write the data half of the runtime arena: standalone fields,
- * memset'd arena regions, sub-structure initializers, and SM-side device
- * pointers. The arena must already be committed (or attached); writes go
- * into arena.base() + sub-region offsets.
- *
- * `sm_dev_base` / `gm_heap_dev_base` are device addresses; we only store
- * them (never dereference). Safe to run on a host arena that owns a host
- * mirror of the runtime image — the resulting buffer is rtMemcpy-ready.
- *
- * Returns the PTO2Runtime* that sits at layout.off_runtime within the arena.
- * Caller must follow up with runtime_wire_arena_pointers; rt->ops and the
- * AICore-side count fields are left untouched and must be filled by the
- * AICPU at boot.
- */
-PTO2Runtime *runtime_init_data_from_layout(
-    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t sm_size,
-    void *gm_heap_dev_base, uint64_t heap_size
-);
-
-/**
- * Phase 3 — wire every arena-internal pointer field (rt->sm_handle,
- * rt->aicore_mailbox, orchestrator.{scope_tasks, scope_begins, scheduler,
- * tensor_map.*, rings[].fanin_pool.base}, scheduler.{ready_queues, dep_pool,
- * wiring.queue}) so each holds arena.base() + offset. Idempotent — runs on
- * both host (writing host-mirror addresses) and AICPU (writing device
- * addresses) sides.
- */
-void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt);
-
-/**
- * AICPU-only Phase 4 — fill in the few fields the host could not know at
- * prebuilt-image build time: the ops table (s_runtime_ops is a device-side
- * file-local global, host cannot resolve its device address) and the
- * orchestrator's core counts (depend on the executor's scheduler context).
- * Call once per boot after runtime_wire_arena_pointers.
- */
-void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count);
-
-/**
- * Destroy runtime. With the prebuilt-arena fast path the arena buffer is
- * pooled across runs by DeviceRunner, so we never call arena.release()
- * here — the destructor only forgets sub-structure pointers (idempotent
- * cleanup).
- */
-void runtime_destroy(PTO2Runtime *rt, DeviceArena &arena);
-
-/**
- * Set execution mode
- */
-void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode);
-
-// =============================================================================
-// Orchestration API (called by orchestration function)
-// =============================================================================
-
-/**
- * Begin a new scope
- *
- * All tasks submitted within this scope will have their lifetime
- * bounded by the scope. When scope_end() is called, the scope
- * releases its reference to all enclosed tasks.
- */
-void rt_scope_begin(PTO2Runtime *rt);
-
-/**
- * End current scope
- *
- * Releases scope reference for all tasks submitted since scope_begin().
- * Tasks whose refcount reaches zero will have their buffers released.
- */
-void rt_scope_end(PTO2Runtime *rt);
-
-/**
- * Mark orchestration as complete
- *
- * Signals that no more tasks will be submitted.
- */
-void rt_orchestration_done(PTO2Runtime *rt);
-
-/**
- * Enter fatal state explicitly from orchestration.
- */
-void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...);
-
-/**
- * Cross-layer data access: read a tensor value by waiting for its producer.
- */
-uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[]);
+inline PTO2RuntimeArenaLayout runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity)
+{
+    PTO2RuntimeArenaLayout layout{};
+    layout.task_window_size = task_window_size;
+    layout.dep_pool_capacity = dep_pool_capacity;
+
+    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_window_sizes[r] = static_cast<int32_t>(task_window_size);
+
+    layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+    layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity);
+    layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity);
+    layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
+    layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
+
+    layout.arena_size = arena.total_size();
+    return layout;
+}
+
+inline PTO2Runtime *runtime_init_data_from_layout(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base, uint64_t, void *gm_heap_dev_base, uint64_t heap_size)
+{
+    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.off_runtime));
+    memset(rt, 0, sizeof(*rt));
+
+    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    memset(sm_wrap, 0, sizeof(*sm_wrap));
+
+    // rt->ops is filled by the AICPU at boot.
+    rt->mode = mode;
+    rt->gm_heap = gm_heap_dev_base;
+    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
+    rt->gm_heap_owned = false;
+    rt->total_cycles = 0;
+
+    if (!rt->orchestrator.init_data_from_layout(layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.task_window_size)) return nullptr;
+    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) return nullptr;
+
+    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    memset(mailbox, 0, sizeof(*mailbox));
+
+    return rt;
+}
+
+inline void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt)
+{
+    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
+    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
+    rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler);
+    rt->scheduler.wire_arena_pointers(layout.sched, arena);
+}
+
+inline void runtime_destroy(PTO2Runtime *rt)
+{
+    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
+    if (!rt) return;
+    rt->scheduler.destroy();
+    rt->orchestrator.destroy();
+    rt->aicore_mailbox = nullptr;
+    rt->sm_handle = nullptr;
+}
+
+inline void runtime_set_mode(PTO2Runtime *rt, PTO2RuntimeMode mode)
+{
+    if (rt) rt->mode = mode;
+}
+
+inline void rt_scope_begin(PTO2Runtime *rt)
+{
+    PTO2ScopeMode mode = rt->pending_scope_mode;
+    rt->pending_scope_mode = PTO2ScopeMode::AUTO;
+    rt->orchestrator.begin_scope(mode);
+}
+
+inline void rt_scope_end(PTO2Runtime *rt)
+{
+    rt->orchestrator.end_scope();
+}
+
+inline void rt_orchestration_done(PTO2Runtime *rt)
+{
+    rt->orchestrator.mark_done();
+}
+
+inline void rt_report_fatal(PTO2Runtime *rt, int32_t error_code, const char *func, const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    if (fmt == nullptr || fmt[0] == '\0')
+    {
+        rt->orchestrator.report_fatal(error_code, func, nullptr);
+    }
+    else
+    {
+        char message[1024];
+        vsnprintf(message, sizeof(message), fmt, args);
+        rt->orchestrator.report_fatal(error_code, func, "%s", message);
+    }
+    va_end(args);
+}
+
+// Orchestration-side logging dispatcher: orchestration .so calls
+// LOG_INFO_V<n>(fmt, ...) which routes through this op into the unified log.
+// The verbosity gate lives inside unified_log_info_v.
+inline void rt_log_info_v(const char *func, int v, const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    char message[1024];
+    vsnprintf(message, sizeof(message), fmt, args);
+    va_end(args);
+    unified_log_info_v(func, v, "%s", message);
+}
+
+MAYBE_UNINITIALIZED_BEGIN
+inline bool wait_for_tensor_ready(PTO2Runtime *rt, const Tensor &tensor, bool wait_for_consumers, const char *caller)
+{
+    PTO2TaskId owner = tensor.owner_task_id;
+    PTO2OrchestratorState &orch = rt->orchestrator;
+
+    constexpr int kSegmentCap = 64;
+    const PTO2TaskSlotState *seg[kSegmentCap];
+    int seg_count = 0;
+    bool signaled = false;
+    bool failed = false;
+
+    auto wait_one_producer = [&](const PTO2TaskSlotState &slot) {
+        uint8_t ring_id = slot.ring_id;
+        int32_t local_id = static_cast<int32_t>(slot.task->task_id.local());
+        auto &ring_hdr = orch.sm_header->rings[ring_id];
+        const int32_t mask = ring_hdr.task_window_mask;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        int32_t spin_count = 0;
+        // (m) Use completion_flags as the single completion signal.
+        while (ring_hdr.completion_flags[local_id & mask].load(std::memory_order_acquire) == 0)
+        {
+            SPIN_WAIT_HINT();
+            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES)
+            {
+                orch.report_fatal(PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller, "Timeout (%llu cycles): producer (ring=%d, local=%d) not completed", (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id);
+                failed = true;
+                return;
+            }
+        }
+    };
+
+    auto wait_one_consumers = [&](const PTO2TaskSlotState &slot) {
+        uint8_t ring_id = slot.ring_id;
+        int32_t local_id = slot.task->task_id.local();
+        // With watermark-based reclamation, "all consumers done" means the
+        // per-ring completed_watermark has reached this slot's recorded
+        // last_consumer_local_id.
+        PTO2SharedMemoryRingHeader &ring_hdr = rt->orchestrator.sm_header->rings[ring_id];
+        int32_t target = slot.last_consumer_local_id;
+        uint64_t t0 = get_sys_cnt_aicpu();
+        int32_t spin_count = 0;
+        while (ring_hdr.completed_watermark.load(std::memory_order_acquire) < target)
+        {
+            SPIN_WAIT_HINT();
+            if ((++spin_count & 1023) == 0 && get_sys_cnt_aicpu() - t0 > PTO2_TENSOR_DATA_TIMEOUT_CYCLES)
+            {
+                orch.report_fatal(PTO2_ERROR_TENSOR_WAIT_TIMEOUT, caller, "Timeout (%llu cycles): consumers of producer (ring=%d, local=%d) not done", (unsigned long long)PTO2_TENSOR_DATA_TIMEOUT_CYCLES, ring_id, local_id);
+                failed = true;
+                return;
+            }
+        }
+    };
+
+    auto flush_segment = [&]() {
+        for (int i = 0; i < seg_count; i++)
+        {
+            wait_one_producer(*seg[i]);
+            if (failed) return;
+            if (!wait_for_consumers) continue;
+            wait_one_consumers(*seg[i]);
+            if (failed) return;
+        }
+        seg_count = 0;
+    };
+
+    auto try_push = [&](const PTO2TaskSlotState &s) {
+        for (int j = 0; j < seg_count; j++)
+            if (seg[j] == &s) return;
+        if (seg_count == kSegmentCap)
+        {
+            flush_segment();
+            if (failed) return;
+        }
+        seg[seg_count++] = &s;
+        if (!signaled)
+        {
+            orch.scheduler->wiring.orch_needs_drain.store(true, std::memory_order_release);
+            signaled = true;
+        }
+    };
+
+    auto do_wait = [&]() {
+        if (owner.is_valid())
+        {
+            auto &s = orch.sm_header->rings[owner.ring()].get_slot_state_by_task_id(owner.local());
+            try_push(s);
+            if (failed) return;
+        }
+
+        orch.tensor_map.lookup(tensor, [&](PTO2TensorMapEntry &entry, OverlapStatus) -> bool {
+            PTO2TaskId pid = entry.producer_task_id;
+            auto &s = orch.sm_header->rings[pid.ring()].get_slot_state_by_task_id(pid.local());
+            try_push(s);
+            return !failed;
+        });
+        if (failed) return;
+        flush_segment();
+    };
+
+    do_wait();
+    if (signaled) orch.scheduler->wiring.orch_needs_drain.store(false, std::memory_order_release);
+    return !failed;
+}
+MAYBE_UNINITIALIZED_END
+
+inline uint64_t get_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[])
+{
+    if (tensor.buffer.addr == 0) return 0;
+
+    if (!wait_for_tensor_ready(rt, tensor, false, __FUNCTION__)) return 0;
+
+    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
+    uint64_t elem_size = get_element_size(tensor.dtype);
+    const void *ptr = reinterpret_cast<const void *>(tensor.buffer.addr + flat_offset * elem_size);
+    uint64_t result = 0;
+    memcpy(&result, ptr, elem_size);
+    return result;
+}
+
+inline void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value)
+{
+    if (tensor.buffer.addr == 0) return;
+
+    // Wait for producer + all consumers before writing (WAW + WAR safety)
+    if (!wait_for_tensor_ready(rt, tensor, true, __FUNCTION__)) return;
+
+    uint64_t flat_offset = tensor.compute_flat_offset(indices, ndims);
+    uint64_t elem_size = get_element_size(tensor.dtype);
+    void *ptr = reinterpret_cast<void *>(tensor.buffer.addr + flat_offset * elem_size);
+    memcpy(ptr, &value, elem_size);
+}
+
+// Function-pointer ops table backing — moved from pto_runtime2.cpp so that
+// the inline runtime_finalize_after_wire above can refer to it.
+
+inline TaskOutputTensors submit_task_impl(PTO2Runtime *rt, const MixedKernels &mixed_kernels, const Arg &args)
+{
+    return rt->orchestrator.submit_task(mixed_kernels, args);
+}
+
+inline TaskOutputTensors alloc_tensors_impl(PTO2Runtime *rt, const Arg &args)
+{
+    return rt->orchestrator.alloc_tensors(args);
+}
+
+inline TaskOutputTensors submit_dummy_task_impl(PTO2Runtime *rt, const Arg &args)
+{
+    return rt->orchestrator.submit_dummy_task(args);
+}
+
+inline bool is_fatal_impl(PTO2Runtime *rt)
+{
+    return rt->orchestrator.fatal;
+}
+
+inline const PTO2RuntimeOps s_runtime_ops = {
+    .submit_task = submit_task_impl,
+    .scope_begin = rt_scope_begin,
+    .scope_end = rt_scope_end,
+    .orchestration_done = rt_orchestration_done,
+    .is_fatal = is_fatal_impl,
+    .report_fatal = rt_report_fatal,
+    .log_info_v = rt_log_info_v,
+    .get_tensor_data = get_tensor_data,
+    .set_tensor_data = set_tensor_data,
+    .alloc_tensors = alloc_tensors_impl,
+    .submit_dummy_task = submit_dummy_task_impl,
+    .scope_set_site = nullptr,
+};
 
-/**
- * Cross-layer data access: write a value to a tensor at given indices.
- * Waits for producer completion (WAW) and all consumers (WAR) via TensorMap.
- * See set_tensor_data in pto_orchestration_api.h for full documentation.
- */
-void set_tensor_data(PTO2Runtime *rt, const Tensor &tensor, uint32_t ndims, const uint32_t indices[], uint64_t value);
+inline void runtime_finalize_after_wire(PTO2Runtime *rt, int32_t aic_count, int32_t aiv_count)
+{
+    rt->ops = &s_runtime_ops;
+    rt->orchestrator.total_cluster_count = aic_count;
+    rt->orchestrator.total_aiv_count = aiv_count;
+}
 
-/**
- * Slim config struct exported by orchestration .so via aicpu_orchestration_config().
- * Shared definition with pto_orchestration_api.h (same layout, guarded).
- */
 #ifndef PTO2_ORCHESTRATION_CONFIG_DEFINED
 #define PTO2_ORCHESTRATION_CONFIG_DEFINED
-struct PTO2OrchestrationConfig {
+struct PTO2OrchestrationConfig
+{
     int expected_arg_count;
 };
 #endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 587a44dff..602abf83e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -9,19 +9,6 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO Runtime2 - Core Type Definitions
- *
- * This header defines all fundamental types used by the PTO Runtime2 system:
- * - Configuration constants
- * - Worker types and task states
- * - Tensor regions and task parameters
- * - Task descriptors with fanin/fanout tracking
- * - Dependency list entries
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
 #ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
 #define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
 
@@ -40,11 +27,6 @@
 #include "pto_task_id.h"
 #include "pto_types.h"
 
-// Spin-wait hint for AICPU threads.  On real hardware the AICPU has dedicated
-// ARM A55 cores — no OS yield is needed, so the hint is a no-op.  In simulation
-// all threads share host CPU cores, so we yield to prevent starvation.
-// This header is also compiled into the Host .so (for struct definitions only),
-// where the hint is never called — the fallback no-op keeps Host builds clean.
 #if __has_include("spin_hint.h")
 #include "spin_hint.h"
 #else
@@ -65,9 +47,8 @@
 // Use pto2_task_slot(sched, task_id) for slot calculation.
 #define PTO2_TASK_WINDOW_SIZE 16384  // Default per-ring task window size (power of 2)
 
-// Multi-ring: number of independent ring layers (HeapRing + TaskRing + DepPool per layer)
-// Scope depth maps to ring index via: min(scope_depth, PTO2_MAX_RING_DEPTH - 1)
-#define PTO2_MAX_RING_DEPTH 4
+// Step 1 of static-N migration: single-ring layout. All scopes map to ring 0.
+#define PTO2_MAX_RING_DEPTH 1
 
 // Memory pools (per-ring defaults; total = value × PTO2_MAX_RING_DEPTH)
 #define PTO2_HEAP_SIZE (256 * 1024 * 1024)  // 256MB per ring (1GB total)
@@ -77,11 +58,6 @@
 
 // Scope management
 #define PTO2_MAX_SCOPE_DEPTH 64  // Maximum nesting depth
-// Hard cap for the scope_tasks buffer. Equals the total in-flight ring slot
-// budget (PTO2_TASK_WINDOW_SIZE × PTO2_MAX_RING_DEPTH): once every ring slot
-// is in flight, no more tasks can ever be pushed regardless of buffer size.
-// scope_tasks_push fatals on overflow rather than growing the arena-owned
-// buffer (which would be UB on the arena's malloc'd backing).
 #define PTO2_SCOPE_TASKS_CAP (PTO2_TASK_WINDOW_SIZE * PTO2_MAX_RING_DEPTH)
 
 // Ready queue
@@ -93,8 +69,8 @@
 // Wiring queue
 #define PTO2_WRIRING_QUEUE_SIZE 1024  // Per-shape queue size
 
-// Fanin storage
-#define PTO2_FANIN_INLINE_CAP 64
+// Fanin storage — absolute max number of unique fanin dependencies per task.
+#define PTO2_MAX_FANIN 16
 
 // TensorMap cleanup interval
 #define PTO2_TENSORMAP_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
@@ -104,87 +80,38 @@
 // ~10s on hardware (1.5 GHz counter), ~10s on simulation (chrono-based).
 constexpr uint64_t PTO2_TENSOR_DATA_TIMEOUT_CYCLES = 15 * 1000 * 1000 * 1000ULL;
 
-// =============================================================================
-// Task States
-// =============================================================================
-
-/**
- * Task state enumeration
- *
- * State transitions:
- *   PENDING -> COMPLETED -> CONSUMED
- *
- * The slot stays in PENDING from submit through "ready in queue" and "running
- * on a worker"; readiness and running-vs-idle are derived from fanin_refcount
- * and per-core running_slot_state respectively, not from task_state itself.
- *
- * Conditions:
- *   PENDING->COMPLETED:   all subtasks finish (set by scheduler) or task is a
- *                         hidden alloc completed inline by the orchestrator
- *   COMPLETED->CONSUMED:  fanout_refcount == fanout_count && state == COMPLETED
- */
-typedef enum {
-    PTO2_TASK_PENDING = 0,    // Submitted; awaiting fanin, queued, or dispatched
-    PTO2_TASK_COMPLETED = 1,  // Execution finished, output may still be in use
-    PTO2_TASK_CONSUMED = 2    // Output fully consumed, buffers can be released
+typedef enum
+{
+    PTO2_TASK_PENDING = 0,   // Submitted; awaiting fanin, queued, or dispatched
+    PTO2_TASK_COMPLETED = 1  // Execution finished; per-ring completed_watermark
+                             // advances past this slot's last_consumer_local_id
+                             // to make its heap chunk reclaimable.
 } PTO2TaskState;
 
-/**
- * Result of a unified task allocation.
- */
-struct PTO2TaskAllocResult {
+struct PTO2TaskAllocResult
+{
     int32_t task_id;    // Absolute task ID (not wrapped)
     int32_t slot;       // task_id & (window_size - 1)
     void *packed_base;  // Heap allocation result (nullptr if failure)
     void *packed_end;   // packed_base + aligned output_size
 
-    bool failed() const { return task_id < 0; }
+    bool failed() const
+    {
+        return task_id < 0;
+    }
 };
 
-struct PTO2OutputLayout {
+struct PTO2OutputLayout
+{
     uint64_t offsets[MAX_TENSOR_ARGS] = {};
     uint64_t buffer_sizes[MAX_TENSOR_ARGS] = {};
     int32_t total_output_size = 0;
 };
 
-// =============================================================================
-// Dependency List Entry
-// =============================================================================
-
-/**
- * Fanin spill entry
- * Stored in the dedicated fanin spill ring buffer.
- */
 struct PTO2TaskSlotState;  // Forward declaration
-struct PTO2FaninPool;      // Forward declaration
-struct PTO2FaninSpillEntry {
-    PTO2TaskSlotState *slot_state;
-};
-static_assert(sizeof(PTO2FaninSpillEntry) == sizeof(PTO2TaskSlotState *));
-
-/**
- * Dependency list entry (singly-linked list node)
- * Stored in DepListPool ring buffer.
- */
-struct PTO2DepListEntry {
-    PTO2TaskSlotState *slot_state;  // Consumer slot state (direct pointer)
-    PTO2DepListEntry *next;         // next entry
-};
-
-// =============================================================================
-// Task Descriptor
-// =============================================================================
 
-/**
- * Task descriptor structure (shared memory)
- *
- * Stored in the TaskDescriptor ring buffer in shared memory.
- * Contains static identification and buffer pointers only.
- * Dynamic scheduling state (fanin/fanout/task_state) is in PTO2TaskSlotState.
- *
- * Fields set by Orchestrator at submission, read by Scheduler for dispatch.
- */
-struct PTO2TaskDescriptor {
+struct PTO2TaskDescriptor
+{
     // Mixed-task identification (encodes ring_id in upper 32 bits)
     PTO2TaskId task_id;  // raw: (ring_id << 32) | local_id
 
@@ -225,53 +152,38 @@ enum PTO2SpecState : uint8_t {
 inline constexpr int PTO2_SPEC_CORE_MASK_WORDS = 2;
 
 struct PTO2TaskPayload {
-    // === Cache lines 0-8 (576B) — metadata + inline fanin ===
+    // === Cache lines 0-2 (192B) — metadata + fanin (wireless model) ===
     int32_t tensor_count{0};
     int32_t scalar_count{0};
-    int32_t fanin_actual_count{0};  // Actual fanin count (without the +1 redundance)
-    int32_t fanin_spill_start{0};   // Linear start index in fanin spill pool (0 = no spill)
+    // wireless: flat fanin_local_ids[] populated at submit. The thread-0
+    // pending poll indexes a compact ring-level completion_flags byte array
+    // via these ids — avoids a pointer chase per fanin into a 128B-aligned
+    // slot_state.
+    int32_t fanin_count{0};
+    int32_t fanin_local_ids[PTO2_MAX_FANIN];
+    // ---- Upstream spec-dispatch coexistence (compatibility layer) ----
+    // Speculative early-dispatch (#1079) was built on a fanin_refcount /
+    // fanin_slot_states model. The wireless poller doesn't read these
+    // fields, but the spec-dispatch code paths still do — keep the storage
+    // so that code links. Populated alongside fanin_local_ids[].
+    int32_t fanin_actual_count{0};
+    int32_t fanin_spill_start{0};
     PTO2FaninPool *fanin_spill_pool{nullptr};
     PTO2TaskSlotState *fanin_inline_slot_states[PTO2_FANIN_INLINE_CAP];
-    // Speculative early-dispatch metadata (AICPU-side only). Ordered by descending
-    // alignment (8B mask, 4B fanin, then 1B flags) so the block packs with no
-    // internal padding. Kept here after the fanin array (not moved up front): on
-    // cache line 8 it shares only with the rarely-touched fanin tail, whereas in
-    // line 0 the spec atomics (written during staging) would false-share with
-    // tensor_count/scalar_count (read by build_payload at dispatch). Fits in the 40B
-    // between the fanin array (offset 536) and the 64B-aligned tensors[] (offset
-    // 576), so sizeof and tensors[] are unchanged.
-    //
-    // Bitmask of global core_ids this consumer is pre-staged (gated) on. Set with
-    // atomic fetch_or by concurrent stagers; read by release. (Re)initialized in
-    // PTO2TaskPayload::init before the slot can be staged again.
     std::atomic<uint64_t> staged_core_mask[PTO2_SPEC_CORE_MASK_WORDS]{};
-    // Early-dispatch CANDIDATE detection (event-driven, dual of fanin_refcount):
-    // seeded at wiring with producers already complete, then a flagged producer's
-    // DISPATCH bumps each consumer's dispatch_fanin. dispatch_fanin ==
-    // fanin_actual_count  <=>  every producer is flagged-and-dispatched or was
-    // pre-completed  =>  this task is an early-dispatch candidate (push early_dispatch_queue).
-    std::atomic<int32_t> dispatch_fanin{0};  // CONSUMER side: flagged-dispatched + pre-completed producers
-    bool allow_early_resolve{false};         // codegen hint copied from Arg in PTO2TaskPayload::init
-    // Lock-free claim state shared by the stagers (Hook 1, possibly several AICPU
-    // threads concurrently) and the completion-path release: 0=NONE, 1=STAGING,
-    // 3=DISPATCHED (2=STAGED is unused now). STAGING is the STABLE gated state —
-    // many threads stage blocks concurrently while it holds, each claiming a block
-    // via the atomic next_block_idx and OR-ing its cores into staged_core_mask.
-    // Release does STAGING->DISPATCHED then rings the mask; a thread that stages a
-    // block AFTER release flipped DISPATCHED rings that block's doorbell itself
-    // (self-ring), so no doorbell is ever missed.
+    std::atomic<int32_t> dispatch_fanin{0};
+    bool allow_early_resolve{false};
     std::atomic<uint8_t> spec_state{0};
-    std::atomic<uint8_t> dispatch_propagated{0};  // PRODUCER side: once-guard for fanout propagation
-    std::atomic<uint8_t> spec_chain_active{0};    // inherited early-dispatch flag (auto-chain past codegen flag)
-    uint8_t spec_chain_depth{0};                  // auto-chain depth; inherited = parent+1, capped
-    // === Cache lines 9-72 (4096B) — tensors (alignas(64) forces alignment) ===
+    std::atomic<uint8_t> dispatch_propagated{0};
+    std::atomic<uint8_t> spec_chain_active{0};
+    uint8_t spec_chain_depth{0};
+    // === Tensors (Tensor is alignas(64); array is naturally aligned) ===
     Tensor tensors[MAX_TENSOR_ARGS];
-    // === Cache lines 73-74 (128B) — scalars ===
+    // === Scalars ===
     uint64_t scalars[MAX_SCALAR_ARGS];
 
-    // Layout verification (size checks that don't need offsetof).
     static_assert(sizeof(Tensor) == 128, "Tensor must be 2 cache lines");
-    static_assert(MAX_SCALAR_ARGS * sizeof(uint64_t) == 128, "scalar region must be 128B (2 cache lines)");
+    static_assert(MAX_SCALAR_ARGS * sizeof(uint64_t) == MAX_SCALAR_ARGS * 8, "scalar region size matches MAX_SCALAR_ARGS");
 
     /**
      * Prefetch (for write) the regions init() is about to fill so the stores land
@@ -309,8 +221,10 @@ struct PTO2TaskPayload {
         scalar_count = args.scalar_count();
 
         // int32_t out_idx = 0;
-        for (int32_t i = 0; i < args.tensor_count(); i++) {
-            if (args.tag(i) != TensorArgType::OUTPUT) {
+        for (int32_t i = 0; i < args.tensor_count(); i++)
+        {
+            if (args.tag(i) != TensorArgType::OUTPUT)
+            {
                 tensors[i].copy(*args.tensor(i).ptr);
             } else {
                 init_tensor_from_create_info(
@@ -350,70 +264,38 @@ struct PTO2TaskPayload {
 };
 
 // PTO2TaskPayload layout verification (offsetof requires complete type).
-static_assert(offsetof(PTO2TaskPayload, fanin_spill_pool) == 16, "spill pool pointer layout drift");
-static_assert(
-    offsetof(PTO2TaskPayload, fanin_inline_slot_states) == 24, "inline fanin array must follow spill metadata"
-);
-static_assert(offsetof(PTO2TaskPayload, tensors) == 576, "tensors must start at byte 576 (cache line 9)");
-static_assert(
-    offsetof(PTO2TaskPayload, scalars) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor),
-    "scalars must immediately follow tensors"
-);
-static_assert(
-    sizeof(PTO2TaskPayload) == 576 + MAX_TENSOR_ARGS * sizeof(Tensor) + MAX_SCALAR_ARGS * sizeof(uint64_t),
-    "PTO2TaskPayload size must stay on the baseline cache-line footprint"
-);
+static_assert(offsetof(PTO2TaskPayload, fanin_local_ids) == 12, "fanin array must follow metadata words");
+static_assert(offsetof(PTO2TaskPayload, scalars) == offsetof(PTO2TaskPayload, tensors) + MAX_TENSOR_ARGS * sizeof(Tensor), "scalars must immediately follow tensors");
+static_assert(sizeof(PTO2TaskPayload) == offsetof(PTO2TaskPayload, scalars) + MAX_SCALAR_ARGS * sizeof(uint64_t), "no trailing padding after scalars");
+
+struct alignas(64) PTO2TaskSlotState
+{
+    // Highest local task id among this slot's consumers. Set to this slot's
+    // own local_id in prepare_task; bumped via max() in submit_task_common for
+    // each consumer that has this slot as a fanin. The slot's heap chunk is
+    // safe to reclaim when the per-ring completed_watermark reaches at least
+    // this id (i.e. every task up to and including the last consumer has
+    // transitioned to COMPLETED). Single-writer (orchestrator) at submit time.
+    int32_t last_consumer_local_id;
 
-/**
- * Per-task slot scheduling state (scheduler-private, NOT in shared memory)
- *
- * Consolidates all hot-path scheduling fields into a single cache-friendly
- * structure (32 bytes = half a cache line). Accessing any field of a task's
- * slot state brings all related fields into the same cache line.
- *
- * Concurrency notes:
- * - fanout_head, fanout_count protected by fanout_lock (per-task spinlock)
- * - fanin_count set once at submission, read-only after (hot path for ready check)
- * - task_state, fanin_refcount, fanout_refcount updated atomically
- */
-struct alignas(64) PTO2TaskSlotState {
-    // Fanout lock + list (accessed together under lock in on_task_complete)
-    std::atomic<int32_t> fanout_lock;  // Per-task spinlock (0=unlocked, 1=locked)
-    int32_t fanout_count;              // 1 (owning scope) + number of consumers
-
-    PTO2DepListEntry *fanout_head;  // Pointer to first fanout entry (nullptr = empty)
-
-    // Task state (completion, consumed check, ready check)
-    std::atomic<PTO2TaskState> task_state;  // PENDING/COMPLETED/CONSUMED
-
-    // Fanin (accessed together in release_fanin_and_check_ready)
-    std::atomic<int32_t> fanin_refcount;  // Dynamic: counts completed producers
-    int32_t fanin_count;                  // Number of producer dependencies (set once by wiring)
-
-    // Fanout refcount (accessed with fanout_count in check_and_handle_consumed)
-    std::atomic<int32_t> fanout_refcount;  // Dynamic: counts released references
-
-    // --- Per-slot constant, re-bound by orch::prepare_task each submit ---
-    // Value is the same on every reuse (&task_payloads[slot] / &task_descriptors[slot]),
-    // but written here per-submit instead of in an O(window_size) init loop —
-    // these are the only "scale-dependent" pointers in this struct, so moving
-    // them out of init makes startup cost independent of task_window_size.
     PTO2TaskPayload *payload;
     PTO2TaskDescriptor *task;
 
+    // --- (e) Wake-list: lightweight last-fanin notification ---
+    // When a pending consumer's fanin scan finds exactly ONE unmet fanin,
+    // it registers itself on the producer's wake list (CAS push). On producer
+    // completion, the producer atomic-exchanges wake_list_head to the
+    // SENTINEL value and pushes every waiter to the ready queues. Consumers
+    // that observe SENTINEL during registration push themselves directly
+    // (producer already completed). Reset to nullptr on slot reuse.
+    std::atomic<PTO2TaskSlotState *> wake_list_head{nullptr};
+    PTO2TaskSlotState *next_in_wake_list{nullptr};
+
     // --- Set per-submit (depend on task inputs) ---
     ActiveMask active_mask;  // Bitmask of active subtask slots (set once)
     uint8_t ring_id;         // Ring layer (immutable after init)
-    // Set by any subtask FIN that pushed deferred-completion CONDITIONs to
-    // the runtime mailbox; read by the last subtask FIN to decide whether
-    // the task needs MPSC-deferred completion or can complete inline on this
-    // thread. Carved out of the otherwise-padding byte between ring_id and
-    // dep_pool_mark to keep PTO2TaskSlotState at 64 bytes. The write is
-    // sequenced before on_subtask_complete's acq_rel fetch_add and the read
-    // after, so all earlier subtasks' writes are visible to the last subtask.
     std::atomic<bool> any_subtask_deferred{false};
     uint8_t _async_pad{0};
-    int32_t dep_pool_mark{0};  // Dep pool top after wiring (thread-0-only)
 
     std::atomic<int16_t> completed_subtasks{0};  // Each core completion increments by 1
     int16_t total_required_subtasks{0};          // = logical_block_num * popcount(active_mask)
@@ -424,41 +306,19 @@ struct alignas(64) PTO2TaskSlotState {
     // happens before release; normal dispatch of the remainder happens after).
     std::atomic<int16_t> next_block_idx{0};
 
-    /**
-     * Bind the slot-invariant ring id. Called once per slot during
-     * RingSchedState::init(); ring_id never changes across reuses.
-     */
-    void bind_ring(uint8_t rid) { ring_id = rid; }
+    void bind_ring(uint8_t rid)
+    {
+        ring_id = rid;
+    }
 
-    /**
-     * Re-bind the per-slot payload/task pointers. Called by
-     * orch::prepare_task on every submit. Value is constant for a given
-     * slot, but we pay the cheap re-write each submit (both fields land on
-     * the same 64B slot_state cache line that prepare_task is already
-     * dirtying) to avoid the init-time per-slot loop.
-     */
-    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t) {
+    void bind_buffers(PTO2TaskPayload *p, PTO2TaskDescriptor *t)
+    {
         payload = p;
         task = t;
     }
 
-    /**
-     * Reset dynamic scheduling fields for slot reuse.
-     * Called by advance_ring_pointers() after a slot transitions to CONSUMED
-     * and last_task_alive advances past it, but before sync_to_sm() publishes
-     * the new last_task_alive to the orchestrator.
-     *
-     * Skips payload, task, ring_id (immutable, bound once at init).
-     * Skips task_state: left as CONSUMED so that wait_for_tensor_ready()
-     * callers holding stale owner_task_id still observe a completed state.
-     * task_state is set to PENDING by the orchestrator when it reuses the slot.
-     */
-    void reset_for_reuse() {
-        fanout_lock.store(0, std::memory_order_relaxed);
-        fanout_count = 1;
-        fanout_head = nullptr;
-        fanin_refcount.store(0, std::memory_order_relaxed);
-        fanout_refcount.store(0, std::memory_order_relaxed);
+    void reset_for_reuse()
+    {
         completed_subtasks.store(0, std::memory_order_relaxed);
         next_block_idx.store(0, std::memory_order_relaxed);
         any_subtask_deferred.store(false, std::memory_order_relaxed);
@@ -466,57 +326,19 @@ struct alignas(64) PTO2TaskSlotState {
         // spec_chain_*) are NOT reset here — this method skips the payload by
         // contract. They are (re)initialized in PTO2TaskPayload::init on every
         // submit, before the slot becomes visible to the scheduler.
-    }
-
-    // === Per-task fanout spinlock ===
-    //
-    // Used by BOTH the orchestrator and the scheduler. The fanout_lock MUST
-    // be held whenever reading or writing fanout_head / fanout_count, because
-    // the orchestrator adds consumers concurrently with the scheduler
-    // traversing the list after task completion.
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void lock_fanout(uint64_t &atomic_count, uint64_t &wait_cycle) {
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-
-        for (;;) {
-            while (fanout_lock.load(std::memory_order_acquire) != 0) {
-                contended = true;
-                atomic_ops++;
-                SPIN_WAIT_HINT();
-            }
-            int32_t expected = 0;
-            if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
-                atomic_ops++;
-                atomic_count += atomic_ops;
-                if (contended) {
-                    wait_cycle += (get_sys_cnt_aicpu() - t0);
-                }
-                return;
-            }
-            contended = true;
-            atomic_ops++;
-        }
-    }
-#endif
 
-    void lock_fanout() {
-        for (;;) {
-            while (fanout_lock.load(std::memory_order_acquire) != 0) {
-                SPIN_WAIT_HINT();
-            }
-            int32_t expected = 0;
-            if (fanout_lock.compare_exchange_weak(expected, 1, std::memory_order_acquire, std::memory_order_relaxed)) {
-                return;
-            }
-        }
+        // (e) Wake list: clear for the next incarnation. Previous incarnation
+        // left it at WAKE_LIST_SENTINEL (set by its on_mixed_task_complete).
+        wake_list_head.store(nullptr, std::memory_order_relaxed);
+        next_in_wake_list = nullptr;
+        // last_consumer_local_id is reset in prepare_task once the task_id is known.
     }
-
-    void unlock_fanout() { fanout_lock.store(0, std::memory_order_release); }
 };
 
-static_assert(sizeof(PTO2TaskSlotState) == 64);
+// (e) Sentinel marking a wake list as "owner already completed; no more
+// registrations accepted". Distinct from any real slot_state pointer.
+inline PTO2TaskSlotState *const WAKE_LIST_SENTINEL = reinterpret_cast<PTO2TaskSlotState *>(uintptr_t{1});
+
+static_assert(sizeof(PTO2TaskSlotState) <= 128, "slot state should fit in two cache lines");
 
 #endif  // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_RUNTIME2_TYPES_H_
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
new file mode 100644
index 000000000..6305ad10b
--- /dev/null
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_scheduler.h
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#pragma once
+
+#include <atomic>
+
+#include "common/core_type.h"
+#include "utils/device_arena.h"
+#include "pto_async_wait.h"
+#include "pto_ring_buffer.h"
+#include "pto_runtime2_types.h"
+#include "pto_shared_memory.h"
+
+// Forward declaration so this header can compile under both AICPU and host
+// builds. The actual definition is provided by aicpu/device_time.cpp (AICPU)
+// or a weak stub in pto_runtime2.h (host). Used only for sub-phase profiling.
+uint64_t get_sys_cnt_aicpu();
+
+struct PTO2ReadyQueueSlot
+{
+    std::atomic<int64_t> sequence;
+    PTO2TaskSlotState *slot_state;
+};
+
+// Number of CoreType values eligible for local dispatch (AIC=0, AIV=1)
+static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2;
+
+struct PTO2LocalReadyBuffer
+{
+    PTO2TaskSlotState **slot_states = nullptr;
+    int count = 0;
+    int capacity = 0;
+
+    void reset(PTO2TaskSlotState **buf, int cap)
+    {
+        slot_states = buf;
+        count = 0;
+        capacity = cap;
+    }
+
+    bool try_push(PTO2TaskSlotState *s)
+    {
+        if (slot_states && count < capacity)
+        {
+            slot_states[count++] = s;
+            return true;
+        }
+        return false;
+    }
+
+    PTO2TaskSlotState *pop()
+    {
+        return (count > 0) ? slot_states[--count] : nullptr;
+    }
+};
+
+struct alignas(64) PTO2ReadyQueue
+{
+    PTO2ReadyQueueSlot *slots;
+    uint64_t capacity;
+    uint64_t mask;        // capacity - 1
+    char _pad0[64 - 24];  // Pad to own cache line
+
+    std::atomic<uint64_t> enqueue_pos;
+    char _pad1[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
+
+    std::atomic<uint64_t> dequeue_pos;
+    char _pad2[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
+
+    uint64_t size()
+    {
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        return (e >= d) ? (e - d) : 0;
+    }
+
+    bool push(PTO2TaskSlotState *slot_state)
+    {
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        while (true)
+        {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos);
+            if (diff == 0)
+            {
+                if (enqueue_pos.compare_exchange_weak(pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed)) break;
+            }
+            else if (diff < 0)
+            {
+                return false;  // Queue full
+            }
+        }
+
+        slot->slot_state = slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
+        return true;
+    }
+
+    // Batch push: reserve count slots with a single CAS after confirming
+    // every target slot is available under the usual Vyukov sequence check.
+    void push_batch(PTO2TaskSlotState **items, int count)
+    {
+        if (count == 0) return;
+
+        uint64_t pos;
+        while (true)
+        {
+            pos = enqueue_pos.load(std::memory_order_relaxed);
+            bool ready = true;
+            for (int i = 0; i < count; i++)
+            {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + i);
+                if (diff != 0)
+                {
+                    ready = false;
+                    break;
+                }
+            }
+            if (!ready) continue;
+            if (enqueue_pos.compare_exchange_weak(pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed)) break;
+        }
+
+        for (int i = 0; i < count; i++)
+        {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            slot->slot_state = items[i];
+            slot->sequence.store(static_cast<int64_t>(pos + i + 1), std::memory_order_release);
+        }
+    }
+
+    PTO2TaskSlotState *pop()
+    {
+        // Fast-path: skip slot load when queue is clearly empty
+        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
+        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
+        if (d >= e) return nullptr;
+
+        uint64_t pos;
+        PTO2ReadyQueueSlot *slot;
+        while (true)
+        {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            slot = &slots[pos & mask];
+            int64_t seq = slot->sequence.load(std::memory_order_acquire);
+            int64_t diff = seq - static_cast<int64_t>(pos + 1);
+            if (diff == 0)
+            {
+                if (dequeue_pos.compare_exchange_weak(pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed)) break;
+            }
+            else if (diff < 0)
+            {
+                return nullptr;  // Queue empty
+            }
+        }
+
+        PTO2TaskSlotState *result = slot->slot_state;
+        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
+        return result;
+    }
+
+    // Batch pop: reserve a contiguous run of ready slots with a single CAS.
+    // Returns actual number of items popped (may be less than max_count).
+    int pop_batch(PTO2TaskSlotState **out, int max_count)
+    {
+        uint64_t pos;
+        int count;
+        while (true)
+        {
+            pos = dequeue_pos.load(std::memory_order_relaxed);
+            count = 0;
+            while (count < max_count)
+            {
+                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
+                int64_t seq = slot->sequence.load(std::memory_order_acquire);
+                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
+                if (diff == 0)
+                {
+                    count++;
+                    continue;
+                }
+                if (diff < 0) break;
+                count = -1;
+                break;
+            }
+            if (count == 0) return 0;
+            if (count < 0) continue;
+            if (dequeue_pos.compare_exchange_weak(pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed)) break;
+        }
+
+        for (int i = 0; i < count; i++)
+        {
+            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
+            out[i] = slot->slot_state;
+            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
+        }
+        return count;
+    }
+};
+
+inline size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity)
+{
+    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
+}
+inline bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity)
+{
+    // Address the slots region for data writes without storing the pointer in
+    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
+    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+    queue->capacity = capacity;
+    queue->mask = capacity - 1;
+    queue->enqueue_pos.store(0, std::memory_order_relaxed);
+    queue->dequeue_pos.store(0, std::memory_order_relaxed);
+
+    for (uint64_t i = 0; i < capacity; i++)
+    {
+        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
+        slots_arena[i].slot_state = nullptr;
+    }
+
+    return true;
+}
+// Stores queue->slots = arena.region_ptr(slots_off). Idempotent.
+inline void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off)
+{
+    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
+}
+inline void ready_queue_destroy(PTO2ReadyQueue *queue)
+{
+    // Arena owns the slots[] buffer; just forget the pointer.
+    queue->slots = nullptr;
+}
+
+struct alignas(64) PTO2SpscQueue
+{
+    // --- Producer cache lines (orchestrator thread) ---
+    alignas(64) std::atomic<uint64_t> head_{0};
+    alignas(64) uint64_t tail_cached_{0};
+
+    // --- Consumer cache lines (scheduler thread 0) ---
+    alignas(64) std::atomic<uint64_t> tail_{0};
+    alignas(64) uint64_t head_cached_{0};
+
+    // --- Shared Cacheline (read only) with mask and data ptr (immutable after init) ---
+    alignas(64) PTO2TaskSlotState **buffer_{nullptr};
+    uint64_t mask_{0};
+
+    // Padding to exactly 5 cache lines
+    char padding[64 - sizeof(PTO2TaskSlotState **) - sizeof(uint64_t)];
+
+    static size_t reserve_layout(DeviceArena &arena, uint64_t capacity)
+    {
+        return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
+    }
+
+    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity)
+    {
+        if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false;
+        auto *buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+        // calloc'd-equivalent: zero the slot pointers so spurious early pops
+        // observe nullptr.
+        for (uint64_t i = 0; i < capacity; i++) buf[i] = nullptr;
+        mask_ = capacity - 1;
+        head_.store(0, std::memory_order_relaxed);
+        tail_.store(0, std::memory_order_relaxed);
+        tail_cached_ = 0;
+        head_cached_ = 0;
+        return true;
+    }
+
+    // Wire the arena-internal pointer. Called by both host (with host arena)
+    // and AICPU (with device arena attached to the prebuilt image).
+    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off)
+    {
+        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
+    }
+
+    // Arena owns the buffer; here we only forget our pointer.
+    void destroy()
+    {
+        buffer_ = nullptr;
+    }
+
+    bool push(PTO2TaskSlotState *item)
+    {
+        uint64_t h = head_.load(std::memory_order_relaxed);
+        uint64_t next_h = h + 1;
+        if (next_h - tail_cached_ > mask_)
+        {
+            tail_cached_ = tail_.load(std::memory_order_acquire);
+            if (next_h - tail_cached_ > mask_) return false;
+        }
+        buffer_[h & mask_] = item;
+        head_.store(next_h, std::memory_order_release);
+        return true;
+    }
+
+    // Pop up to max_count items (consumer only). Returns actual count.
+    int pop_batch(PTO2TaskSlotState **out, int max_count)
+    {
+        uint64_t t = tail_.load(std::memory_order_relaxed);
+        uint64_t avail = head_cached_ - t;
+        if (avail < static_cast<uint64_t>(max_count))
+        {
+            head_cached_ = head_.load(std::memory_order_acquire);
+            avail = head_cached_ - t;
+            if (avail == 0) return 0;
+        }
+        int count = (avail < static_cast<uint64_t>(max_count)) ? static_cast<int>(avail) : max_count;
+        for (int i = 0; i < count; i++) out[i] = buffer_[(t + i) & mask_];
+        tail_.store(t + count, std::memory_order_release);
+        return count;
+    }
+
+    // Approximate size (used for backoff decisions, not exact).
+    uint64_t size() const
+    {
+        uint64_t h = head_.load(std::memory_order_acquire);
+        uint64_t t = tail_.load(std::memory_order_acquire);
+        return h - t;
+    }
+};
+
+static_assert(sizeof(PTO2SpscQueue) == 5 * 64, "PTO2SpscQueue must be exactly 5 cache lines (320B)");
+// =============================================================================
+
+struct CompletionStats
+{
+    int32_t fanout_edges;       // Number of fanout edges traversed (notify consumers)
+    int32_t tasks_enqueued;     // Number of consumers that became READY
+    int32_t fanin_edges;        // Number of fanin edges traversed (release producers)
+    bool mixed_task_completed;  // True only when this callback completed a mixed task
+};
+
+struct PTO2SchedulerLayout
+{
+    size_t off_ready_queue_slots[PTO2_NUM_RESOURCE_SHAPES];
+    size_t off_dummy_ready_queue_slots;
+    size_t off_pending_spsc_buffer;
+    size_t off_pending_buffer;
+    uint64_t ready_queue_capacity;
+    uint64_t spsc_capacity;
+    uint64_t pending_capacity;
+};
+
+struct PTO2SchedulerState
+{
+    // Shared memory access
+    PTO2SharedMemoryHeader *sm_header;
+
+    // Per-ring state
+    struct alignas(64) RingSchedState
+    {
+        PTO2SharedMemoryRingHeader *ring;
+        int32_t last_task_alive;
+        std::atomic<int32_t> advance_lock;  // multi-thread CAS
+
+        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id)
+        {
+            ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
+            last_task_alive = 0;
+            advance_lock.store(0, std::memory_order_relaxed);
+            return true;
+        }
+
+        void destroy() { ring = nullptr; }
+
+        void sync_to_sm()
+        {
+            ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release);
+        }
+
+        void advance_ring_pointers()
+        {
+            const int32_t watermark = ring->completed_watermark.load(std::memory_order_acquire);
+            int32_t old_last_task_alive = last_task_alive;
+
+            // Retire any slot at the tail whose last consumer is at or below
+            // the global completed watermark — i.e. every consumer of this
+            // producer has reached COMPLETED. Implies this slot itself is
+            // COMPLETED because the seed value of last_consumer_local_id is
+            // the slot's own local_id.
+            while (last_task_alive <= watermark)
+            {
+                PTO2TaskSlotState &slot_state = ring->get_slot_state_by_task_id(last_task_alive);
+                if (watermark < slot_state.last_consumer_local_id) break;
+                last_task_alive++;
+            }
+
+            for (int32_t id = old_last_task_alive; id < last_task_alive; id++) ring->get_slot_state_by_task_id(id).reset_for_reuse();
+
+            sync_to_sm();
+        }
+    } ring_sched_states[PTO2_MAX_RING_DEPTH];
+
+    // Ready queues remain global (scheduling is ring-agnostic)
+    PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES];
+
+    // Dependency-only tasks (active_mask is empty, shape == DUMMY). Drained by
+    // the dispatch loop and completed inline -- never goes to AICore.
+    PTO2ReadyQueue dummy_ready_queue;
+
+    // Thread 0 exclusive: circular FIFO of tasks awaiting fanin readiness.
+    // SPSC queue receives slot_states from the orchestrator; thread 0 drains
+    // them into the pending ring and polls fanin readiness. Storing the FIFO
+    // out of band (instead of intrusively in PTO2TaskSlotState) keeps the
+    // task struct free of scheduler-private state.
+    struct alignas(64) PendingState
+    {
+        static constexpr int BACKOFF_LIMIT = 32;
+        static constexpr int DRAIN_BATCH = 30;
+        static constexpr int POLL_MAX_PER_ITER = 128;
+
+        // --- Thread 0 exclusive ---
+        PTO2TaskSlotState **pending_buf{nullptr};  // capacity slots, arena-owned
+        uint32_t pending_cap{0};
+        uint32_t pending_mask{0};
+        uint32_t pending_head_idx{0};  // next pop
+        uint32_t pending_tail_idx{0};  // next push
+        int backoff_counter{0};
+        PTO2TaskSlotState *drain_buf[DRAIN_BATCH];
+
+        // --- SPSC queue: orchestrator (push) ↔ thread 0 (pop) ---
+        PTO2SpscQueue queue;
+
+        // --- Orchestrator write, thread 0 read ---
+        alignas(64) std::atomic<bool> orch_needs_drain{false};
+
+        uint32_t pending_count() const { return pending_tail_idx - pending_head_idx; }
+        bool pending_empty() const { return pending_tail_idx == pending_head_idx; }
+    } wiring;
+
+    alignas(64) AsyncWaitList async_wait_list;
+
+    void push_ready_routed(PTO2TaskSlotState *slot_state)
+    {
+        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+        if (shape == PTO2ResourceShape::DUMMY) dummy_ready_queue.push(slot_state);
+        else ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+    }
+
+    // Append slot to the tail of the pending FIFO.
+    void pending_push_back(PTO2TaskSlotState *s)
+    {
+        wiring.pending_buf[wiring.pending_tail_idx & wiring.pending_mask] = s;
+        wiring.pending_tail_idx++;
+    }
+
+    // Pop the head of the pending FIFO (or nullptr).
+    PTO2TaskSlotState *pending_pop_front()
+    {
+        if (wiring.pending_empty()) return nullptr;
+        PTO2TaskSlotState *s = wiring.pending_buf[wiring.pending_head_idx & wiring.pending_mask];
+        wiring.pending_head_idx++;
+        return s;
+    }
+
+    bool fanin_satisfied(PTO2TaskSlotState *s) const
+    {
+        const PTO2TaskPayload &p = *s->payload;
+        const auto &ring = *ring_sched_states[s->ring_id].ring;
+        const int32_t mask = ring.task_window_mask;
+        std::atomic<uint8_t> *flags = ring.completion_flags;
+        for (int32_t i = 0; i < p.fanin_count; i++)
+            if (flags[p.fanin_local_ids[i] & mask].load(std::memory_order_acquire) == 0) return false;
+        return true;
+    }
+
+    // (e) Single-pass fanin classification used by the pending poll. Returns:
+    //   -2: all fanins met (route directly to ready)
+    //   -1: 2+ fanins unmet (push back to pending FIFO)
+    //   ≥0: exactly 1 fanin unmet, returned index identifies which fanin
+    //       (register on that producer's wake list).
+    int classify_fanin_state(PTO2TaskSlotState *s) const
+    {
+        const PTO2TaskPayload &p = *s->payload;
+        const auto &ring = *ring_sched_states[s->ring_id].ring;
+        const int32_t mask = ring.task_window_mask;
+        std::atomic<uint8_t> *flags = ring.completion_flags;
+        int unmet_idx = -2;
+        for (int32_t i = 0; i < p.fanin_count; i++)
+        {
+            if (flags[p.fanin_local_ids[i] & mask].load(std::memory_order_acquire) == 0)
+            {
+                if (unmet_idx >= 0) return -1;  // 2+ unmet
+                unmet_idx = i;
+            }
+        }
+        return unmet_idx;
+    }
+
+    // (e) Register `consumer` on `producer`'s wake list. If producer has
+    // already completed (head == WAKE_LIST_SENTINEL), push consumer directly
+    // to ready_queues. Otherwise CAS push-onto the head.
+    void register_wake(PTO2TaskSlotState *producer, PTO2TaskSlotState *consumer)
+    {
+        PTO2TaskSlotState *expected = producer->wake_list_head.load(std::memory_order_relaxed);
+        while (true)
+        {
+            if (expected == WAKE_LIST_SENTINEL)
+            {
+                // Producer already completed and drained its wake list. The
+                // last unmet fanin is now satisfied; push consumer to ready.
+                push_ready_routed(consumer);
+                return;
+            }
+            consumer->next_in_wake_list = expected;
+            if (producer->wake_list_head.compare_exchange_weak(expected, consumer, std::memory_order_acq_rel, std::memory_order_relaxed))
+            {
+                return;  // registered
+            }
+            // CAS failed: expected was updated by load on retry. Loop.
+        }
+    }
+
+    // Thread 0 entry point: drain SPSC into pending list, then poll pending
+    // for newly-ready tasks. Not-ready tasks rotate to the tail.
+    // Returns >0 if anything moved (SPSC drained OR tasks routed to ready);
+    // 0 signals no productive work.
+    //
+    // Sub-phase timing pointers (optional). If non-null, cumulative cycle/
+    // iteration counters for Stage 1 (SPSC drain) and Stage 2 (pending poll)
+    // are accumulated into them.
+    int drain_wiring_queue(bool force_drain = false,
+                           uint64_t *spsc_cyc_out = nullptr, uint64_t *spsc_iters_out = nullptr,
+                           uint64_t *poll_cyc_out = nullptr, uint64_t *poll_iters_out = nullptr)
+    {
+        // Stage 1: drain SPSC → pending FIFO tail
+        uint64_t t0 = spsc_cyc_out ? get_sys_cnt_aicpu() : 0;
+        int drained = wiring.queue.pop_batch(wiring.drain_buf, PendingState::DRAIN_BATCH);
+        for (int i = 0; i < drained; i++) pending_push_back(wiring.drain_buf[i]);
+        if (spsc_cyc_out)
+        {
+            *spsc_cyc_out += get_sys_cnt_aicpu() - t0;
+            if (spsc_iters_out) (*spsc_iters_out)++;
+        }
+
+        // Backoff when nothing to do and orchestrator isn't pressing
+        if (drained == 0 && wiring.pending_empty())
+        {
+            if (!force_drain && !wiring.orch_needs_drain.load(std::memory_order_acquire) && wiring.backoff_counter < PendingState::BACKOFF_LIMIT)
+            {
+                wiring.backoff_counter++;
+                return 0;
+            }
+        }
+        wiring.backoff_counter = 0;
+
+        // Stage 2: poll pending FIFO. Three-way classification:
+        //   - all fanins met → push to ready_queues
+        //   - exactly 1 unmet → register on that producer's wake list (no
+        //     more polling for this task; producer wakes it on completion)
+        //   - 2+ unmet → push back to FIFO for the next poll cycle
+        uint64_t t1 = poll_cyc_out ? get_sys_cnt_aicpu() : 0;
+        int routed = 0;
+        int to_visit = static_cast<int>(wiring.pending_count());
+        if (to_visit > PendingState::POLL_MAX_PER_ITER) to_visit = PendingState::POLL_MAX_PER_ITER;
+        for (int i = 0; i < to_visit; i++)
+        {
+            PTO2TaskSlotState *s = pending_pop_front();
+            if (s == nullptr) break;
+            int state = classify_fanin_state(s);
+            if (state == -2)
+            {
+                push_ready_routed(s);
+                routed++;
+            }
+            else if (state == -1)
+            {
+                pending_push_back(s);  // 2+ missing, re-check next cycle
+            }
+            else
+            {
+                // exactly 1 unmet at index `state`; register and remove from FIFO
+                int32_t prod_local = s->payload->fanin_local_ids[state];
+                auto &ring = *ring_sched_states[s->ring_id].ring;
+                PTO2TaskSlotState *producer = &ring.get_slot_state_by_task_id(prod_local);
+                register_wake(producer, s);
+                routed++;  // count as routed since it's no longer in FIFO
+            }
+        }
+        if (poll_cyc_out)
+        {
+            *poll_cyc_out += get_sys_cnt_aicpu() - t1;
+            if (poll_iters_out) (*poll_iters_out)++;
+        }
+
+        return drained + routed;
+    }
+
+    int get_ready_tasks_batch(PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count)
+    {
+        int count = 0;
+        while (count < max_count && local_buf.count > 0) out[count++] = local_buf.slot_states[--local_buf.count];
+        int remaining = max_count - count;
+        if (remaining > 0) count += ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining);
+        return count;
+    }
+
+    bool on_subtask_complete(PTO2TaskSlotState &slot_state)
+    {
+        int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
+        return (prev + 1) == slot_state.total_required_subtasks;
+    }
+
+    // Publish this slot as COMPLETED, then advance the per-ring monotonic
+    // completed_watermark — the highest local_id W such that every task
+    // 0..W has reached COMPLETED. Reclamation in advance_ring_pointers gates
+    // on watermark >= producer.last_consumer_local_id, so no consumer→producer
+    // notification edge is needed.
+    void on_mixed_task_complete(PTO2TaskSlotState &slot_state)
+    {
+        // (m) Skip slot_state.task_state.store here; completion_flags below is
+        // the single source of truth. Saves one atomic release store per task.
+        const int32_t my_id = static_cast<int32_t>(slot_state.task->task_id.local());
+        int32_t ring_id = slot_state.ring_id;
+        auto &rss = ring_sched_states[ring_id];
+        auto &ring = *rss.ring;
+
+        // Publish to the polling-fast completion array. Release ordering
+        // makes the producer's output writes visible to consumers that
+        // acquire-load this byte in fanin_satisfied.
+        ring.completion_flags[my_id & ring.task_window_mask].store(1, std::memory_order_release);
+
+        // (e) Drain the wake list. Any consumer registered on this slot was
+        // waiting on us as their last unmet fanin. After completion_flag is
+        // set above, atomic-exchange wake_list_head to SENTINEL (refusing
+        // any future registrations) and push every waiter to the ready
+        // queues. Ordering: completion_flag is set BEFORE the exchange, so
+        // any consumer that races a registration against our exchange and
+        // observes a SENTINEL during retry will see completion_flag=1 and
+        // push itself directly.
+        PTO2TaskSlotState *waiter = slot_state.wake_list_head.exchange(WAKE_LIST_SENTINEL, std::memory_order_acq_rel);
+        while (waiter != nullptr && waiter != WAKE_LIST_SENTINEL)
+        {
+            PTO2TaskSlotState *next = waiter->next_in_wake_list;
+            waiter->next_in_wake_list = nullptr;
+            push_ready_routed(waiter);
+            waiter = next;
+        }
+
+        // CAS-advance the watermark, bounded by my_id (which we know is
+        // published since we just completed it). If a forward task we observe
+        // as COMPLETED is also published, but a gap remains, we stop — the
+        // task filling the gap will resume the walk when it completes.
+        int32_t w = ring.completed_watermark.load(std::memory_order_acquire);
+        while (w < my_id)
+        {
+            int32_t next = w + 1;
+            // (m) Read completion_flags (already published by the candidate's
+            // completer) instead of cand.task_state — one fewer atomic store
+            // per task in the common path.
+            if (ring.completion_flags[next & ring.task_window_mask].load(std::memory_order_acquire) == 0) break;
+            if (ring.completed_watermark.compare_exchange_weak(w, next, std::memory_order_acq_rel, std::memory_order_acquire))
+            {
+                w = next;
+            }
+        }
+
+        // Try to retire slots whose last consumer has reached COMPLETED.
+        int32_t expected_lock = 0;
+        if (rss.advance_lock.compare_exchange_strong(expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed))
+        {
+            rss.advance_ring_pointers();
+            rss.advance_lock.store(0, std::memory_order_release);
+        }
+    }
+
+    // === Cold-path API ===
+
+    static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t /*dep_pool_capacity*/)
+    {
+        PTO2SchedulerLayout layout{};
+        layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
+        layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
+        layout.pending_capacity = PTO2_TASK_WINDOW_SIZE;  // bounded by per-ring slot window
+
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+        layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
+        layout.off_pending_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
+        layout.off_pending_buffer = arena.reserve(layout.pending_capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
+        return layout;
+    }
+
+    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base)
+    {
+        PTO2SchedulerState *sched = this;
+        sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) return false;
+
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++)
+            if (!ready_queue_init_data_from_layout(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity)) return false;
+        if (!ready_queue_init_data_from_layout(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity)) return false;
+
+        if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_pending_spsc_buffer, layout.spsc_capacity)) return false;
+
+        if (layout.pending_capacity == 0 || (layout.pending_capacity & (layout.pending_capacity - 1)) != 0) return false;
+        sched->wiring.pending_buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_pending_buffer));
+        sched->wiring.pending_cap = static_cast<uint32_t>(layout.pending_capacity);
+        sched->wiring.pending_mask = sched->wiring.pending_cap - 1;
+        sched->wiring.pending_head_idx = 0;
+        sched->wiring.pending_tail_idx = 0;
+        sched->wiring.backoff_counter = 0;
+
+        return true;
+    }
+
+    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena)
+    {
+        PTO2SchedulerState *sched = this;
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
+        ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
+        sched->wiring.queue.wire_arena_pointers(arena, layout.off_pending_spsc_buffer);
+        sched->wiring.pending_buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_pending_buffer));
+    }
+
+    // Forget per-region pointers; arena owns the backing memory.
+    void destroy()
+    {
+        PTO2SchedulerState *sched = this;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) sched->ring_sched_states[r].destroy();
+        sched->wiring.queue.destroy();
+        sched->wiring.pending_buf = nullptr;
+        for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) ready_queue_destroy(&sched->ready_queues[i]);
+        ready_queue_destroy(&sched->dummy_ready_queue);
+    }
+};
+
+// Scheduler cold-path API is declared as PTO2SchedulerState member functions.
+// See init()/destroy() below the struct definition.
+
+inline bool AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state)
+{
+    sink.sched->on_mixed_task_complete(slot_state);
+    sink.inline_completed++;
+    return true;
+}
+
+template <bool Profiling>
+inline AsyncPollResult AsyncWaitList::poll_and_complete(AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched)
+{
+    AsyncPollResult result;
+    if (!try_lock()) return result;
+
+    AsyncWaitList::DrainCompletionSink sink{};
+    sink.sched = sched;
+
+    int32_t drain_err = PTO2_ERROR_NONE;
+    drain_aicore_completion_mailbox_locked(aicore_mailbox, sink, drain_err);
+    if (drain_err != PTO2_ERROR_NONE)
+    {
+        result.error_code = drain_err;
+        unlock();
+        return result;
+    }
+    result.completed += sink.inline_completed;
+
+    for (int32_t i = count - 1; i >= 0; --i)
+    {
+        AsyncWaitEntry &entry = entries[i];
+        uintptr_t last_invalidated_counter_line = static_cast<uintptr_t>(-1);
+        for (int32_t c = 0; c < entry.condition_count; c++)
+        {
+            CompletionCondition &cond = entry.conditions[c];
+            if (cond.satisfied) continue;
+            if (cond.completion_type == COMPLETION_TYPE_COUNTER && cond.counter_addr != nullptr)
+            {
+                uintptr_t counter_line = mailbox_cache_line(cond.counter_addr);
+                if (counter_line != last_invalidated_counter_line)
+                {
+                    cache_invalidate_range(reinterpret_cast<const void *>(counter_line), sizeof(uint32_t));
+                    last_invalidated_counter_line = counter_line;
+                }
+            }
+            CompletionPollResult poll = cond.test();
+            if (poll.state == CompletionPollState::FAILED)
+            {
+                result.error_code = poll.error_code;
+                result.failed_slot_state = entry.slot_state;
+                unlock();
+                return result;
+            }
+            if (poll.state == CompletionPollState::READY)
+            {
+                cond.satisfied = true;
+                cond.retire();
+                entry.waiting_completion_count--;
+            }
+        }
+
+        if (entry.normal_done && entry.waiting_completion_count <= 0)
+        {
+            sched->on_mixed_task_complete(*entry.slot_state);
+            result.completed++;
+
+            int32_t last = count - 1;
+            if (i != last) entries[i] = entries[last];
+            count = last;
+        }
+    }
+
+    unlock();
+    return result;
+}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
index 25e4bcfeb..a5e029ee8 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_shared_memory.h
@@ -8,64 +8,24 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * PTO Runtime2 - Shared Memory Layout
- *
- * Defines the shared memory structure for Orchestrator-Scheduler communication.
- *
- * Memory Layout (per-ring sections repeat for each ring 0..PTO2_MAX_RING_DEPTH-1):
- *   +---------------------------+
- *   | SharedMemoryHeader        |  (per-ring flow control + sync)
- *   +---------------------------+
- *   | Ring 0: TaskDescriptor[]  |
- *   | Ring 0: TaskPayload[]     |
- *   | Ring 0: TaskSlotState[]   |
- *   +---------------------------+
- *   | Ring 1: TaskDescriptor[]  |
- *   | Ring 1: TaskPayload[]     |
- *   | Ring 1: TaskSlotState[]   |
- *   +---------------------------+
- *   | ...                       |
- *   +---------------------------+
- *
- * Design principles:
- * - Only data needed for Orchestrator<->Scheduler communication is here
- * - TensorMap, scope_stack, ready_queues, dep_pool are in private memory
- * - Flow control via atomic counters/flags (no locks needed for single-word R/W)
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
 
 #pragma once
 
 #include "utils/device_arena.h"
 #include "pto_runtime2_types.h"
 
-// =============================================================================
-// Shared Memory Header
-// =============================================================================
-
 struct PTO2SharedMemoryHandle;
 
-/**
- * Per-ring flow control state in shared memory.
- * Written/read by Orchestrator and Scheduler for synchronization.
- */
-struct alignas(64) PTO2RingFlowControl {
+struct alignas(64) PTO2RingFlowControl
+{
     // === Cache Line 0: Written by Orchestrator, Read by Scheduler ===
     alignas(64) std::atomic<int32_t> current_task_index;  // Task ring head (next to allocate)
 
     // === Cache Line 1: Written by Scheduler, Read by Orchestrator (for back-pressure) ===
     alignas(64) std::atomic<int32_t> last_task_alive;  // Task ring tail (oldest active task)
 
-    // Per-boot SM reset. PTO2TaskAllocator::init() seeds its private
-    // local_task_id_ from initial_local_task_id (default 0 in production)
-    // *without* dereferencing current_task_index — it relies on this reset
-    // running on every AICPU boot so 0 stays in sync. If you ever change
-    // the initial fc value or the boot ordering, update the default in
-    // PTO2TaskAllocator::init (pto_ring_buffer.h) in the same change, or
-    // submit IDs will be off by the divergence.
-    void init() {
+    void init()
+    {
         current_task_index.store(0, std::memory_order_relaxed);
         last_task_alive.store(0, std::memory_order_relaxed);
     }
@@ -75,15 +35,16 @@ struct alignas(64) PTO2RingFlowControl {
 
 static_assert(sizeof(PTO2RingFlowControl) == 128, "PTO2RingFlowControl must be exactly 2 cache lines (128B)");
 
-/**
- * Per-ring shared memory header section.
- *
- * Groups flow-control, layout info, and per-ring data pointers for a single ring.
- * Pointers are host-side only (set by setup_pointers, invalid on device).
- */
-struct alignas(64) PTO2SharedMemoryRingHeader {
+struct alignas(64) PTO2SharedMemoryRingHeader
+{
     PTO2RingFlowControl fc;
 
+    // Highest task_id such that every task with id in [0, completed_watermark]
+    // has reached COMPLETED. Maintained at task-completion time. Used to gate
+    // slot reclamation: a producer slot P is safe to retire when
+    // completed_watermark >= P.last_consumer_local_id.
+    alignas(64) std::atomic<int32_t> completed_watermark;
+
     // Layout metadata (set once at init)
     uint64_t task_window_size;
     int32_t task_window_mask;
@@ -95,31 +56,48 @@ struct alignas(64) PTO2SharedMemoryRingHeader {
     PTO2TaskPayload *task_payloads;
     PTO2TaskSlotState *slot_states;
 
-    int32_t get_slot_by_task_id(int32_t local_task_id) { return local_task_id & task_window_mask; }
-
-    PTO2TaskDescriptor &get_task_by_slot(int32_t slot) { return task_descriptors[slot]; }
+    // Compact contiguous array (one byte per slot) holding the polling-fast
+    // "task X completed?" flag. 0 = pending, 1 = completed. Indexed by
+    // local_id & task_window_mask. Writer: the task's completer at
+    // on_mixed_task_complete; Resetter: orchestrator in prepare_task for the
+    // newly-allocated slot. Reader: thread-0 fanin polling. Replaces a chain
+    // of 128B-aligned slot_state pointer derefs with byte reads into a single
+    // array — typically condenses 16 fanin checks into 1-2 cache lines.
+    std::atomic<uint8_t> *completion_flags;
+
+    PTO2TaskDescriptor &get_task_by_slot(int32_t slot)
+    {
+        return task_descriptors[slot];
+    }
 
-    PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id) {
-        return task_descriptors[get_slot_by_task_id(local_id)];
+    PTO2TaskDescriptor &get_task_by_task_id(int32_t local_id)
+    {
+        return task_descriptors[local_id & task_window_mask];
     }
 
-    PTO2TaskPayload &get_payload_by_slot(int32_t slot) { return task_payloads[slot]; }
+    PTO2TaskPayload &get_payload_by_slot(int32_t slot)
+    {
+        return task_payloads[slot];
+    }
 
-    PTO2TaskPayload &get_payload_by_task_id(int32_t local_id) { return task_payloads[get_slot_by_task_id(local_id)]; }
+    PTO2TaskPayload &get_payload_by_task_id(int32_t local_id)
+    {
+        return task_payloads[local_id & task_window_mask];
+    }
 
-    PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot) { return slot_states[slot]; }
+    PTO2TaskSlotState &get_slot_state_by_slot(int32_t slot)
+    {
+        return slot_states[slot];
+    }
 
-    PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id) {
-        return slot_states[get_slot_by_task_id(local_id)];
+    PTO2TaskSlotState &get_slot_state_by_task_id(int32_t local_id)
+    {
+        return slot_states[local_id & task_window_mask];
     }
 };
 
-/**
- * Shared memory header structure
- *
- * Contains per-ring flow control and global layout information.
- */
-struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader {
+struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader
+{
     // === PER-RING FLOW CONTROL + LAYOUT INFO (set once at init) ===
     PTO2SharedMemoryRingHeader rings[PTO2_MAX_RING_DEPTH];
 
@@ -147,20 +125,10 @@ struct alignas(PTO2_ALIGN_SIZE) PTO2SharedMemoryHeader {
     std::atomic<int32_t> sched_error_thread;   // Thread index of last error writer
 };
 
-static_assert(
-    (sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096),
-    "PTO2SharedMemoryHeader should be reasonably sized"
-);
+static_assert((sizeof(PTO2SharedMemoryHeader) % PTO2_ALIGN_SIZE == 0) && (sizeof(PTO2SharedMemoryHeader) < 4096), "PTO2SharedMemoryHeader should be reasonably sized");
 
-// =============================================================================
-// Shared Memory Handle
-// =============================================================================
-
-/**
- * Handle for shared memory lifecycle management (create/destroy).
- * Runtime components (orchestrator, scheduler) use PTO2SharedMemoryHeader* directly.
- */
-struct PTO2SharedMemoryHandle {
+struct PTO2SharedMemoryHandle
+{
     void *sm_base;     // Base address of shared memory
     uint64_t sm_size;  // Total size of shared memory
 
@@ -171,91 +139,212 @@ struct PTO2SharedMemoryHandle {
 
     // === Static helpers ===
 
-    static uint64_t calculate_size(uint64_t task_window_size);
-    static uint64_t calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+    static uint64_t calculate_size(uint64_t task_window_size)
+    {
+        uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_window_sizes[r] = task_window_size;
+        return calculate_size_per_ring(task_window_sizes);
+    }
+    static uint64_t calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        uint64_t size = 0;
+
+        // Header (aligned to cache line)
+        size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+
+        // Per-ring task descriptors and payloads
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+            size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+            size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+            size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(std::atomic<uint8_t>), PTO2_ALIGN_SIZE);
+        }
+
+        return size;
+    }
 
-    // UT convenience: reserve wrapper + sm_base on `arena`, commit, and init
-    // using default PTO2_TASK_WINDOW_SIZE / PTO2_HEAP_SIZE. Only valid when the
-    // arena is otherwise empty (the call performs the single commit). All
-    // memory is owned by the arena — caller must not call destroy().
-    static PTO2SharedMemoryHandle *create_and_init_default(DeviceArena &arena);
+    static PTO2SharedMemoryHandle *create_and_init_default(DeviceArena &arena)
+    {
+        const uint64_t buffer_size = calculate_size(PTO2_TASK_WINDOW_SIZE);
+        const size_t off_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
+        const size_t off_buffer = arena.reserve(static_cast<size_t>(buffer_size), PTO2_ALIGN_SIZE);
+        if (arena.commit() == nullptr) return nullptr;
+
+        auto *handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_handle));
+        memset(handle, 0, sizeof(*handle));
+        void *buffer = arena.region_ptr(off_buffer);
+        memset(buffer, 0, static_cast<size_t>(buffer_size));
+        if (!handle->init(buffer, buffer_size, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE)) return nullptr;
+        return handle;
+    }
 
     // === Instance methods ===
 
-    // In-place init for caller-provided wrapper storage (e.g. a region carved
-    // out of a DeviceArena). Sets is_owner = false, calls setup_pointers and
-    // init_header. Returns false when `sm_size` is too small for the requested
-    // `task_window_size`.
-    bool init(void *sm_base, uint64_t sm_size, uint64_t task_window_size, uint64_t heap_size);
+    bool init(void *sm_base_arg, uint64_t sm_size_arg, uint64_t task_window_size, uint64_t heap_size)
+    {
+        if (!sm_base_arg || sm_size_arg == 0) return false;
+        if (sm_size_arg < calculate_size(task_window_size)) return false;
+
+        sm_base = sm_base_arg;
+        sm_size = sm_size_arg;
+        is_owner = false;
+        setup_pointers(task_window_size);
+        init_header(task_window_size, heap_size);
+        return true;
+    }
+
+    void destroy()
+    {
+        // Arena-owned wrappers (is_owner == false) are reclaimed by arena.release();
+        // calling destroy on them is a no-op so existing callers stay safe.
+        if (is_owner && sm_base)
+        {
+            free(sm_base);
+            free(this);
+        }
+    }
+    void print_layout()
+    {
+        if (!header) return;
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {}
+    }
+    bool validate()
+    {
+        if (!sm_base) return false;
+        if (!header) return false;
+
+        PTO2SharedMemoryHeader *h = header;
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            if (!h->rings[r].fc.validate(this, r)) return false;
 
-    void destroy();
-    void print_layout();
-    bool validate();
+        return true;
+    }
 
 private:
-    void init_header(uint64_t task_window_size, uint64_t heap_size);
-    void init_header_per_ring(
-        const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
-    );
-    void setup_pointers(uint64_t task_window_size);
-    void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
+    void init_header(uint64_t task_window_size, uint64_t heap_size)
+    {
+        uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+        uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            task_window_sizes[r] = task_window_size;
+            heap_sizes[r] = heap_size;
+        }
+        init_header_per_ring(task_window_sizes, heap_sizes);
+    }
+    void init_header_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        // Per-ring flow control (start at 0)
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            header->rings[r].fc.init();
+            // -1 = "no task completed yet"; first task to complete (local_id 0)
+            // will advance the watermark to 0.
+            header->rings[r].completed_watermark.store(-1, std::memory_order_relaxed);
+        }
+
+        header->orchestrator_done.store(0, std::memory_order_relaxed);
+
+        // Per-ring layout info
+        uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            header->rings[r].task_window_size = task_window_sizes[r];
+            header->rings[r].task_window_mask = static_cast<int32_t>(task_window_sizes[r] - 1);
+            header->rings[r].heap_size = heap_sizes[r];
+            header->rings[r].task_descriptors_offset = offset;
+            offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+            offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+            offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+        }
+
+        header->total_size = sm_size;
+        header->graph_output_ptr.store(0, std::memory_order_relaxed);
+        header->graph_output_size.store(0, std::memory_order_relaxed);
+
+        // Error reporting
+        header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+        header->sched_error_bitmap.store(0, std::memory_order_relaxed);
+        header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
+        header->sched_error_thread.store(-1, std::memory_order_relaxed);
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            auto &ring = header->rings[r];
+            for (uint64_t i = 0; i < task_window_sizes[r]; i++)
+            {
+                ring.slot_states[i].bind_ring(static_cast<uint8_t>(r));
+                ring.slot_states[i].reset_for_reuse();
+                ring.slot_states[i].active_mask = ActiveMask{};
+            }
+        }
+    }
+    void setup_pointers(uint64_t task_window_size)
+    {
+        uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_window_sizes[r] = task_window_size;
+        setup_pointers_per_ring(task_window_sizes);
+    }
+    void setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        char *ptr = (char *)sm_base;
+
+        // Header
+        header = (PTO2SharedMemoryHeader *)ptr;
+        ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
+
+        // Per-ring task descriptors, payloads, and slot states
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            auto &ring = header->rings[r];
+            ring.task_descriptors = (PTO2TaskDescriptor *)ptr;
+            ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
+
+            ring.task_payloads = (PTO2TaskPayload *)ptr;
+            ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
+
+            ring.slot_states = (PTO2TaskSlotState *)ptr;
+            ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
+
+            ring.completion_flags = (std::atomic<uint8_t> *)ptr;
+            ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(std::atomic<uint8_t>), PTO2_ALIGN_SIZE);
+        }
+    }
 };
 
-// =============================================================================
-// SM Device Layout Helpers
-// =============================================================================
-//
-// When the host pre-builds a runtime-arena image, it needs the device-side
-// addresses of several SM sub-fields (ring flow-control counters,
-// task_descriptors arrays, orch_error_code) so it can wire them into the
-// orchestrator / scheduler init_data path without dereferencing the SM —
-// the SM lives in device memory and cannot be touched from host.
-//
-// These helpers compute those addresses by offset arithmetic on the SM
-// device base. Pure pointer math, no loads/stores; safe to call from host.
-// The same arithmetic happens on AICPU too (via PTO2SharedMemoryHandle's
-// own setup_pointers), so values are guaranteed consistent across sides.
 namespace pto2_sm_layout {
 
-inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept {
-    return reinterpret_cast<std::atomic<int32_t> *>(
-        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code)
-    );
+inline std::atomic<int32_t> *orch_error_code_addr(void *sm_dev_base) noexcept
+{
+    return reinterpret_cast<std::atomic<int32_t> *>(static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, orch_error_code));
 }
 
-inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept {
-    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(
-        static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) +
-        static_cast<size_t>(ring_id) * sizeof(PTO2SharedMemoryRingHeader)
-    );
+inline PTO2SharedMemoryRingHeader *ring_header_addr(void *sm_dev_base, int ring_id) noexcept
+{
+    return reinterpret_cast<PTO2SharedMemoryRingHeader *>(static_cast<char *>(sm_dev_base) + offsetof(PTO2SharedMemoryHeader, rings) + static_cast<size_t>(ring_id) * sizeof(PTO2SharedMemoryRingHeader));
 }
 
-inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept {
-    return reinterpret_cast<std::atomic<int32_t> *>(
-        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
-        offsetof(PTO2RingFlowControl, current_task_index)
-    );
+inline std::atomic<int32_t> *ring_current_task_index_addr(void *sm_dev_base, int ring_id) noexcept
+{
+    return reinterpret_cast<std::atomic<int32_t> *>(reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + offsetof(PTO2RingFlowControl, current_task_index));
 }
 
-inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept {
-    return reinterpret_cast<std::atomic<int32_t> *>(
-        reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) +
-        offsetof(PTO2RingFlowControl, last_task_alive)
-    );
+inline std::atomic<int32_t> *ring_last_task_alive_addr(void *sm_dev_base, int ring_id) noexcept
+{
+    return reinterpret_cast<std::atomic<int32_t> *>(reinterpret_cast<char *>(ring_header_addr(sm_dev_base, ring_id)) + offsetof(PTO2SharedMemoryRingHeader, fc) + offsetof(PTO2RingFlowControl, last_task_alive));
 }
 
-// Walk the per-ring SM layout (same arithmetic as setup_pointers_per_ring)
-// to compute ring `ring_id`'s task_descriptors device address. Accepts a
-// per-ring window-size array so the helper's signature mirrors
-// `PTO2SharedMemoryHandle::setup_pointers_per_ring` and cannot silently
-// disagree with the SM layout when (hypothetically) ring sizes diverge.
-inline PTO2TaskDescriptor *ring_task_descriptors_addr(
-    void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id
-) noexcept {
+inline PTO2TaskDescriptor *ring_task_descriptors_addr(void *sm_dev_base, const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], int ring_id) noexcept
+{
     assert(ring_id >= 0 && ring_id < PTO2_MAX_RING_DEPTH && "pto2_sm_layout: ring_id out of range");
     char *p = static_cast<char *>(sm_dev_base);
     p += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-    for (int r = 0; r < ring_id; r++) {
+    for (int r = 0; r < ring_id; r++)
+    {
         p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
         p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
         p += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
index 21c77fce2..f70af0a23 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_submit_types.h
@@ -9,36 +9,21 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO Submit Types - Shared submit-contract definitions
- *
- * Header-only definitions shared by orchestration-facing and runtime-facing
- * headers. Keeps orchestration slim (no dependency on pto_runtime2_types.h).
- */
-
 #pragma once
 
 #include <stdint.h>
 
 inline constexpr int32_t INVALID_KERNEL_ID = -1;
 
-/**
- * Subtask slot count: AIC, AIV0, AIV1
- */
 inline constexpr int32_t PTO2_SUBTASK_SLOT_COUNT = 3;
 
-/**
- * Subtask slot indices
- */
-enum class PTO2SubtaskSlot : uint8_t {
+enum class PTO2SubtaskSlot : uint8_t
+{
     AIC = 0,
     AIV0 = 1,
     AIV1 = 2,
 };
 
-/**
- * Subtask mask bits (for ActiveMask)
- */
 inline constexpr uint8_t PTO2_SUBTASK_MASK_AIC = (1u << 0);         // 0x1
 inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV0 = (1u << 1);        // 0x2
 inline constexpr uint8_t PTO2_SUBTASK_MASK_AIV1 = (1u << 2);        // 0x4
@@ -57,36 +42,46 @@ inline constexpr uint8_t PTO2_SUBTASK_FLAG_SYNC_START = (1u << 3);  // 0x8: all
  * with an empty core_mask route to a dedicated DUMMY ready queue and are
  * completed inline by the scheduler dispatch loop, bypassing core allocation.
  */
-enum class PTO2ResourceShape : uint8_t {
+enum class PTO2ResourceShape : uint8_t
+{
     AIC = 0,    // Single AIC
     AIV = 1,    // Single AIV
     MIX = 2,    // Full cluster (dispatch uses active_mask)
     DUMMY = 3,  // Dependency-only (no AICore dispatch)
 };
 
-// Number of *dispatchable* resource shapes (AIC, AIV, MIX). DUMMY does not
-// allocate a per-shape ready_queue entry / local buffer — it lives in a
-// dedicated queue inside PTO2SchedulerState.
 inline constexpr int32_t PTO2_NUM_RESOURCE_SHAPES = 3;
 
-/**
- * Bitmask of active subtask slots + flags, sizeof == 1.
- */
-class ActiveMask {
+class ActiveMask
+{
 public:
     constexpr ActiveMask() = default;
     constexpr explicit ActiveMask(uint8_t raw) :
-        raw_(raw) {}
+        raw_(raw)
+    {}
 
-    uint8_t raw() const { return raw_; }
+    uint8_t raw() const
+    {
+        return raw_;
+    }
 
-    bool subtask_active(PTO2SubtaskSlot slot) const { return (raw_ & (1u << static_cast<uint8_t>(slot))) != 0; }
+    bool subtask_active(PTO2SubtaskSlot slot) const
+    {
+        return (raw_ & (1u << static_cast<uint8_t>(slot))) != 0;
+    }
 
-    uint8_t core_mask() const { return raw_ & 0x07u; }
+    uint8_t core_mask() const
+    {
+        return raw_ & 0x07u;
+    }
 
-    bool requires_sync_start() const { return (raw_ & PTO2_SUBTASK_FLAG_SYNC_START) != 0; }
+    bool requires_sync_start() const
+    {
+        return (raw_ & PTO2_SUBTASK_FLAG_SYNC_START) != 0;
+    }
 
-    PTO2ResourceShape to_shape() const {
+    PTO2ResourceShape to_shape() const
+    {
         uint8_t cmask = core_mask();
         if (cmask == 0) return PTO2ResourceShape::DUMMY;
         int bit_count = __builtin_popcount(cmask);
@@ -95,22 +90,44 @@ class ActiveMask {
         return PTO2ResourceShape::AIV;
     }
 
-    void set_sync_start() { raw_ |= PTO2_SUBTASK_FLAG_SYNC_START; }
+    void set_sync_start()
+    {
+        raw_ |= PTO2_SUBTASK_FLAG_SYNC_START;
+    }
 
-    bool operator==(ActiveMask other) const { return raw_ == other.raw_; }
-    bool operator!=(ActiveMask other) const { return raw_ != other.raw_; }
+    bool operator==(ActiveMask other) const
+    {
+        return raw_ == other.raw_;
+    }
+    bool operator!=(ActiveMask other) const
+    {
+        return raw_ != other.raw_;
+    }
 
-    ActiveMask operator|(ActiveMask other) const { return ActiveMask(raw_ | other.raw_); }
-    ActiveMask &operator|=(ActiveMask other) {
+    ActiveMask operator|(ActiveMask other) const
+    {
+        return ActiveMask(raw_ | other.raw_);
+    }
+    ActiveMask &operator|=(ActiveMask other)
+    {
         raw_ |= other.raw_;
         return *this;
     }
 
-    ActiveMask operator&(uint8_t mask) const { return ActiveMask(raw_ & mask); }
+    ActiveMask operator&(uint8_t mask) const
+    {
+        return ActiveMask(raw_ & mask);
+    }
 
-    bool has_mask(uint8_t mask) const { return (raw_ & mask) != 0; }
+    bool has_mask(uint8_t mask) const
+    {
+        return (raw_ & mask) != 0;
+    }
 
-    explicit operator bool() const { return raw_ != 0; }
+    explicit operator bool() const
+    {
+        return raw_ != 0;
+    }
 
 private:
     uint8_t raw_{0};
@@ -118,18 +135,14 @@ class ActiveMask {
 
 static_assert(sizeof(ActiveMask) == 1, "ActiveMask must be exactly 1 byte");
 
-/**
- * Mixed-task submit contract.
- *
- * Each field holds either a valid kernel ID or INVALID_KERNEL_ID (inactive).
- * At least one slot must be valid.
- */
-struct MixedKernels {
+struct MixedKernels
+{
     int32_t aic_kernel_id{INVALID_KERNEL_ID};
     int32_t aiv0_kernel_id{INVALID_KERNEL_ID};
     int32_t aiv1_kernel_id{INVALID_KERNEL_ID};
 
-    ActiveMask to_active_mask() const {
+    ActiveMask to_active_mask() const
+    {
         uint8_t mask = 0;
         if (aic_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIC;
         if (aiv0_kernel_id != INVALID_KERNEL_ID) mask |= PTO2_SUBTASK_MASK_AIV0;
@@ -138,22 +151,28 @@ struct MixedKernels {
     }
 };
 
-/**
- * SPMD launch parameters carried inside Arg.
- *
- * Controls how many logical blocks (SPMD dimension) a single task
- * is expanded into at dispatch time.  Each block receives a unique
- * block_idx in [0, block_num) via the per-dispatch LocalContext.
- */
-class PTO2LaunchSpec {
+class PTO2LaunchSpec
+{
 public:
     constexpr PTO2LaunchSpec() = default;
 
-    int16_t block_num() const { return block_num_; }
-    void set_block_num(int16_t n) { block_num_ = n; }
+    int16_t block_num() const
+    {
+        return block_num_;
+    }
+    void set_block_num(int16_t n)
+    {
+        block_num_ = n;
+    }
 
-    bool require_sync_start() const { return require_sync_start_; }
-    void set_require_sync_start(bool v) { require_sync_start_ = v; }
+    bool require_sync_start() const
+    {
+        return require_sync_start_;
+    }
+    void set_require_sync_start(bool v)
+    {
+        require_sync_start_ = v;
+    }
 
 private:
     int16_t block_num_{1};
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 30017fadd..732ac02da 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -9,37 +9,6 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO Runtime2 - TensorMap Interface
- *
- * TensorMap provides producer lookup for dependency discovery:
- * - Maps Tensor -> producer task ID
- * - Used by pto_submit_task() to find dependencies
- *
- * Key design features:
- * 1. Ring buffer pool for entries (no malloc/free)
- * 2. Lazy invalidation (entries become stale when producer retires)
- * 3. Per-task per-ring entry tracking for efficient cleanup
- * 4. OVERLAP DETECTION: Detects dependencies for overlapping sub-regions
- *
- * Hash table with chaining:
- * - buckets[] array of head offsets
- * - Entries linked via next_in_bucket
- * - Insert at head (newest first) for sorted chains
- *
- * CRITICAL: Hash only by base_ptr
- * ==============================
- * For overlap detection to work, ALL sub-regions of the same base tensor
- * MUST be in the SAME hash bucket. This allows lookup to compare all
- * potentially overlapping regions.
- *
- * Overlap detection: Two regions create a dependency if:
- *   1. Same base_ptr (raw tensor pointer)
- *   2. Byte ranges [offset, offset+size) intersect
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
 #pragma once
 
 #include "common.h"
@@ -72,7 +41,8 @@ struct Segment {
  *
  * All offsets are relative to the arena's base.
  */
-struct PTO2TensorMapLayout {
+struct PTO2TensorMapLayout
+{
     size_t off_buckets;
     size_t off_entry_pool;
     size_t off_free_entry_list;
@@ -122,119 +92,86 @@ extern uint64_t g_insert_count;
  *
  * Entry size: 128B (2 cache lines), matches Tensor.
  */
-struct alignas(64) PTO2TensorMapEntry {
+struct alignas(64) PTO2TensorMapEntry
+{
     // === Cache line 1 (64B) — lookup hot path; mirrors Tensor line 1 from byte 16 ===
-    uint64_t buffer_addr;                // 8B [0, 8):   tensor base address (hash key, mirrors Tensor::buffer.addr)
-    PTO2TensorMapEntry *next_in_bucket;  // 8B [8, 16):  next entry in hash bucket chain (overlays Tensor::buffer.size)
-    PTO2TaskId producer_task_id;         // 8B [16,24):  mirrors Tensor::owner_task_id slot
-    uint64_t start_offset;               // 8B [24,32):  mirrors Tensor::start_offset (element offset)
-    int32_t version;                     // 4B [32,36):  mirrors Tensor::version
-    uint32_t ndims;                      // 4B [36,40):  mirrors Tensor::ndims
-    DataType dtype;                      // 1B [40,41):  mirrors Tensor::dtype
-    bool manual_dep;                     // 1B [41,42):  mirrors Tensor::manual_dep
-    bool is_contiguous;                  // 1B [42,43):  mirrors Tensor::is_contiguous
-    uint8_t __padding1__;                // 1B [43,44):  mirrors Tensor padding
-    uint32_t shapes[MAX_TENSOR_DIMS];    // 20B [44,64): mirrors Tensor::shapes
+    uint64_t buffer_addr;                      // 8B [0, 8):   tensor base address (hash key, mirrors Tensor::buffer.addr)
+    PTO2TensorMapEntry *next_in_bucket;        // 8B [8, 16):  next entry in hash bucket chain (overlays Tensor::buffer.size)
+    PTO2TaskId producer_task_id;               // 8B [16, 24):  mirrors Tensor::owner_task_id slot
+    uint64_t start_offset;                     // 8B [24, 32):  mirrors Tensor::start_offset (element offset)
+    int32_t version;                           // 4B [32, 36):  mirrors Tensor::version
+    uint32_t ndims;                            // 4B [36, 40):  mirrors Tensor::ndims
+    DataType dtype;                            // 1B [40, 41):  mirrors Tensor::dtype
+    bool manual_dep;                           // 1B [41, 42):  mirrors Tensor::manual_dep
+    bool is_contiguous;                        // 1B [42, 43):  mirrors Tensor::is_contiguous
+    uint8_t __padding1__;                      // 1B [43, 44):  mirrors Tensor padding
+    uint32_t shapes[MAX_TENSOR_DIMS];          // 20B [44, 64): mirrors Tensor::shapes
 
     // === Cache line 2 (64B) — chain manipulation + non-contiguous overlap data ===
-    PTO2TensorMapEntry *prev_in_bucket;  // 8B [64, 72)
-    PTO2TensorMapEntry *next_in_task;    // 8B [72, 80)
-    PTO2TensorMapEntry *prev_in_task;    // 8B [80, 88)
-    int32_t bucket_index;                // 4B [88, 92): -1 when unlinked
-    uint32_t __padding2__;               // 4B [92, 96)
-    uint64_t extent_elem_cache;          // 8B [96,104): non-contiguous extent (mirrors Tensor)
-    uint32_t strides[MAX_TENSOR_DIMS];   // 20B [104,124): element strides, mirrors Tensor::strides
-    uint8_t __padding3__[4];             // 4B [124,128)
-
-    /**
-     * Copy overlap-relevant fields from a Tensor into this entry.
-     *
-     * 64B memcpy of Tensor cache line 1 populates buffer_addr (byte [0,8)),
-     * producer_task_id, start_offset, version, ndims, dtype, manual_dep,
-     * is_contiguous and shapes[]. Byte [8,16) holds Tensor::buffer.size in
-     * the source and gets written into next_in_bucket; that's harmless
-     * because link_entry() overwrites next_in_bucket immediately after.
-     *
-     * Cache line 2 (stride / extent_elem_cache) is derived from line 1 when
-     * the source is canonically contiguous (is_contiguous && start_offset==0),
-     * so the producer Tensor's cache line 2 stays cold during insert. Only
-     * non-contiguous producers pay one extra line 2 read.
-     */
-    void copy_from_tensor(const Tensor &tensor) {
+    PTO2TensorMapEntry *prev_in_bucket;         // 8B [64, 72)
+    PTO2TensorMapEntry *next_in_task;           // 8B [72, 80)
+    PTO2TensorMapEntry *prev_in_task;           // 8B [80, 88)
+    int32_t bucket_index;                       // 4B [88, 92): -1 when unlinked
+    uint32_t __padding2__;                      // 4B [92, 96)
+    uint64_t extent_elem_cache;                 // 8B [96, 104): non-contiguous extent (mirrors Tensor)
+    uint32_t strides[MAX_TENSOR_DIMS];          // 20B [104, 124): element strides, mirrors Tensor::strides
+    uint8_t __padding3__[4];                    // 4B [124, 128)
+
+    void copy_from_tensor(const Tensor &tensor)
+    {
         memcpy(this, &tensor, 64);
-        if (tensor.is_contiguous && tensor.start_offset == 0) {
+        if (tensor.is_contiguous && tensor.start_offset == 0)
+        {
             uint64_t numel = 1;
-            for (uint32_t i = 0; i < tensor.ndims; i++)
-                numel *= tensor.shapes[i];
+            for (uint32_t i = 0; i < tensor.ndims; i++) numel *= tensor.shapes[i];
             extent_elem_cache = numel;
             uint32_t s = 1;
-            for (int32_t i = static_cast<int32_t>(tensor.ndims) - 1; i >= 0; i--) {
+            for (int32_t i = static_cast<int32_t>(tensor.ndims) - 1; i >= 0; i--)
+            {
                 strides[i] = s;
                 s *= tensor.shapes[i];
             }
-        } else {
+        }
+        else
+        {
             extent_elem_cache = tensor.extent_elem_cache;
-            for (uint32_t i = 0; i < tensor.ndims; i++) {
-                strides[i] = tensor.strides[i];
-            }
+            for (uint32_t i = 0; i < tensor.ndims; i++) strides[i] = tensor.strides[i];
         }
     }
 
-    void copy_tensor_create_info(const TensorCreateInfo &tensor_create_info, uint64_t addr) {
+    void copy_tensor_create_info(const TensorCreateInfo &tensor_create_info, uint64_t addr)
+    {
         memcpy(this, &tensor_create_info, 64);
         buffer_addr = addr;
         // Create-info outputs are always contiguous with start_offset = 0;
         // extent_elem = prod(shapes); stride is row-major.
         uint64_t numel = 1;
-        for (uint32_t i = 0; i < tensor_create_info.ndims; i++) {
-            numel *= tensor_create_info.shapes[i];
-        }
+        for (uint32_t i = 0; i < tensor_create_info.ndims; i++) numel *= tensor_create_info.shapes[i];
         extent_elem_cache = numel;
         uint32_t s = 1;
-        for (int32_t i = static_cast<int32_t>(tensor_create_info.ndims) - 1; i >= 0; i--) {
+        for (int32_t i = static_cast<int32_t>(tensor_create_info.ndims) - 1; i >= 0; i--)
+        {
             strides[i] = s;
             s *= tensor_create_info.shapes[i];
         }
     }
 
-    /**
-     * Effective element extent of this entry.
-     * Contiguous-aligned views compute it from shapes alone (line 1 hit only);
-     * non-contiguous views read the cached value from line 2.
-     */
-    uint64_t effective_extent_elem() const {
-        if (is_contiguous) {
+    uint64_t effective_extent_elem() const
+    {
+        if (is_contiguous)
+        {
             uint64_t n = 1;
-            for (uint32_t i = 0; i < ndims; i++)
-                n *= shapes[i];
+            for (uint32_t i = 0; i < ndims; i++) n *= shapes[i];
             return n;
         }
         return extent_elem_cache;
     }
 
-    /**
-     * Check overlap between input tensor and this entry (the producer output).
-     *
-     * Three-level cascade:
-     *   L1 — O(1) byte-range intersection. Disjoint -> NO_OVERLAP.
-     *   L2 — O(ndims) hyper-rectangle precise check, eligible only when both
-     *        sides share the same canonical row-major axis layout (same
-     *        dtype/ndims/strides[], stride descends as integer multiples,
-     *        start_offset decomposes cleanly under the reference shape).
-     *        Yields NO_OVERLAP / COVERED / OTHER per-dim.
-     *   L3 — Non-hyper-rectangle pairs (transpose/permute mismatch, slice
-     *        with step, etc): conservative OTHER. Exact enumeration via
-     *        contiguous-segment merge is scheduled for a follow-up.
-     *
-     * COVERED is returned when `input` completely contains `entry` per-dim
-     * — dep_compute uses this to retire the now-redundant entry.
-     */
-    OverlapStatus check_overlap(const Tensor &input) const {
+    OverlapStatus check_overlap(const Tensor &input) const
+    {
         debug_assert(input.buffer.addr == buffer_addr);
         debug_assert(input.version >= version);
-        if (input.version > version) {
-            return OverlapStatus::OTHER;
-        }
+        if (input.version > version) return OverlapStatus::OTHER;
 
         // -------- L1: byte-range intersection (O(1) fast reject) --------
         const uint64_t in_begin = input.start_offset;
@@ -243,27 +180,15 @@ struct alignas(64) PTO2TensorMapEntry {
         const uint64_t ent_end = start_offset + effective_extent_elem();
         Segment in_range_bytes{in_begin, in_end};
         Segment ent_range_bytes{ent_begin, ent_end};
-        if (!in_range_bytes.line_segment_intersection(ent_range_bytes)) {
-            return OverlapStatus::NO_OVERLAP;
-        }
+        if (!in_range_bytes.line_segment_intersection(ent_range_bytes)) return OverlapStatus::NO_OVERLAP;
 
         // -------- L2 prereqs: same axis layout? --------
-        if (input.dtype != dtype || input.ndims != ndims || ndims == 0) {
-            return OverlapStatus::OTHER;
-        }
-        for (uint32_t i = 0; i < ndims; i++) {
+        if (input.dtype != dtype || input.ndims != ndims || ndims == 0) return OverlapStatus::OTHER;
+        for (uint32_t i = 0; i < ndims; i++)
             if (input.strides[i] != strides[i]) return OverlapStatus::OTHER;
-        }
-        // strides[ndims-1] must be 1 and strides[i-1] must be an integer
-        // multiple of strides[i] for the row-major reference-shape derivation
-        // below to hold. This rejects slice-with-step (strides[d] != prev factor)
-        // and any view chain that scrambles the axis order. (strides is
-        // uint32_t with the > 0 invariant enforced at construction, so no
-        // sign check needed.)
         if (strides[ndims - 1] != 1) return OverlapStatus::OTHER;
-        for (uint32_t i = 1; i < ndims; i++) {
+        for (uint32_t i = 1; i < ndims; i++)
             if (strides[i - 1] % strides[i] != 0) return OverlapStatus::OTHER;
-        }
 
         // Derive reference shape A from stride. By construction stride is
         // row-major over A: strides[i] = prod(A[i+1..ndims-1]). So
@@ -301,7 +226,8 @@ struct alignas(64) PTO2TensorMapEntry {
         uint32_t ent_offsets[MAX_TENSOR_DIMS] = {};
         uint64_t in_remain = input.start_offset;
         uint64_t ent_remain = start_offset;
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             const uint32_t s = strides[i];
             in_offsets[i] = static_cast<uint32_t>(in_remain / s);
             ent_offsets[i] = static_cast<uint32_t>(ent_remain / s);
@@ -312,22 +238,20 @@ struct alignas(64) PTO2TensorMapEntry {
 
         // Validate that each side fits within ref_shapes (defense in depth —
         // a well-formed view always satisfies this).
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             if (static_cast<uint64_t>(in_offsets[i]) + input.shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER;
             if (static_cast<uint64_t>(ent_offsets[i]) + shapes[i] > ref_shapes[i]) return OverlapStatus::OTHER;
         }
 
         // -------- L2 core: per-dim line-segment intersection --------
         bool input_contains_entry = true;
-        for (uint32_t i = 0; i < ndims; i++) {
+        for (uint32_t i = 0; i < ndims; i++)
+        {
             Segment in_seg{in_offsets[i], static_cast<uint64_t>(in_offsets[i]) + input.shapes[i]};
             Segment ent_seg{ent_offsets[i], static_cast<uint64_t>(ent_offsets[i]) + shapes[i]};
-            if (!in_seg.line_segment_intersection(ent_seg)) {
-                return OverlapStatus::NO_OVERLAP;
-            }
-            if (!in_seg.contains(ent_seg)) {
-                input_contains_entry = false;
-            }
+            if (!in_seg.line_segment_intersection(ent_seg)) return OverlapStatus::NO_OVERLAP;
+            if (!in_seg.contains(ent_seg)) input_contains_entry = false;
         }
         return input_contains_entry ? OverlapStatus::COVERED : OverlapStatus::OTHER;
     }
@@ -343,20 +267,10 @@ static_assert(offsetof(PTO2TensorMapEntry, dtype) == offsetof(Tensor, dtype));
 static_assert(offsetof(PTO2TensorMapEntry, manual_dep) == offsetof(Tensor, manual_dep));
 static_assert(offsetof(PTO2TensorMapEntry, is_contiguous) == offsetof(Tensor, is_contiguous));
 static_assert(offsetof(PTO2TensorMapEntry, shapes) == offsetof(Tensor, shapes));
-static_assert(
-    offsetof(PTO2TensorMapEntry, prev_in_bucket) == 64, "TensorMapEntry must be exactly 2 cache lines (128 bytes)"
-);
+static_assert(offsetof(PTO2TensorMapEntry, prev_in_bucket) == 64, "TensorMapEntry must be exactly 2 cache lines (128 bytes)");
 
-// =============================================================================
-// TensorMap Lookup Chain Length Statistics (compile-time toggle)
-// =============================================================================
-
-/**
- * TensorMap structure
- *
- * Hash table with ring buffer entry pool and lazy invalidation.
- */
-struct PTO2TensorMap {
+struct PTO2TensorMap
+{
     // Hash table buckets (fixed size, power of 2)
     PTO2TensorMapEntry **buckets;  // Array of offsets into entry_pool (-1 = empty)
     int32_t num_buckets;           // Must be power of 2 for fast modulo
@@ -379,20 +293,25 @@ struct PTO2TensorMap {
     // Per-ring cleanup progress (for periodic cleanup_retired)
     int32_t last_cleanup[PTO2_MAX_RING_DEPTH]{};
 
-    uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const {
+    uint32_t get_task_local_id_slot(uint8_t ring_id, uint32_t task_local_id) const
+    {
         return task_local_id & (task_window_sizes[ring_id] - 1);
     }
 
-    // Accessors read by scope_stats_collector. Declared unconditionally so the
-    // collector .cpp compiles at PTO2_PROFILING=0 (collector is unconditional —
-    // setter symbols must export for host dlsym; the probe call sites that use
-    // these accessors stay gated by PTO2_PROFILING).
-    int32_t current_used() const { return next_entry_idx - free_num; }
-    int32_t pool_capacity() const { return pool_size; }
+    int32_t current_used() const
+    {
+        return next_entry_idx - free_num;
+    }
+    int32_t pool_capacity() const
+    {
+        return pool_size;
+    }
 
     // new_entry only allocates memory, does not assign attributes
-    PTO2TensorMapEntry *new_entry() {
-        if (free_num > 0) {
+    PTO2TensorMapEntry *new_entry()
+    {
+        if (free_num > 0)
+        {
             PTO2TensorMapEntry *res = free_entry_list[--free_num];
             debug_assert(res->bucket_index == -1);
             return res;
@@ -403,22 +322,24 @@ struct PTO2TensorMap {
         return res;
     }
 
-    void free_entry(PTO2TensorMapEntry &entry) {
+    void free_entry(PTO2TensorMapEntry &entry)
+    {
         always_assert(entry.bucket_index != -1);  // must still be in a bucket
 
         // Update predecessor's next pointer (O(1) via prev_in_bucket)
-        if (entry.prev_in_bucket == nullptr) {
+        if (entry.prev_in_bucket == nullptr)
+        {
             // Entry is the head of its bucket chain, update bucket head
             // Must compute hash BEFORE clearing tensor
             buckets[entry.bucket_index] = entry.next_in_bucket;
-        } else {
+        }
+        else
+        {
             entry.prev_in_bucket->next_in_bucket = entry.next_in_bucket;
         }
 
         // Update successor's prev pointer
-        if (entry.next_in_bucket != nullptr) {
-            entry.next_in_bucket->prev_in_bucket = entry.prev_in_bucket;
-        }
+        if (entry.next_in_bucket != nullptr) entry.next_in_bucket->prev_in_bucket = entry.prev_in_bucket;
 
         free_entry_list[free_num++] = &entry;
         entry.bucket_index = -1;
@@ -428,164 +349,144 @@ struct PTO2TensorMap {
         entry.prev_in_task = nullptr;
     }
 
-    // =============================================================================
-    // TensorMap API
-    // =============================================================================
-
-    /**
-     * Phase 1: reserve every sub-region (buckets, entry_pool, free list, per-ring
-     * task_entry_heads) on the supplied arena. Records the resulting offsets in
-     * the returned layout descriptor. Must be called before the arena is
-     * committed.
-     */
-    static PTO2TensorMapLayout reserve_layout(
-        DeviceArena &arena, int32_t num_buckets, int32_t pool_size, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]
-    );
-
-    /**
-     * Same as reserve_layout() with default sizes (PTO2_TENSORMAP_NUM_BUCKETS,
-     * PTO2_TENSORMAP_POOL_SIZE).
-     */
-    static PTO2TensorMapLayout
-    reserve_layout_default(DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH]);
-
-    /**
-     * Phase 3a: write everything *except* arena-internal pointer fields
-     * (buckets, entry_pool, free_entry_list, task_entry_heads[r]).
-     * Uses arena.region_ptr to address the arena regions for data writes,
-     * but does not store those addresses in struct fields. Safe to call on
-     * a host arena that holds the prebuilt image.
-     */
-    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena);
-
-    /**
-     * Phase 3b: write the arena-internal pointer fields. Idempotent;
-     * called once on the host arena and once on the AICPU after attach.
-     */
-    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena);
-
-    /**
-     * Tear down state. Does not free memory — the arena owns the backing
-     * buffer. Pointers are set to nullptr so accidental reuse traps.
-     */
-    void destroy();
-
-    /**
-     * Update validity threshold from shared memory
-     * Called periodically to refresh the lazy invalidation threshold.
-     *
-     * @param last_task_alive  Current value from shared memory
-     */
-    void sync_validity(int32_t ring_id, int32_t last_task_alive) { this->last_task_alives[ring_id] = last_task_alive; }
-
-    /**
-     * Lookup producer for a tensor region
-     *
-     * Searches the hash table for matching regions and invokes the callback
-     * for each overlapping valid entry.
-     * Stale entries from different rings are skipped (not truncated).
-     *
-     * The callback receives (PTO2TensorMapEntry &, OverlapStatus) and should
-     * return true to continue iteration, false to stop early. It is safe for
-     * the callback to call remove_entry() on the current entry: next_in_bucket
-     * is latched before invocation.
-     *
-     * @param tensor    Tensor to look up
-     * @param on_match  Callback invoked for each overlapping entry
-     */
+    static PTO2TensorMapLayout reserve_layout(DeviceArena &arena, int32_t new_num_buckets, int32_t new_pool_size, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        // num_buckets must be a power of two for the hash truncation to work.
+        always_assert((new_num_buckets & (new_num_buckets - 1)) == 0);
+
+        PTO2TensorMapLayout layout{};
+        layout.num_buckets = new_num_buckets;
+        layout.pool_size = new_pool_size;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) layout.task_window_sizes[r] = new_task_window_sizes[r];
+
+        layout.off_buckets = arena.reserve(static_cast<size_t>(new_num_buckets) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+        layout.off_entry_pool = arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry), alignof(PTO2TensorMapEntry));
+        layout.off_free_entry_list = arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) layout.off_task_entry_heads[r] = arena.reserve(static_cast<size_t>(new_task_window_sizes[r]) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
+        return layout;
+    }
+
+    static PTO2TensorMapLayout reserve_layout_default(DeviceArena &arena, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH])
+    {
+        return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
+    }
+
+    bool init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena)
+    {
+        num_buckets = layout.num_buckets;
+        pool_size = layout.pool_size;
+
+        // Address arena regions for data writes; do not store these in struct
+        // fields (wire_arena_pointers does that).
+        auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+        auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+        auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+
+        // buckets[]: empty == nullptr.
+        for (int32_t i = 0; i < num_buckets; i++) buckets_arena[i] = nullptr;
+
+        memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
+        for (int32_t i = 0; i < pool_size; i++)
+        {
+            entry_pool_arena[i].bucket_index = -1;
+            entry_pool_arena[i].next_in_bucket = nullptr;
+            entry_pool_arena[i].prev_in_bucket = nullptr;
+            entry_pool_arena[i].next_in_task = nullptr;
+            entry_pool_arena[i].prev_in_task = nullptr;
+            entry_pool_arena[i].producer_task_id = PTO2TaskId{};
+        }
+
+        // free_entry_list: zeroed (was calloc'd before); contents become meaningful
+        // only after entries are freed back, so the body of the array stays as 0.
+        memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
+
+        next_entry_idx = 0;
+        free_num = 0;
+
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+        {
+            auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+            for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) heads_arena[i] = nullptr;
+            task_window_sizes[r] = layout.task_window_sizes[r];
+            last_task_alives[r] = 0;
+            last_cleanup[r] = 0;
+        }
+
+        return true;
+    }
+
+    void wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena)
+    {
+        buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
+        entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
+        free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
+    }
+
+    void destroy()
+    {
+        buckets = nullptr;
+        entry_pool = nullptr;
+        free_entry_list = nullptr;
+        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) task_entry_heads[r] = nullptr;
+    }
+
+    void sync_validity(int32_t ring_id, int32_t last_task_alive)
+    {
+        this->last_task_alives[ring_id] = last_task_alive;
+    }
+
     template <typename Fn>
-    void lookup(const Tensor &tensor, Fn &&on_match) {
+    void lookup(const Tensor &tensor, Fn &&on_match)
+    {
         uint32_t bucket_index = hash(tensor.buffer.addr);
         PTO2TensorMapEntry *cur_entry = buckets[bucket_index];
 
-#if PTO2_TENSORMAP_PROFILING
-        g_lookup_count++;
-        int32_t chain_len = 0;
-#endif
-
-        while (cur_entry != nullptr) {
+        while (cur_entry != nullptr)
+        {
             PTO2TensorMapEntry *next_entry = cur_entry->next_in_bucket;
 
-#if PTO2_TENSORMAP_PROFILING
-            chain_len++;
-#endif
-            // Skip stale entries (no chain truncation — entries from different
-            // rings can be interleaved, so a stale entry from one ring does NOT
-            // imply subsequent entries from other rings are also stale)
-            if (!entry_valid(*cur_entry)) {
+            if (!entry_valid(*cur_entry))
+            {
                 cur_entry = next_entry;
                 continue;
             }
 
-            // Entry is valid - check if regions OVERLAP (not just exact match)
-            // Since we hash only by base_ptr, all entries in this bucket have
-            // potential to overlap. We must check actual byte-range overlap.
-            if (tensor.buffer.addr == cur_entry->buffer_addr) {
-#if PTO2_TENSORMAP_PROFILING
-                g_lookup_overlap_checks++;
-#endif
+            if (tensor.buffer.addr == cur_entry->buffer_addr)
+            {
                 auto overlap_status = cur_entry->check_overlap(tensor);
-                if (overlap_status != OverlapStatus::NO_OVERLAP) {
-#if PTO2_TENSORMAP_PROFILING
-                    g_lookup_overlap_hits++;
-#endif
-                    if (!on_match(*cur_entry, overlap_status)) {
-#if PTO2_TENSORMAP_PROFILING
-                        g_lookup_chain_total += chain_len;
-                        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
-#endif
-                        return;
-                    }
+                if (overlap_status != OverlapStatus::NO_OVERLAP)
+                {
+                    if (!on_match(*cur_entry, overlap_status)) return;
                 }
             }
 
             // Move to next entry
             cur_entry = next_entry;
         }
-#if PTO2_TENSORMAP_PROFILING
-        g_lookup_chain_total += chain_len;
-        if (chain_len > g_lookup_chain_max) g_lookup_chain_max = chain_len;
-#endif
     }
 
-    /**
-     * Insert a new entry (called when task produces output)
-     *
-     * Allocates from ring buffer pool, may overwrite stale entries.
-     * Inserts at head of hash bucket chain (maintains task_id ordering).
-     *
-     * @param tensor            Tensor produced
-     * @param producer_task_id  Task ID of producer
-     */
-    void insert(const Tensor &tensor, PTO2TaskId producer_task_id) {
+    void insert(const Tensor &tensor, PTO2TaskId producer_task_id)
+    {
         PTO2TensorMapEntry *entry = new_entry();
         entry->copy_from_tensor(tensor);
         link_entry(entry, tensor.buffer.addr, producer_task_id);
     }
 
-    /**
-     * Cleanup stale entries for retired tasks
-     *
-     * Called periodically by Orchestrator when last_task_alive advances.
-     * Removes entries from bucket chains for tasks in [old, new) range.
-     *
-     * @param old_last_task_alive  Previous threshold
-     * @param new_last_task_alive  New threshold
-     */
-    void cleanup_retired(int32_t ring_id, int32_t old_last_task_alive, int32_t new_last_task_alive) {
+    void cleanup_retired(int32_t ring_id, int32_t old_last_task_alive, int32_t new_last_task_alive)
+    {
         // Iterate through retired tasks on this ring and remove their entries
-        for (int32_t local_id = old_last_task_alive; local_id < new_last_task_alive; local_id++) {
+        for (int32_t local_id = old_last_task_alive; local_id < new_last_task_alive; local_id++)
+        {
             int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
             PTO2TensorMapEntry *cur_entry = task_entry_heads[ring_id][task_slot];
 
-            while (cur_entry != nullptr) {
+            while (cur_entry != nullptr)
+            {
                 PTO2TensorMapEntry *next_entry = cur_entry->next_in_task;  // Save before clearing
                 // Only remove if this entry belongs to the retiring task
                 // (slot may have been reused by a newer task)
-                debug_assert(
-                    cur_entry->producer_task_id ==
-                    PTO2TaskId::make(static_cast<uint8_t>(ring_id), static_cast<uint32_t>(local_id))
-                );
+                debug_assert(cur_entry->producer_task_id == PTO2TaskId::make(static_cast<uint8_t>(ring_id), static_cast<uint32_t>(local_id)));
                 free_entry(*cur_entry);
                 cur_entry = next_entry;
             }
@@ -595,30 +496,14 @@ struct PTO2TensorMap {
         }
     }
 
-    // =============================================================================
-    // Internal Helpers (exposed for testing)
-    // =============================================================================
-
-    /**
-     * Compute hash for tensor addr
-     *
-     * Multiplicative hash using the golden-ratio constant.  Multiplication
-     * mixes ALL input bits into the high bits of the product, so aligned
-     * addresses (low bits all-zero) still distribute evenly.  We extract
-     * the top log2(num_buckets) bits which carry the most entropy.
-     */
-    uint32_t hash(uint64_t key) {
+    uint32_t hash(uint64_t key)
+    {
         key *= 0x9E3779B97F4A7C15ULL;
         return static_cast<uint32_t>(key >> (64 - __builtin_ctz(num_buckets)));
     }
 
-    /**
-     * Link an initialized entry into bucket and task chains.
-     */
-    void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id) {
-#if PTO2_TENSORMAP_PROFILING
-        g_insert_count++;
-#endif
+    void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id)
+    {
         uint32_t bucket_index = hash(addr);
         auto ring_id = producer_task_id.ring();
         auto local_id = producer_task_id.local();
@@ -629,95 +514,75 @@ struct PTO2TensorMap {
         // Insert at head of hash bucket
         entry->bucket_index = bucket_index;
         entry->next_in_bucket = buckets[bucket_index];
-        if (entry->next_in_bucket != nullptr) {
-            entry->next_in_bucket->prev_in_bucket = entry;
-        }
+        if (entry->next_in_bucket != nullptr) entry->next_in_bucket->prev_in_bucket = entry;
         buckets[bucket_index] = entry;
         entry->prev_in_bucket = nullptr;
 
         // Link to task's entry list
         entry->next_in_task = task_entry_heads[ring_id][task_slot];
         entry->prev_in_task = nullptr;
-        if (entry->next_in_task != nullptr) {
-            entry->next_in_task->prev_in_task = entry;
-        }
+        if (entry->next_in_task != nullptr) entry->next_in_task->prev_in_task = entry;
         task_entry_heads[ring_id][task_slot] = entry;
     }
 
-    /**
-     * Check if entry is valid (producer has not retired)
-     */
-    bool entry_valid(const PTO2TensorMapEntry &entry) const {
+    bool entry_valid(const PTO2TensorMapEntry &entry) const
+    {
         return static_cast<int32_t>(entry.producer_task_id.local()) >= last_task_alives[entry.producer_task_id.ring()];
     }
 
-    void remove_entry(PTO2TensorMapEntry &entry) {
+    void remove_entry(PTO2TensorMapEntry &entry)
+    {
         remove_from_task(entry);
         free_entry(entry);
     }
 
-    /**
-     * Remove entry from its task chain (O(1) with prev pointer)
-     * Called during pool wrap-around to unlink reused entries.
-     */
-    void remove_from_task(PTO2TensorMapEntry &entry) {
+    void remove_from_task(PTO2TensorMapEntry &entry)
+    {
         always_assert(entry.bucket_index != -1);  // must still be in a bucket
         // Update predecessor's next pointer (O(1) via prev_in_task)
-        if (entry.prev_in_task == nullptr) {
+        if (entry.prev_in_task == nullptr)
+        {
             // Entry is the head of its task chain, update task_entry_heads
             int32_t ring_id = entry.producer_task_id.ring();
             int32_t local_id = static_cast<int32_t>(entry.producer_task_id.local());
             int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
             task_entry_heads[ring_id][task_slot] = entry.next_in_task;
-        } else {
+        }
+        else
+        {
             entry.prev_in_task->next_in_task = entry.next_in_task;
         }
 
         // Update successor's prev pointer
-        if (entry.next_in_task != nullptr) {
-            entry.next_in_task->prev_in_task = entry.prev_in_task;
-        }
+        if (entry.next_in_task != nullptr) entry.next_in_task->prev_in_task = entry.prev_in_task;
 
         entry.next_in_task = nullptr;
         entry.prev_in_task = nullptr;
     }
 
-    // =============================================================================
-    // Debug Utilities
-    // =============================================================================
-
-    /**
-     * Print TensorMap statistics
-     */
-    void print_stats();
-
-    /**
-     * Get count of valid entries
-     */
-    int32_t valid_count();
-
-    // =============================================================================
-    // TensorMap Synchronization
-    // =============================================================================
-
-    /**
-     * Sync TensorMap validity threshold from shared memory
-     *
-     * Called periodically to refresh the lazy invalidation threshold.
-     * Also triggers cleanup if threshold has advanced significantly.
-     */
-    void sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive);
-};
+    int32_t valid_count()
+    {
+        int32_t count = 0;
 
-#if PTO2_TENSORMAP_PROFILING
-struct PTO2TensorMapProfilingData {
-    uint64_t lookup_chain_total;
-    uint64_t lookup_count;
-    int32_t lookup_chain_max;
-    uint64_t overlap_checks;
-    uint64_t overlap_hits;
-    uint64_t insert_count;
-};
+        for (int32_t i = 0; i < pool_size; i++)
+            if (entry_pool[i].bucket_index != -1 && entry_valid(entry_pool[i])) count++;
 
-PTO2TensorMapProfilingData pto2_tensormap_get_profiling();
-#endif
+        return count;
+    }
+
+    void sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive)
+    {
+        auto ring_id = task_id.ring();
+        auto local_id = task_id.local();
+        sync_validity(ring_id, sm_last_task_alive);
+
+        // Only attempt cleanup when last_task_alive has actually advanced;
+        // otherwise cleanup_retired would empty-loop and we'd spin forever.
+        auto overlap = get_task_local_id_slot(ring_id, local_id) == get_task_local_id_slot(ring_id, last_cleanup[ring_id]);
+        if (sm_last_task_alive - last_cleanup[ring_id] >= PTO2_TENSORMAP_CLEANUP_INTERVAL || overlap)
+        {
+            cleanup_retired(ring_id, last_cleanup[ring_id], sm_last_task_alive);
+            last_cleanup[ring_id] = sm_last_task_alive;
+        }
+    }
+};
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 635b893f3..6fd795702 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -8,23 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  * -----------------------------------------------------------------------------------------------------------
  */
-/**
- * Runtime Class - Device Execution and Handshake Control
- *
- * This class manages device-side execution through AICPU-AICore handshake
- * protocol. Task graph construction is handled by PTO2Runtime; this class
- * only handles:
- * - Handshake buffers for AICPU-AICore communication
- * - Execution parameters (block_dim, aicpu_thread_num)
- * - Tensor pair management for host-device memory tracking
- * - Device orchestration state (gm_sm_ptr_, orch_args_)
- * - Function address mapping (func_id_to_addr_)
- *
- * Task dispatch uses a per-core PTO2DispatchPayload written by the scheduler.
- * At dispatch time, build_payload() copies tensor pointers and scalars from
- * the task payload into the per-core args[], populates SPMD context, then
- * signals AICore via DATA_MAIN_BASE.
- */
 
 #ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
 #define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_RUNTIME_H_
@@ -42,10 +25,6 @@
 #include "pto2_dispatch_payload.h"
 #include "task_args.h"
 
-// =============================================================================
-// Configuration Macros
-// =============================================================================
-
 #define RUNTIME_MAX_ARGS 128
 #define RUNTIME_MAX_WORKER 72  // 24 AIC + 48 AIV cores
 #define RUNTIME_MAX_FUNC_ID 1024
@@ -55,42 +34,8 @@
 // Default ready queue shards: one shard per worker thread (total minus orchestrator)
 constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1;
 
-// =============================================================================
-// Data Structures
-// =============================================================================
-
-/**
- * Handshake Structure - Shared between Host, AICPU, and AICore
- *
- * This structure facilitates communication and synchronization between
- * AICPU and AICore during task execution.
- *
- * Protocol State Machine:
- * 1. Initialization: AICPU sets aicpu_ready=1
- * 2. Acknowledgment: AICore sets aicore_done=core_id+1
- * 3. Task Dispatch: AICPU writes DATA_MAIN_BASE after updating the per-core payload
- * 4. Task Execution: AICore reads the cached PTO2DispatchPayload and executes
- * 5. Task Completion: AICore writes FIN to COND; AICPU observes completion
- * 6. Shutdown: AICPU sets control=1, AICore exits
- *
- * Each AICore instance has its own handshake buffer to enable concurrent
- * task execution across multiple cores.
- */
-
-/**
- * Handshake buffer for AICPU-AICore communication
- *
- * Each AICore has its own handshake buffer for synchronization with AICPU.
- * The structure is cache-line aligned (64 bytes) to prevent false sharing
- * between cores and optimize cache coherency operations.
- *
- * Field Access Patterns:
- * - aicpu_ready: Written by AICPU, read by AICore
- * - aicore_done: Written by AICore, read by AICPU
- * - task: Written by AICPU, read by AICore (0 = not ready, non-zero = PTO2DispatchPayload*)
- * - core_type: Written by AICPU, read by AICore (CoreType::AIC or CoreType::AIV)
- */
-struct Handshake {
+struct Handshake
+{
     volatile uint32_t aicpu_ready;        // AICPU ready signal: 0=not ready, 1=ready
     volatile uint32_t aicore_done;        // AICore ready signal: 0=not ready, core_id+1=ready
     volatile uint64_t task;               // Init: PTO2DispatchPayload* (set before aicpu_ready); runtime: unused
@@ -100,104 +45,40 @@ struct Handshake {
     volatile uint32_t aicore_regs_ready;  // AICore ID reported: 0=pending, 1=done
 } __attribute__((aligned(64)));
 
-/**
- * Tensor pair for tracking host-device memory mappings.
- * Used for copy-back during finalize.
- */
-struct TensorPair {
+struct TensorPair
+{
     void *host_ptr;
     void *dev_ptr;
     size_t size;
-    // false for read-only INPUT tensors: they are never written by the kernel,
-    // so the end-of-run D2H copy-back is skipped. OUTPUT/INOUT/unknown
-    // keep the safe default of copying back.
-    bool needs_copy_back = true;
 };
 
-/**
- * Host API function pointers for device memory operations.
- * Allows runtime to use pluggable device memory backends.
- */
-struct HostApi {
+struct HostApi
+{
     void *(*device_malloc)(size_t size);
     void (*device_free)(void *dev_ptr);
     int (*copy_to_device)(void *dev_ptr, const void *host_ptr, size_t size);
     int (*copy_from_device)(void *host_ptr, const void *dev_ptr, size_t size);
-    // Set a device buffer to a byte value (device-side, no PCIe). Used to
-    // zero-init pure OUTPUT buffers in lieu of an H2D copy-in. May be
-    // null on backends that don't wire it; callers must fall back to
-    // copy_to_device.
     int (*device_memset)(void *dev_ptr, int value, size_t size);
-    // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared
-    // memory, trb prebuilt runtime arena) as three independent device
-    // allocations. `runtime_arena_size == 0` skips the third region (hbg
-    // path: hbg has no prebuilt runtime arena). Idempotent on identical
-    // sizes; returns 0 on success, -1 on allocation failure.
     int (*setup_static_arena)(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size);
-    // Return the per-Worker pooled pointer for the PTO2 GM heap / shared
-    // memory / prebuilt runtime arena. setup_static_arena must have already
-    // committed the relevant region; the returned pointer is owned by the
-    // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it
-    // to device_free or record it in `tensor_pairs_`.
-    //
-    // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is
-    // only committed when setup_static_arena was invoked with
-    // runtime_arena_size > 0. Calling it on the hbg path
-    // (setup_static_arena(...,0)) returns nullptr (not undefined).
     void *(*acquire_pooled_gm_heap)();
     void *(*acquire_pooled_gm_sm)();
     void *(*acquire_pooled_runtime_arena)();
-    // Single-shot upload of the entire ChipCallable buffer. `callable` is a
-    // `const ChipCallable *` (declared void* to avoid pulling task_interface
-    // headers into runtime.h). DeviceRunner walks child_offsets_ to compute
-    // total byte size, allocates device GM once, fixes up each child's
-    // resolved_addr_ in an internal host scratch (onboard: device addr; sim:
-    // dlopen function pointer), H2D's once, and returns the device-side
-    // address of the ChipCallable header. Pool-managed: identical buffer
-    // contents (FNV-1a 64-bit) hit the dedup cache; all chip buffers are
-    // bulk-freed in DeviceRunner::finalize(). Returns 0 on error or when
-    // child_count() == 0. Caller computes child addrs as
-    //     chip_dev + offsetof(ChipCallable, storage_) + child_offset(i)
-    // and stores them via runtime->set_function_bin_addr(fid, child_dev).
     uint64_t (*upload_chip_callable_buffer)(const void *callable);
 };
 
-/**
- * Task structure - Compatibility stub for platform layer
- *
- * RT2 uses PTO2DispatchPayload instead of Task for task dispatch.
- * This stub exists only for API compatibility with device_runner.cpp.
- * Since get_task_count() returns 0, this struct is never actually used.
- */
-struct Task {
+struct Task
+{
     int func_id;
     uint64_t function_bin_addr;
 };
 
-// =============================================================================
-// Runtime Class
-// =============================================================================
-
-/**
- * Runtime class for device execution and handshake control
- *
- * This class manages AICPU-AICore communication through handshake buffers.
- * Task graph construction is handled by PTO2Runtime; this class only handles
- * execution control and device orchestration state.
- */
-class Runtime {
+class Runtime
+{
 public:
     // Handshake buffers for AICPU-AICore communication
     Handshake workers[RUNTIME_MAX_WORKER];  // Worker (AICore) handshake buffers
     int worker_count;                       // Number of active workers
 
-    // Execution parameters for AICPU scheduling.
-    //
-    // aicpu_thread_num is the *total* AICPU thread count launched on this run
-    // (= orch + schedulers). AicpuExecutor splits this into one orchestrator
-    // thread (highest idx, runs aicpu_orchestration_entry) and the remaining
-    // aicpu_thread_num-1 scheduler threads that dispatch tasks to AICore.
-    // The orch thread also dispatches when env PTO2_ORCH_TO_SCHED is set.
     int aicpu_thread_num;
     int ready_queue_shards;  // Number of ready queue shards (1..MAX_AICPU_THREADS, default MAX-1)
 
@@ -210,10 +91,6 @@ class Runtime {
     // NOTE: Made public for direct access from aicore code
     uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
 
-    // Orchestrator-to-scheduler transition control
-    // When true, orchestrator threads convert to scheduler threads after orchestration completes.
-    // When false (default), orchestrator threads exit after orchestration without dispatching tasks.
-    // Controlled via PTO2_ORCH_TO_SCHED environment variable.
     bool orch_to_sched;
 
 private:
@@ -226,114 +103,207 @@ class Runtime {
     void *slot_states_ptr_;                  // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling)
     ChipStorageTaskArgs orch_args_storage_;  // Copy of args for device
 
-    // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing
-    // Runtime to device; AICPU reads them in the boot path to skip
-    // runtime_create_from_sm and reuse the pooled, prebuilt arena buffer
-    // (already populated by runtime_init_data_from_layout + wire on host).
     void *prebuilt_arena_base_;
     size_t prebuilt_runtime_offset_;
 
-    // Device orchestration SO (for dlopen on AICPU thread 3).
-    // The SO bytes themselves live in a separately-allocated device buffer
-    // owned by DeviceRunner; only the metadata below travels inside Runtime.
     uint64_t dev_orch_so_addr_;
     uint64_t dev_orch_so_size_;
-    // Per-callable_id dispatch. AICPU dispatches via
-    // `orch_so_table_[active_callable_id_]`; `register_new_callable_id_`
-    // signals whether the host is delivering a freshly-registered
-    // callable_id (write+dlopen) or reusing an already-loaded one.
     int32_t active_callable_id_;
     bool register_new_callable_id_;
     char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
     char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
 
 public:
-    /**
-     * Constructor - zero-initialize all arrays
-     */
-    Runtime();
-
-    // =========================================================================
-    // Performance Profiling
-    // =========================================================================
-
-    // =========================================================================
-    // Device orchestration (for AICPU thread 3)
-    // =========================================================================
-
-    void *get_gm_sm_ptr() const;
-    void *get_gm_heap_ptr() const;
-    const ChipStorageTaskArgs &get_orch_args() const;
-    void set_gm_sm_ptr(void *p);
-    void set_gm_heap(void *p);
-    void set_slot_states_ptr(void *p);
-    void set_orch_args(const ChipStorageTaskArgs &args);
-
-    // Prebuilt-arena fast path (trb only). Set by host's
-    // bind_callable_to_runtime_impl; consumed by AICPU at boot to attach a
-    // DeviceArena to `prebuilt_arena_base_` and pick up the PTO2Runtime at
-    // `prebuilt_arena_base_ + prebuilt_runtime_offset_`. Both stay zero on
-    // first construction (Runtime() ctor zeros them) so a non-prebuilt boot
-    // path can still detect "no prebuilt image set" via nullptr.
-    void set_prebuilt_arena(void *arena_base, size_t runtime_off);
-    void *get_prebuilt_arena_base() const;
-    size_t get_prebuilt_runtime_offset() const;
+    Runtime()
+    {
+        // NOTE: host_api is initialized in InitRuntime() (host-only code)
+        // because the CApi functions don't exist when compiled for device.
+
+        // Initialize handshake buffers
+        memset(workers, 0, sizeof(workers));
+        worker_count = 0;
+        aicpu_thread_num = 1;
+        ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
+        task_window_size = 0;
+        heap_size = 0;
+        dep_pool_size = 0;
+        orch_to_sched = false;
+
+        // Initialize device orchestration state
+        gm_sm_ptr_ = nullptr;
+        gm_heap_ptr_ = nullptr;
+        slot_states_ptr_ = nullptr;
+        orch_args_storage_.clear();
+        prebuilt_arena_base_ = nullptr;
+        prebuilt_runtime_offset_ = 0;
+
+        // Initialize device orchestration SO binary
+        dev_orch_so_addr_ = 0;
+        dev_orch_so_size_ = 0;
+        active_callable_id_ = -1;
+        register_new_callable_id_ = false;
+        device_orch_func_name_[0] = '\0';
+        device_orch_config_name_[0] = '\0';
+
+        // Initialize kernel binary tracking
+        registered_kernel_count_ = 0;
+
+        // Initialize function address mapping
+        for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) func_id_to_addr_[i] = 0;
+    }
+
+    void *get_gm_sm_ptr() const
+    {
+        return gm_sm_ptr_;
+    }
+    void *get_gm_heap_ptr() const
+    {
+        return gm_heap_ptr_;
+    }
+    const ChipStorageTaskArgs &get_orch_args() const
+    {
+        return orch_args_storage_;
+    }
+    void set_gm_sm_ptr(void *p)
+    {
+        gm_sm_ptr_ = p;
+    }
+    void set_gm_heap(void *p)
+    {
+        gm_heap_ptr_ = p;
+    }
+    void set_slot_states_ptr(void *p)
+    {
+        slot_states_ptr_ = p;
+    }
+    void set_orch_args(const ChipStorageTaskArgs &args)
+    {
+        orch_args_storage_ = args;
+    }
+
+    void set_prebuilt_arena(void *arena_base, size_t runtime_off)
+    {
+        prebuilt_arena_base_ = arena_base;
+        prebuilt_runtime_offset_ = runtime_off;
+    }
+    void *get_prebuilt_arena_base() const
+    {
+        return prebuilt_arena_base_;
+    }
+    size_t get_prebuilt_runtime_offset() const
+    {
+        return prebuilt_runtime_offset_;
+    }
 
     // Device orchestration SO binary (for dlopen on AICPU thread 3)
-    void set_dev_orch_so(uint64_t dev_addr, uint64_t size);
-    uint64_t get_dev_orch_so_addr() const;
-    uint64_t get_dev_orch_so_size() const;
-    // Per-callable_id dispatch. callable_id must be in
-    // [0, MAX_REGISTERED_CALLABLE_IDS); register_new_callable_id_ tells AICPU
-    // whether to (re)load the orch SO into orch_so_table_[callable_id] or
-    // reuse the cached entry.
-    void set_active_callable_id(int32_t callable_id, bool is_new);
-    int32_t get_active_callable_id() const;
-    bool register_new_callable_id() const;
-    void set_device_orch_func_name(const char *name);
-    const char *get_device_orch_func_name() const;
-    void set_device_orch_config_name(const char *name);
-    const char *get_device_orch_config_name() const;
-
-    uint64_t get_function_bin_addr(int func_id) const;
-    void set_function_bin_addr(int func_id, uint64_t addr);
-    /**
-     * Replay a previously-uploaded kernel address onto a fresh Runtime
-     * without recording it in registered_kernel_func_ids_. Used by
-     * DeviceRunner::bind_callable_to_runtime so prepared kernel
-     * binaries are not freed by validate_runtime_impl across runs.
-     */
-    void replay_function_bin_addr(int func_id, uint64_t addr);
-
-    int get_registered_kernel_count() const;
-    int get_registered_kernel_func_id(int index) const;
-    void clear_registered_kernels();
-
-    // =========================================================================
-    // Deprecated API (for platform compatibility, always returns 0/nullptr)
-    // Task graph is now managed by PTO2Runtime, not Runtime
-    // =========================================================================
-
-    /** @deprecated Task count is now in PTO2 shared memory */
-    int get_task_count() const { return 0; }
-
-    /** @deprecated RT2 uses PTO2DispatchPayload, not Task. Always returns nullptr. */
-    Task *get_task(int) { return nullptr; }
-
-    // =========================================================================
-    // Host API (host-only, not copied to device)
-    // =========================================================================
+    void set_dev_orch_so(uint64_t dev_addr, uint64_t size)
+    {
+        dev_orch_so_addr_ = dev_addr;
+        dev_orch_so_size_ = size;
+    }
+    uint64_t get_dev_orch_so_addr() const
+    {
+        return dev_orch_so_addr_;
+    }
+    uint64_t get_dev_orch_so_size() const
+    {
+        return dev_orch_so_size_;
+    }
+    void set_active_callable_id(int32_t callable_id, bool is_new)
+    {
+        active_callable_id_ = callable_id;
+        register_new_callable_id_ = is_new;
+    }
+    int32_t get_active_callable_id() const
+    {
+        return active_callable_id_;
+    }
+    bool register_new_callable_id() const
+    {
+        return register_new_callable_id_;
+    }
+    void set_device_orch_func_name(const char *name)
+    {
+        if (name == nullptr)
+        {
+            device_orch_func_name_[0] = '\0';
+            return;
+        }
+        std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+        device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+    }
+    const char *get_device_orch_func_name() const
+    {
+        return device_orch_func_name_;
+    }
+    void set_device_orch_config_name(const char *name)
+    {
+        if (name == nullptr)
+        {
+            device_orch_config_name_[0] = '\0';
+            return;
+        }
+        std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+        device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+    }
+    const char *get_device_orch_config_name() const
+    {
+        return device_orch_config_name_;
+    }
+
+    uint64_t get_function_bin_addr(int func_id) const
+    {
+        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
+        return func_id_to_addr_[func_id];
+    }
+    void set_function_bin_addr(int func_id, uint64_t addr)
+    {
+        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return;
+        if (addr != 0 && func_id_to_addr_[func_id] == 0)
+        {
+            if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID)
+            {
+                registered_kernel_func_ids_[registered_kernel_count_++] = func_id;
+            }
+            else
+            {}
+        }
+        func_id_to_addr_[func_id] = addr;
+    }
+    void replay_function_bin_addr(int func_id, uint64_t addr)
+    {
+        if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return;
+        func_id_to_addr_[func_id] = addr;
+    }
+
+    int get_registered_kernel_count() const
+    {
+        return registered_kernel_count_;
+    }
+    int get_registered_kernel_func_id(int index) const
+    {
+        if (index < 0 || index >= registered_kernel_count_) return -1;
+        return registered_kernel_func_ids_[index];
+    }
+    void clear_registered_kernels()
+    {
+        registered_kernel_count_ = 0;
+    }
+
+    int get_task_count() const
+    {
+        return 0;
+    }
+
+    Task *get_task([[maybe_unused]] int taskId)
+    {
+        return nullptr;
+    }
 
     // Host API function pointers for device memory operations
     // NOTE: Placed at end of class to avoid affecting device memory layout
     HostApi host_api;
 
-    // Host-side tensor ledger for D2H copy-back at finalize. Populated by
-    // runtime_maker.cpp from orch_args at bind time, then iterated in
-    // validate_runtime_impl. Not read by AICPU/AICore — the device-side
-    // Runtime image carries the std::vector control block as harmless
-    // garbage, identical to host_api above. No fixed cap — grows with the
-    // chip-level entry-tensor count.
     std::vector<TensorPair> tensor_pairs_;
 };
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
deleted file mode 100644
index 4b7484bc9..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - Scheduler Implementation
- *
- * Implements scheduler state management, ready queues, and task lifecycle.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_scheduler.h"
-#include <inttypes.h>
-#include <stdlib.h>
-#include "common/unified_log.h"
-
-#if PTO2_PROFILING
-// Weak fallbacks for host/UT builds that don't link the scope_stats collector.
-extern "C" __attribute__((weak, visibility("hidden"))) bool is_scope_stats_enabled() { return false; }
-extern "C" __attribute__((weak, visibility("hidden"))) void scope_stats_note_heap_wrap(int) {}
-#endif
-
-// =============================================================================
-// Scheduler Profiling Counters
-// =============================================================================
-
-#if PTO2_SCHED_PROFILING
-#include "common/platform_config.h"
-
-uint64_t g_sched_lock_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanout_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanin_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_self_consumed_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_lock_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_push_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_pop_wait_cycle[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_lock_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanout_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_fanin_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_self_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_pop_atomic_count[PLATFORM_MAX_AICPU_THREADS] = {};
-uint64_t g_sched_complete_count[PLATFORM_MAX_AICPU_THREADS] = {};
-
-PTO2SchedProfilingData scheduler_get_profiling(int thread_idx) {
-    PTO2SchedProfilingData d;
-    d.lock_cycle = std::exchange(g_sched_lock_cycle[thread_idx], 0);
-    d.fanout_cycle = std::exchange(g_sched_fanout_cycle[thread_idx], 0);
-    d.fanin_cycle = std::exchange(g_sched_fanin_cycle[thread_idx], 0);
-    d.self_consumed_cycle = std::exchange(g_sched_self_consumed_cycle[thread_idx], 0);
-    d.lock_wait_cycle = std::exchange(g_sched_lock_wait_cycle[thread_idx], 0);
-    d.push_wait_cycle = std::exchange(g_sched_push_wait_cycle[thread_idx], 0);
-    d.pop_wait_cycle = std::exchange(g_sched_pop_wait_cycle[thread_idx], 0);
-    d.lock_atomic_count = std::exchange(g_sched_lock_atomic_count[thread_idx], 0);
-    d.fanout_atomic_count = std::exchange(g_sched_fanout_atomic_count[thread_idx], 0);
-    d.fanin_atomic_count = std::exchange(g_sched_fanin_atomic_count[thread_idx], 0);
-    d.self_atomic_count = std::exchange(g_sched_self_atomic_count[thread_idx], 0);
-    d.pop_atomic_count = std::exchange(g_sched_pop_atomic_count[thread_idx], 0);
-    d.complete_count = std::exchange(g_sched_complete_count[thread_idx], 0);
-    return d;
-}
-#endif
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-void PTO2SchedulerState::print_stats() {
-    PTO2SchedulerState *sched = this;
-    LOG_INFO_V0("=== Scheduler Statistics ===");
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (sched->ring_sched_states[r].last_task_alive > 0) {
-            LOG_INFO_V0("Ring %d:", r);
-            LOG_INFO_V0("  last_task_alive: %d", sched->ring_sched_states[r].last_task_alive);
-            auto &dp = sched->ring_sched_states[r].dep_pool;
-            if (dp.top > 0) {
-                LOG_INFO_V0(
-                    "  dep_pool: top=%d tail=%d used=%d high_water=%d capacity=%d", dp.top, dp.tail, dp.top - dp.tail,
-                    dp.high_water, dp.capacity
-                );
-            }
-        }
-    }
-#if PTO2_SCHED_PROFILING
-    LOG_INFO_V0("tasks_completed:   %lld", (long long)sched->tasks_completed.load(std::memory_order_relaxed));
-    LOG_INFO_V0("tasks_consumed:    %lld", (long long)sched->tasks_consumed.load(std::memory_order_relaxed));
-#endif
-    LOG_INFO_V0("============================");
-}
-
-void PTO2SchedulerState::print_queues() {
-    PTO2SchedulerState *sched = this;
-    LOG_INFO_V0("=== Ready Queues ===");
-
-    const char *shape_names[] = {"AIC", "AIV", "MIX"};
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        LOG_INFO_V0("  %s: count=%" PRIu64, shape_names[i], sched->ready_queues[i].size());
-    }
-    LOG_INFO_V0("  DUMMY: count=%" PRIu64, sched->dummy_ready_queue.size());
-
-    LOG_INFO_V0("====================");
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
deleted file mode 100644
index f5213dca7..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/pto_scheduler.h
+++ /dev/null
@@ -1,1483 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-
-/**
- * PTO Runtime2 - Scheduler Interface
- *
- * The Scheduler is responsible for:
- * 1. Maintaining per-resource-shape ready queues
- * 2. Tracking task state (PENDING -> COMPLETED -> CONSUMED)
- * 3. Managing fanin/fanout refcounts for dependency resolution
- * 4. Advancing last_task_alive for heap reclamation
- * 5. Two-stage mixed-task completion (subtask done bits → mixed-task complete)
- *
- * The Scheduler runs on Device AI_CPU and processes:
- * - Task state transitions based on fanin_refcount
- * - Buffer lifecycle based on fanout_refcount
- * - Ring pointer advancement for flow control
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#pragma once
-
-#include <atomic>
-
-#include "common/core_type.h"
-#include "utils/device_arena.h"
-#include "aicpu/platform_regs.h"  // get_reg_ptr / RegId for the speculative doorbell
-#include "pto_async_wait.h"
-#include "pto_ring_buffer.h"
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-
-#include "aicpu/device_time.h"  // get_sys_cnt_aicpu (weak; used by spec doorbell timing too)
-#if PTO2_SCHED_PROFILING
-#define PTO2_SCHED_CYCLE_START() uint64_t _st0 = get_sys_cnt_aicpu(), _st1
-#define PTO2_SCHED_CYCLE_LAP(acc)   \
-    do {                            \
-        _st1 = get_sys_cnt_aicpu(); \
-        acc += (_st1 - _st0);       \
-        _st0 = _st1;                \
-    } while (0)
-#endif
-
-// =============================================================================
-// Ready Queue (Lock-free bounded MPMC — Vyukov design)
-// =============================================================================
-
-/**
- * Per-slot entry: sequence counter for ABA safety + task payload
- */
-struct PTO2ReadyQueueSlot {
-    std::atomic<int64_t> sequence;
-    PTO2TaskSlotState *slot_state;
-};
-
-/**
- * Thread-local ready buffer for local-first dispatch optimization.
- *
- * Two buffers per scheduling thread, one per CoreType (AIC=0, AIV=1).
- * Initialized once before the scheduling loop; must be empty at
- * the start of each iteration (verified by always_assert).
- *
- * Phase 1 fills per-CoreType buffers via on_task_complete().
- * The dispatch stage drains them local-first via get_ready_tasks_batch,
- * with any remaining tasks pushed to the global ready queue.
- */
-// Number of CoreType values eligible for local dispatch (AIC=0, AIV=1)
-static constexpr int PTO2_LOCAL_DISPATCH_TYPE_NUM = 2;
-
-struct PTO2LocalReadyBuffer {
-    PTO2TaskSlotState **slot_states = nullptr;
-    int count = 0;
-    int capacity = 0;
-
-    void reset(PTO2TaskSlotState **buf, int cap) {
-        slot_states = buf;
-        count = 0;
-        capacity = cap;
-    }
-
-    bool try_push(PTO2TaskSlotState *s) {
-        if (slot_states && count < capacity) {
-            slot_states[count++] = s;
-            return true;
-        }
-        return false;
-    }
-
-    PTO2TaskSlotState *pop() { return (count > 0) ? slot_states[--count] : nullptr; }
-};
-
-/**
- * Lock-free bounded MPMC queue (Dmitry Vyukov design)
- *
- * Key properties:
- * - enqueue_pos and dequeue_pos on separate cache lines (no false sharing)
- * - Per-slot sequence counter prevents ABA problem
- * - Empty queue pop returns immediately (single atomic load, no lock)
- * - CAS contention is split: producers only touch enqueue_pos,
- *   consumers only touch dequeue_pos
- */
-struct alignas(64) PTO2ReadyQueue {
-    PTO2ReadyQueueSlot *slots;
-    uint64_t capacity;
-    uint64_t mask;        // capacity - 1
-    char _pad0[64 - 24];  // Pad to own cache line
-
-    std::atomic<uint64_t> enqueue_pos;
-    char _pad1[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
-
-    std::atomic<uint64_t> dequeue_pos;
-    char _pad2[64 - sizeof(std::atomic<uint64_t>)];  // Own cache line
-
-    uint64_t size() {
-        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
-        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
-        return (e >= d) ? (e - d) : 0;
-    }
-
-    bool push(PTO2TaskSlotState *slot_state) {
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        while (true) {
-            pos = enqueue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - static_cast<int64_t>(pos);
-            if (diff == 0) {
-                if (enqueue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    )) {
-                    break;
-                }
-            } else if (diff < 0) {
-                return false;  // Queue full
-            }
-        }
-
-        slot->slot_state = slot_state;
-        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
-        return true;
-    }
-
-    // Batch push: reserve count slots with a single CAS after confirming
-    // every target slot is available under the usual Vyukov sequence check.
-    void push_batch(PTO2TaskSlotState **items, int count) {
-        if (count == 0) return;
-
-        uint64_t pos;
-        while (true) {
-            pos = enqueue_pos.load(std::memory_order_relaxed);
-            bool ready = true;
-            for (int i = 0; i < count; i++) {
-                PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
-                int64_t seq = slot->sequence.load(std::memory_order_acquire);
-                int64_t diff = seq - static_cast<int64_t>(pos + i);
-                if (diff != 0) {
-                    ready = false;
-                    break;
-                }
-            }
-            if (!ready) {
-                continue;
-            }
-            if (enqueue_pos.compare_exchange_weak(
-                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
-                )) {
-                break;
-            }
-        }
-
-        for (int i = 0; i < count; i++) {
-            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
-            slot->slot_state = items[i];
-            slot->sequence.store(static_cast<int64_t>(pos + i + 1), std::memory_order_release);
-        }
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    bool push(PTO2TaskSlotState *slot_state, uint64_t &atomic_count, uint64_t &wait_cycle) {
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-        while (true) {
-            pos = enqueue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - static_cast<int64_t>(pos);
-            atomic_ops += 2;  // enqueue_pos.load + sequence.load
-            if (diff == 0) {
-                if (enqueue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    )) {
-                    atomic_ops++;  // successful CAS
-                    break;
-                }
-                contended = true;
-                atomic_ops++;  // failed CAS
-            } else if (diff < 0) {
-                return false;  // Queue full
-            } else {
-                contended = true;  // diff > 0: slot not yet released, spin
-            }
-        }
-        atomic_ops++;  // final sequence.store
-        atomic_count += atomic_ops;
-        if (contended) {
-            wait_cycle += (get_sys_cnt_aicpu() - t0);
-        }
-
-        slot->slot_state = slot_state;
-        slot->sequence.store(static_cast<int64_t>(pos + 1), std::memory_order_release);
-        return true;
-    }
-#endif
-
-    PTO2TaskSlotState *pop() {
-        // Fast-path: skip slot load when queue is clearly empty
-        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
-        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
-        if (d >= e) {
-            return nullptr;
-        }
-
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        while (true) {
-            pos = dequeue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - static_cast<int64_t>(pos + 1);
-            if (diff == 0) {
-                if (dequeue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    ))
-                    break;
-            } else if (diff < 0) {
-                return nullptr;  // Queue empty
-            }
-        }
-
-        PTO2TaskSlotState *result = slot->slot_state;
-        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
-        return result;
-    }
-
-#if PTO2_SCHED_PROFILING
-    PTO2TaskSlotState *pop(uint64_t &atomic_count, uint64_t &wait_cycle) {
-        // Fast-path: skip slot load when queue is clearly empty
-        uint64_t d = dequeue_pos.load(std::memory_order_relaxed);
-        uint64_t e = enqueue_pos.load(std::memory_order_relaxed);
-        atomic_count += 2;  // dequeue_pos.load + enqueue_pos.load
-        if (d >= e) {
-            return nullptr;
-        }
-
-        uint64_t pos;
-        PTO2ReadyQueueSlot *slot;
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-        while (true) {
-            pos = dequeue_pos.load(std::memory_order_relaxed);
-            slot = &slots[pos & mask];
-            int64_t seq = slot->sequence.load(std::memory_order_acquire);
-            int64_t diff = seq - static_cast<int64_t>(pos + 1);
-            atomic_ops += 2;  // dequeue_pos.load + sequence.load
-            if (diff == 0) {
-                if (dequeue_pos.compare_exchange_weak(
-                        pos, pos + 1, std::memory_order_relaxed, std::memory_order_relaxed
-                    )) {
-                    atomic_ops++;  // successful CAS
-                    break;
-                }
-                contended = true;
-                atomic_ops++;  // failed CAS
-            } else if (diff < 0) {
-                atomic_count += atomic_ops;
-                return nullptr;  // Queue empty
-            } else {
-                contended = true;
-            }
-        }
-        atomic_ops++;  // final sequence.store
-        atomic_count += atomic_ops;
-        if (contended) {
-            wait_cycle += (get_sys_cnt_aicpu() - t0);
-        }
-
-        PTO2TaskSlotState *result = slot->slot_state;
-        slot->sequence.store(static_cast<int64_t>(pos + mask + 1), std::memory_order_release);
-        return result;
-    }
-#endif
-
-    // Batch pop: reserve a contiguous run of ready slots with a single CAS.
-    // Returns actual number of items popped (may be less than max_count).
-    int pop_batch(PTO2TaskSlotState **out, int max_count) {
-        uint64_t pos;
-        int count;
-        while (true) {
-            pos = dequeue_pos.load(std::memory_order_relaxed);
-            count = 0;
-            while (count < max_count) {
-                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
-                int64_t seq = slot->sequence.load(std::memory_order_acquire);
-                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
-                if (diff == 0) {
-                    count++;
-                    continue;
-                }
-                if (diff < 0) {
-                    break;
-                }
-                count = -1;
-                break;
-            }
-            if (count == 0) return 0;
-            if (count < 0) continue;
-            if (dequeue_pos.compare_exchange_weak(
-                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
-                )) {
-                break;
-            }
-        }
-
-        for (int i = 0; i < count; i++) {
-            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
-            out[i] = slot->slot_state;
-            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
-        }
-        return count;
-    }
-
-#if PTO2_SCHED_PROFILING
-    int pop_batch(PTO2TaskSlotState **out, int max_count, uint64_t &atomic_count, uint64_t &wait_cycle) {
-        uint64_t pos;
-        int count;
-        uint64_t t0 = get_sys_cnt_aicpu();
-        bool contended = false;
-        uint32_t atomic_ops = 0;
-        while (true) {
-            pos = dequeue_pos.load(std::memory_order_relaxed);
-            atomic_ops++;  // dequeue_pos.load
-            count = 0;
-            while (count < max_count) {
-                PTO2ReadyQueueSlot *slot = &slots[(pos + count) & mask];
-                int64_t seq = slot->sequence.load(std::memory_order_acquire);
-                int64_t diff = seq - static_cast<int64_t>(pos + count + 1);
-                atomic_ops++;  // sequence.load
-                if (diff == 0) {
-                    count++;
-                    continue;
-                }
-                if (diff < 0) {
-                    break;
-                }
-                contended = true;
-                count = -1;
-                break;
-            }
-            if (count == 0) {
-                atomic_count += atomic_ops;
-                return 0;
-            }
-            if (count < 0) {
-                continue;
-            }
-            if (dequeue_pos.compare_exchange_weak(
-                    pos, pos + count, std::memory_order_relaxed, std::memory_order_relaxed
-                )) {
-                atomic_ops++;  // successful CAS
-                break;
-            }
-            contended = true;
-            atomic_ops++;  // failed CAS
-        }
-
-        for (int i = 0; i < count; i++) {
-            PTO2ReadyQueueSlot *slot = &slots[(pos + i) & mask];
-            out[i] = slot->slot_state;
-            slot->sequence.store(static_cast<int64_t>(pos + i + mask + 1), std::memory_order_release);
-            atomic_ops++;  // sequence.store
-        }
-        atomic_count += atomic_ops;
-        if (contended) {
-            wait_cycle += (get_sys_cnt_aicpu() - t0);
-        }
-        return count;
-    }
-#endif
-};
-
-// Cold-path ready queue operations (defined in pto_scheduler.cpp). Declared
-// as non-member so PTO2ReadyQueue stays a POD-like struct with cache-line
-// alignment. Storage is owned by the caller-supplied arena.
-//   reserve_layout: declare the slots[] region on the arena (must precede commit)
-//   init_from_layout: bind slots pointer from arena.region_ptr(off) and
-//                     initialize sequence counters
-//   destroy: forget the slots pointer (arena owns the buffer)
-size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity);
-// Writes everything *except* the arena-internal `slots` pointer field
-// (sequences/positions on the slot array, capacity, mask). Uses
-// arena.region_ptr(slots_off) only to address the slot array for writes;
-// does NOT store the pointer in `queue->slots`. Call
-// `ready_queue_wire_arena_pointers` afterwards to set the field itself.
-bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity);
-// Stores queue->slots = arena.region_ptr(slots_off). Idempotent.
-void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off);
-void ready_queue_destroy(PTO2ReadyQueue *queue);
-
-// =============================================================================
-// SPSC Queue (Single-Producer Single-Consumer, wait-free)
-// =============================================================================
-//
-// Bounded ring buffer optimized for the wiring queue use case:
-//   - Producer: orchestrator thread (push)
-//   - Consumer: scheduler thread 0 (pop_batch)
-//
-// Design based on Rigtorp's cached-index technique: each side caches
-// the other's index locally, avoiding cross-core cache line bouncing
-// on the hot path. Only when the local cache says "full" or "empty"
-// does the thread issue an acquire load on the remote index.
-//
-// Memory layout: 5 cache-line-aligned fields ensure zero false sharing.
-
-struct alignas(64) PTO2SpscQueue {
-    // --- Producer cache lines (orchestrator thread) ---
-    alignas(64) std::atomic<uint64_t> head_{0};
-    alignas(64) uint64_t tail_cached_{0};
-
-    // --- Consumer cache lines (scheduler thread 0) ---
-    alignas(64) std::atomic<uint64_t> tail_{0};
-    alignas(64) uint64_t head_cached_{0};
-
-    // --- Shared Cacheline (read only) with mask and data ptr (immutable after init) ---
-    alignas(64) PTO2TaskSlotState **buffer_{nullptr};
-    uint64_t mask_{0};
-
-    // Padding to exactly 5 cache lines
-    char padding[64 - sizeof(PTO2TaskSlotState **) - sizeof(uint64_t)];
-
-    // Reserve the backing buffer region on the supplied arena. Returns the
-    // region offset, to be passed to init_from_layout() after the arena is
-    // committed. Cache-line aligned: the buffer is shared between the
-    // orchestrator (push) and scheduler thread 0 (pop_batch), so its base
-    // must not false-share with neighboring regions.
-    static size_t reserve_layout(DeviceArena &arena, uint64_t capacity) {
-        return arena.reserve(capacity * sizeof(PTO2TaskSlotState *), PTO2_ALIGN_SIZE);
-    }
-
-    // Writes everything except the arena-internal `buffer_` pointer field
-    // (zeros the slot pointer array, mask/head/tail). The host pre-builds the
-    // image without storing a host address in buffer_; the AICPU wires
-    // buffer_ at boot via wire_arena_pointers().
-    bool init_data_from_layout(DeviceArena &arena, size_t buffer_off, uint64_t capacity) {
-        if (capacity == 0 || (capacity & (capacity - 1)) != 0) return false;
-        auto *buf = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
-        // calloc'd-equivalent: zero the slot pointers so spurious early pops
-        // observe nullptr.
-        for (uint64_t i = 0; i < capacity; i++)
-            buf[i] = nullptr;
-        mask_ = capacity - 1;
-        head_.store(0, std::memory_order_relaxed);
-        tail_.store(0, std::memory_order_relaxed);
-        tail_cached_ = 0;
-        head_cached_ = 0;
-        return true;
-    }
-
-    // Wire the arena-internal pointer. Called by both host (with host arena)
-    // and AICPU (with device arena attached to the prebuilt image).
-    void wire_arena_pointers(DeviceArena &arena, size_t buffer_off) {
-        buffer_ = static_cast<PTO2TaskSlotState **>(arena.region_ptr(buffer_off));
-    }
-
-    // Arena owns the buffer; here we only forget our pointer.
-    void destroy() { buffer_ = nullptr; }
-
-    // Push one item (producer only). Returns false if queue is full.
-    // Full condition: next_h - tail > mask_ (i.e. > capacity-1), so the
-    // effective usable capacity is capacity-1 (one slot is wasted as a
-    // sentinel to distinguish full from empty). uint64_t wrapping is safe
-    // since head and tail are monotonically increasing and subtraction
-    // wraps correctly.
-    bool push(PTO2TaskSlotState *item) {
-        uint64_t h = head_.load(std::memory_order_relaxed);
-        uint64_t next_h = h + 1;
-        if (next_h - tail_cached_ > mask_) {
-            tail_cached_ = tail_.load(std::memory_order_acquire);
-            if (next_h - tail_cached_ > mask_) {
-                return false;
-            }
-        }
-        buffer_[h & mask_] = item;
-        head_.store(next_h, std::memory_order_release);
-        return true;
-    }
-
-    // Pop up to max_count items (consumer only). Returns actual count.
-    int pop_batch(PTO2TaskSlotState **out, int max_count) {
-        uint64_t t = tail_.load(std::memory_order_relaxed);
-        uint64_t avail = head_cached_ - t;
-        if (avail < static_cast<uint64_t>(max_count)) {
-            head_cached_ = head_.load(std::memory_order_acquire);
-            avail = head_cached_ - t;
-            if (avail == 0) return 0;
-        }
-        int count = (avail < static_cast<uint64_t>(max_count)) ? static_cast<int>(avail) : max_count;
-        for (int i = 0; i < count; i++) {
-            out[i] = buffer_[(t + i) & mask_];
-        }
-        tail_.store(t + count, std::memory_order_release);
-        return count;
-    }
-
-    // Approximate size (used for backoff decisions, not exact).
-    uint64_t size() const {
-        uint64_t h = head_.load(std::memory_order_acquire);
-        uint64_t t = tail_.load(std::memory_order_acquire);
-        return h - t;
-    }
-};
-
-static_assert(sizeof(PTO2SpscQueue) == 5 * 64, "PTO2SpscQueue must be exactly 5 cache lines (320B)");
-// =============================================================================
-
-/**
- * Statistics returned by mixed-task completion processing
- */
-struct CompletionStats {
-    int32_t fanout_edges;       // Number of fanout edges traversed (notify consumers)
-    int32_t tasks_enqueued;     // Number of consumers that became READY
-    int32_t fanin_edges;        // Number of fanin edges traversed (release producers)
-    bool mixed_task_completed;  // True only when this callback completed a mixed task
-};
-
-/**
- * Layout descriptor produced by PTO2SchedulerState::reserve_layout(). Holds
- * the arena offsets of every sub-region the scheduler needs plus the
- * capacities used at layout time (init_from_layout reuses them).
- */
-struct PTO2SchedulerLayout {
-    size_t off_ready_queue_slots[PTO2_NUM_RESOURCE_SHAPES];
-    size_t off_dummy_ready_queue_slots;
-    size_t off_early_dispatch_queue_slots;
-    size_t off_dep_pool_entries[PTO2_MAX_RING_DEPTH];
-    size_t off_wiring_spsc_buffer;
-    uint64_t ready_queue_capacity;
-    uint64_t spsc_capacity;
-    int32_t dep_pool_capacity;
-};
-
-/**
- * Scheduler state structure
- *
- * Contains dynamic state updated during task execution.
- * Separated from shared memory for cache efficiency.
- * Hot-path methods are defined inline (implicitly inline as member functions).
- */
-struct PTO2SchedulerState {
-    // Shared memory access
-    PTO2SharedMemoryHeader *sm_header;
-
-    // Per-ring state
-    struct alignas(64) RingSchedState {
-        // --- Cache Line 0: ring pointer (read-only) + hot path (read-write) ---
-        PTO2SharedMemoryRingHeader *ring;
-        int32_t last_task_alive;
-        std::atomic<int32_t> advance_lock;  // multi-thread CAS
-
-        // --- Cache Line 1+: Thread 0 only (wiring dep_pool) ---
-        alignas(64) PTO2DepListPool dep_pool;
-#if PTO2_PROFILING
-        // Published only for scope_stats; orchestrator must not read dep_pool's non-atomic counters directly.
-        alignas(64) std::atomic<int32_t> dep_pool_snapshot_tail;
-        std::atomic<int32_t> dep_pool_snapshot_top;
-#endif
-
-        // Initialize arena-internal data + arena-external pointers; does NOT
-        // store dep_pool.base (that lives in the runtime arena and is wired
-        // by SchedulerState::wire_arena_pointers). The `ring` field stores
-        // the device address of the SM ring header — computed via offset
-        // arithmetic, no SM dereference.
-        bool init_data_from_layout(void *sm_dev_base, int32_t ring_id);
-        void destroy();
-
-        void sync_to_sm() { ring->fc.last_task_alive.store(last_task_alive, std::memory_order_release); }
-
-#if PTO2_PROFILING
-        void publish_dep_pool_snapshot() {
-            dep_pool_snapshot_tail.store(dep_pool.tail, std::memory_order_release);
-            dep_pool_snapshot_top.store(dep_pool.top, std::memory_order_release);
-        }
-
-        void read_dep_pool_snapshot(int32_t &tail, int32_t &top) const {
-            top = dep_pool_snapshot_top.load(std::memory_order_acquire);
-            tail = dep_pool_snapshot_tail.load(std::memory_order_acquire);
-            if (tail > top) tail = top;
-        }
-#endif
-
-        void advance_ring_pointers() {
-            int32_t current_task_index = ring->fc.current_task_index.load(std::memory_order_acquire);
-            int32_t old_last_task_alive = last_task_alive;
-
-            while (last_task_alive < current_task_index) {
-                PTO2TaskSlotState &slot_state = ring->get_slot_state_by_task_id(last_task_alive);
-                if (slot_state.task_state.load(std::memory_order_acquire) != PTO2_TASK_CONSUMED) {
-                    break;
-                }
-                last_task_alive++;
-            }
-
-            // Eager reset: prepare reclaimed slots for reuse while still hot in cache.
-            // Safe because last_task_alive has advanced past these slots but
-            // sync_to_sm has not yet published — the orchestrator cannot reuse
-            // them until the release store below.
-            // Skips payload, task, ring_id — immutable after RingSchedState::init().
-            for (int32_t id = old_last_task_alive; id < last_task_alive; id++) {
-                ring->get_slot_state_by_task_id(id).reset_for_reuse();
-            }
-
-            sync_to_sm();
-        }
-    } ring_sched_states[PTO2_MAX_RING_DEPTH];
-
-    // Ready queues remain global (scheduling is ring-agnostic)
-    PTO2ReadyQueue ready_queues[PTO2_NUM_RESOURCE_SHAPES];
-
-    // Dependency-only tasks (active_mask is empty, shape == DUMMY). Drained by
-    // the dispatch loop and completed inline -- never goes to AICore.
-    PTO2ReadyQueue dummy_ready_queue;
-
-    // Wiring subsystem — groups all wiring-related state for cache-line isolation.
-    //
-    // Three cache-line regions by writer:
-    //   1. batch_*  / backoff — thread 0 exclusive (local batch buffer)
-    //   2. queue    — SPSC: orchestrator push, thread 0 pop
-    //   3. orch_needs_drain — orchestrator write, thread 0 read
-    struct alignas(64) WiringState {
-        static constexpr uint64_t BATCH_SIZE = 30;
-        static constexpr int BACKOFF_LIMIT = 32;
-
-        // --- Thread 0 exclusive: local batch buffer + backoff ---
-        int batch_count = 0;
-        int batch_index = 0;
-        int backoff_counter = 0;
-        PTO2TaskSlotState *batch[BATCH_SIZE];
-
-        // --- SPSC queue: orchestrator (push) ↔ thread 0 (pop) ---
-        PTO2SpscQueue queue;
-
-        // --- Orchestrator write, thread 0 read ---
-        alignas(64) std::atomic<bool> orch_needs_drain{false};
-    } wiring;
-
-    static_assert(
-        offsetof(WiringState, queue) == 256, "WiringState: batch region must be exactly 4 cache lines before queue"
-    );
-    static_assert(sizeof(WiringState) == 640, "WiringState must be exactly 10 cache lines (640B)");
-
-    alignas(64) AsyncWaitList async_wait_list;
-
-    // Statistics (cold path, isolated from hot-path fields)
-#if PTO2_SCHED_PROFILING
-    alignas(64) std::atomic<int64_t> tasks_completed;
-    std::atomic<int64_t> tasks_consumed;
-#endif
-    // =========================================================================
-    // Inline hot-path methods
-    // =========================================================================
-
-    /**
-     * Drain wiring queue: pop submitted tasks and wire their fanout edges.
-     * Called by scheduler thread 0 each loop iteration. Sets fanin_count,
-     * acquires fanout_lock per producer, allocates dep_pool entries, and
-     * pushes ready tasks to the appropriate ready queue.
-     *
-     * @return Number of tasks wired this call.
-     */
-
-    int drain_wiring_queue(bool force_drain = false) {
-        int wired = 0;
-
-        // Refill local batch buffer when exhausted.
-        if (wiring.batch_index >= wiring.batch_count) {
-            // Backoff: defer pop when queue holds fewer than a full batch,
-            // unless force_drain, orch_needs_drain, or backoff limit reached.
-            if (!force_drain && wiring.queue.size() < WiringState::BATCH_SIZE) {
-                if (!wiring.orch_needs_drain.load(std::memory_order_acquire) &&
-                    wiring.backoff_counter < WiringState::BACKOFF_LIMIT) {
-                    wiring.backoff_counter++;
-                    return 0;
-                }
-            }
-            wiring.backoff_counter = 0;
-            wiring.batch_count = wiring.queue.pop_batch(wiring.batch, WiringState::BATCH_SIZE);
-            wiring.batch_index = 0;
-            if (wiring.batch_count == 0) return 0;
-        }
-
-        // Process tasks from local buffer in strict FIFO order.
-        while (wiring.batch_index < wiring.batch_count) {
-            PTO2TaskSlotState *ws = wiring.batch[wiring.batch_index];
-            int ring_id = ws->ring_id;
-            auto &rss = ring_sched_states[ring_id];
-            int32_t wfanin = ws->payload->fanin_actual_count;
-
-            if (wfanin > 0 && rss.dep_pool.available() < wfanin) {
-                rss.dep_pool.reclaim(*rss.ring, rss.last_task_alive);
-                if (rss.dep_pool.available() < wfanin) {
-#if PTO2_PROFILING
-                    if (is_scope_stats_enabled()) {
-                        rss.publish_dep_pool_snapshot();
-                    }
-#endif
-                    break;  // not enough dep_pool space — keep remainder for next call
-                }
-            }
-
-            wiring.batch_index++;
-            wire_task(rss, ws, wfanin);
-            wired++;
-        }
-
-        return wired;
-    }
-
-    // Route a ready slot to the right global queue. Dummy tasks (empty
-    // active_mask) live in dummy_ready_queue; everything else goes to the
-    // per-shape ready_queues[]. Used by paths that do not have a thread-local
-    // ready buffer (e.g. wiring). See push_ready_routed_local for the
-    // dispatch-time fast path.
-    void push_ready_routed(PTO2TaskSlotState *slot_state) {
-        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
-        if (shape == PTO2ResourceShape::DUMMY) {
-            dummy_ready_queue.push(slot_state);
-        } else {
-            ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-        }
-    }
-
-    /**
-     * Wire fanout edges for a single task. Sets fanin_count, acquires each
-     * producer's fanout_lock, allocates dep_pool entries for live producers,
-     * pushes the task to the ready queue once its fanin refcount is satisfied.
-     */
-    void wire_task(RingSchedState &rss, PTO2TaskSlotState *ws, int32_t wfanin) {
-        PTO2TaskPayload *wp = ws->payload;
-        ws->fanin_count = wfanin + 1;
-
-        if (wfanin != 0) {
-            int32_t early_finished = 0;
-            for_each_fanin_slot_state(*wp, [&](PTO2TaskSlotState *producer) {
-                producer->lock_fanout();
-                int32_t pstate = producer->task_state.load(std::memory_order_acquire);
-                if (pstate >= PTO2_TASK_COMPLETED) {
-                    early_finished++;
-                } else {
-                    producer->fanout_head = rss.dep_pool.prepend(producer->fanout_head, ws);
-                }
-                producer->unlock_fanout();
-            });
-
-            // Seed dispatch_fanin with producers already complete at wiring
-            // time (e.g. buffer-creator tasks that finished before this
-            // consumer entered the graph). Such producers never dispatch at
-            // runtime, so they can never bump dispatch_fanin via the fanout
-            // walk; without this seed the candidate compare
-            // (dispatch_fanin == fanin_actual_count) would be unreachable
-            // whenever any producer is pre-completed. Mirrors the
-            // early_finished seed that ready_fanin gets via init_rc.
-            if (early_finished != 0) {
-                wp->dispatch_fanin.fetch_add(early_finished, std::memory_order_acq_rel);
-            }
-
-            int32_t init_rc = early_finished + 1;
-            int32_t new_rc = ws->fanin_refcount.fetch_add(init_rc, std::memory_order_acq_rel) + init_rc;
-            if (new_rc >= ws->fanin_count) {
-                push_ready_routed(ws);
-            }
-        } else {
-            ws->fanin_refcount.fetch_add(1, std::memory_order_acq_rel);
-            push_ready_routed(ws);
-        }
-
-        ws->dep_pool_mark = rss.dep_pool.top;
-#if PTO2_PROFILING
-        if (is_scope_stats_enabled()) {
-            rss.publish_dep_pool_snapshot();
-        }
-#endif
-    }
-
-    void check_and_handle_consumed(PTO2TaskSlotState &slot_state) {
-        if (slot_state.fanout_refcount.load(std::memory_order_acquire) != slot_state.fanout_count) return;
-
-        PTO2TaskState expected = PTO2_TASK_COMPLETED;
-        if (!slot_state.task_state.compare_exchange_strong(
-                expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
-            )) {
-            return;
-        }
-
-#if PTO2_SCHED_PROFILING
-        tasks_consumed.fetch_add(1, std::memory_order_relaxed);
-#endif
-
-        int32_t ring_id = slot_state.ring_id;
-        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
-        int32_t expected_lock = 0;
-        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(
-                expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed
-            )) {
-            ring_sched_states[ring_id].advance_ring_pointers();
-            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
-        }
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void check_and_handle_consumed(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
-        int32_t fc = slot_state.fanout_count;
-        int32_t rc = slot_state.fanout_refcount.load(std::memory_order_acquire);
-
-        atomic_count += 2;  // fanout_count.load + fanout_refcount.load
-
-        if (rc != fc) return;
-
-        PTO2TaskState expected = PTO2_TASK_COMPLETED;
-        if (!slot_state.task_state.compare_exchange_strong(
-                expected, PTO2_TASK_CONSUMED, std::memory_order_acq_rel, std::memory_order_acquire
-            )) {
-            atomic_count += 1;  // failed CAS
-            return;
-        }
-
-        atomic_count += 1;  // successful CAS
-
-#if PTO2_SCHED_PROFILING
-        tasks_consumed.fetch_add(1, std::memory_order_relaxed);
-#endif
-
-        int32_t ring_id = slot_state.ring_id;
-        // Try-lock — if another thread is advancing this ring, it will scan our CONSUMED task
-        int32_t expected_lock = 0;
-        if (ring_sched_states[ring_id].advance_lock.compare_exchange_strong(
-                expected_lock, 1, std::memory_order_acquire, std::memory_order_relaxed
-            )) {
-            ring_sched_states[ring_id].advance_ring_pointers();
-            ring_sched_states[ring_id].advance_lock.store(0, std::memory_order_release);
-            atomic_count += 2;  // try-lock CAS + unlock store
-        } else {
-            atomic_count += 1;  // failed try-lock CAS
-        }
-    }
-#endif
-
-    void release_producer(PTO2TaskSlotState &slot_state) {
-        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
-        check_and_handle_consumed(slot_state);
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    void release_producer(PTO2TaskSlotState &slot_state, uint64_t &atomic_count) {
-        slot_state.fanout_refcount.fetch_add(1, std::memory_order_acq_rel);
-        atomic_count += 1;  // fanout_refcount.fetch_add
-        check_and_handle_consumed(slot_state, atomic_count);
-    }
-#endif
-
-    // Speculative early-dispatch release. If the now-ready task was pre-staged
-    // (gated on a core), ring its DATA_MAIN_BASE high-32 doorbell RIGHT HERE in
-    // the completion path — the moment its last producer's FIN satisfies fanin —
-    // instead of routing it through the ready queue and waiting for the dispatch
-    // pass to pop it. Returns true if the task is fully handled (caller must NOT
-    // push to the ready queue). Returns false when the caller must route C
-    // normally: either it was never pre-staged, OR it is a SPMD consumer only
-    // PARTIALLY pre-staged — the gated blocks are released by the doorbells rung
-    // here, and the remaining (next_block_idx .. logical_block_num) blocks
-    // dispatch normally off the ready queue. Lock-free claim shared with Hook 1
-    // (the stager): CAS NONE->DISPATCHED wins => not pre-staged; lose => STAGED
-    // (spin past the brief STAGING window so the mask is visible), then ring.
-
-    // Per-core speculative doorbell table. Hook 1 records each gated core's
-    // (reg_addr, dispatch token) here at stage time; the completion-path release
-    // reads it back for the cores set in the consumer's staged_core_mask. One
-    // global table indexed by core_id (not per-task): gated cores in flight are
-    // bounded by the chip's core count (no two-level pre-dispatch), so this is the
-    // natural capacity and removes the old per-task 3-doorbell cap.
-    struct SpecDoorbell {
-        uint64_t addr{0};
-        uint32_t token{0};
-    };
-    SpecDoorbell spec_doorbell_table[PTO2_SPEC_CORE_MASK_WORDS * 64]{};
-
-    // Cross-thread early-dispatch work queue (a PTO2ReadyQueue MPMC instance,
-    // arena-backed — reserved/wired in pto_runtime2_init alongside the ready queues).
-    // A consumer's SPMD blocks span cores owned by several AICPU threads, but only a
-    // thread RUNNING the consumer's producer discovers it (via the producer's
-    // fanout). When that producer is thread-local (e.g. a 16-block AIV op filling one
-    // thread's cores), the other threads never see the consumer and its blocks on
-    // their cores can't pre-stage. The first claimer pushes the partially-staged
-    // consumer here; every idle thread's early_dispatch pass pops one, stages a range onto
-    // ITS OWN cores (range-claim via next_block_idx), and re-pushes if blocks remain
-    // — exactly mirroring how a partially-dispatched SPMD task is re-pushed to the
-    // ready queue (scheduler_dispatch: pop -> claim -> re-push). A stale/released
-    // entry fails the STAGING check on pop and is dropped; a push that overflows is
-    // logged and the consumer's blocks fall back to normal dispatch.
-    PTO2ReadyQueue early_dispatch_queue;
-
-    static inline void ring_one_doorbell(uint64_t reg_addr, uint32_t token) {
-        volatile uint64_t *dmb = reinterpret_cast<volatile uint64_t *>(get_reg_ptr(reg_addr, RegId::DATA_MAIN_BASE));
-        uint64_t tk = static_cast<uint64_t>(token);
-        *dmb = (tk << 32) | tk;  // 64-bit STR: high=low=token releases the gated AICore
-    }
-
-    // auto-chain depth cap: a candidate inherits the flag only while depth < this.
-    static constexpr uint8_t PTO2_SPEC_CHAIN_MAX = 4;
-
-    // Event-driven candidate detection (the dual of fanin_refcount/ready). Call when a
-    // FLAGGED producer `p` DISPATCHES (starts running): walk its fanout and bump each
-    // consumer's dispatch_fanin. A consumer whose dispatch_fanin reaches
-    // fanin_actual_count (= every producer is either flagged-and-dispatched, or was
-    // already complete when the consumer was wired) is an early-dispatch candidate:
-    // CAS NONE->STAGING (exactly-once) and push to early_dispatch_queue for the idle drain to
-    // pre-stage. Once-guarded per producer so an SPMD producer's block-by-block
-    // dispatch propagates once. Replaces the old per-iteration pass-1 PULL scan.
-    void propagate_dispatch_fanin(PTO2TaskSlotState &p) {
-        if (!(p.payload->allow_early_resolve || p.payload->spec_chain_active.load(std::memory_order_acquire)))
-            return;  // only flagged (codegen or inherited) producers propagate
-        if (p.payload->dispatch_propagated.exchange(1, std::memory_order_acq_rel) != 0)
-            return;  // already propagated once
-        uint8_t child_depth = static_cast<uint8_t>(p.payload->spec_chain_depth + 1);
-        p.lock_fanout();
-        PTO2DepListEntry *edge = p.fanout_head;  // snapshot head, walk lock-free (fanout stable by dispatch)
-        p.unlock_fanout();
-        for (; edge != nullptr; edge = edge->next) {
-            PTO2TaskSlotState *c = edge->slot_state;
-            // Compare to fanin_actual_count (the real producer-edge count), NOT
-            // fanin_count: fanin_count = fanin_actual_count + 1 (a self/wiring +1 that
-            // ready_fanin gets but dispatch_fanin does not). dispatch_fanin starts at
-            // the wiring-time early_finished seed (producers already complete) and is
-            // bumped here by flagged producers; reaching fanin_actual_count means every
-            // producer is flagged-dispatched or was pre-completed.
-            int32_t nf = c->payload->dispatch_fanin.fetch_add(1, std::memory_order_acq_rel) + 1;
-            if (nf != c->payload->fanin_actual_count) continue;
-            if (c->active_mask.requires_sync_start()) continue;  // sync_start can't be block-by-block pre-staged
-            PTO2ResourceShape shape = c->active_mask.to_shape();
-            if (shape != PTO2ResourceShape::AIC && shape != PTO2ResourceShape::AIV && shape != PTO2ResourceShape::MIX)
-                continue;
-            uint8_t expect = PTO2_SPEC_NONE;  // exactly-once: only the CAS winner enqueues
-            if (!c->payload->spec_state.compare_exchange_strong(
-                    expect, PTO2_SPEC_STAGING, std::memory_order_seq_cst, std::memory_order_seq_cst
-                ))
-                continue;
-            if (child_depth < PTO2_SPEC_CHAIN_MAX) {  // auto-chain: C propagates to ITS consumers
-                c->payload->spec_chain_depth = child_depth;
-                c->payload->spec_chain_active.store(1, std::memory_order_release);
-            }
-            early_dispatch_queue.push(c);
-        }
-    }
-
-    // Collects consumers released via the speculative-doorbell path during a
-    // single on_task_complete fanout walk, so their dispatch_fanin
-    // propagation runs AFTER the walk — never between two siblings' doorbells.
-    struct SpecReleaseSink {
-        static constexpr int CAP = 32;
-        PTO2TaskSlotState *items[CAP];
-        int n = 0;
-        inline bool push(PTO2TaskSlotState *s) {
-            if (n >= CAP) return false;
-            items[n++] = s;
-            return true;
-        }
-    };
-
-    inline bool try_speculative_release(PTO2TaskSlotState &slot_state, SpecReleaseSink *sink = nullptr) {
-        // Never staged => CAS NONE->DISPATCHED wins => dispatch normally.
-        uint8_t expect = PTO2_SPEC_NONE;
-        if (slot_state.payload->spec_state.compare_exchange_strong(
-                expect, PTO2_SPEC_DISPATCHED, std::memory_order_seq_cst, std::memory_order_seq_cst
-            )) {
-            return false;
-        }
-        // Staged (STAGING). Flip STAGING->DISPATCHED, THEN read the mask. seq_cst
-        // gives a total order with the concurrent stagers, each of which OR-s its
-        // core into the mask and THEN loads spec_state: a stager whose bit lands
-        // before this CAS is read here and rung; a stager whose bit lands after
-        // sees DISPATCHED and rings that core itself (self-ring in
-        // stage_consumer_blocks). Either way every gated core's doorbell fires once
-        // (a double-ring is harmless — the AICore already matched). This replaces
-        // the old transient-STAGING spin: STAGING is now the stable gated state.
-        expect = PTO2_SPEC_STAGING;
-        slot_state.payload->spec_state.compare_exchange_strong(
-            expect, PTO2_SPEC_DISPATCHED, std::memory_order_seq_cst, std::memory_order_seq_cst
-        );
-        for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++) {
-            uint64_t bits = slot_state.payload->staged_core_mask[w].load(std::memory_order_seq_cst);
-            while (bits != 0) {
-                int core_id = w * 64 + __builtin_ctzll(bits);
-                bits &= bits - 1;
-                ring_one_doorbell(spec_doorbell_table[core_id].addr, spec_doorbell_table[core_id].token);
-            }
-        }
-        // This pre-staged consumer was just released by its doorbell — it starts
-        // running NOW, so propagate dispatch_fanin to ITS consumers (auto-chain,
-        // knob A). Defer it via the sink so it runs after the whole fanout walk:
-        // doing it inline here would delay the doorbells of later consumers in the
-        // same producer's fanout. Fallback to inline if no sink / sink full.
-        if (sink == nullptr || !sink->push(&slot_state)) {
-            propagate_dispatch_fanin(slot_state);
-        }
-        // No explicit removal from the cross-thread queue: a still-queued entry for
-        // this consumer is now DISPATCHED and is dropped when a peer pops it.
-        // Fully pre-staged => skip the ready queue. Partially staged SPMD consumer =>
-        // fall through so the caller pushes C; dispatch resumes from next_block_idx.
-        return slot_state.next_block_idx.load(std::memory_order_seq_cst) >= slot_state.logical_block_num;
-    }
-
-    bool release_fanin_and_check_ready(
-        PTO2TaskSlotState &slot_state, PTO2LocalReadyBuffer *local_bufs = nullptr, SpecReleaseSink *sink = nullptr
-    ) {
-        // Atomically increment fanin_refcount and check if all producers are done
-        // ACQ_REL on fanin_refcount already synchronizes with the orchestrator's
-        // init release, making fanin_count visible — plain load suffices.
-        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
-
-        if (new_refcount == slot_state.fanin_count) {
-            // Speculative early-dispatch: pre-staged tasks are released by doorbell
-            // here, skipping the ready-queue round-trip entirely.
-            if (try_speculative_release(slot_state, sink)) return true;
-            // Local-first: try per-CoreType thread-local buffer before global queue
-            // Route by active_mask: AIC-containing tasks → buf[0], AIV-only → buf[1]
-            // DUMMY shape is out of range for local_bufs (sized PTO2_NUM_RESOURCE_SHAPES);
-            // dummy slots bypass the local fast path and go straight to dummy_ready_queue.
-            PTO2ResourceShape shape = slot_state.active_mask.to_shape();
-            if (shape == PTO2ResourceShape::DUMMY) {
-                dummy_ready_queue.push(&slot_state);
-            } else if (!local_bufs || !local_bufs[static_cast<int32_t>(shape)].try_push(&slot_state)) {
-                ready_queues[static_cast<int32_t>(shape)].push(&slot_state);
-            }
-            return true;
-        }
-        return false;
-    }
-
-#if PTO2_ORCH_PROFILING || PTO2_SCHED_PROFILING
-    bool release_fanin_and_check_ready(
-        PTO2TaskSlotState &slot_state, uint64_t &atomic_count, uint64_t &push_wait,
-        PTO2LocalReadyBuffer *local_bufs = nullptr, SpecReleaseSink *sink = nullptr
-    ) {
-        int32_t new_refcount = slot_state.fanin_refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
-        atomic_count += 1;  // fanin_refcount.fetch_add
-
-        if (new_refcount == slot_state.fanin_count) {
-            // Speculative early-dispatch: pre-staged tasks are released by doorbell
-            // here, skipping the ready-queue round-trip entirely.
-            if (try_speculative_release(slot_state, sink)) return true;
-            // Local-first: try per-CoreType thread-local buffer before global queue.
-            // Dummy slots bypass local_bufs (out-of-range for PTO2_NUM_RESOURCE_SHAPES)
-            // and go straight to dummy_ready_queue; use the profiling-aware push so
-            // atomic_count / push_wait stay consistent with the non-dummy path.
-            PTO2ResourceShape shape = slot_state.active_mask.to_shape();
-            if (shape == PTO2ResourceShape::DUMMY) {
-                dummy_ready_queue.push(&slot_state, atomic_count, push_wait);
-            } else if (!local_bufs || !local_bufs[static_cast<int32_t>(shape)].try_push(&slot_state)) {
-                ready_queues[static_cast<int32_t>(shape)].push(&slot_state, atomic_count, push_wait);
-            }
-            return true;
-        }
-        return false;
-    }
-#endif
-
-    int get_ready_tasks_batch(
-        PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
-    ) {
-        int count = 0;
-        while (count < max_count && local_buf.count > 0) {
-            out[count++] = local_buf.slot_states[--local_buf.count];
-        }
-        int remaining = max_count - count;
-        if (remaining > 0) {
-            count += ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining);
-        }
-        return count;
-    }
-
-#if PTO2_SCHED_PROFILING
-    int get_ready_tasks_batch(
-        PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count,
-        uint64_t &atomic_count, uint64_t &wait_cycle
-    ) {
-        int count = 0;
-        while (count < max_count && local_buf.count > 0) {
-            out[count++] = local_buf.slot_states[--local_buf.count];
-        }
-        int remaining = max_count - count;
-        if (remaining > 0) {
-            count +=
-                ready_queues[static_cast<int32_t>(shape)].pop_batch(out + count, remaining, atomic_count, wait_cycle);
-        }
-        return count;
-    }
-#endif
-
-    void on_scope_end(PTO2TaskSlotState **task_slot_states, int32_t count) {
-#if PTO2_ORCH_PROFILING
-        extern uint64_t g_orch_scope_end_atomic_count;
-        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
-        for (int32_t i = 0; i < count; i++) {
-            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
-            release_producer(*task_slot_states[i], g_orch_scope_end_atomic_count);
-        }
-#else
-        if (count > 0) __builtin_prefetch(task_slot_states[0], 1, 0);
-        for (int32_t i = 0; i < count; i++) {
-            if (i + 1 < count) __builtin_prefetch(task_slot_states[i + 1], 1, 0);
-            release_producer(*task_slot_states[i]);
-        }
-#endif
-    }
-
-    /**
-     * Subtask completion: atomic counter model.
-     * Called when a single subtask (AIC, AIV0, or AIV1) finishes on any block.
-     * Atomically increments completed_subtasks and checks whether all subtasks
-     * across all blocks are done.
-     *
-     * @return true if this was the last subtask, completing the entire task.
-     */
-    bool on_subtask_complete(PTO2TaskSlotState &slot_state) {
-        int16_t prev = slot_state.completed_subtasks.fetch_add(1, std::memory_order_acq_rel);
-        return (prev + 1) == slot_state.total_required_subtasks;
-    }
-
-    /**
-     * Two-stage completion: second stage.
-     * Called exactly once when all subtasks of a task are done (i.e.,
-     * on_subtask_complete returned true). Walks the consumer (fanout) list,
-     * decrements each consumer's fanin, pushes newly-ready ones, and rings
-     * doorbells for speculative hits.
-     *
-     * Non-PROFILING returns the consumer-walk count (= edges traversed). The
-     * Resolve swimlane bar reads it to label the bar with how many successors
-     * actually got resolved. PROFILING returns the richer CompletionStats
-     * whose `fanout_edges` carries the same number.
-     */
-#if PTO2_SCHED_PROFILING
-    CompletionStats
-#else
-    uint32_t
-#endif
-    on_task_complete(
-        PTO2TaskSlotState &slot_state,
-#if PTO2_SCHED_PROFILING
-        int thread_idx,
-#endif
-
-        PTO2LocalReadyBuffer *local_bufs = nullptr
-    ) {
-#if PTO2_SCHED_PROFILING
-        CompletionStats stats = {0, 0, 0, true};
-#else
-        uint32_t consumer_walk_count = 0;
-#endif
-#if PTO2_SCHED_PROFILING
-        extern uint64_t g_sched_lock_cycle[], g_sched_fanout_cycle[];
-        extern uint64_t g_sched_lock_atomic_count[], g_sched_lock_wait_cycle[];
-        extern uint64_t g_sched_fanout_atomic_count[], g_sched_push_wait_cycle[];
-        uint64_t lock_atomics = 0, lock_wait = 0;
-        PTO2_SCHED_CYCLE_START();
-#endif
-
-#if PTO2_SCHED_PROFILING
-        slot_state.lock_fanout(lock_atomics, lock_wait);
-#else
-        slot_state.lock_fanout();
-#endif
-        slot_state.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
-        PTO2DepListEntry *current = slot_state.fanout_head;  // Protected by fanout_lock
-        slot_state.unlock_fanout();
-
-#if PTO2_SCHED_PROFILING
-        lock_atomics += 2;  // state.store + unlock.store
-        g_sched_lock_atomic_count[thread_idx] += lock_atomics;
-        g_sched_lock_wait_cycle[thread_idx] += lock_wait;
-        PTO2_SCHED_CYCLE_LAP(g_sched_lock_cycle[thread_idx]);
-#endif
-
-        // Fanout: notify consumers. A pre-staged consumer that becomes ready has
-        // its doorbell rung INLINE (db = nullptr) the moment its node is reached,
-        // not batched to after the whole walk — so a flagged consumer near the
-        // front of the list starts immediately and overlaps the remaining
-        // release_fanin work for the other consumers, instead of waiting for the
-        // full O(fanout-degree) walk (~5us for a 50-consumer producer).
-        //
-        // Safe on silicon: the producer's slot is already COMPLETED here — every
-        // SPMD block has FIN'd AND dcci-flushed its output to HBM before
-        // on_task_complete runs — so a released consumer never reads stale
-        // producer output. (Batching used to align the released wave, but pushed
-        // every doorbell to the end of the walk, defeating the whole point of
-        // speculative early-dispatch: minimal producer-end -> consumer-start.)
-#if PTO2_SCHED_PROFILING
-        uint64_t fanout_atomics = 0, push_wait = 0;
-#endif
-        // Doorbells for released pre-staged consumers fire INLINE in the walk
-        // below; their dispatch_fanin propagation is collected here and replayed
-        // after the walk, so no consumer's doorbell waits on a sibling's propagate.
-        SpecReleaseSink rel_sink;
-        while (current != nullptr) {
-            PTO2TaskSlotState &consumer_slot = *current->slot_state;
-#if PTO2_SCHED_PROFILING
-            stats.fanout_edges++;
-            if (release_fanin_and_check_ready(consumer_slot, fanout_atomics, push_wait, local_bufs, &rel_sink)) {
-                stats.tasks_enqueued++;
-            }
-#else
-            consumer_walk_count++;
-            release_fanin_and_check_ready(consumer_slot, local_bufs, &rel_sink);
-#endif
-            current = current->next;
-        }
-        for (int i = 0; i < rel_sink.n; i++) {
-            propagate_dispatch_fanin(*rel_sink.items[i]);
-        }
-
-#if PTO2_SCHED_PROFILING
-        g_sched_fanout_atomic_count[thread_idx] += fanout_atomics;
-        g_sched_push_wait_cycle[thread_idx] += push_wait;
-        PTO2_SCHED_CYCLE_LAP(g_sched_fanout_cycle[thread_idx]);
-        return stats;
-#else
-        return consumer_walk_count;
-#endif
-    }
-
-    /**
-     * Cold path: release producers (fanin traversal) + check self for CONSUMED.
-     * Returns fanin edge count for profiling.
-     */
-
-#if PTO2_SCHED_PROFILING
-    int32_t on_task_release(PTO2TaskSlotState &slot_state, int32_t thread_idx) {
-        PTO2_SCHED_CYCLE_START();
-        extern uint64_t g_sched_fanin_cycle[], g_sched_fanin_atomic_count[];
-        extern uint64_t g_sched_self_atomic_count[];
-        extern uint64_t g_sched_self_consumed_cycle[];
-        extern uint64_t g_sched_complete_count[];
-        uint64_t fanin_atomics = 0;
-#else
-    int32_t on_task_release(PTO2TaskSlotState &slot_state) {
-#endif
-        PTO2TaskPayload *payload = slot_state.payload;
-        for_each_fanin_slot_state(*payload, [&](PTO2TaskSlotState *producer_slot_state) {
-#if PTO2_SCHED_PROFILING
-            release_producer(*producer_slot_state, fanin_atomics);
-#else
-            release_producer(*producer_slot_state);
-#endif
-        });
-#if PTO2_SCHED_PROFILING
-        g_sched_fanin_atomic_count[thread_idx] += fanin_atomics;
-        PTO2_SCHED_CYCLE_LAP(g_sched_fanin_cycle[thread_idx]);
-#endif
-
-        // Self consumed check
-#if PTO2_SCHED_PROFILING
-        uint64_t self_atomics = 0;
-        check_and_handle_consumed(slot_state, self_atomics);
-        g_sched_self_atomic_count[thread_idx] += self_atomics;
-        PTO2_SCHED_CYCLE_LAP(g_sched_self_consumed_cycle[thread_idx]);
-        g_sched_complete_count[thread_idx]++;
-#else
-        check_and_handle_consumed(slot_state);
-#endif
-        return payload->fanin_actual_count;
-    }
-
-    // === Cold-path API (defined in pto_scheduler.cpp) ===
-
-    // Phase 1: declare every sub-region (ready_queue slots, dummy queue slots,
-    // per-ring dep_pool entries, wiring SPSC buffer) on the supplied arena.
-    // Capacities are baked into the returned layout; init_data_from_layout uses
-    // the same values.
-    static PTO2SchedulerLayout reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity = PTO2_DEP_LIST_POOL_SIZE);
-
-    // Phase 3a: write everything *except* arena-internal pointer fields.
-    // `sm_dev_base` is the device address of the SM (only stored, never
-    // dereferenced here). Safe to call on a host arena that holds the
-    // prebuilt image buffer. (The orchestrator counterpart takes
-    // task_window_size for ring task_descriptors address arithmetic; the
-    // scheduler only needs the SM header / ring header base addresses,
-    // both window-size-independent.)
-    bool init_data_from_layout(const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base);
-
-    // Phase 3b: write the arena-internal pointer fields
-    // (ready_queues[].slots, dummy_ready_queue.slots, dep_pool.base for each
-    // ring, wiring.queue.buffer_). Called on both host and device sides.
-    void wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena);
-
-    // Forget per-region pointers; arena owns the backing memory.
-    void destroy();
-    void print_stats();
-    void print_queues();
-};
-
-// Scheduler cold-path API is declared as PTO2SchedulerState member functions.
-// See init()/destroy()/print_stats()/print_queues() below the struct definition.
-
-// try_inline_complete_locked: short-circuit NotDeferred completions seen during
-// drain so they don't grow entries[]. Defined here (not in pto_async_wait.h)
-// because PTO2SchedulerState's on_task_complete signature is only known
-// after its full definition above.
-//
-// When the deferred_release_slot_states[] buffer is full, drain it via
-// on_task_release before appending — mirrors the same overflow-drain idiom
-// that scheduler_completion.cpp's inline NotDeferred path uses, so high task
-// rates don't surface as ASYNC_WAIT_OVERFLOW errors.
-inline bool
-AsyncWaitList::try_inline_complete_locked(AsyncWaitList::DrainCompletionSink &sink, PTO2TaskSlotState &slot_state) {
-    // Return value (CompletionStats / consumer-walk count) discarded:
-    // async-wait drain path has no Resolve swimlane bar attached.
-#if PTO2_SCHED_PROFILING
-    (void)sink.sched->on_task_complete(slot_state, sink.thread_idx, sink.local_bufs);
-#else
-    (void)sink.sched->on_task_complete(slot_state, sink.local_bufs);
-#endif
-    if (*sink.deferred_release_count >= sink.deferred_release_capacity) {
-        while (*sink.deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-            (void)sink.sched->on_task_release(
-                *sink.deferred_release_slot_states[--(*sink.deferred_release_count)], sink.thread_idx
-            );
-#else
-            sink.sched->on_task_release(*sink.deferred_release_slot_states[--(*sink.deferred_release_count)]);
-#endif
-        }
-    }
-    sink.deferred_release_slot_states[(*sink.deferred_release_count)++] = &slot_state;
-    sink.inline_completed++;
-    return true;
-}
-
-template <bool Profiling>
-inline AsyncPollResult AsyncWaitList::poll_and_complete(
-    AICoreCompletionMailbox *aicore_mailbox, PTO2SchedulerState *sched, PTO2LocalReadyBuffer *local_bufs,
-    PTO2TaskSlotState **deferred_release_slot_states, int32_t &deferred_release_count, int32_t deferred_release_capacity
-#if PTO2_SCHED_PROFILING
-    ,
-    int thread_idx
-#endif
-) {
-    AsyncPollResult result;
-    if (!try_lock()) return result;
-
-    AsyncWaitList::DrainCompletionSink sink{};
-    sink.sched = sched;
-    sink.local_bufs = local_bufs;
-    sink.deferred_release_slot_states = deferred_release_slot_states;
-    sink.deferred_release_count = &deferred_release_count;
-    sink.deferred_release_capacity = deferred_release_capacity;
-#if PTO2_SCHED_PROFILING
-    sink.thread_idx = thread_idx;
-#endif
-
-    int32_t drain_err = PTO2_ERROR_NONE;
-    drain_aicore_completion_mailbox_locked(aicore_mailbox, sink, drain_err);
-    if (drain_err != PTO2_ERROR_NONE) {
-        result.error_code = drain_err;
-        unlock();
-        return result;
-    }
-    result.completed += sink.inline_completed;
-
-    for (int32_t i = count - 1; i >= 0; --i) {
-        AsyncWaitEntry &entry = entries[i];
-        uintptr_t last_invalidated_counter_line = static_cast<uintptr_t>(-1);
-        for (int32_t c = 0; c < entry.condition_count; c++) {
-            CompletionCondition &cond = entry.conditions[c];
-            if (cond.satisfied) continue;
-            if (cond.completion_type == COMPLETION_TYPE_COUNTER && cond.counter_addr != nullptr) {
-                uintptr_t counter_line = mailbox_cache_line(cond.counter_addr);
-                if (counter_line != last_invalidated_counter_line) {
-                    cache_invalidate_range(reinterpret_cast<const void *>(counter_line), sizeof(uint32_t));
-                    last_invalidated_counter_line = counter_line;
-                }
-            }
-            CompletionPollResult poll = cond.test();
-            if (poll.state == CompletionPollState::FAILED) {
-                result.error_code = poll.error_code;
-                result.failed_slot_state = entry.slot_state;
-                unlock();
-                return result;
-            }
-            if (poll.state == CompletionPollState::READY) {
-                cond.satisfied = true;
-                cond.retire();
-                entry.waiting_completion_count--;
-            }
-        }
-
-        if (entry.normal_done && entry.waiting_completion_count <= 0) {
-            // Return value (CompletionStats / consumer-walk count) discarded:
-            // deferred-completion drain has no Resolve swimlane bar attached.
-#if PTO2_SCHED_PROFILING
-            (void)sched->on_task_complete(*entry.slot_state, thread_idx, local_bufs);
-#else
-            (void)sched->on_task_complete(*entry.slot_state, local_bufs);
-#endif
-            // Drain deferred_release in place when the buffer fills — same
-            // overflow-drain idiom used by complete_slot_task's inline path
-            // and by try_inline_complete_locked. Without this, large bursts
-            // of completable wait_list entries in a single poll surfaced as
-            // ASYNC_WAIT_OVERFLOW under the MPSC model.
-            if (deferred_release_count >= deferred_release_capacity) {
-                while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                    (void)sched->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-                    sched->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-                }
-            }
-            deferred_release_slot_states[deferred_release_count++] = entry.slot_state;
-            result.completed++;
-
-            int32_t last = count - 1;
-            if (i != last) entries[i] = entries[last];
-            count = last;
-        }
-    }
-
-    unlock();
-    return result;
-}
-
-// =============================================================================
-// Scheduler Profiling Data
-// =============================================================================
-
-#if PTO2_SCHED_PROFILING
-struct PTO2SchedProfilingData {
-    // Sub-phase cycle breakdown within on_task_complete
-    uint64_t lock_cycle;           // lock_fanout + state store + unlock
-    uint64_t fanout_cycle;         // fanout traversal
-    uint64_t fanin_cycle;          // fanin traversal
-    uint64_t self_consumed_cycle;  // self check_and_handle_consumed
-
-    // Wait times
-    uint64_t lock_wait_cycle;  // spin-wait in fanout_lock
-    uint64_t push_wait_cycle;  // CAS contention in push()
-    uint64_t pop_wait_cycle;   // CAS contention in pop()
-
-    // Atomic counts per sub-phase
-    uint64_t lock_atomic_count;
-    uint64_t fanout_atomic_count;
-    uint64_t fanin_atomic_count;
-    uint64_t self_atomic_count;
-    uint64_t pop_atomic_count;
-
-    int64_t complete_count;
-};
-
-/**
- * Get and reset scheduler profiling data for a specific thread.
- * Returns accumulated profiling data and resets counters.
- */
-PTO2SchedProfilingData scheduler_get_profiling(int thread_idx);
-#endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
deleted file mode 100644
index e72f746ea..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ /dev/null
@@ -1,1088 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include "scheduler_context.h"
-
-#include <cinttypes>
-#include <cstdio>
-
-#include "common/unified_log.h"
-#include "aicpu/device_time.h"
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-#include "aicpu/platform_regs.h"
-#include "aicpu/pmu_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-#include "common/memory_barrier.h"
-#include "common/l2_swimlane_profiling.h"
-#include "common/platform_config.h"
-#include "pto_runtime2.h"
-#include "pto_shared_memory.h"
-#include "runtime.h"
-#include "spin_hint.h"
-
-// =============================================================================
-// Cold-path helpers for the main dispatch loop (noinline to reduce hot-loop icache)
-// =============================================================================
-
-static void latch_scheduler_error(PTO2SharedMemoryHeader *header, int32_t thread_idx, int32_t error_code) {
-    if (header == nullptr || error_code == PTO2_ERROR_NONE) {
-        return;
-    }
-    // The first error code/thread pair wins; the bitmap cumulatively records all reporting threads.
-    int32_t expected = PTO2_ERROR_NONE;
-    if (header->sched_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) {
-        header->sched_error_thread.store(thread_idx, std::memory_order_release);
-    }
-    if (thread_idx >= 0 && thread_idx < 32) {
-        header->sched_error_bitmap.fetch_or(1U << static_cast<uint32_t>(thread_idx), std::memory_order_acq_rel);
-    }
-}
-
-LoopAction SchedulerContext::handle_orchestrator_exit(
-    int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count
-) {
-    if (completed_.load(std::memory_order_acquire)) {
-        return LoopAction::BREAK_LOOP;
-    }
-    int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
-    if (orch_err != PTO2_ERROR_NONE) {
-        LOG_ERROR(
-            "Thread %d: Fatal error (code=%d), sending EXIT_SIGNAL to all cores. "
-            "completed_tasks=%d, total_tasks=%d",
-            thread_idx, orch_err, completed_tasks_.load(std::memory_order_relaxed), total_tasks_
-        );
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-        return LoopAction::BREAK_LOOP;
-    }
-    int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
-    if (sched_err != PTO2_ERROR_NONE) {
-        LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err);
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-        return LoopAction::BREAK_LOOP;
-    }
-
-    bool orch_done = orchestrator_done_;
-    if (!orch_done) return LoopAction::NONE;
-
-    task_count = total_tasks_;
-    if (task_count > 0 && completed_tasks_.load(std::memory_order_relaxed) >= task_count) {
-        completed_.store(true, std::memory_order_release);
-        LOG_INFO_V0(
-            "Thread %d: PTO2 completed tasks %d/%d", thread_idx, completed_tasks_.load(std::memory_order_relaxed),
-            task_count
-        );
-        return LoopAction::BREAK_LOOP;
-    }
-    return LoopAction::NONE;
-}
-
-LoopAction SchedulerContext::handle_core_transition(bool &cores_released) {
-    if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE;
-    if (!reassigned_.load(std::memory_order_acquire)) {
-        wait_reassign_.fetch_add(1, std::memory_order_release);
-        while (!reassigned_.load(std::memory_order_acquire)) {
-            if (completed_.load(std::memory_order_acquire)) {
-                return LoopAction::BREAK_LOOP;
-            }
-            SPIN_WAIT_HINT();
-        }
-    }
-    cores_released = true;
-    return LoopAction::NONE;
-}
-
-LoopAction
-SchedulerContext::check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime) {
-    if (completed_.load(std::memory_order_acquire)) {
-        return LoopAction::BREAK_LOOP;
-    }
-    int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
-    if (orch_err != PTO2_ERROR_NONE) {
-        LOG_ERROR("Thread %d: Fatal error detected (code=%d), sending EXIT_SIGNAL to all cores", thread_idx, orch_err);
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-        return LoopAction::BREAK_LOOP;
-    }
-    int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
-    if (sched_err != PTO2_ERROR_NONE) {
-        LOG_ERROR("Thread %d: Scheduler fatal error detected (code=%d)", thread_idx, sched_err);
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-        return LoopAction::BREAK_LOOP;
-    }
-    return LoopAction::NONE;
-}
-
-// =============================================================================
-// Stall diagnostic log format.
-//
-// Every line is self-contained — when scheduler threads emit concurrently and
-// device_log interleaves their output, each line still carries enough context
-// to identify which thread / iteration / object it belongs to.
-//
-// Prefix on every line:
-//   [STALL thread=N idle_iterations=K] CATEGORY ...
-//
-// All scheduler threads spinning at the same idle rate hit STALL_LOG_INTERVAL
-// together, so lines with the same idle_iterations belong to one diagnostic
-// round; grep "idle_iterations=N" groups one round's output.
-//
-// Categories (and which thread emits them):
-//   SUMMARY  — completed / total counts and scan totals               (thread 0 only)
-//   TASK     — one per non-completed task scanned from shared rings   (thread 0 only)
-//              - state=RUNNING: includes running_on=[...] cross-ref
-//              - state=READY:   fanin satisfied but no idle core yet
-//              - state=WAIT:    includes missing_deps=N
-//   CLUSTER  — one per cluster owned by this thread                   (every thread)
-//              - busy slot shows kernel + task_id + cond_reg_state;
-//                ANOMALY suffix when COND register is fin while software
-//                still has the slot marked busy.
-//
-// Reader workflow:
-//   1. grep SUMMARY                          -> overall completion status
-//   2. grep "idle_iterations=N TASK"         -> stuck RUNNING task and which
-//                                               core/thread it is on
-//   3. grep "idle_iterations=N CLUSTER.*task=<id>" -> cross-check via the
-//                                                     cluster line (or just
-//                                                     read running_on in step 2)
-// =============================================================================
-
-namespace {
-
-// Format a core's idle/busy state into a fixed buffer. Used inside CLUSTER lines.
-// Layout (idle):    coreN(idle)
-// Layout (busy):    coreN(busy kernel=K task=T cond_reg_state=ack)
-// Layout (anomaly): coreN(busy kernel=K task=T cond_reg_state=fin ANOMALY)
-//
-// Healthy busy: COND register reports ack (AICore still executing). fin means
-// AICore wrote completion but AICPU hasn't recycled the running slot yet —
-// either a completion-poll bug or the diagnostic raced the recycle.
-void format_core_status(
-    char *buf, size_t buf_size, int32_t core_id, bool idle, const CoreExecState *core_state, uint64_t reg_addr_for_cond
-) {
-    if (idle) {
-        snprintf(buf, buf_size, "core%d(idle)", core_id);
-        return;
-    }
-    int32_t kernel = -1;
-    int64_t task_id_raw = -1;
-    if (core_state && core_state->running_slot_state) {
-        int32_t subslot = static_cast<int32_t>(core_state->running_subslot);
-        kernel = core_state->running_slot_state->task->kernel_id[subslot];
-        task_id_raw = static_cast<int64_t>(core_state->running_slot_state->task->task_id.raw);
-    }
-    uint64_t cond_reg = read_reg(reg_addr_for_cond, RegId::COND);
-    int32_t hw_state = EXTRACT_TASK_STATE(cond_reg);
-    const char *cond_reg_state_str = (hw_state == TASK_ACK_STATE) ? "ack" : "fin";
-    if (hw_state == TASK_ACK_STATE) {
-        snprintf(
-            buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s)", core_id, kernel, task_id_raw,
-            cond_reg_state_str
-        );
-    } else {
-        snprintf(
-            buf, buf_size,
-            "core%d(busy kernel=%d task=%" PRId64
-            " cond_reg_state=%s ANOMALY cond_tok=%d running_tok=%d pending_tok=%d)",
-            core_id, kernel, task_id_raw, cond_reg_state_str, EXTRACT_TASK_ID(cond_reg),
-            core_state->running_reg_task_id, core_state->pending_reg_task_id
-        );
-    }
-}
-
-}  // namespace
-
-int32_t SchedulerContext::find_core_owner_thread(int32_t core_id) const {
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        const int32_t *ids = core_trackers_[t].core_ids();
-        int32_t n = core_trackers_[t].core_num();
-        for (int32_t i = 0; i < n; i++) {
-            if (ids[i] == core_id) return t;
-        }
-    }
-    return -1;
-}
-
-bool SchedulerContext::self_owns_running_task(int32_t thread_idx) const {
-    const int32_t *cores = core_trackers_[thread_idx].core_ids();
-    int32_t core_num = core_trackers_[thread_idx].core_num();
-    for (int32_t i = 0; i < core_num; i++) {
-        if (core_exec_states_[cores[i]].running_slot_state != nullptr) {
-            return true;
-        }
-    }
-    return false;
-}
-
-bool SchedulerContext::no_thread_owns_running_task() const {
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        if (self_owns_running_task(t)) return false;
-    }
-    return true;
-}
-
-void SchedulerContext::log_stall_diagnostics(
-    int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count
-) {
-    CoreTracker &tracker = core_trackers_[thread_idx];
-
-    // T0 owns the shared-ring scan; printing it from other threads would
-    // produce identical TASK lines once per scheduler thread.
-    if (thread_idx == 0) {
-        int32_t cnt_ready = 0, cnt_waiting = 0, cnt_running = 0, submitted_in_ring = 0;
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            PTO2SharedMemoryRingHeader &ring = *sched_->ring_sched_states[r].ring;
-            int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed);
-            submitted_in_ring += ring_task_count;
-            for (int32_t si = 0; si < ring_task_count; si++) {
-                PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si);
-                PTO2TaskState st = slot_state.task_state.load(std::memory_order_relaxed);
-                int32_t rc = slot_state.fanin_refcount.load(std::memory_order_relaxed);
-                int32_t fi = slot_state.fanin_count;
-                int32_t kid_aic = slot_state.task->kernel_id[0];
-                int32_t kid_aiv0 = slot_state.task->kernel_id[1];
-                int32_t kid_aiv1 = slot_state.task->kernel_id[2];
-                int64_t task_id = static_cast<int64_t>(slot_state.task->task_id.raw);
-                if (st >= PTO2_TASK_COMPLETED) continue;
-                // task_state has no intermediate ready/running value — it
-                // stays PENDING until the worker stores COMPLETED. Classify
-                // by the ground truth instead: a slot is RUNNING iff some
-                // core has it as running_slot_state. A task occupies at most
-                // 3 cores (one cluster), all under the same owner thread by
-                // construction of assign_cores_to_threads.
-                char running_on[192] = {0};
-                int32_t owner = -1;
-                int32_t pos = 0;
-                bool is_running = false;
-                for (int32_t cid = 0; cid < cores_total_num_ && pos + 32 < (int32_t)sizeof(running_on); cid++) {
-                    if (core_exec_states_[cid].running_slot_state != &slot_state) continue;
-                    is_running = true;
-                    if (owner < 0) owner = find_core_owner_thread(cid);
-                    const char *sname = subslot_name(core_exec_states_[cid].running_subslot);
-                    int32_t written = snprintf(
-                        running_on + pos, sizeof(running_on) - pos, "%score=%d(%s)", pos == 0 ? "" : " ", cid, sname
-                    );
-                    if (written > 0) pos += written;
-                }
-
-                if (is_running) {
-                    cnt_running++;
-                    if (cnt_running > STALL_DUMP_READY_MAX) continue;
-                    LOG_INFO_V9(
-                        "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
-                        " state=RUNNING fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] "
-                        "running_on=[owner_thread=%d cores=[%s]]",
-                        thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, owner, running_on
-                    );
-                    continue;
-                }
-                if (rc >= fi) {
-                    cnt_ready++;
-                    if (cnt_ready > STALL_DUMP_READY_MAX) continue;
-                    LOG_INFO_V9(
-                        "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
-                        " state=READY   fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d]",
-                        thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1
-                    );
-                    continue;
-                }
-                cnt_waiting++;
-                if (cnt_waiting > STALL_DUMP_WAIT_MAX) continue;
-                LOG_INFO_V9(
-                    "[STALL thread=%d idle_iterations=%d] TASK ring=%d task_id=%" PRId64
-                    " state=WAIT    fanin_refcount=%d/%d kernels=[aic:%d aiv0:%d aiv1:%d] missing_deps=%d",
-                    thread_idx, idle_iterations, r, task_id, rc, fi, kid_aic, kid_aiv0, kid_aiv1, fi - rc
-                );
-            }
-        }
-        int32_t effective_total = task_count > 0 ? task_count : submitted_in_ring;
-        int32_t c = completed_tasks_.load(std::memory_order_relaxed);
-        LOG_INFO_V9(
-            "[STALL thread=%d idle_iterations=%d] SUMMARY completed=%d/%d last_progress_iteration=%d "
-            "scan_ready=%d scan_waiting=%d scan_running=%d",
-            thread_idx, idle_iterations, c, effective_total, last_progress_count, cnt_ready, cnt_waiting, cnt_running
-        );
-    }
-
-    // CLUSTER lines: one per cluster this thread owns.
-    // cluster_id = local_cluster_idx * active_sched_threads_ + thread_idx, matching the
-    // round-robin assignment in assign_cores_to_threads / reassign_cores_for_all_threads.
-    int32_t ast = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
-    for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++) {
-        int32_t offset = cli * 3;
-        int32_t aic_id = tracker.get_aic_core_id(offset);
-        int32_t aiv0_id = tracker.get_aiv0_core_id(offset);
-        int32_t aiv1_id = tracker.get_aiv1_core_id(offset);
-        bool aic_idle = tracker.is_aic_core_idle(offset);
-        bool aiv0_idle = tracker.is_aiv0_core_idle(offset);
-        bool aiv1_idle = tracker.is_aiv1_core_idle(offset);
-        int32_t cluster_id = cli * ast + thread_idx;
-        char aic_buf[192], aiv0_buf[192], aiv1_buf[192];
-        format_core_status(
-            aic_buf, sizeof(aic_buf), aic_id, aic_idle, &core_exec_states_[aic_id], core_exec_states_[aic_id].reg_addr
-        );
-        format_core_status(
-            aiv0_buf, sizeof(aiv0_buf), aiv0_id, aiv0_idle, &core_exec_states_[aiv0_id],
-            core_exec_states_[aiv0_id].reg_addr
-        );
-        format_core_status(
-            aiv1_buf, sizeof(aiv1_buf), aiv1_id, aiv1_idle, &core_exec_states_[aiv1_id],
-            core_exec_states_[aiv1_id].reg_addr
-        );
-        LOG_INFO_V9(
-            "[STALL thread=%d idle_iterations=%d] CLUSTER cluster_id=%d aic=%s aiv0=%s aiv1=%s", thread_idx,
-            idle_iterations, cluster_id, aic_buf, aiv0_buf, aiv1_buf
-        );
-    }
-}
-
-void SchedulerContext::log_shutdown_stall_snapshot(
-    int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count
-) {
-    LOG_WARN(
-        "[SHUTDOWN_SNAPSHOT trigger_thread=%d reason=scheduler_timeout idle_iterations=%d] "
-        "dumping all scheduler threads before emergency shutdown",
-        trigger_thread_idx, trigger_idle_iterations
-    );
-    int32_t thread_count = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
-    if (thread_count < 0 || thread_count > MAX_AICPU_THREADS) {
-        LOG_ERROR(
-            "[SHUTDOWN_SNAPSHOT trigger_thread=%d] invalid thread_count=%d, clamping to [0,%d]", trigger_thread_idx,
-            thread_count, MAX_AICPU_THREADS
-        );
-        thread_count = thread_count < 0 ? 0 : MAX_AICPU_THREADS;
-    }
-    for (int32_t t = 0; t < thread_count; t++) {
-        log_stall_diagnostics(t, total_tasks_, trigger_idle_iterations, trigger_last_progress_count);
-    }
-}
-
-int32_t SchedulerContext::handle_timeout_exit(
-    int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
-    int32_t last_progress_count
-#if PTO2_PROFILING
-    ,
-    uint64_t sched_start_ts
-#endif
-) {
-    LOG_ERROR(
-        "[STALL thread=%d idle_iterations=%d] TIMEOUT_EXIT after_idle_iterations=%d", thread_idx, idle_iterations,
-        idle_iterations
-    );
-    latch_scheduler_error(header, thread_idx, PTO2_ERROR_SCHEDULER_TIMEOUT);
-    if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-        log_shutdown_stall_snapshot(thread_idx, idle_iterations, last_progress_count);
-#if PTO2_PROFILING
-        // Capture the in-flight kernels' partial output before signalling the
-        // cores to exit, so the dump reflects the live stuck state.
-        if (is_dump_args_enabled()) {
-            dump_running_task_outputs<PTO2_SUBTASK_SLOT_COUNT>(
-                thread_idx, cores_total_num_,
-                [this](int32_t cid) {
-                    return core_exec_states_[cid].running_slot_state;
-                },
-                [](ActiveMask active_mask, int raw_subtask_id) {
-                    return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
-                },
-                [this](int32_t func_id) {
-                    return get_function_bin_addr(func_id);
-                }
-            );
-        }
-#endif
-        emergency_shutdown(runtime);
-    }
-#if PTO2_PROFILING
-    uint64_t sched_timeout_ts = get_sys_cnt_aicpu();
-    LOG_INFO_V9(
-        "Thread %d: sched_start=%" PRIu64 " sched_end(timeout)=%" PRIu64 " sched_cost=%.3fus", thread_idx,
-        static_cast<uint64_t>(sched_start_ts), static_cast<uint64_t>(sched_timeout_ts),
-        cycles_to_us(sched_timeout_ts - sched_start_ts)
-    );
-#endif
-    return -PTO2_ERROR_SCHEDULER_TIMEOUT;
-}
-
-#if PTO2_PROFILING
-void SchedulerContext::log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed) {
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-    uint64_t sched_end_ts = get_sys_cnt_aicpu();
-    LOG_INFO_V9(
-        "Thread %d: sched_start=%" PRIu64 " sched_end=%" PRIu64 " sched_cost=%.3fus", thread_idx,
-        static_cast<uint64_t>(l2_swimlane.sched_start_ts), static_cast<uint64_t>(sched_end_ts),
-        cycles_to_us(sched_end_ts - l2_swimlane.sched_start_ts)
-    );
-
-    uint64_t sched_total = l2_swimlane.sched_wiring_cycle + l2_swimlane.sched_complete_cycle +
-                           l2_swimlane.sched_dispatch_cycle + l2_swimlane.sched_idle_cycle;
-    if (sched_total == 0) sched_total = 1;
-
-#if PTO2_SCHED_PROFILING
-    {
-        PTO2SchedProfilingData sp = scheduler_get_profiling(thread_idx);
-        uint64_t otc_total = sp.lock_cycle + sp.fanout_cycle + sp.fanin_cycle + sp.self_consumed_cycle;
-        uint64_t complete_poll =
-            (l2_swimlane.sched_complete_cycle > otc_total + l2_swimlane.sched_complete_perf_cycle) ?
-                (l2_swimlane.sched_complete_cycle - otc_total - l2_swimlane.sched_complete_perf_cycle) :
-                0;
-        uint64_t dispatch_poll = (l2_swimlane.sched_dispatch_cycle >
-                                  l2_swimlane.sched_dispatch_pop_cycle + l2_swimlane.sched_dispatch_setup_cycle) ?
-                                     (l2_swimlane.sched_dispatch_cycle - l2_swimlane.sched_dispatch_pop_cycle -
-                                      l2_swimlane.sched_dispatch_setup_cycle) :
-                                     0;
-
-        LOG_INFO_V9(
-            "Thread %d: === Scheduler Phase Breakdown: total=%.3fus, %d tasks ===", thread_idx,
-            cycles_to_us(sched_total), cur_thread_completed
-        );
-
-        // fanout / fanin per-thread aggregates live in
-        // sched_overhead_analysis.compute_dag_stats_from_deps (deps.json edges
-        // × core_to_thread).
-        LOG_INFO_V9(
-            "Thread %d:   complete       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_complete_cycle),
-            l2_swimlane.sched_complete_cycle * 100.0 / sched_total
-        );
-
-        uint64_t c_parent = l2_swimlane.sched_complete_cycle > 0 ? l2_swimlane.sched_complete_cycle : 1;
-        uint64_t complete_miss_count = (l2_swimlane.complete_probe_count > l2_swimlane.complete_hit_count) ?
-                                           (l2_swimlane.complete_probe_count - l2_swimlane.complete_hit_count) :
-                                           0;
-        double complete_hit_rate = l2_swimlane.complete_probe_count > 0 ?
-                                       l2_swimlane.complete_hit_count * 100.0 / l2_swimlane.complete_probe_count :
-                                       0.0;
-        LOG_INFO_V9(
-            "Thread %d:     poll         : %.3fus (%.1f%%)  hit=%" PRIu64 ", miss=%" PRIu64 ", hit_rate=%.1f%%",
-            thread_idx, cycles_to_us(complete_poll), complete_poll * 100.0 / c_parent,
-            static_cast<uint64_t>(l2_swimlane.complete_hit_count), static_cast<uint64_t>(complete_miss_count),
-            complete_hit_rate
-        );
-        LOG_INFO_V9(
-            "Thread %d:     otc_lock     : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.lock_cycle), sp.lock_cycle * 100.0 / c_parent,
-            cycles_to_us(sp.lock_cycle - sp.lock_wait_cycle), cycles_to_us(sp.lock_wait_cycle),
-            static_cast<uint64_t>(sp.lock_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     otc_fanout   : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.fanout_cycle), sp.fanout_cycle * 100.0 / c_parent,
-            cycles_to_us(sp.fanout_cycle - sp.push_wait_cycle), cycles_to_us(sp.push_wait_cycle),
-            static_cast<uint64_t>(sp.fanout_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     otc_fanin    : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.fanin_cycle), sp.fanin_cycle * 100.0 / c_parent,
-            static_cast<uint64_t>(sp.fanin_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     otc_self     : %.3fus (%.1f%%)  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(sp.self_consumed_cycle), sp.self_consumed_cycle * 100.0 / c_parent,
-            static_cast<uint64_t>(sp.self_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     perf         : %.3fus (%.1f%%)", thread_idx,
-            cycles_to_us(l2_swimlane.sched_complete_perf_cycle),
-            l2_swimlane.sched_complete_perf_cycle * 100.0 / c_parent
-        );
-
-        LOG_INFO_V9(
-            "Thread %d:   dispatch       : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_dispatch_cycle),
-            l2_swimlane.sched_dispatch_cycle * 100.0 / sched_total
-        );
-
-        uint64_t d_parent = l2_swimlane.sched_dispatch_cycle > 0 ? l2_swimlane.sched_dispatch_cycle : 1;
-        LOG_INFO_V9(
-            "Thread %d:     poll         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(dispatch_poll),
-            dispatch_poll * 100.0 / d_parent
-        );
-        LOG_INFO_V9(
-            "Thread %d:     pop          : %.3fus (%.1f%%)  work=%.3fus wait=%.3fus  atomics=%" PRIu64 "", thread_idx,
-            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle), l2_swimlane.sched_dispatch_pop_cycle * 100.0 / d_parent,
-            cycles_to_us(l2_swimlane.sched_dispatch_pop_cycle - sp.pop_wait_cycle), cycles_to_us(sp.pop_wait_cycle),
-            static_cast<uint64_t>(sp.pop_atomic_count)
-        );
-        LOG_INFO_V9(
-            "Thread %d:     setup        : %.3fus (%.1f%%)", thread_idx,
-            cycles_to_us(l2_swimlane.sched_dispatch_setup_cycle),
-            l2_swimlane.sched_dispatch_setup_cycle * 100.0 / d_parent
-        );
-
-#if PTO2_SCHED_PROFILING
-        LOG_INFO_V9(
-            "Thread %d:   wiring         : %.3fus (%.1f%%)  tasks=%d", thread_idx,
-            cycles_to_us(l2_swimlane.sched_wiring_cycle), l2_swimlane.sched_wiring_cycle * 100.0 / sched_total,
-            l2_swimlane.phase_wiring_count
-        );
-#else
-        LOG_INFO_V9(
-            "Thread %d:   wiring         : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_wiring_cycle),
-            l2_swimlane.sched_wiring_cycle * 100.0 / sched_total
-        );
-#endif
-
-        LOG_INFO_V9(
-            "Thread %d:   idle           : %.3fus (%.1f%%)", thread_idx, cycles_to_us(l2_swimlane.sched_idle_cycle),
-            l2_swimlane.sched_idle_cycle * 100.0 / sched_total
-        );
-
-        if (cur_thread_completed > 0) {
-            LOG_INFO_V9(
-                "Thread %d:   avg/complete   : %.3fus", thread_idx,
-                cycles_to_us(l2_swimlane.sched_complete_cycle) / cur_thread_completed
-            );
-        }
-    }
-#endif
-    LOG_INFO_V9(
-        "Thread %d: Scheduler summary: total_time=%.3fus, loops=%" PRIu64 ", tasks_scheduled=%d", thread_idx,
-        cycles_to_us(sched_total), static_cast<uint64_t>(l2_swimlane.sched_loop_count), cur_thread_completed
-    );
-}
-#endif
-
-// =============================================================================
-// Shutdown: deinit AICore regs for this thread's cores (and PMU finalize if enabled).
-// Orchestrator threads have core_trackers_[thread_idx].core_num() == 0 -> no-op.
-// platform_deinit_aicore_regs is idempotent; safe to call after early completion.
-// =============================================================================
-int32_t SchedulerContext::shutdown(int32_t thread_idx) {
-    const int32_t *cores = core_trackers_[thread_idx].core_ids();
-    int32_t core_num = core_trackers_[thread_idx].core_num();
-    if (core_num == 0) return 0;
-
-#if PTO2_PROFILING
-    if (is_pmu_enabled()) {
-        pmu_aicpu_finalize(cores, core_num);
-    }
-#endif
-
-    LOG_INFO_V0("Thread %d: Shutting down %d cores", thread_idx, core_num);
-    int32_t rc = 0;
-    for (int32_t i = 0; i < core_num; i++) {
-        int32_t core_id = cores[i];
-        uint64_t reg_addr = core_exec_states_[core_id].reg_addr;
-        if (reg_addr != 0) {
-            // Timeout means AICore is unresponsive. Log and continue deiniting remaining cores.
-            if (platform_deinit_aicore_regs(reg_addr) != 0) {
-                LOG_ERROR("Thread %d: Core %d deinit timed out", thread_idx, core_id);
-                rc = -1;
-            }
-        } else {
-            LOG_ERROR("Thread %d: Core %d has invalid register address", thread_idx, core_id);
-        }
-    }
-    LOG_INFO_V0("Thread %d: Shutdown complete", thread_idx);
-    return rc;
-}
-
-// =============================================================================
-// Handshake with all AICore workers; discover core type and reg address.
-// =============================================================================
-int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) {
-    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
-    cores_total_num_ = runtime->worker_count;
-
-    // Validate cores_total_num_ before using as array index
-    if (cores_total_num_ == 0 || cores_total_num_ > RUNTIME_MAX_WORKER) {
-        LOG_ERROR("Invalid cores_total_num %d (expected 1-%d)", cores_total_num_, RUNTIME_MAX_WORKER);
-        return -1;
-    }
-
-    aic_count_ = 0;
-    aiv_count_ = 0;
-
-    LOG_INFO_V0("Handshaking with %d cores", cores_total_num_);
-
-    // Step 1: Write per-core payload addresses and send handshake signal.
-    // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before
-    // aicpu_ready=1, so AICore reads the correct payload pointer after waking up.
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        all_handshakes[i].task = reinterpret_cast<uint64_t>(&payload_per_core_[i][0]);
-        OUT_OF_ORDER_STORE_BARRIER();
-        all_handshakes[i].aicpu_ready = 1;
-    }
-    OUT_OF_ORDER_STORE_BARRIER();
-
-    // Get platform physical cores count for validation
-    uint32_t max_physical_cores_count = platform_get_physical_cores_count();
-
-    // Step 2: Wait for all cores to respond, collect core type and register addresses
-    bool handshake_failed = false;
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        Handshake *hank = &all_handshakes[i];
-
-        while (hank->aicore_regs_ready == 0) {
-            SPIN_WAIT_HINT();
-        }
-
-        uint32_t physical_core_id = hank->physical_core_id;
-
-        if (physical_core_id >= max_physical_cores_count) {
-            LOG_ERROR(
-                "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id,
-                max_physical_cores_count
-            );
-            handshake_failed = true;
-            continue;
-        }
-
-        uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
-        uint64_t reg_addr = regs[physical_core_id];
-
-        // Initialize AICore registers after discovery (first round)
-        platform_init_aicore_regs(reg_addr);
-        OUT_OF_ORDER_STORE_BARRIER();
-        hank->aicpu_regs_ready = 1;
-
-        OUT_OF_ORDER_STORE_BARRIER();
-
-        while (hank->aicore_done == 0) {
-            SPIN_WAIT_HINT();
-        }
-
-        CoreType type = hank->core_type;
-
-        core_exec_states_[i].reg_addr = reg_addr;
-        core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
-
-#if PTO2_PROFILING
-        // Record physical_core_id for PMU init later (CoreExecState has no room
-        // for this field under PTO2_PROFILING).
-        physical_core_ids_[i] = physical_core_id;
-#endif
-#if !PTO2_PROFILING
-        core_exec_states_[i].worker_id = i;
-        core_exec_states_[i].physical_core_id = physical_core_id;
-        core_exec_states_[i].core_type = type;
-#endif
-
-        if (type == CoreType::AIC) {
-            aic_worker_ids_[aic_count_++] = i;
-            LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
-        } else {
-            aiv_worker_ids_[aiv_count_++] = i;
-            LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
-        }
-    }
-
-    if (handshake_failed) {
-        emergency_shutdown(runtime);
-        return -1;
-    }
-
-    LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_);
-    return 0;
-}
-
-// =============================================================================
-// Assign discovered cores to scheduler threads (cluster-aligned round-robin).
-// =============================================================================
-bool SchedulerContext::assign_cores_to_threads() {
-    // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % active_sched_threads_.
-    // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together.
-    active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
-    int32_t cluster_count = aic_count_;
-
-    // Max clusters any single sched thread can hold: ceil(cluster_count / active_sched_threads_).
-    int32_t max_clusters_per_thread = (cluster_count + active_sched_threads_ - 1) / active_sched_threads_;
-    int32_t thread_cores_num = max_clusters_per_thread * 3;
-
-    if (thread_cores_num > CoreTracker::MAX_CORE_PER_THREAD) {
-        LOG_ERROR("Can't assign more then 64 cores in per scheduler");
-        return false;
-    }
-
-    LOG_INFO_V0(
-        "Assigning cores (round-robin): %d clusters across %d sched threads (%d AIC, %d AIV)", cluster_count,
-        active_sched_threads_, aic_count_, aiv_count_
-    );
-
-    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
-        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
-        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
-    }
-
-    // Count clusters per thread first (round-robin may distribute unevenly)
-    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        clusters_per_thread[ci % active_sched_threads_]++;
-    }
-    for (int32_t i = 0; i < active_sched_threads_; i++) {
-        core_trackers_[i].init(clusters_per_thread[i]);
-    }
-
-    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
-
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        int32_t t = ci % active_sched_threads_;
-
-        int32_t aic_wid = aic_worker_ids_[ci];
-        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
-        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
-
-        core_trackers_[t].set_cluster(cluster_idx_per_thread[t]++, aic_wid, aiv0_wid, aiv1_wid);
-
-        LOG_INFO_V0("Thread %d: cluster %d (AIC=%d, AIV0=%d, AIV1=%d)", t, ci, aic_wid, aiv0_wid, aiv1_wid);
-    }
-
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        LOG_INFO_V0(
-            "Thread %d: total %d cores (%d clusters)", t, core_trackers_[t].core_num(),
-            core_trackers_[t].get_cluster_count()
-        );
-    }
-
-    LOG_INFO_V0(
-        "Config: threads=%d, cores=%d, cores_per_thread=%d", aicpu_thread_num_, cores_total_num_, thread_cores_num
-    );
-    return true;
-}
-
-// =============================================================================
-// Reassign all cores across all threads (sched + orchestrator) after orchestration.
-// =============================================================================
-void SchedulerContext::reassign_cores_for_all_threads() {
-    LOG_INFO_V0(
-        "Reassigning cores (cluster-aligned) for %d threads: %d AIC, %d AIV", aicpu_thread_num_, aic_count_, aiv_count_
-    );
-
-    // Collect running worker_ids from all current trackers
-    bool running_cores[RUNTIME_MAX_WORKER] = {};
-    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
-        auto all_running = core_trackers_[i].get_all_running_cores();
-        int32_t bp;
-        while ((bp = all_running.pop_first()) >= 0) {
-            running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true;
-        }
-    }
-
-    // Count clusters per thread (round-robin across all threads)
-    int32_t cluster_count = aic_count_;
-    int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        clusters_per_thread[ci % aicpu_thread_num_]++;
-    }
-
-    // Re-init all trackers and reset core counts
-    for (int32_t i = 0; i < aicpu_thread_num_; i++) {
-        core_trackers_[i].init(clusters_per_thread[i]);
-    }
-
-    // Assign clusters round-robin and restore running state
-    int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
-    for (int32_t ci = 0; ci < cluster_count; ci++) {
-        int32_t t = ci % aicpu_thread_num_;
-
-        int32_t aic_wid = aic_worker_ids_[ci];
-        int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
-        int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
-
-        int32_t cl_idx = cluster_idx_per_thread[t]++;
-        core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid);
-
-        // init() marks all idle; toggle cores that were running and restore pending_occupied
-        if (running_cores[aic_wid]) {
-            core_trackers_[t].change_core_state(cl_idx * 3);
-            core_trackers_[t].set_pending_occupied(cl_idx * 3);
-        }
-        if (running_cores[aiv0_wid]) {
-            core_trackers_[t].change_core_state(cl_idx * 3 + 1);
-            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1);
-        }
-        if (running_cores[aiv1_wid]) {
-            core_trackers_[t].change_core_state(cl_idx * 3 + 2);
-            core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2);
-        }
-    }
-
-    // Log final distribution
-    LOG_INFO_V0("Core reassignment complete:");
-    for (int32_t t = 0; t < aicpu_thread_num_; t++) {
-        int32_t aic_running = core_trackers_[t].get_running_count<CoreType::AIC>();
-        int32_t aiv_running = core_trackers_[t].get_running_count<CoreType::AIV>();
-        LOG_INFO_V0(
-            "  Thread %d: %d cores, %d clusters (AIC running=%d, AIV running=%d)", t, core_trackers_[t].core_num(),
-            core_trackers_[t].get_cluster_count(), aic_running, aiv_running
-        );
-    }
-    active_sched_threads_ = aicpu_thread_num_;
-}
-
-// =============================================================================
-// Emergency shutdown: broadcast exit signal to every handshake'd core and
-// deinit their AICore register blocks. Idempotent.
-// =============================================================================
-void SchedulerContext::emergency_shutdown(Runtime *runtime) {
-    LOG_WARN("Emergency shutdown: sending exit signal to all initialized cores");
-    Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
-    int32_t timeout_count = 0;
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        Handshake *hank = &all_handshakes[i];
-        OUT_OF_ORDER_STORE_BARRIER();
-        hank->aicpu_regs_ready = 1;
-        if (core_exec_states_[i].reg_addr != 0) {
-            if (platform_deinit_aicore_regs(core_exec_states_[i].reg_addr) != 0) {
-                timeout_count++;
-            }
-        }
-    }
-    if (timeout_count > 0) {
-        LOG_ERROR("Emergency shutdown: %d cores did not acknowledge exit", timeout_count);
-    }
-    LOG_WARN("Emergency shutdown complete");
-}
-
-// =============================================================================
-// Lifecycle: init / deinit
-// =============================================================================
-int32_t SchedulerContext::init(
-    Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base
-) {
-    always_assert(runtime != nullptr);
-
-    // Zero all per-core execution state before handshake
-    memset(core_exec_states_, 0, sizeof(core_exec_states_));
-
-    // Wire thread/transition configuration that handshake/assign need to read.
-    aicpu_thread_num_ = aicpu_thread_num;
-    sched_thread_num_ = sched_thread_num;
-    orch_to_sched_ = orch_to_sched;
-    regs_ = regs_base;
-
-#if PTO2_PROFILING
-    // l2_swimlane_aicpu_init promotes g_l2_swimlane_level from the shared-memory
-    // header — must be called BEFORE caching the level, otherwise the cached
-    // value would still be 0 (only the binary enable bit has been seeded by
-    // kernel.cpp at this point). Reset the cached level on disabled runs so a
-    // prior enabled launch's level can't leak into the phase-record gates in
-    // scheduler_dispatch.
-    if (is_l2_swimlane_enabled()) {
-        l2_swimlane_aicpu_init(runtime->worker_count);
-        l2_swimlane_level_ = get_l2_swimlane_level();
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-            // Sched-phase pool count: matches the dump_args_init branch in
-            // scheduler_dispatch.cpp. sched_thread_num_ <= 0 means "use all
-            // AICPU threads as scheduler threads" (see assign_cores_to_threads'
-            // active_sched_threads_ normalization at line 689). Without this
-            // normalization here, init_phase would prime zero sched pools and
-            // all sched_phase emits would silently drop.
-            const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
-            const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched;
-            // Orchestration is always single-threaded, so orch-phase is one pool
-            // (ordinal 0) in both modes — see record_orch_phase.
-            const int orch_phase_threads = 1;
-            l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_phase_threads, orch_phase_threads);
-        }
-    } else {
-        l2_swimlane_level_ = L2SwimlaneLevel::DISABLED;
-    }
-#endif
-
-    // Discover cores and assign to scheduler threads.
-    int32_t rc = handshake_all_cores(runtime);
-    if (rc != 0) {
-        LOG_ERROR("handshake_all_cores failed");
-        return rc;
-    }
-    if (!assign_cores_to_threads()) {
-        return -1;
-    }
-
-    // Initialize task counters. Task count comes from PTO2 shared memory.
-    if (runtime->get_gm_sm_ptr()) {
-        auto *header = static_cast<PTO2SharedMemoryHeader *>(runtime->get_gm_sm_ptr());
-        // Read at one-time boot init, before the SM is reset for the run, so a
-        // ring not yet written holds uninitialized memory (0xbe... under ASAN's
-        // malloc-fill). Sum in int64 and only count rings whose value is a
-        // plausible task count — (0, PTO2_SCOPE_TASKS_CAP]; a ring cannot hold
-        // more than the scope cap. This rejects any garbage pattern (negative
-        // or positive), so uninitialized rings contribute 0 (the correct boot
-        // count) while valid counts still add up, with no signed overflow.
-        int64_t pto2_count = 0;
-        for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-            int32_t ring_tasks = header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
-            if (ring_tasks > 0 && ring_tasks <= PTO2_SCOPE_TASKS_CAP) pto2_count += ring_tasks;
-        }
-        total_tasks_ = static_cast<int32_t>(pto2_count);
-    } else {
-        total_tasks_ = 0;
-    }
-    completed_tasks_.store(0, std::memory_order_release);
-
-    // Device orchestration: the orchestrator thread flips this when the graph is built.
-    orchestrator_done_ = false;
-
-    // Clear per-core dispatch payloads
-    memset(payload_per_core_, 0, sizeof(payload_per_core_));
-    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
-
-    // Initialize per-core GlobalContext (sub_block_id) based on cluster position.
-    // This is done once at startup and never modified afterwards.
-    for (int32_t t = 0; t < sched_thread_num_; t++) {
-        CoreTracker &tracker = core_trackers_[t];
-        for (int32_t c = 0; c < tracker.get_cluster_count(); c++) {
-            int32_t cluster_offset = c * 3;  // Each cluster = 1 AIC + 2 AIV
-            auto aiv0_id = tracker.get_core_id_by_offset(tracker.get_aiv0_core_offset(cluster_offset));
-            auto aiv1_id = tracker.get_core_id_by_offset(tracker.get_aiv1_core_offset(cluster_offset));
-            payload_per_core_[aiv0_id][0].global_context.sub_block_id = 0;
-            payload_per_core_[aiv0_id][1].global_context.sub_block_id = 0;
-            payload_per_core_[aiv1_id][0].global_context.sub_block_id = 1;
-            payload_per_core_[aiv1_id][1].global_context.sub_block_id = 1;
-        }
-    }
-
-    func_id_to_addr_ = runtime->func_id_to_addr_;
-
-    return 0;
-}
-
-void SchedulerContext::deinit() {
-    // Reset all per-core execution state
-    for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++) {
-        core_exec_states_[i] = {};
-        core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
-        core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
-    }
-
-    // Clear per-core dispatch payloads
-    memset(payload_per_core_, 0, sizeof(payload_per_core_));
-    memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
-
-    // Reset sync-start drain coordination — a previous run that aborted mid-drain
-    // would otherwise leave dirty pending/elected/ack state for the next reuse.
-    drain_state_.sync_start_pending.store(0, std::memory_order_release);
-    drain_state_.drain_worker_elected.store(0, std::memory_order_release);
-    drain_state_.drain_ack_mask.store(0, std::memory_order_release);
-    drain_state_.pending_task.store(nullptr, std::memory_order_release);
-
-    // Reset task counters and orchestrator state
-    completed_tasks_.store(0, std::memory_order_release);
-    total_tasks_ = 0;
-    orchestrator_done_ = false;
-    pto2_init_done_.store(false, std::memory_order_release);
-    pto2_init_complete_.store(false, std::memory_order_release);
-
-    // Reset core transition state
-    transition_requested_.store(false, std::memory_order_release);
-    wait_reassign_.store(0, std::memory_order_release);
-    reassigned_.store(false, std::memory_order_release);
-    completed_.store(false, std::memory_order_release);
-
-    // Reset core discovery and assignment state
-    aic_count_ = 0;
-    aiv_count_ = 0;
-    cores_total_num_ = 0;
-    aicpu_thread_num_ = 0;
-    sched_thread_num_ = 0;
-    orch_to_sched_ = false;
-    active_sched_threads_ = 0;
-    for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) {
-        core_trackers_[t] = CoreTracker{};
-    }
-
-    regs_ = 0;
-    sched_ = nullptr;
-    rt_ = nullptr;
-    func_id_to_addr_ = nullptr;
-}
-
-void SchedulerContext::wait_pto2_init_complete() const {
-    while (!pto2_init_complete_.load(std::memory_order_acquire)) {
-        SPIN_WAIT_HINT();
-    }
-}
-
-void SchedulerContext::bind_runtime(PTO2Runtime *rt) {
-    rt_ = rt;
-    sched_ = &rt->scheduler;
-}
-
-// =============================================================================
-// Post-orchestration bookkeeping. Runs on the orchestrator thread once the
-// build phase finishes; folds inline-completed tasks, flips orchestrator_done_,
-// and drives the orchestrator → scheduler core transition (or fatal shutdown).
-// =============================================================================
-void SchedulerContext::on_orchestration_done(
-    Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks
-) {
-#if PTO2_PROFILING
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) {
-        // Flush the orchestrator's orch-phase buffer (single instance, pool 0).
-        // The orchestrator has no scheduler-phase pool of its own — those belong
-        // to the scheduler threads and are flushed in scheduler_dispatch.
-        l2_swimlane_aicpu_flush_orch_phase_buffer(thread_idx);
-    }
-#endif
-
-    total_tasks_ = total_tasks;
-
-    // Fold tasks completed inline during orchestration
-    int32_t inline_completed = static_cast<int32_t>(rt->orchestrator.inline_completed_tasks);
-    if (inline_completed > 0) {
-        completed_tasks_.fetch_add(inline_completed, std::memory_order_relaxed);
-#if PTO2_SCHED_PROFILING
-        rt->scheduler.tasks_completed.fetch_add(inline_completed, std::memory_order_relaxed);
-#endif
-    }
-    orchestrator_done_ = true;
-
-    // Check for fatal error from orchestration; if so, shut down immediately.
-    int32_t orch_err = 0;
-    if (sched_->sm_header) {
-        orch_err = sched_->sm_header->orch_error_code.load(std::memory_order_relaxed);
-    }
-    if (orch_err != PTO2_ERROR_NONE) {
-        if (!completed_.exchange(true, std::memory_order_acq_rel)) {
-            emergency_shutdown(runtime);
-        }
-    }
-
-    // Skip core transition on fatal error — cores already shut down above.
-    if (completed_.load(std::memory_order_acquire)) {
-        // Signal transition to unblock scheduler threads waiting at core transition
-        transition_requested_.store(true, std::memory_order_release);
-        reassigned_.store(true, std::memory_order_release);
-    } else if (orch_to_sched_) {
-        LOG_INFO_V0("Thread %d: Set orchestrator_done=true, requesting core transition", thread_idx);
-        transition_requested_.store(true, std::memory_order_release);
-
-        // Wait for scheduler threads to acknowledge transition request
-        while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_) {
-            if (completed_.load(std::memory_order_acquire)) {
-                break;
-            }
-            SPIN_WAIT_HINT();
-        }
-        if (!completed_.load(std::memory_order_acquire)) {
-            reassign_cores_for_all_threads();
-            reassigned_.store(true, std::memory_order_release);
-        }
-    }
-
-#if PTO2_PROFILING
-    // Write core-to-thread mapping AFTER reassignment so the profiling data
-    // reflects the final distribution (all active_sched_threads_, including
-    // former orchestrator threads when orch_to_sched_ is enabled).
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        l2_swimlane_aicpu_init_core_assignments(cores_total_num_);
-        for (int32_t t = 0; t < active_sched_threads_; t++) {
-            l2_swimlane_aicpu_write_core_assignments_for_thread(
-                t, core_trackers_[t].core_ids(), core_trackers_[t].core_num()
-            );
-        }
-    }
-#endif
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
deleted file mode 100644
index 774589865..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_completion.cpp
+++ /dev/null
@@ -1,614 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include "scheduler_context.h"
-
-#include <algorithm>
-
-#include "common/unified_log.h"
-#include "aicpu/device_time.h"
-#include "aicpu/platform_regs.h"
-#include "common/l2_swimlane_profiling.h"
-#include "common/memory_barrier.h"
-#include "common/platform_config.h"
-#include "pto_runtime2.h"
-#include "runtime.h"
-#include "spin_hint.h"
-
-// Performance profiling headers
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-#include "aicpu/pmu_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-
-// =============================================================================
-// Dual-slot state machine helpers
-// =============================================================================
-
-namespace {
-inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
-}
-
-// Pure function: read register result -> SlotTransition (no side effects).
-SlotTransition SchedulerContext::decide_slot_transition(
-    int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id, bool pending_gated
-) {
-    SlotTransition t;
-    if (pending_id != AICPU_TASK_INVALID && reg_task_id == pending_id) {
-        t.matched = true;
-        t.running_done = true;  // Serial execution: pending event implies running done
-        t.running_freed = true;
-        t.pending_freed = true;
-        if (reg_state == TASK_FIN_STATE) {
-            t.pending_done = true;  // Case 1: pending FIN
-        }
-        // else: Case 2: pending ACK (pending_done stays false)
-    } else if (reg_task_id == running_id) {
-        if (reg_state == TASK_FIN_STATE) {
-            if (pending_id == AICPU_TASK_INVALID) {
-                // Case 3.2: running FIN, no pending -> core goes idle
-                t.matched = true;
-                t.running_done = true;
-                t.running_freed = true;
-            } else if (pending_gated) {
-                // Case 3.3: running FIN, pending is a SPECULATIVE GATED task. The
-                // Case 3.1 "wait for the pending's ack" shortcut assumes the AICore
-                // immediately runs the pending task; a gated task instead spins on
-                // its doorbell and never acks until its producer completes — and
-                // that producer's completion depends on collecting THIS running FIN.
-                // Waiting would deadlock. Complete the running FIN now and promote
-                // the gated task (it then skip-gates until its doorbell). pending is
-                // NOT freed (it promotes, not retires) so the bitmap update keeps the
-                // core off-limits — no second gated block, no doorbell overwrite.
-                t.matched = true;
-                t.running_done = true;
-                t.running_freed = true;
-            }
-            // Case 3.1: running FIN, NON-gated pending exists -> skip (transient
-            // state). Case 1/2 (pending ack/FIN) completes running implicitly.
-        } else {
-            // Case 4: running ACK -- only pending_freed (slot now hardware-latched)
-            t.matched = true;
-            t.pending_freed = true;
-        }
-    }
-    return t;
-}
-
-// Complete one slot's task: subtask counting, mixed completion, deferred release, profiling.
-void SchedulerContext::complete_slot_task(
-    PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, [[maybe_unused]] PTO2SubtaskSlot subslot,
-    int32_t thread_idx, int32_t core_id, Handshake *hank, int32_t &completed_this_turn,
-    PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count, PTO2LocalReadyBuffer *local_bufs
-#if PTO2_PROFILING
-    ,
-    uint64_t dispatch_ts, uint64_t finish_ts
-#endif
-) {
-#if PTO2_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-#else
-    (void)hank;
-#endif
-    // MPSC fast-path is opt-in per task: only tasks with at least one subtask
-    // that registered a deferred condition route through the mailbox. Pure
-    // non-deferred tasks complete inline on this thread (matching pre-MPSC
-    // behavior — keeps the common case parallelized across scheduler threads
-    // instead of serializing through the single consumer). The
-    // any_subtask_deferred flag on slot_state is the discriminator; it's set
-    // (release) before on_subtask_complete and read (acquire) after, so the
-    // last subtask sees flag writes from any earlier subtask of the same task.
-    AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr;
-    bool defer_completion_to_consumer = false;
-
-    if (slot_state.payload != nullptr) {
-        volatile DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][expected_reg_task_id & 1];
-        int32_t slab_err = deferred_slab->error_code;
-        if (slab_err != PTO2_ERROR_NONE) {
-            int32_t expected = PTO2_ERROR_NONE;
-            sched_->sm_header->sched_error_code.compare_exchange_strong(
-                expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire
-            );
-            completed_.store(true, std::memory_order_release);
-            return;
-        }
-
-        uint32_t cond_count = deferred_slab->count;
-        if (cond_count > MAX_COMPLETIONS_PER_TASK) {
-            int32_t expected = PTO2_ERROR_NONE;
-            sched_->sm_header->sched_error_code.compare_exchange_strong(
-                expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire
-            );
-            completed_.store(true, std::memory_order_release);
-            return;
-        }
-
-        if (cond_count > 0) {
-            // Publish "this task is deferred" before on_subtask_complete so the
-            // acq_rel fetch_add inside on_subtask_complete makes the flag
-            // visible to whichever subtask sees task_complete=true (which may
-            // be this thread or a later one).
-            slot_state.any_subtask_deferred.store(true, std::memory_order_release);
-
-            const PTO2TaskId token = slot_state.task->task_id;
-            for (uint32_t i = 0; i < cond_count; ++i) {
-                volatile DeferredCompletionEntry *e = &deferred_slab->entries[i];
-                while (!mailbox->try_push_condition(token, e->addr, e->expected_value, e->engine, e->completion_type)) {
-                    sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
-                    SPIN_WAIT_HINT();
-                }
-            }
-        }
-    }
-
-    bool task_complete = sched_->on_subtask_complete(slot_state);
-
-#if PTO2_PROFILING
-    // Sub-block retire that did not finish the slot: record it so the poll
-    // iteration becomes visible on the scheduler lane (the SPMD harvest tail).
-    if (!task_complete && l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        l2_swimlane.phase_subretire_count++;
-    }
-#endif
-
-    if (task_complete && slot_state.payload != nullptr &&
-        slot_state.any_subtask_deferred.load(std::memory_order_acquire)) {
-        // Some subtask of this task registered conditions; finish the
-        // registration by handing the slot_state off to the consumer.
-        while (!mailbox->try_push_normal_done(slot_state.task->task_id, reinterpret_cast<uint64_t>(&slot_state))) {
-            sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
-            SPIN_WAIT_HINT();
-        }
-        defer_completion_to_consumer = true;
-    }
-
-    if (task_complete && !defer_completion_to_consumer) {
-#if PTO2_PROFILING
-        if (is_dump_args_enabled()) {
-            dump_args_for_task<PTO2_SUBTASK_SLOT_COUNT>(
-                thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION,
-                [](ActiveMask active_mask, int raw_subtask_id) {
-                    return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
-                },
-                [this](int32_t func_id) {
-                    return get_function_bin_addr(func_id);
-                }
-            );
-        }
-#endif
-#if PTO2_PROFILING
-        // Time Resolve (walk the consumer list, decrement each consumer's
-        // fanin, push the newly-ready ones, ring doorbells for speculative
-        // hits) so it renders as a child bar nested inside this iteration's
-        // Complete bar. The 1 µs floor below filters out the ~88% of tasks
-        // with 1-2 consumers (~500 ns Resolve) so only the long broadcast /
-        // reduction walks stand out on the lane.
-        uint64_t resolve_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
-#endif
-        // [[maybe_unused]] silences -Werror=unused-but-set-variable on the
-        // profiling-flags-smoke build path where PTO2_PROFILING is OFF and
-        // the Resolve emit below is excluded.
-        [[maybe_unused]] uint32_t consumers_resolved = 0;
-#if PTO2_SCHED_PROFILING
-        // SCHED_PROFILING variant takes thread_idx for its per-thread atomic
-        // counter side-effects (g_sched_*_atomic_count[thread_idx], consumed
-        // by the otc_* log lines). It returns CompletionStats whose
-        // `fanout_edges` is the consumer-walk count.
-        consumers_resolved = sched_->on_task_complete(slot_state, thread_idx, local_bufs).fanout_edges;
-#else
-        consumers_resolved = sched_->on_task_complete(slot_state, local_bufs);
-#endif
-#if PTO2_PROFILING
-        if (resolve_t0 != 0) {
-            uint64_t resolve_t1 = get_sys_cnt_aicpu();
-            // Filter: drop Resolve bars under 1 µs so the lane shows only
-            // resolves that did meaningful work (high consumer counts or
-            // doorbells). 50 cycles @ 50 MHz = 1 µs (PLATFORM_PROF_SYS_CNT_FREQ
-            // is the device sys-cnt frequency).
-            constexpr uint64_t RESOLVE_EMIT_MIN_CYCLES = PLATFORM_PROF_SYS_CNT_FREQ / 1'000'000;  // 1 µs
-            if (resolve_t1 - resolve_t0 >= RESOLVE_EMIT_MIN_CYCLES) {
-                l2_swimlane_aicpu_record_sched_phase(
-                    thread_idx, L2SwimlaneSchedPhaseKind::Resolve, resolve_t0, resolve_t1, l2_swimlane.sched_loop_count,
-                    consumers_resolved
-                );
-            }
-        }
-        l2_swimlane.phase_complete_count++;
-#endif
-        if (deferred_release_count < PTO2_DEFERRED_RELEASE_CAP) {
-            deferred_release_slot_states[deferred_release_count++] = &slot_state;
-        } else {
-            LOG_INFO_V9("Thread %d: release", thread_idx);
-            while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                // SCHED_PROFILING variant takes thread_idx for the per-thread
-                // atomic counter side-effects. The return value is unused.
-                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-            }
-            deferred_release_slot_states[deferred_release_count++] = &slot_state;
-        }
-        completed_this_turn++;
-    }
-
-#if PTO2_PROFILING
-    // Level gate: at AICORE_TIMING (level=1) the AICore record alone carries
-    // {start, end, task_token_raw}, host resolves func_id/core_type from
-    // dep_gen / per-core mapping, and AICPU has nothing to write. Only at
-    // AICPU_TIMING (level=2) and above does AICPU contribute dispatch/finish
-    // timestamps via complete_task. Bypassing here saves the per-completion
-    // hot-path cost (counter inc + ring lookup + record store + wmb + buffer
-    // rotation bookkeeping) for runs that only want AICore timing.
-    if (l2_swimlane.l2_swimlane_enabled && l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
-#if PTO2_SCHED_PROFILING
-        uint64_t t_perf_start = get_sys_cnt_aicpu();
-#endif
-
-        if (l2_swimlane_aicpu_complete_task(
-                core_id, thread_idx, static_cast<uint32_t>(expected_reg_task_id), dispatch_ts, finish_ts
-            ) != 0) {
-            LOG_ERROR(
-                "Core %d: l2_swimlane_aicpu_complete_task failed for task 0x%" PRIx64, core_id,
-                static_cast<uint64_t>(slot_state.task->task_id.raw)
-            );
-        }
-#if PTO2_SCHED_PROFILING
-        l2_swimlane.sched_complete_perf_cycle += (get_sys_cnt_aicpu() - t_perf_start);
-#endif
-    }
-
-    if (is_pmu_enabled()) {
-        pmu_aicpu_record_task(
-            core_id, thread_idx, slot_state.task->task_id.raw,
-            slot_state.task->kernel_id[static_cast<int32_t>(subslot)], hank[core_id].core_type
-        );
-    }
-#endif
-}
-
-// Promote pending slot data to running slot. Clears pending fields.
-void SchedulerContext::promote_pending_to_running(CoreExecState &core) {
-    core.running_slot_state = core.pending_slot_state;
-    core.running_reg_task_id = core.pending_reg_task_id;
-    core.running_subslot = core.pending_subslot;
-#if PTO2_PROFILING
-    core.running_dispatch_timestamp = core.pending_dispatch_timestamp;
-#endif
-    core.pending_slot_state = nullptr;
-    core.pending_reg_task_id = AICPU_TASK_INVALID;
-}
-
-// Clear running slot (core becomes idle).
-void SchedulerContext::clear_running_slot(CoreExecState &core) {
-    core.running_slot_state = nullptr;
-    core.running_reg_task_id = AICPU_TASK_INVALID;
-}
-
-void SchedulerContext::check_running_cores_for_completion(
-    int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed,
-    bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
-    PTO2LocalReadyBuffer *local_bufs
-) {
-#if PTO2_SCHED_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-#endif
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    auto running_core_states = tracker.get_all_running_cores();
-    while (running_core_states.has_value()) {
-        int32_t bit_pos = running_core_states.pop_first();
-        int32_t core_id = tracker.get_core_id_by_offset(bit_pos);
-        CoreExecState &core = core_exec_states_[core_id];
-
-        // Skip gated speculative cores. A STAGED task is parked on this core
-        // waiting for its doorbell — it physically cannot ACK/FIN yet, so
-        // reading its COND (MMIO, and the core is hot-spinning on its own SPR)
-        // every poll is pure waste that drags out the completion phase. The
-        // doorbell (try_speculative_release) flips spec_state to DISPATCHED, at
-        // which point the core becomes pollable again and its FIN is caught.
-        // Cheap cacheable load; no MMIO. Pending slot is empty while gated.
-        {
-            PTO2TaskSlotState *rs = core.running_slot_state;
-            if (rs != nullptr && rs->payload != nullptr &&
-                rs->payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING) {
-                continue;
-            }
-        }
-
-        // --- Judgment phase: read register, derive transition ---
-        // Use the precomputed cond_ptr (resolved once in handshake) to skip
-        // the reg_offset switch and reg_addr addition on every poll.
-        uint64_t reg_val = static_cast<uint64_t>(*core.cond_ptr);
-        // ARM64 allows Device-nGnRnE -> Normal-cacheable load reorder; the
-        // rmb() pins any AICore-published cacheable reads downstream of the
-        // FIN observation. Replaces the post-`__sync_synchronize` that the
-        // old read_reg() helper carried implicitly.
-        rmb();
-        int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
-        int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
-
-#if PTO2_SCHED_PROFILING
-        if (l2_swimlane.l2_swimlane_enabled) {
-            l2_swimlane.complete_probe_count++;
-        }
-#endif
-
-        // A pending task is "gated" when it is a speculative pre-stage still
-        // waiting on its doorbell (STAGED): it will not ack on the producer's FIN,
-        // so the Case 3.1 wait-for-pending-ack shortcut would deadlock. Detect it
-        // so decide_slot_transition completes the running FIN and promotes it.
-        bool pending_gated =
-            (core.pending_slot_state != nullptr && core.pending_slot_state->payload != nullptr &&
-             core.pending_slot_state->payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING);
-        SlotTransition t = decide_slot_transition(
-            reg_task_id, reg_state, core.running_reg_task_id, core.pending_reg_task_id, pending_gated
-        );
-        if (!t.matched) continue;
-
-#if PTO2_SCHED_PROFILING
-        if (l2_swimlane.l2_swimlane_enabled && (t.running_done || t.pending_done)) {
-            l2_swimlane.complete_hit_count++;
-        }
-#endif
-
-#if PTO2_PROFILING
-        // Capture finish_ts at the FIN observation point — right after rmb()
-        // above pinned the cacheable AICore reads downstream of the register
-        // load, and BEFORE any fanin / deferred-release work. Anything later
-        // (slot transition apply, complete_slot_task fanin processing) would
-        // charge AICPU completion-processing cost to the (end → finish)
-        // span, masking the actual FIN-delivery latency.
-        uint64_t finish_ts = 0;
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING && (t.pending_done || t.running_done)) {
-            finish_ts = get_sys_cnt_aicpu();
-        }
-#endif
-
-        // --- Apply phase: execute actions based on transition ---
-
-        // 1. Complete finished tasks (capture pointers before modifying core state)
-        if (t.pending_done) {
-            complete_slot_task(
-                *core.pending_slot_state, core.pending_reg_task_id, core.pending_subslot, thread_idx, core_id, hank,
-                completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs
-#if PTO2_PROFILING
-                ,
-                core.pending_dispatch_timestamp, finish_ts
-#endif
-            );
-            cur_thread_completed++;
-        }
-        if (t.running_done) {
-            complete_slot_task(
-                *core.running_slot_state, core.running_reg_task_id, core.running_subslot, thread_idx, core_id, hank,
-                completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs
-#if PTO2_PROFILING
-                ,
-                core.running_dispatch_timestamp, finish_ts
-#endif
-            );
-            cur_thread_completed++;
-        }
-
-        // 2. Update slot data
-        if (t.running_freed) {
-            if (core.pending_slot_state != nullptr && !t.pending_done) {
-                promote_pending_to_running(core);  // Case 2 or Case 3 (with pending)
-            } else {
-                clear_running_slot(core);  // Case 1 or Case 3 (no pending)
-                if (t.pending_done) {
-                    // Case 1: pending FIN observed directly -- clear stale pending fields.
-                    // Without this, pending_reg_task_id retains a stale value that blocks
-                    // clear_pending_occupied and permanently degrades pipelining.
-                    core.pending_slot_state = nullptr;
-                    core.pending_reg_task_id = AICPU_TASK_INVALID;
-                }
-            }
-        }
-
-        // 3. Update tracker bitmap
-        bool is_idle = (core.running_reg_task_id == AICPU_TASK_INVALID);
-        if (is_idle) {
-            tracker.change_core_state(bit_pos);       // Mark idle
-            tracker.clear_pending_occupied(bit_pos);  // Idle safeguard: no payload to protect
-        } else if (t.pending_freed && core.pending_reg_task_id == AICPU_TASK_INVALID) {
-            // Case 4 (running ACK) or Case 2 (pending ACK): clear pending_occupied only
-            // when no pending task is currently held. Otherwise pending slot is occupied
-            // by a pre-loaded task and must stay protected.
-            tracker.clear_pending_occupied(bit_pos);
-        }
-
-        // 4. Progress signal (only when running task completes)
-        if (t.running_done) {
-            made_progress = true;
-        }
-    }
-}
-
-// =============================================================================
-// sync_start drain protocol
-// =============================================================================
-
-// Take ownership of slot_state and signal all threads to enter drain mode.
-// Returns true if this thread won the CAS and owns the drain slot.
-// Returns false if another thread already holds drain; caller must re-push slot_state.
-//
-// Two-phase protocol: CAS 0 -> -1 (sentinel) to claim ownership, store task and
-// reset election flag, then release-store block_num.  Other threads acquire-load
-// sync_start_pending; seeing block_num > 0 ensures all relaxed stores are visible.
-bool SchedulerContext::enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num) {
-    int32_t expected = 0;
-    if (!drain_state_.sync_start_pending.compare_exchange_strong(
-            expected, -1, std::memory_order_relaxed, std::memory_order_relaxed
-        )) {
-        return false;  // Another thread already holds the drain slot.
-    }
-    // We own the drain slot.  Store the task and reset election flag before making it visible.
-    drain_state_.pending_task.store(slot_state, std::memory_order_release);
-    drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
-    drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
-    // Release store: all stores above are now visible to any thread that
-    // acquire-loads sync_start_pending and sees block_num > 0.
-    drain_state_.sync_start_pending.store(block_num, std::memory_order_release);
-    return true;
-}
-
-// Count total available resources across all scheduler threads for a given shape.
-int32_t SchedulerContext::count_global_available(PTO2ResourceShape shape, uint8_t core_mask) {
-    int32_t total = 0;
-    for (int32_t t = 0; t < active_sched_threads_; t++) {
-        if (shape == PTO2ResourceShape::MIX) {
-            total += core_trackers_[t].count_mix_running_clusters(core_mask);
-        } else {
-            total += core_trackers_[t].get_idle_core_offset_states(shape).count();
-        }
-    }
-    return total;
-}
-
-// Drain worker: dispatch all blocks in one pass across all threads' trackers.
-// Called only when global resources >= block_num, so one pass always suffices.
-// All other threads are spinning -- the drain worker has exclusive tracker access.
-void SchedulerContext::drain_worker_dispatch(int32_t block_num) {
-    PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
-    if (!slot_state) {
-        drain_state_.sync_start_pending.store(0, std::memory_order_release);
-        return;
-    }
-    PTO2ResourceShape shape = slot_state->active_mask.to_shape();
-    uint8_t core_mask = slot_state->active_mask.core_mask();
-
-    for (int32_t t = 0;
-         t < active_sched_threads_ && slot_state->next_block_idx.load(std::memory_order_relaxed) < block_num; t++) {
-        auto valid = (shape == PTO2ResourceShape::MIX) ?
-                         core_trackers_[t].get_mix_running_cluster_offset_states(core_mask) :
-                         core_trackers_[t].get_idle_core_offset_states(shape);
-        int32_t start = slot_state->next_block_idx.load(std::memory_order_relaxed);
-        int32_t remaining = slot_state->logical_block_num - start;
-        int32_t claim = std::min(valid.count(), remaining);
-        slot_state->next_block_idx.store(static_cast<int16_t>(start + claim), std::memory_order_relaxed);
-        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
-        int handle_count = 0;
-        for (int32_t b = 0; b < claim; b++) {
-            auto core_offset = valid.pop_first();
-            handle_count += prepare_block_for_dispatch(
-                t, core_offset, *slot_state, shape, false, start + b, &handles[handle_count]
-            );
-        }
-        wmb();
-        uint64_t dispatch_ts = 0;
-#if PTO2_PROFILING
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
-            dispatch_ts = get_sys_cnt_aicpu();
-        }
-#endif
-        for (int i = 0; i < handle_count; i++) {
-            publish_subtask_to_core(handles[i], dispatch_ts);
-        }
-    }
-
-    // All blocks dispatched -- clear drain state.
-    // Release fence ensures tracker mutations are visible to threads that
-    // acquire-load sync_start_pending == 0 and resume normal operation.
-    std::atomic_thread_fence(std::memory_order_release);
-    drain_state_.pending_task.store(nullptr, std::memory_order_release);
-    drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
-    drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
-    drain_state_.sync_start_pending.store(0, std::memory_order_release);
-}
-
-// Called by each scheduler thread when drain_state_.sync_start_pending != 0.
-//
-// Protocol (single-stage ack barrier):
-//   1. Ack barrier: all threads signal they've stopped dispatch, then spin
-//      until all ack bits are set.
-//      If this thread's bit gets cleared while waiting, a reset occurred -- return.
-//   2. Election: one thread wins the CAS and becomes the drain worker.
-//      If resources are insufficient, reset ack/election fields and return --
-//      all threads resume completion polling to free running cores, then retry.
-//   3. Dispatch: elected thread dispatches all blocks (one pass, resources guaranteed).
-//      Non-elected threads spin-wait until sync_start_pending == 0.
-//      During dispatch the elected thread has exclusive tracker access.
-void SchedulerContext::handle_drain_mode(int32_t thread_idx) {
-    // Every spin in this function honors is_completed(): once the run latches
-    // completed_ (all tasks done, or a fatal error raised elsewhere), peers leave
-    // the dispatch loop and stop participating in the drain. A thread parked in a
-    // drain spin would then wait forever for acks / a gate-open that can no longer
-    // arrive -- the AICPU watchdog never fires here because these spins live
-    // outside the dispatch loop's wall-clock budget, so the hang escalates straight
-    // to the 3 s STARS op-exec timeout (507018) and poisons the device. Bailing on
-    // completed_ is always safe: any pending sync_start task is either already
-    // dispatched (a stale re-popped slot) or moot under teardown, and deinit()
-    // resets drain_state_ before the next run, so leaving it dirty is harmless.
-    // Spin until drain is fully initialized (sentinel -1 -> block_num > 0).
-    int32_t block_num;
-    do {
-        if (is_completed()) return;
-        block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire);
-    } while (block_num < 0);
-    if (block_num == 0) return;
-
-    uint32_t all_acked = (1u << active_sched_threads_) - 1;
-
-    // Ack barrier -- signal this thread has stopped dispatch.
-    drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release);
-
-    // Spin until all threads have acked.
-    // If our bit is cleared while waiting, elected reset due to insufficient resources.
-    while (true) {
-        if (is_completed()) return;
-        uint32_t ack = drain_state_.drain_ack_mask.load(std::memory_order_acquire);
-        if ((ack & all_acked) == all_acked) break;
-        if ((ack & (1u << thread_idx)) == 0) return;
-        SPIN_WAIT_HINT();
-    }
-
-    // Election -- exactly one thread wins the CAS.
-    int32_t expected = 0;
-    drain_state_.drain_worker_elected.compare_exchange_strong(
-        expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed
-    );
-
-    if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1) {
-        // Non-elected: spin-wait for drain completion or resource-insufficient reset.
-        while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
-            if (is_completed()) return;
-            if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return;
-            SPIN_WAIT_HINT();
-        }
-        return;
-    }
-
-    // Elected: check if global resources are sufficient.
-    PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
-    if (slot_state == nullptr) {
-        // pending_task is observed null only when a concurrent drain completion
-        // already cleared it (drain_worker_dispatch nulls it before reopening the
-        // gate). That drain is done and this is a stale-elected thread, so just
-        // release the election lock and return. Do NOT clear drain_ack_mask or
-        // sync_start_pending: a *new* drain run may already be active and
-        // accumulating acks, and zeroing them would corrupt it into a hang.
-        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
-        return;
-    }
-    PTO2ResourceShape shape = slot_state->active_mask.to_shape();
-    int32_t available = count_global_available(shape, slot_state->active_mask.core_mask());
-
-    if (available < block_num) {
-        // Insufficient resources -- reset drain fields so threads can resume
-        // completion polling to free running cores, then retry.
-        drain_state_.drain_ack_mask.store(0, std::memory_order_release);
-        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
-        return;
-    }
-
-    // Dispatch -- all other threads are spinning, elected thread has exclusive tracker access.
-    drain_worker_dispatch(block_num);
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
deleted file mode 100644
index 3a008bbf9..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_context.h
+++ /dev/null
@@ -1,423 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#ifndef SCHEDULER_CONTEXT_H
-#define SCHEDULER_CONTEXT_H
-
-#include "aicpu/platform_regs.h"
-#include "common/l2_swimlane_profiling.h"
-#include "common/unified_log.h"
-#include "scheduler_types.h"
-
-#include "scheduler/pto_scheduler.h"
-
-#include "aicore_completion_mailbox.h"
-#include "pto2_dispatch_payload.h"
-
-// These macros are defined in runtime.h, but we cannot include it here
-// (it pulls in Handshake which we only forward-declare).  Mirror the
-// authoritative values so the class layout compiles standalone.
-#ifndef RUNTIME_MAX_WORKER
-#define RUNTIME_MAX_WORKER 72
-#endif
-#ifndef RUNTIME_MAX_FUNC_ID
-#define RUNTIME_MAX_FUNC_ID 1024
-#endif
-
-// Forward declarations — avoid pulling in full headers for pointer/reference params.
-class Runtime;
-struct Handshake;
-struct PTO2Runtime;
-
-/**
- * SchedulerContext: owns all scheduler-side state and methods.
- *
- * Held as a member of AicpuExecutor (sched_ctx_).  The single public entry
- * point is resolve_and_dispatch(), called once per scheduler thread.
- *
- * All dispatch/completion/drain/cold-path logic is implemented as private
- * member methods, split across three .cpp files by responsibility:
- *   - scheduler_completion.cpp  (completion polling, drain protocol)
- *   - scheduler_cold_path.cpp   (exit checks, stall diagnostics, profiling)
- *   - scheduler_dispatch.cpp    (task dispatch loop and helpers)
- */
-class SchedulerContext {
-public:
-    // =========================================================================
-    // Lifecycle
-    // =========================================================================
-
-    // Initialize scheduler state from the given runtime and thread layout.
-    // - Discovers cores via handshake_all_cores()
-    // - Assigns cores to scheduler threads
-    // - Resets task counters, payloads, per-core GlobalContext
-    // - Binds func_id_to_addr_ / initial sched_ (if rt is already known)
-    // - Captures AICore-register base (consumed by handshake_all_cores())
-    // Returns 0 on success, negative on failure (handshake / assignment error).
-    int32_t
-    init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base);
-
-    // Reset all SchedulerContext-owned state to its post-construction defaults.
-    // Called by AicpuExecutor::deinit() during per-run teardown.
-    void deinit();
-
-    // =========================================================================
-    // Per-thread execution entry points (called by AicpuExecutor::run)
-    // =========================================================================
-
-    // Main scheduler thread entry: poll completion + dispatch ready tasks.
-    int32_t resolve_and_dispatch(Runtime *runtime, int32_t thread_idx);
-
-    // Shutdown AICore registers for this thread's assigned cores.
-    // Also runs PMU finalize (PTO2_PROFILING) before deinit when enabled.
-    // Orchestrator threads (core_trackers_[thread_idx].core_num() == 0) are a no-op.
-    int32_t shutdown(int32_t thread_idx);
-
-    // Run all post-orchestration scheduler bookkeeping:
-    //  - publishes core assignments to the perf collector (PTO2_PROFILING)
-    //  - latches submitted task count from PTO2 shared memory
-    //  - folds inline_completed_tasks into completed_tasks_
-    //  - flips orchestrator_done_ and triggers core transition
-    //    (skipped on fatal error — emergency_shutdown runs instead)
-    // Callers must invoke rt_orchestration_done(rt) before this — that
-    // step belongs to the orchestrator lifecycle, not the scheduler.
-    void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks);
-
-    // Bind the PTO2Runtime scheduler pointer. Required in device-orchestration
-    // mode where rt is created by the orchestrator thread after init().
-    void bind_runtime(PTO2Runtime *rt);
-
-    // =========================================================================
-    // State queries / external synchronization points
-    // =========================================================================
-
-    int32_t aic_count() const { return aic_count_; }
-    int32_t aiv_count() const { return aiv_count_; }
-    bool is_completed() const { return completed_.load(std::memory_order_acquire); }
-    int32_t completed_tasks_count() const { return completed_tasks_.load(std::memory_order_acquire); }
-
-    // Block until the first scheduler thread has finished one-time PTO2 init.
-    // Called by the orchestrator thread in device-orch mode.
-    void wait_pto2_init_complete() const;
-
-private:
-    // =========================================================================
-    // State
-    // =========================================================================
-
-    // --- Scheduler binding & per-core runtime state ---
-    alignas(64) PTO2SchedulerState *sched_{nullptr};
-    PTO2Runtime *rt_{nullptr};
-
-    // Per-core execution state, indexed by core_id (= worker_id)
-    CoreExecState core_exec_states_[RUNTIME_MAX_WORKER];
-
-    // Cluster-ordered core trackers, one per scheduler thread
-    CoreTracker core_trackers_[MAX_AICPU_THREADS];
-
-    // Per-core dispatch payload storage: dual-buffer for pipelining.
-    // buf_idx = reg_task_id & 1; adjacent dispatches alternate automatically.
-    PTO2DispatchPayload payload_per_core_[RUNTIME_MAX_WORKER][2];
-
-    // Per-core deferred-completion software registration storage.  This has
-    // the same runtime lifetime as payload_per_core_, but is kept out of the
-    // dispatch payload so normal task dispatch layout and cache footprint stay
-    // unchanged.
-    DeferredCompletionSlab deferred_slab_per_core_[RUNTIME_MAX_WORKER][2];
-
-    // sync_start drain coordination
-    SyncStartDrainState drain_state_;
-
-#if PTO2_PROFILING
-    SchedL2SwimlaneCounters sched_l2_swimlane_[MAX_AICPU_THREADS];
-    // Cached once at init() from get_l2_swimlane_level(), AFTER
-    // l2_swimlane_aicpu_init has promoted the level from the shared-memory header.
-    L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};
-#endif
-
-    // --- Task-execution tracking ---
-    std::atomic<int32_t> completed_tasks_{0};
-    int32_t total_tasks_{0};
-    // Device orchestration: set by last orchestrator when graph is built; schedulers poll it.
-    // volatile prevents the compiler from hoisting the load out of spin loops.
-    volatile bool orchestrator_done_{false};
-    std::atomic<bool> completed_{false};
-    uint64_t *func_id_to_addr_{nullptr};
-
-    // --- Core-transition coordination ---
-    std::atomic<bool> transition_requested_{false};
-    std::atomic<int32_t> wait_reassign_{0};
-    std::atomic<bool> reassigned_{false};
-
-    // --- Thread/core configuration ---
-    int32_t active_sched_threads_{0};
-    int32_t sched_thread_num_{0};
-    bool orch_to_sched_{false};
-    int32_t aicpu_thread_num_{0};
-    int32_t cores_total_num_{0};
-
-    // Cluster-ordered worker_id lists, populated by handshake_all_cores().
-    int32_t aic_worker_ids_[RUNTIME_MAX_WORKER]{};
-    int32_t aiv_worker_ids_[RUNTIME_MAX_WORKER]{};
-    int32_t aic_count_{0};
-    int32_t aiv_count_{0};
-
-    // Platform AICore-register base array (set by AicpuExecutor before init()).
-    uint64_t regs_{0};
-
-#if PTO2_PROFILING
-    // PMU profiling: physical core IDs for PMU MMIO base resolution.
-    // Separate storage because CoreExecState's 64-byte budget has no room for
-    // physical_core_id when PTO2_PROFILING=1.
-    uint32_t physical_core_ids_[RUNTIME_MAX_WORKER]{};
-#endif
-
-    // --- One-time init coordination ---
-    std::atomic<bool> pto2_init_done_{false};
-    std::atomic<bool> pto2_init_complete_{false};
-
-    // =========================================================================
-    // Core management (scheduler_cold_path.cpp)
-    // =========================================================================
-
-    // Handshake with all AICore workers; populates core_exec_states_, worker id lists.
-    int32_t handshake_all_cores(Runtime *runtime);
-
-    // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads.
-    bool assign_cores_to_threads();
-
-    // Re-distribute all cores across all threads after orchestration completes.
-    void reassign_cores_for_all_threads();
-
-    // Emergency shutdown: broadcast exit signal to every handshake'd core and
-    // deinit their AICore register blocks. Idempotent.
-    void emergency_shutdown(Runtime *runtime);
-
-    // =========================================================================
-    // Dispatch (scheduler_dispatch.cpp)
-    // =========================================================================
-
-    static const char *shape_name(PTO2ResourceShape shape);
-
-    // Lower-case rendering of PTO2SubtaskSlot, used by dispatch and stall logs.
-    // Kept lower-case to match the `kernels=[aic:N aiv0:N aiv1:N]` field
-    // convention already established in the stall log family.
-    static inline const char *subslot_name(PTO2SubtaskSlot s) {
-        switch (s) {
-        case PTO2SubtaskSlot::AIC:
-            return "aic";
-        case PTO2SubtaskSlot::AIV0:
-            return "aiv0";
-        case PTO2SubtaskSlot::AIV1:
-            return "aiv1";
-        }
-        return "?";
-    }
-
-    int pop_ready_tasks_batch(
-        PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out,
-        int max_count
-    );
-
-    void build_payload(
-        PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
-        const AsyncCtx &async_ctx, int32_t block_idx
-    );
-
-    // Batched-dispatch primitives. prepare_* builds the payload and per-core
-    // state; publish_* issues the MMIO register write. Callers must wmb()
-    // between the prepare batch and the publish batch, then sample
-    // get_sys_cnt_aicpu() once and pass it to publish_* for every handle.
-    //
-    // dispatch_timestamp_slot points to the CoreExecState slot
-    // (pending_dispatch_timestamp / running_dispatch_timestamp) selected at
-    // prepare time, or nullptr when L2 swimlane is below AICPU_TIMING and no
-    // dispatch timestamp is being recorded.
-    struct PublishHandle {
-        uint64_t reg_addr;
-        uint32_t reg_task_id;
-        int32_t core_offset;
-        uint64_t *dispatch_timestamp_slot;
-    };
-
-    PublishHandle prepare_subtask_to_core(
-        int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
-        bool to_pending, int32_t block_idx
-    );
-
-    inline void publish_subtask_to_core(const PublishHandle &h, uint64_t dispatch_ts) {
-        if (h.dispatch_timestamp_slot != nullptr) {
-            *h.dispatch_timestamp_slot = dispatch_ts;
-        }
-        write_reg(h.reg_addr, RegId::DATA_MAIN_BASE, static_cast<uint64_t>(h.reg_task_id));
-    }
-
-    // Fan out one block's subtasks (1 for AIC/AIV, 1-3 for MIX) into the
-    // caller-supplied handles buffer. Returns the number of handles written.
-    int prepare_block_for_dispatch(
-        int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape,
-        bool to_pending, int32_t block_idx, PublishHandle *out_handles
-    );
-
-    void dispatch_shape(
-        int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf,
-        CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
-    );
-
-    // Speculative early-dispatch (Hook 1). After normal dispatch leaves idle
-    // cores spare, pre-stage the consumers of any RUNNING flagged producer onto
-    // those cores with not_ready=1 (gated). Touches no dependency state — the
-    // task is released by the doorbell at its normal ready-pop (Hook 2).
-    int32_t try_speculative_early_dispatch(int32_t thread_idx);
-
-    // Stage the already-claimed range [start, start+count) of consumer `c` onto
-    // thread_idx's idle (RUNNING slot) then pending (gated-pending, promote-on-FIN)
-    // cores from the provided free-core sets. The caller advances next_block_idx and
-    // re-pushes `c` BEFORE calling, so this expensive prepare+publish runs
-    // concurrently with peers (mirrors the normal SPMD dispatch path). Returns the
-    // number of blocks staged.
-    int32_t stage_consumer_blocks(
-        int32_t thread_idx, PTO2TaskSlotState *c, PTO2ResourceShape shape, int32_t start, int32_t count,
-        CoreTracker::BitStates &idle, CoreTracker::BitStates &pend
-    );
-
-    // One pass of "Phase 4" in the resolve_and_dispatch loop: IDLE-stage dispatch
-    // for MIX then (if no mix residual) AIC/AIV; mid-flush of local buffers; then
-    // PENDING-stage dispatch with cross-thread idle gating. MIX is strictly
-    // prioritized — when mix residual is detected after MIX-IDLE, AIC/AIV are
-    // skipped for the whole pass but MIX-PENDING still runs.
-    //
-    // Forward-progress argument for AIC/AIV: skip_aic_aiv is sticky for the
-    // current pass only. The next loop iteration re-evaluates after Phase 1
-    // completion polling and the global MIX queue draining (here or on any
-    // peer thread). AIC/AIV starvation is therefore bounded by MIX throughput,
-    // not unbounded — once mix completes on at least one cluster, the next
-    // pass either drains the residual or admits AIC/AIV.
-    void dispatch_ready_tasks(
-        int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES],
-        bool pmu_active, bool &made_progress, bool &try_pushed
-    );
-
-    // Returns true if any *other* scheduler thread currently has an idle core
-    // matching `shape`. Used as a scheduling hint on the PENDING dispatch path
-    // — see the implementation in scheduler_dispatch.cpp for the hint-semantics
-    // rationale and the safety argument against the drain worker.
-    bool has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const;
-
-    // True if mix tasks remain anywhere this thread could see them: the caller's
-    // MIX local LIFO stack or the global MIX ready queue. Approximate —
-    // PTO2ReadyQueue::size() (see pto_scheduler.h) snapshots its enqueue/dequeue
-    // positions with std::memory_order_relaxed and may interleave with concurrent
-    // push/pop. Don't confuse with PTO2SpscQueue::size(), which uses acquire
-    // loads — that one isn't on this path. A stale read here causes at most one
-    // extra/missed AIC/AIV skip and self-corrects on the next loop iteration.
-    bool has_residual_mix(const PTO2LocalReadyBuffer &mix_local_buf) const {
-        return mix_local_buf.count > 0 || sched_->ready_queues[static_cast<int32_t>(PTO2ResourceShape::MIX)].size() > 0;
-    }
-
-    // =========================================================================
-    // Completion & drain (scheduler_completion.cpp)
-    // =========================================================================
-
-    static SlotTransition decide_slot_transition(
-        int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id, bool pending_gated = false
-    );
-
-    void complete_slot_task(
-        PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, PTO2SubtaskSlot subslot, int32_t thread_idx,
-        int32_t core_id, Handshake *hank, int32_t &completed_this_turn,
-        PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
-        PTO2LocalReadyBuffer *local_bufs
-#if PTO2_PROFILING
-        ,
-        uint64_t dispatch_ts, uint64_t finish_ts
-#endif
-    );
-
-    static void promote_pending_to_running(CoreExecState &core);
-    static void clear_running_slot(CoreExecState &core);
-
-    void check_running_cores_for_completion(
-        int32_t thread_idx, Handshake *hank, int32_t &completed_this_turn, int32_t &cur_thread_completed,
-        bool &made_progress, PTO2TaskSlotState *deferred_release_slot_states[], int32_t &deferred_release_count,
-        PTO2LocalReadyBuffer *local_bufs
-    );
-
-    bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num);
-    int32_t count_global_available(PTO2ResourceShape shape, uint8_t core_mask);
-    void drain_worker_dispatch(int32_t block_num);
-    void handle_drain_mode(int32_t thread_idx);
-
-    // =========================================================================
-    // Cold path: exit checks, stall diagnostics, profiling (scheduler_cold_path.cpp)
-    // =========================================================================
-
-    __attribute__((noinline, cold)) LoopAction
-    handle_orchestrator_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t &task_count);
-
-    __attribute__((noinline, cold)) LoopAction handle_core_transition(bool &cores_released);
-
-    __attribute__((noinline, cold)) LoopAction
-    check_idle_fatal_error(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime);
-
-    __attribute__((noinline, cold)) void
-    log_stall_diagnostics(int32_t thread_idx, int32_t task_count, int32_t idle_iterations, int32_t last_progress_count);
-
-    __attribute__((noinline, cold)) void log_shutdown_stall_snapshot(
-        int32_t trigger_thread_idx, int32_t trigger_idle_iterations, int32_t trigger_last_progress_count
-    );
-
-    // Reverse lookup: given a global core_id, find which scheduler thread's
-    // tracker owns it. Returns -1 if not found. Linear scan — only used on
-    // the cold diagnostic path.
-    int32_t find_core_owner_thread(int32_t core_id) const;
-
-    // Does this thread own any core with a RUNNING task (running_slot_state set)?
-    // Gates the scheduler timeout fatal latch: a thread without an owned
-    // RUNNING task has no first-hand evidence of a stuck dispatch and must
-    // not declare global fatal on its own idle observation. The thread that
-    // does own the stuck task will reach the budget on its own polls and
-    // latch with valid evidence (or recover when the COND register flips).
-    bool self_owns_running_task(int32_t thread_idx) const;
-
-    // Does *any* scheduler thread own a RUNNING task? Used as the second
-    // fatal-latch condition: if the wall-clock budget elapsed AND no thread
-    // owns RUNNING work AND tasks remain incomplete, the system is in a
-    // pre-dispatch / WAIT-only deadlock (e.g. dependency cycle) and the
-    // ownerless idle threads are the only observers — let one of them latch.
-    bool no_thread_owns_running_task() const;
-
-    __attribute__((noinline, cold)) int32_t handle_timeout_exit(
-        int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime, int32_t idle_iterations,
-        int32_t last_progress_count
-#if PTO2_PROFILING
-        ,
-        uint64_t sched_start_ts
-#endif
-    );
-
-#if PTO2_PROFILING
-    __attribute__((noinline, cold)) void log_l2_swimlane_summary(int32_t thread_idx, int32_t cur_thread_completed);
-#endif
-
-    // =========================================================================
-    // Small inline helpers
-    // =========================================================================
-
-    uint64_t get_function_bin_addr(int func_id) const {
-        if (!func_id_to_addr_ || func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
-            LOG_ERROR("func_id=%d is out of range [0, %d) or map is null", func_id, RUNTIME_MAX_FUNC_ID);
-            return 0;
-        }
-        return func_id_to_addr_[func_id];
-    }
-};
-
-#endif  // SCHEDULER_CONTEXT_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
deleted file mode 100644
index 08a2d9020..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ /dev/null
@@ -1,1409 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#include "scheduler_context.h"
-
-#include <algorithm>
-#include <cinttypes>
-#include <limits>
-
-#include "common.h"  // debug_assert
-
-#include "common/unified_log.h"
-#include "aicpu/device_time.h"
-#include "aicpu/platform_regs.h"
-#include "callable.h"
-#include "common/l2_swimlane_profiling.h"
-#include "common/memory_barrier.h"
-#include "common/platform_config.h"
-#include "pto_runtime2.h"
-#include "runtime.h"
-#include "spin_hint.h"
-
-// Performance profiling headers
-#include "aicpu/l2_swimlane_collector_aicpu.h"
-#include "aicpu/pmu_collector_aicpu.h"
-#include "aicpu/tensor_dump_aicpu.h"
-
-#ifndef unlikely
-#define unlikely(x) __builtin_expect(!!(x), 0)
-#endif
-
-// =============================================================================
-// Dispatch helpers
-// =============================================================================
-
-namespace {
-inline constexpr int32_t PTO2_DEFERRED_RELEASE_CAP = 256;
-}
-
-// The speculative core bitmask (PTO2_SPEC_CORE_MASK_WORDS * 64 bits) must cover
-// every global core_id, and the per-core doorbell table is sized to match.
-static_assert(
-    RUNTIME_MAX_WORKER <= PTO2_SPEC_CORE_MASK_WORDS * 64, "staged_core_mask too small for RUNTIME_MAX_WORKER cores"
-);
-
-const char *SchedulerContext::shape_name(PTO2ResourceShape shape) {
-    switch (shape) {
-    case PTO2ResourceShape::AIC:
-        return "AIC";
-    case PTO2ResourceShape::AIV:
-        return "AIV";
-    case PTO2ResourceShape::MIX:
-        return "MIX";
-    case PTO2ResourceShape::DUMMY:
-        return "DUMMY";
-    }
-    return "UNKNOWN";
-}
-
-bool SchedulerContext::has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const {
-    // Cross-thread read of peer trackers without explicit synchronization. The
-    // backing `core_states_` is a naturally aligned uint64_t; aarch64 guarantees
-    // single-copy atomicity for an 8-byte aligned load, so no torn read. The
-    // value is consumed only as a scheduling *hint* — a stale read at worst
-    // causes one missed/extra pending dispatch, corrected on the next iteration.
-    // Drain-mode cross-thread writes are serialized by handle_drain_mode's ack
-    // barrier (all peers spin out of the dispatch path before any tracker
-    // mutation), so this routine is never racing the drain worker.
-    for (int32_t t = 0; t < active_sched_threads_; t++) {
-        if (t == self_thread_idx) continue;
-        if (core_trackers_[t].get_idle_core_offset_states(shape).has_value()) {
-            return true;
-        }
-    }
-    return false;
-}
-
-int SchedulerContext::pop_ready_tasks_batch(
-    PTO2ResourceShape shape, int32_t thread_idx, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count
-) {
-#if PTO2_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-#if PTO2_SCHED_PROFILING
-    extern uint64_t g_sched_pop_atomic_count[], g_sched_pop_wait_cycle[];
-    uint64_t t_pop_start = get_sys_cnt_aicpu();
-    int count = sched_->get_ready_tasks_batch(
-        shape, local_buf, out, max_count, g_sched_pop_atomic_count[thread_idx], g_sched_pop_wait_cycle[thread_idx]
-    );
-    l2_swimlane.sched_dispatch_pop_cycle += (get_sys_cnt_aicpu() - t_pop_start);
-#else
-    int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
-#endif
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        if (count > 0) {
-            l2_swimlane.pop_hit += count;
-        } else {
-            l2_swimlane.pop_miss++;
-        }
-    }
-#else
-    (void)thread_idx;
-    int count = sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
-#endif
-    return count;
-}
-
-void SchedulerContext::build_payload(
-    PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot,
-    const AsyncCtx &async_ctx, int32_t block_idx
-) {
-    int32_t slot_idx = static_cast<int32_t>(subslot);
-    uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]);
-    const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
-    dispatch_payload.function_bin_addr = callable->resolved_addr();
-    auto &payload = *slot_state.payload;
-    int n = 0;
-    for (int32_t i = 0; i < payload.tensor_count; i++) {
-        dispatch_payload.args[n++] = reinterpret_cast<uint64_t>(&payload.tensors[i]);
-    }
-    for (int32_t i = 0; i < payload.scalar_count; i++) {
-        dispatch_payload.args[n++] = payload.scalars[i];
-    }
-    dispatch_payload.local_context.block_idx = block_idx;
-    dispatch_payload.local_context.block_num = slot_state.logical_block_num;
-    dispatch_payload.local_context.async_ctx = async_ctx;
-    dispatch_payload.args[PAYLOAD_LOCAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.local_context);
-    dispatch_payload.args[PAYLOAD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.global_context);
-    // Speculative early-dispatch: a task being staged (Hook 1 set spec_state to
-    // STAGING before this call) is gated — the AICore must wait for the
-    // DATA_MAIN_BASE high-32 doorbell. All other dispatches run on pickup.
-    dispatch_payload.not_ready =
-        (slot_state.payload->spec_state.load(std::memory_order_relaxed) == PTO2_SPEC_STAGING) ? 1 : 0;
-}
-
-SchedulerContext::PublishHandle SchedulerContext::prepare_subtask_to_core(
-    int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, bool to_pending,
-    int32_t block_idx
-) {
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    auto core_id = tracker.get_core_id_by_offset(core_offset);
-    CoreExecState &core_exec_state = core_exec_states_[core_id];
-
-    core_exec_state.dispatch_seq++;
-    uint32_t reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
-    static_assert(
-        (TASK_ID_MASK - AICORE_EXIT_SIGNAL + 1) % 2 == 0, "Sentinel skip must be even to preserve dual-buffer parity"
-    );
-    if (reg_task_id >= AICORE_EXIT_SIGNAL) {
-        core_exec_state.dispatch_seq += (TASK_ID_MASK - reg_task_id + 1);
-        reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
-    }
-
-    uint32_t buf_idx = reg_task_id & 1u;
-    PTO2DispatchPayload &payload = payload_per_core_[core_id][buf_idx];
-    DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][buf_idx];
-    deferred_slab->count = 0;
-    deferred_slab->error_code = PTO2_ERROR_NONE;
-    AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab);
-    build_payload(payload, slot_state, subslot, async_ctx, block_idx);
-
-    if (to_pending) {
-        core_exec_state.pending_subslot = subslot;
-        core_exec_state.pending_slot_state = &slot_state;
-        core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
-    } else {
-        core_exec_state.running_subslot = subslot;
-        core_exec_state.running_slot_state = &slot_state;
-        core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
-        tracker.change_core_state(core_offset);
-    }
-    tracker.set_pending_occupied(core_offset);
-
-    LOG_DEBUG(
-        "Thread %d: Dispatched %s %s task %" PRId64 " kernel_id=[%d,%d,%d] block_idx=%d/total_blocks=%d to"
-        " core_offset=%d core_id=%d reg_task_id=%u",
-        thread_idx, to_pending ? "pending" : "idle", subslot_name(subslot),
-        static_cast<int64_t>(slot_state.task->task_id.raw), slot_state.task->kernel_id[0],
-        slot_state.task->kernel_id[1], slot_state.task->kernel_id[2], block_idx, slot_state.logical_block_num,
-        core_offset, core_id, reg_task_id
-    );
-
-    // AICore buffer rotation lives on the dispatch path: count this dispatch
-    // and rotate before write_reg when we're about to cross a BUFFER_SIZE
-    // boundary. The completion-before-dispatch invariant makes this race-free
-    // (all prior tasks on this core have FIN'd, so AICore has dcci'd their
-    // records out of the old buffer). Gated on the same enable bit as flush
-    // so level=1 (AICORE_TIMING-only) participates without needing complete_task.
-#if PTO2_PROFILING
-    if (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED) {
-        l2_swimlane_aicpu_on_aicore_dispatch(core_id, thread_idx);
-    }
-#endif
-
-    uint64_t *dispatch_timestamp_slot = nullptr;
-#if PTO2_PROFILING
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
-        dispatch_timestamp_slot =
-            to_pending ? &core_exec_state.pending_dispatch_timestamp : &core_exec_state.running_dispatch_timestamp;
-    }
-#endif
-
-    return PublishHandle{core_exec_state.reg_addr, reg_task_id, core_offset, dispatch_timestamp_slot};
-}
-
-int SchedulerContext::prepare_block_for_dispatch(
-    int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape, bool to_pending,
-    int32_t block_idx, PublishHandle *out_handles
-) {
-#if PTO2_PROFILING
-    if (is_dump_args_enabled()) {
-        dump_args_for_task<PTO2_SUBTASK_SLOT_COUNT>(
-            thread_idx, slot_state, TensorDumpStage::BEFORE_DISPATCH,
-            [](ActiveMask active_mask, int raw_subtask_id) {
-                return active_mask.subtask_active(static_cast<PTO2SubtaskSlot>(raw_subtask_id));
-            },
-            [this](int32_t func_id) {
-                return get_function_bin_addr(func_id);
-            }
-        );
-    }
-#endif
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    if (shape == PTO2ResourceShape::MIX) {
-        uint8_t cmask = slot_state.active_mask.core_mask();
-        int n = 0;
-        if (cmask & PTO2_SUBTASK_MASK_AIC) {
-            out_handles[n++] = prepare_subtask_to_core(
-                thread_idx, tracker.get_aic_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIC, to_pending,
-                block_idx
-            );
-        }
-        if (cmask & PTO2_SUBTASK_MASK_AIV0) {
-            out_handles[n++] = prepare_subtask_to_core(
-                thread_idx, tracker.get_aiv0_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV0, to_pending,
-                block_idx
-            );
-        }
-        if (cmask & PTO2_SUBTASK_MASK_AIV1) {
-            out_handles[n++] = prepare_subtask_to_core(
-                thread_idx, tracker.get_aiv1_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV1, to_pending,
-                block_idx
-            );
-        }
-#if PTO2_PROFILING
-        sched_l2_swimlane_[thread_idx].phase_dispatch_count += __builtin_popcount(cmask);
-#endif
-        return n;
-    } else if (shape == PTO2ResourceShape::AIC) {
-        out_handles[0] =
-            prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIC, to_pending, block_idx);
-#if PTO2_PROFILING
-        sched_l2_swimlane_[thread_idx].phase_dispatch_count += 1;
-#endif
-        return 1;
-    } else {
-        out_handles[0] =
-            prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx);
-#if PTO2_PROFILING
-        sched_l2_swimlane_[thread_idx].phase_dispatch_count += 1;
-#endif
-        return 1;
-    }
-}
-
-void SchedulerContext::dispatch_shape(
-    int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf,
-    CoreTracker &tracker, bool &entered_drain, bool &made_progress, bool &try_pushed
-) {
-#if PTO2_SCHED_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-#endif
-    if (entered_drain) return;
-
-    bool is_pending = (phase == CoreTracker::DispatchPhase::PENDING);
-    bool is_mix = (shape == PTO2ResourceShape::MIX);
-    auto cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase);
-    if (!cores.has_value()) return;
-
-    while (cores.has_value() && !entered_drain) {
-        int want = cores.count();
-        PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS * 3];
-        int got = pop_ready_tasks_batch(shape, thread_idx, local_buf, batch, want);
-        if (got == 0) break;
-
-        // sync_start exclusion gate.
-        //
-        // When the popped batch contains a sync_start task we MUST publish each
-        // prior task with its own wmb so AICore receives them with time
-        // separation. The drain coordinator's `count_global_available()` check
-        // reads the per-thread CoreTracker, and although `prepare_block_for_dispatch`
-        // marks cores occupied synchronously, the head-start between successive
-        // tasks is what lets the surrounding completion loop catch up on FINs in
-        // the retry window when the sync_start task hits insufficient resources.
-        // Bursting all prior tasks at the end of the pop (cross-task batching)
-        // collapses that head-start and causes spmd_sync_start_stress to time
-        // out via 507018 on ~40% of runs — see
-        // docs/investigations/2026-06-cross-task-batched-publish.md.
-        //
-        // When the batch carries no sync_start task, no drain entry can happen
-        // in this pop, so we hoist `handles[]`, `wmb()`, and the publish loop
-        // out of the per-task body. One wmb amortizes across all tasks and one
-        // dispatch_ts is shared, which restores ~60 ns first-to-last AICore
-        // start span for single-block decode kernels (out_proj, q_proj, ...).
-        // Detection is a single mask check per task — cheap relative to even
-        // one register write.
-        bool any_sync_start = false;
-        for (int bi = 0; bi < got; bi++) {
-            if (batch[bi]->active_mask.requires_sync_start()) {
-                any_sync_start = true;
-                break;
-            }
-        }
-
-        // handles[] is sized for the MIX worst case: total claims across the
-        // pop bounded by `cores.count() ≤ MAX_CLUSTERS`, and each block
-        // contributes ≤ 3 subtasks for MIX.
-        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
-        int handle_count = 0;
-        bool dispatched_any = false;
-        // Slots dispatched this pop whose dispatch_fanin must be propagated to
-        // consumers. Deferred until AFTER publish (below) so a flagged producer's
-        // fanout walk never sits between claiming cores and publishing its own
-        // blocks — doing it inline delays this thread's blocks while peer threads
-        // co-dispatching the same SPMD task publish immediately, misaligning the
-        // task's block starts. Bounded by cores.count() ≤ MAX_CLUSTERS dispatches.
-        PTO2TaskSlotState *prop_list[CoreTracker::MAX_CLUSTERS];
-        int prop_n = 0;
-#if PTO2_SCHED_PROFILING
-        uint64_t t_setup_start = get_sys_cnt_aicpu();
-#endif
-
-        // Flush prepared-but-unpublished handles. Required before
-        // `enter_drain_mode` so the drain coordinator sees cores as occupied,
-        // and at the per-task boundary when `any_sync_start` is true.
-        auto flush_publish = [&]() {
-            if (handle_count == 0) return;
-            wmb();
-            uint64_t dispatch_ts = 0;
-#if PTO2_PROFILING
-            if (l2_swimlane_level_ >= L2SwimlaneLevel::AICPU_TIMING) {
-                dispatch_ts = get_sys_cnt_aicpu();
-            }
-#endif
-            for (int i = 0; i < handle_count; i++) {
-                publish_subtask_to_core(handles[i], dispatch_ts);
-            }
-            handle_count = 0;
-            made_progress = true;
-        };
-
-        for (int bi = 0; bi < got; bi++) {
-            PTO2TaskSlotState *slot_state = batch[bi];
-            CoreTracker::BitStates selected_mix_clusters(0ULL);
-
-            if (is_mix) {
-                auto candidates = cores;
-                uint8_t cmask = slot_state->active_mask.core_mask();
-                auto wanted = is_pending ? CoreTracker::MixPlacement::PENDING : CoreTracker::MixPlacement::RUNNING;
-                while (candidates.has_value()) {
-                    int32_t cluster_offset = candidates.pop_first();
-                    if (tracker.classify_mix_cluster(cluster_offset, cmask) == wanted) {
-                        selected_mix_clusters |= CoreTracker::BitStates(1ULL << cluster_offset);
-                    }
-                }
-                if (!selected_mix_clusters.has_value()) {
-                    sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-                    continue;
-                }
-            }
-
-            // (Speculative pre-staged tasks never reach this ready-pop: they are
-            // released by their doorbell in release_fanin_and_check_ready the
-            // instant their last producer completes — see try_speculative_release.)
-
-            if (slot_state->active_mask.requires_sync_start()) {
-                if (is_pending) {
-                    sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-                    continue;
-                }
-                int32_t available = is_mix ? selected_mix_clusters.count() : cores.count();
-                if (available < slot_state->logical_block_num) {
-                    flush_publish();
-                    if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) {
-                        sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-                    }
-                    for (int rem = bi + 1; rem < got; rem++) {
-                        sched_->ready_queues[static_cast<int32_t>(shape)].push(batch[rem]);
-                    }
-                    entered_drain = true;
-                    break;
-                }
-            }
-
-            if (!cores.has_value()) {
-                flush_publish();
-                sched_->ready_queues[static_cast<int32_t>(shape)].push_batch(&batch[bi], got - bi);
-                break;
-            }
-
-            dispatched_any = true;
-            try_pushed = true;
-            // Record for deferred dispatch_fanin propagation after this pop's
-            // blocks are published (see after the loop). propagate's own guard
-            // filters non-flagged slots, so recording unconditionally is cheap.
-            if (prop_n < static_cast<int>(sizeof(prop_list) / sizeof(prop_list[0]))) {
-                prop_list[prop_n++] = slot_state;
-            }
-            // Claim a contiguous range of blocks, hand the slot back to the
-            // ready queue immediately, then perform the expensive dispatches.
-            // This lets other schedulers concurrently claim and dispatch the
-            // remaining blocks of the same SPMD task instead of spinning while
-            // this thread fills all its own cores. Only local `start + b` is
-            // read after the push — `next_block_idx` may already be advanced
-            // by another scheduler that popped the slot.
-            int32_t start = slot_state->next_block_idx.load(std::memory_order_relaxed);
-            int32_t remaining = slot_state->logical_block_num - start;
-            int32_t available = is_mix ? selected_mix_clusters.count() : cores.count();
-            int32_t claim = std::min(available, remaining);
-            slot_state->next_block_idx.store(static_cast<int16_t>(start + claim), std::memory_order_relaxed);
-
-            if (start + claim < slot_state->logical_block_num) {
-                sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
-            }
-
-            for (int32_t b = 0; b < claim; b++) {
-                auto core_offset = is_mix ? selected_mix_clusters.pop_first() : cores.pop_first();
-                if (is_mix) {
-                    cores.clear_bit(core_offset);
-                }
-                handle_count += prepare_block_for_dispatch(
-                    thread_idx, core_offset, *slot_state, shape, is_pending, start + b, &handles[handle_count]
-                );
-            }
-
-            // Sync_start exclusion: flush per task so prior tasks have head-
-            // start time before any sync_start drain check. Normal batches
-            // fall through and accumulate for one cross-task flush at the
-            // end of the pop.
-            if (any_sync_start) {
-                flush_publish();
-            }
-        }
-
-        flush_publish();
-        // Blocks are published; now propagate dispatch_fanin for any flagged
-        // producers dispatched above (knob A: producer is running). Off the
-        // pre-publish path so it cannot delay or misalign their blocks.
-        for (int i = 0; i < prop_n; i++) {
-            sched_->propagate_dispatch_fanin(*prop_list[i]);
-        }
-#if PTO2_SCHED_PROFILING
-        l2_swimlane.sched_dispatch_setup_cycle += (get_sys_cnt_aicpu() - t_setup_start);
-#endif
-
-        if (!dispatched_any) break;
-
-        if (!cores.has_value()) {
-            cores = is_mix ? tracker.get_cluster_offset_states() : tracker.get_dispatchable_cores(shape, phase);
-        }
-    }
-}
-
-void SchedulerContext::dispatch_ready_tasks(
-    int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES],
-    bool pmu_active, bool &made_progress, bool &try_pushed
-) {
-    using Phase = CoreTracker::DispatchPhase;
-    constexpr int32_t MIX_I = static_cast<int32_t>(PTO2ResourceShape::MIX);
-
-    // MIX is handled explicitly at the top of each stage; only AIC/AIV cycle
-    // through this 2-elem array, with order toggled by thread parity for
-    // shape-level load balancing across threads.
-    static constexpr PTO2ResourceShape kAicAivOrder[2][2] = {
-        {PTO2ResourceShape::AIC, PTO2ResourceShape::AIV},
-        {PTO2ResourceShape::AIV, PTO2ResourceShape::AIC},
-    };
-    const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1];
-
-    // Spill overflow from local_bufs to the shared ready queue BEFORE we start
-    // dispatching. release_fanin's fast path packs all newly-ready consumers
-    // into the producing thread's local_bufs (zero atomic, peer-invisible). For
-    // batch releases (e.g. attn_fence → 50 out_proj consumers) that
-    // overshoots this thread's slot budget so peers are starving while we
-    // hoard. The cross-thread invisibility window between "complete pushes 50
-    // to local" and "IDLE-AIC's mid-phase flush exposes overflow to shared"
-    // is what shows up in the swimlane as the multi-microsecond inter-thread
-    // stagger on out_proj's first wave.
-    //
-    // Gate conditions:
-    //   (a) local count exceeds this thread's per-shape block budget — we
-    //       can't dispatch them all even with both RUNNING+PENDING slots;
-    //   (b) at least one peer has idle cores in this shape — they want work.
-    // Both must hold to avoid wasting a CAS push when we could profitably
-    // self-dispatch the overflow. Condition (b) reads peer CoreTracker
-    // (plain 8-byte load on a rarely-contended cache line, ~5 ns) — we
-    // deliberately avoid ready_queues[s].size() here, which is two atomic
-    // loads on lines pushers + poppers actively bounce.
-    //
-    // Capacity derives from how cores are partitioned across sched threads:
-    //   per-shape budget = (PLATFORM_MAX_BLOCKDIM / active_sched_threads_)
-    //                       × cores_per_blockdim_for_that_shape
-    //   MIX is 1 cluster per block dim, so its budget equals the block-dim
-    //   share without multiplying.
-    //
-    // Push the trailing `excess` slot pointers — O(1) count decrement, no
-    // memmove. push_batch is one CAS for the whole excess; peers see the
-    // batch immediately and can race for them.
-    const int32_t bd_per_thread = PLATFORM_MAX_BLOCKDIM / active_sched_threads_;
-    const int32_t thread_capacity[PTO2_NUM_RESOURCE_SHAPES] = {
-        /*AIC=*/bd_per_thread * PLATFORM_AIC_CORES_PER_BLOCKDIM,
-        /*AIV=*/bd_per_thread * PLATFORM_AIV_CORES_PER_BLOCKDIM,
-        /*MIX=*/bd_per_thread,
-    };
-    for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
-        auto &lb = local_bufs[s];
-        int32_t excess = lb.count - thread_capacity[s];
-        if (excess <= 0) continue;
-        if (!has_idle_in_other_threads(thread_idx, static_cast<PTO2ResourceShape>(s))) continue;
-        sched_->ready_queues[s].push_batch(&lb.slot_states[lb.count - excess], excess);
-        lb.count -= excess;
-    }
-
-    auto flush_local_bufs = [&]() {
-        for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++) {
-            auto &lb = local_bufs[s];
-            if (lb.count > 0) {
-                sched_->ready_queues[s].push_batch(lb.slot_states, lb.count);
-                lb.count = 0;
-            }
-        }
-    };
-    // Every return path below must flush; wrap in RAII so we cannot forget.
-    // The mid-function flush between IDLE and PENDING is still called
-    // explicitly — guard only covers exit.
-    struct FlushGuard {
-        decltype(flush_local_bufs) &flush_fn;
-        ~FlushGuard() { flush_fn(); }
-    } flush_guard{flush_local_bufs};
-
-    bool entered_drain = false;
-
-    // ===== IDLE stage =====
-    dispatch_shape(
-        thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain, made_progress,
-        try_pushed
-    );
-    if (entered_drain) return;
-
-    // MIX-IDLE residual: AIC/AIV (both IDLE and PENDING) yield for this pass.
-    // MIX-PENDING below still runs — that is the core of "mix strict priority":
-    // pending slots are spent on mix before AIC/AIV get any chance.
-    bool skip_aic_aiv = has_residual_mix(local_bufs[MIX_I]);
-
-    if (!skip_aic_aiv) {
-        for (int i = 0; i < 2; i++) {
-            PTO2ResourceShape s = aic_aiv[i];
-            dispatch_shape(
-                thread_idx, s, Phase::IDLE, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress,
-                try_pushed
-            );
-            if (entered_drain) return;
-        }
-    }
-
-    // Flush between IDLE and PENDING so PENDING-stage queue-size checks and any
-    // peer-thread reads see the IDLE-stage release_fanin output.
-    flush_local_bufs();
-
-    if (pmu_active) return;
-
-    // ===== PENDING stage =====
-    // MIX-PENDING gate: skip when a peer has an idle MIX-capable cluster — that
-    // peer's next IDLE-MIX iteration will pull the mix task from the global
-    // queue (already flushed above) at lower latency than us pre-loading a
-    // pending slot here. Forward progress for MIX is preserved: at least one
-    // thread will run MIX-IDLE next pass and consume the residual.
-    //
-    // The gate is NOT subject to skip_aic_aiv — residual mix continues to drain
-    // via pending slots on this thread when no peer is idle.
-    if (!has_idle_in_other_threads(thread_idx, PTO2ResourceShape::MIX)) {
-        dispatch_shape(
-            thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain,
-            made_progress, try_pushed
-        );
-        if (entered_drain) return;
-    }
-
-    // Re-check after MIX-PENDING. If MIX-IDLE already set skip_aic_aiv, leave
-    // it set; otherwise, escalate iff PENDING-MIX left residual.
-    if (!skip_aic_aiv && has_residual_mix(local_bufs[MIX_I])) {
-        skip_aic_aiv = true;
-    }
-
-    // PENDING-MIX may have re-populated AIC/AIV local_bufs via release_fanin
-    // during in-flight completions; flush_guard ensures these don't carry
-    // across to the next iteration's IDLE stage.
-    if (skip_aic_aiv) return;
-
-    // AIC/AIV-PENDING gate: a peer-idle skip is a delay, not a loss — the peer
-    // will pull from the global queue on its next IDLE pass.
-    for (int i = 0; i < 2; i++) {
-        PTO2ResourceShape s = aic_aiv[i];
-        if (has_idle_in_other_threads(thread_idx, s)) continue;
-        dispatch_shape(
-            thread_idx, s, Phase::PENDING, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress,
-            try_pushed
-        );
-        if (entered_drain) return;
-    }
-}
-
-// Stage the ALREADY-CLAIMED range [start, start+count) of consumer `c` onto
-// thread_idx's idle then pending cores. The caller (the queue drain) has advanced
-// next_block_idx by `count` under pop-exclusivity AND re-pushed `c` for peers
-// BEFORE calling this — so this, the expensive prepare+publish, runs CONCURRENTLY
-// with peers staging other ranges of the same consumer. This mirrors the normal
-// SPMD dispatch path (claim range -> store next_block_idx -> re-push -> dispatch).
-// `idle`/`pend` are this thread's free-core sets, sized so idle.count+pend.count >=
-// count (the caller clamped the claim to them), so all `count` blocks get a core.
-//
-// Rule 1: idle cores -> gated task in the RUNNING slot. Rule 2: PENDING slot of
-// cores running a real task -> promoted in when that task FINs (gated-pending Case
-// 3.3 in decide_slot_transition completes the running FIN + promotes instead of
-// waiting for an ack the gated task never sends). Each staged core stays
-// pending_occupied while gated, so no second gated block stacks on it.
-//
-// Self-ring: release flips STAGING->DISPATCHED then rings the mask. A block staged
-// after that flip isn't in the mask release read, so this thread rings it here. The
-// seq_cst order between "OR mask then load spec_state" (here) and "store DISPATCHED
-// then read mask" (release) guarantees every gated core's doorbell fires.
-int32_t SchedulerContext::stage_consumer_blocks(
-    int32_t thread_idx, PTO2TaskSlotState *c, PTO2ResourceShape shape, int32_t start, int32_t count,
-    CoreTracker::BitStates &idle, CoreTracker::BitStates &pend
-) {
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    // Stamp the real pre-stage time (NOT 0) so the swimlane shows these blocks
-    // dispatched during the producer's run, not at trace start.
-    uint64_t early_dispatch_ts = get_sys_cnt_aicpu();
-    uint64_t my_cores[PTO2_SPEC_CORE_MASK_WORDS] = {0};  // cores this thread gated (for self-ring)
-    int32_t staged = 0;
-    int32_t block = start;
-    auto stage_from = [&](CoreTracker::BitStates &avail, bool to_pending) {
-        // Mirror the normal flush_publish (scheduler_dispatch.cpp wmb()+publish loop):
-        // prepare all claimed blocks' payloads, one wmb(), then publish. The wmb
-        // guarantees the not_ready gate + args are globally visible before any
-        // DATA_MAIN_BASE token — without it a gated core can pick up the token and
-        // dcci a stale payload (the doorbell/release path mirrors normal dispatch).
-        PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
-        int n = 0;
-        while (count > 0 && avail.has_value()) {
-            int32_t core_offset = avail.pop_first();
-            n += prepare_block_for_dispatch(thread_idx, core_offset, *c, shape, to_pending, block, &handles[n]);
-            block++;
-            count--;
-            staged++;
-        }
-        if (n == 0) return;
-        wmb();
-        for (int i = 0; i < n; i++) {
-            publish_subtask_to_core(handles[i], early_dispatch_ts);
-            int32_t cid = tracker.get_core_id_by_offset(handles[i].core_offset);
-            sched_->spec_doorbell_table[cid].addr = handles[i].reg_addr;
-            sched_->spec_doorbell_table[cid].token = handles[i].reg_task_id;
-            my_cores[cid >> 6] |= (1ULL << (cid & 63));
-        }
-    };
-    if (idle.has_value()) stage_from(idle, /*to_pending=*/false);
-    if (pend.has_value()) stage_from(pend, /*to_pending=*/true);
-    // Publish all this thread's gated cores into the shared mask in one OR per word
-    // (vs one per subtask) so release sees them; seq_cst keeps the self-ring order.
-    for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++)
-        if (my_cores[w] != 0) c->payload->staged_core_mask[w].fetch_or(my_cores[w], std::memory_order_seq_cst);
-
-    // If release already flipped DISPATCHED, it may have read the mask before our
-    // bits landed — ring our own cores so none is left gated forever.
-    if (staged > 0 && c->payload->spec_state.load(std::memory_order_seq_cst) == PTO2_SPEC_DISPATCHED) {
-        for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++) {
-            uint64_t bits = my_cores[w];
-            while (bits != 0) {
-                int cid = w * 64 + __builtin_ctzll(bits);
-                bits &= bits - 1;
-                PTO2SchedulerState::ring_one_doorbell(
-                    sched_->spec_doorbell_table[cid].addr, sched_->spec_doorbell_table[cid].token
-                );
-            }
-        }
-    }
-    return staged;
-}
-
-// Early-dispatch drain (idle pass). Candidates are pushed to early_dispatch_queue
-// EVENT-DRIVEN by propagate_dispatch_fanin (a flagged producer's dispatch bumps its
-// consumers' dispatch_fanin; reaching fanin_count enqueues the consumer) — there is
-// no per-iteration PULL scan here anymore. This pass only DRAINS the queue.
-// Returns the number of blocks staged this pass (for the EarlyDispatch swimlane bar).
-int32_t SchedulerContext::try_speculative_early_dispatch(int32_t thread_idx) {
-    constexpr int PTO2_EARLY_DISPATCH_DRAIN_MAX = 8;  // bounded pops per pass
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    int32_t total_staged = 0;
-
-    // Drain the queue — mirrors the normal SPMD dispatch path. Pop a consumer,
-    // CLAIM a range sized to THIS thread's free cores by advancing next_block_idx with
-    // a CAS (atomic — next_block_idx is shared with normal dispatch, which also claims
-    // it if release routes the consumer to the ready queue, so a plain store could
-    // double-dispatch), RE-PUSH it for peers, THEN do the expensive prepare+publish.
-    // Re-pushing before staging lets peers claim the next range and stage CONCURRENTLY
-    // — a wide consumer (online_softmax, 48 blocks) is filled by all idle threads in
-    // parallel instead of a serial winner-then-peer daisy chain. Bounded pops/pass.
-    for (int n = 0; n < PTO2_EARLY_DISPATCH_DRAIN_MAX; n++) {
-        PTO2TaskSlotState *c = sched_->early_dispatch_queue.pop();
-        if (c == nullptr) break;
-        if (c->payload->spec_state.load(std::memory_order_acquire) != PTO2_SPEC_STAGING) continue;  // released
-        PTO2ResourceShape shape = c->active_mask.to_shape();
-        auto idle = tracker.get_idle_core_offset_states(shape);
-        auto pend = tracker.get_pending_core_offset_states(shape);
-        int32_t freecores = (idle.has_value() ? idle.count() : 0) + (pend.has_value() ? pend.count() : 0);
-        if (freecores == 0) {  // no free cores of this shape — give it back for peers and stop
-            sched_->early_dispatch_queue.push(c);
-            break;
-        }
-        // CAS-claim a contiguous range [start, start+claim) sized to this thread's
-        // free cores; CAS keeps it atomic against peers AND normal dispatch.
-        int32_t start = 0, claim = 0;
-        while (true) {
-            int16_t cur = c->next_block_idx.load(std::memory_order_relaxed);
-            if (cur >= c->logical_block_num) break;  // fully claimed
-            int32_t cnt = c->logical_block_num - cur;
-            if (cnt > freecores) cnt = freecores;
-            if (c->next_block_idx.compare_exchange_weak(
-                    cur, static_cast<int16_t>(cur + cnt), std::memory_order_seq_cst, std::memory_order_relaxed
-                )) {
-                start = cur;
-                claim = cnt;
-                break;
-            }
-        }
-        if (claim == 0) continue;  // nothing left to claim -> drop (no re-push)
-        // Re-push for concurrent peers BEFORE the expensive staging.
-        if (start + claim < c->logical_block_num) {
-            if (!sched_->early_dispatch_queue.push(c))
-                LOG_INFO_V9(
-                    "[SPEC] queue full on re-push, consumer=%" PRId64, static_cast<int64_t>(c->task->task_id.raw)
-                );
-        }
-        total_staged += stage_consumer_blocks(thread_idx, c, shape, start, claim, idle, pend);
-    }
-    return total_staged;
-}
-
-// =============================================================================
-// Main scheduler dispatch loop
-// =============================================================================
-
-int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_idx) {
-    always_assert(sched_ != nullptr);
-    CoreTracker &tracker = core_trackers_[thread_idx];
-    LOG_INFO_V0("Thread %d: resolve_and_dispatch entry", thread_idx);
-
-    PTO2SharedMemoryHeader *header = sched_->sm_header;
-    if (!header) {
-        LOG_ERROR("PTO2 dispatch: header is null");
-        return -1;
-    }
-    LOG_INFO_V0(
-        "Thread %d: header=%p, task_desc_offset[0]=%lu, window_size=%lu", thread_idx, static_cast<void *>(header),
-        static_cast<uint64_t>(header->rings[0].task_descriptors_offset),
-        static_cast<uint64_t>(header->rings[0].task_window_size)
-    );
-
-    Handshake *hank = static_cast<Handshake *>(runtime->workers);
-    LOG_INFO_V0(
-        "Thread %d: hank=%p, window_size=%lu", thread_idx, static_cast<void *>(hank),
-        static_cast<uint64_t>(header->rings[0].task_window_size)
-    );
-
-    // One-time init: assign perf buffers (one thread does it; others wait)
-    if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel)) {
-        LOG_INFO_V0("Thread %d: doing one-time init", thread_idx);
-
-#if PTO2_PROFILING
-        if (is_dump_args_enabled()) {
-            dump_args_init(orch_to_sched_ ? aicpu_thread_num_ : sched_thread_num_);
-        }
-#endif
-
-#if PTO2_PROFILING
-        // Initialize PMU: program events, start counters, and pop initial buffers
-        if (is_pmu_enabled()) {
-            pmu_aicpu_init(physical_core_ids_, cores_total_num_);
-            LOG_INFO_V0("PMU profiling started on %d cores", cores_total_num_);
-        }
-#endif
-
-        LOG_INFO_V0("Thread %d: one-time init done", thread_idx);
-        pto2_init_complete_.store(true, std::memory_order_release);
-    } else {
-        while (!pto2_init_complete_.load(std::memory_order_acquire)) {
-            SPIN_WAIT_HINT();
-        }
-    }
-
-    LOG_INFO_V0("Thread %d: PTO2 dispatch starting with %d cores", thread_idx, core_trackers_[thread_idx].core_num());
-    int32_t cur_thread_completed = 0;
-    // Non-zero once a scheduler-hang timeout latches; returned in place of the
-    // completed count so the caller still sees the negative error rc while the
-    // shared end-of-loop flush below runs.
-    int32_t timeout_rc = 0;
-    int32_t idle_iterations = 0;
-    int32_t last_progress_count = 0;
-#if PTO2_PROFILING
-    auto &l2_swimlane = sched_l2_swimlane_[thread_idx];
-    l2_swimlane.reset();
-    l2_swimlane.l2_swimlane_enabled = (l2_swimlane_level_ != L2SwimlaneLevel::DISABLED);
-#endif
-
-    constexpr int LOCAL_READY_CAP_PER_TYPE = 64;
-    PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE];
-    PTO2LocalReadyBuffer local_bufs[PTO2_NUM_RESOURCE_SHAPES];
-    for (int32_t i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        local_bufs[i].reset(local_ptrs[i], LOCAL_READY_CAP_PER_TYPE);
-    }
-    PTO2TaskSlotState *deferred_release_slot_states[PTO2_DEFERRED_RELEASE_CAP];
-    int32_t deferred_release_count = 0;
-
-    bool cores_released = false;
-
-    // PMU runs require single-issue dispatch — overlapping in-flight tasks
-    // pollute per-task PMU counters, so skip the PENDING pre-load phase.
-    // Cached at function scope: is_pmu_enabled() is extern "C" and the
-    // compiler cannot hoist it across the dispatch loop on its own.
-    const bool pmu_active = is_pmu_enabled();
-
-#if PTO2_PROFILING
-    l2_swimlane.sched_start_ts = get_sys_cnt_aicpu();
-#endif
-
-#if PTO2_PROFILING
-    // Queue-depth snapshot carried across the iteration boundary: each phase
-    // emit consumes (phase_start_*) and refreshes them with its own end snapshot
-    // so the next phase's "at_start" equals the previous phase's "at_end".
-    //
-    // L2SWIMLANE_NUM_QUEUE_SHAPES (3) matches PTO2_NUM_RESOURCE_SHAPES: AIC/AIV/MIX.
-    //
-    // **Hot-path cost discipline.** Local depth (this thread's PTO2LocalReadyBuffer)
-    // is a single int read on a register-cached stack — free. Shared depth
-    // (PTO2ReadyQueue::size) is two atomic relaxed loads against cache lines
-    // that all peer sched threads also write to (enqueue_pos and dequeue_pos
-    // bounce on every flush_local_bufs + every pop). With both phases emitting
-    // per iter that's 12 cross-core loads × thousands of iters per run, a
-    // measurable AICPU slowdown. Mitigation: lazy + per-iter cached shared
-    // snapshot, refreshed at most once per iteration. The complete-emit and
-    // dispatch-emit in the same iter both reuse the same shared sample; the
-    // big transitions (local→shared flush) still show up across iter boundaries.
-    static_assert(
-        L2SWIMLANE_NUM_QUEUE_SHAPES == PTO2_NUM_RESOURCE_SHAPES,
-        "queue snapshot width must match runtime resource shape count"
-    );
-    int16_t phase_start_local[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
-    int16_t phase_start_shared[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
-    int16_t iter_shared_snapshot[L2SWIMLANE_NUM_QUEUE_SHAPES] = {0};
-    bool iter_shared_sampled = false;
-    auto capture_local_snapshot = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) {
-        for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-            local_out[s] = static_cast<int16_t>(local_bufs[s].count);
-        }
-    };
-    auto get_or_sample_shared = [&]() -> const int16_t * {
-        if (!iter_shared_sampled) {
-            // Clamp to int16_t max before narrowing. PTO2_PROF_READYQUEUE_SIZE
-            // is in the low thousands today but could grow with platform
-            // scaling — without clamp, sizes above 32767 wrap to negatives
-            // and silently corrupt the snapshot.
-            constexpr size_t kMax = static_cast<size_t>(std::numeric_limits<int16_t>::max());
-            for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-                const size_t qsize = sched_->ready_queues[s].size();
-                iter_shared_snapshot[s] = static_cast<int16_t>(std::min(qsize, kMax));
-            }
-            iter_shared_sampled = true;
-        }
-        return iter_shared_snapshot;
-    };
-    auto capture_phase_end = [&](int16_t local_out[L2SWIMLANE_NUM_QUEUE_SHAPES],
-                                 int16_t shared_out[L2SWIMLANE_NUM_QUEUE_SHAPES]) {
-        capture_local_snapshot(local_out);
-        const int16_t *shared_cached = get_or_sample_shared();
-        for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++)
-            shared_out[s] = shared_cached[s];
-    };
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        capture_phase_end(phase_start_local, phase_start_shared);
-    }
-#endif
-
-    // Wall-clock timestamp of the last completed task on this thread.
-    // Updated on made_progress; consulted to decide whether the wall-clock
-    // budget for declaring a scheduler hang has elapsed. Initialized to
-    // "now" so the first budget cycle starts when this thread does, not at
-    // an undefined value.
-    uint64_t last_progress_ts = get_sys_cnt_aicpu();
-
-    while (true) {
-        if (completed_.load(std::memory_order_acquire)) {
-            break;
-        }
-        bool made_progress = false;
-#if PTO2_PROFILING
-        CYCLE_COUNT_START();
-        l2_swimlane.sched_loop_count++;
-        uint64_t _t0_phase = _t0;
-        // Release is the only "no Complete/Dispatch bar" attribution we keep —
-        // emitted with its own span in the idle branch below. Iterations that
-        // only scan/poll show as blank gaps; the per-loop Poll/Scan bars (PR
-        // #1079 debug overlay) were removed since "scheduler is polling when
-        // there's nothing to do" carries no actionable signal.
-        // Per-iter lazy shared-queue snapshot: first phase emit in this iter
-        // pays the atomic-load cost, subsequent emits in the same iter reuse
-        // the cached value. Reset here so we re-sample exactly once per iter
-        // (or skip entirely on iters with no phase emit).
-        iter_shared_sampled = false;
-#endif
-        int32_t task_count = 0;
-        if (!tracker.has_any_running_cores()) {
-            LoopAction action = handle_orchestrator_exit(thread_idx, header, runtime, task_count);
-            if (action == LoopAction::BREAK_LOOP) break;
-        }
-
-        if (!cores_released && orch_to_sched_) {
-            LoopAction action = handle_core_transition(cores_released);
-            if (action == LoopAction::BREAK_LOOP) break;
-        }
-
-#if PTO2_PROFILING
-        CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
-#endif
-
-        // Phase 1: Check running cores for completion
-        int32_t completed_this_turn = 0;
-
-        bool try_completed = tracker.has_any_running_cores();
-        if (try_completed) {
-            check_running_cores_for_completion(
-                thread_idx, hank, completed_this_turn, cur_thread_completed, made_progress,
-                deferred_release_slot_states, deferred_release_count, local_bufs
-            );
-        }
-        if (completed_this_turn > 0) {
-#if PTO2_SCHED_PROFILING
-            sched_->tasks_completed.fetch_add(completed_this_turn, std::memory_order_relaxed);
-#endif
-            int32_t prev = completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
-            int32_t new_total = prev + completed_this_turn;
-            last_progress_count = new_total;
-            if (thread_idx == 0 && task_count > 0) {
-                if (new_total <= PROGRESS_VERBOSE_THRESHOLD ||
-                    new_total / PROGRESS_LOG_INTERVAL != prev / PROGRESS_LOG_INTERVAL || new_total >= task_count) {
-                    LOG_INFO_V9(
-                        "PTO2 progress: completed=%d total=%d (%.1f%%)", new_total, task_count,
-                        100.0 * new_total / task_count
-                    );
-                }
-            }
-        }
-
-        if (rt_ != nullptr && rt_->aicore_mailbox != nullptr &&
-            (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending())) {
-            AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(
-                rt_->aicore_mailbox, sched_, local_bufs, deferred_release_slot_states, deferred_release_count,
-                PTO2_DEFERRED_RELEASE_CAP
-#if PTO2_SCHED_PROFILING
-                ,
-                thread_idx
-#endif
-            );
-            if (poll_result.error_code != PTO2_ERROR_NONE) {
-                int32_t expected = PTO2_ERROR_NONE;
-                header->sched_error_code.compare_exchange_strong(
-                    expected, poll_result.error_code, std::memory_order_acq_rel, std::memory_order_acquire
-                );
-                completed_.store(true, std::memory_order_release);
-                break;
-            }
-            if (poll_result.completed > 0) {
-#if PTO2_SCHED_PROFILING
-                sched_->tasks_completed.fetch_add(poll_result.completed, std::memory_order_relaxed);
-#endif
-                int32_t prev = completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed);
-                int32_t new_total = prev + poll_result.completed;
-                last_progress_count = new_total;
-                made_progress = true;
-            }
-        }
-
-#if PTO2_PROFILING
-        if (!try_completed) {
-            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
-        } else {
-            CYCLE_COUNT_LAP(l2_swimlane.sched_complete_cycle);
-            // Emit on any completion work this iteration — a finished slot OR
-            // sub-block retires that did not finish a slot. The latter makes the
-            // SPMD harvest tail visible (count field = blocks processed this
-            // iteration; on a pure-retire iteration phase_complete_count is 0).
-            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES &&
-                (l2_swimlane.phase_complete_count > 0 || l2_swimlane.phase_subretire_count > 0)) {
-                // Local depth is cheap (this thread's own buffer counter).
-                // Shared depth is NOT sampled here: complete's release_fanin
-                // pushes to local_bufs in the fast path (try_push succeeds
-                // until cap=64). Shared only changes on dispatch's flush
-                // path. Carrying phase_start_shared forward as end_shared
-                // is the right answer 99% of the time AND skips three
-                // contended atomic loads per emit.
-                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
-                capture_local_snapshot(phase_end_local);
-                l2_swimlane_aicpu_record_sched_phase(
-                    thread_idx, L2SwimlaneSchedPhaseKind::Complete, _t0_phase, _t1, l2_swimlane.sched_loop_count,
-                    l2_swimlane.phase_complete_count + l2_swimlane.phase_subretire_count, /*pop_hit=*/0,
-                    /*pop_miss=*/0, phase_start_local, phase_start_shared, phase_end_local, phase_start_shared
-                );
-                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-                    phase_start_local[s] = phase_end_local[s];
-                    // phase_start_shared unchanged — carried forward
-                }
-                _t0_phase = _t1;
-                l2_swimlane.phase_complete_count = 0;
-                l2_swimlane.phase_subretire_count = 0;
-            }
-        }
-#endif
-
-        bool try_pushed = false;
-
-        // Phase 2 drain check
-        if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0) {
-            handle_drain_mode(thread_idx);
-            continue;
-        }
-
-        // Phase 3: Drain wiring queue (thread 0 only)
-        if (thread_idx == 0) {
-            int wired = sched_->drain_wiring_queue(orchestrator_done_);
-            if (wired > 0) {
-                made_progress = true;
-#if PTO2_SCHED_PROFILING
-                l2_swimlane.phase_wiring_count += wired;
-#endif
-            }
-        }
-#if PTO2_PROFILING
-        CYCLE_COUNT_LAP(l2_swimlane.sched_wiring_cycle);
-#endif
-
-        // Phase 3b: Drain dummy ready queue (thread 0 only).
-        //
-        // Dependency-only tasks bypass AICore dispatch: they go through the
-        // scheduler so fanin/fanout edges stay consistent, but completion is
-        // signalled inline here. Pinned to thread 0 to avoid cross-thread
-        // races and to keep cache hot near the wiring drain above.
-        if (thread_idx == 0) {
-            constexpr int DUMMY_DRAIN_BATCH = 16;
-            PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH];
-            int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH);
-            for (int di = 0; di < dummy_got; di++) {
-                PTO2TaskSlotState &dummy_slot = *dummy_batch[di];
-
-                // ----- DummyTask phase: dummy "task" identity marker. --------
-                // The dummy has no AICore presence — start ≈ end (1 cycle
-                // wide, just "we identified it"). Converter renders this on
-                // Worker View's DUMMY_T{thread} lane so the DAG node is
-                // visually present. tasks_processed = task_token low 32 bits
-                // (= local_id within ring) so deps.json flow arrows can land.
-                // The Resolve work that follows is emitted separately below.
-#if PTO2_PROFILING
-                if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-                    uint64_t dummy_marker_t = get_sys_cnt_aicpu();
-                    uint32_t dummy_id_low32 = static_cast<uint32_t>(dummy_slot.task->task_id.raw & 0xFFFFFFFFu);
-                    l2_swimlane_aicpu_record_sched_phase(
-                        thread_idx, L2SwimlaneSchedPhaseKind::DummyTask, dummy_marker_t, dummy_marker_t,
-                        sched_l2_swimlane_[thread_idx].sched_loop_count, dummy_id_low32
-                    );
-                }
-#endif
-
-                // ----- Resolve work: walk this dummy's consumer list. ------
-                // Same 1 µs filter as the main-path Resolve emit suppresses
-                // dummies whose consumer release runs sub-microsecond.
-#if PTO2_PROFILING
-                uint64_t dummy_resolve_t0 =
-                    (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) ? get_sys_cnt_aicpu() : 0;
-#endif
-                // [[maybe_unused]] silences -Werror=unused-but-set-variable on
-                // the profiling-flags-smoke build path where PTO2_PROFILING is
-                // OFF and the Resolve emit below is excluded.
-                [[maybe_unused]] uint32_t dummy_consumers = 0;
-#if PTO2_SCHED_PROFILING
-                dummy_consumers = sched_->on_task_complete(dummy_slot, thread_idx, local_bufs).fanout_edges;
-#else
-                dummy_consumers = sched_->on_task_complete(dummy_slot, local_bufs);
-#endif
-#if PTO2_PROFILING
-                if (dummy_resolve_t0 != 0) {
-                    uint64_t dummy_resolve_t1 = get_sys_cnt_aicpu();
-                    constexpr uint64_t RESOLVE_EMIT_MIN_CYCLES = PLATFORM_PROF_SYS_CNT_FREQ / 1'000'000;  // 1 µs
-                    if (dummy_resolve_t1 - dummy_resolve_t0 >= RESOLVE_EMIT_MIN_CYCLES) {
-                        l2_swimlane_aicpu_record_sched_phase(
-                            thread_idx, L2SwimlaneSchedPhaseKind::Resolve, dummy_resolve_t0, dummy_resolve_t1,
-                            sched_l2_swimlane_[thread_idx].sched_loop_count, dummy_consumers
-                        );
-                    }
-                }
-#endif
-                // Dummy tasks have no subtasks to retire and no fanout pre-conditions
-                // beyond their own producers; release self-reference so the slot can
-                // reach CONSUMED once all consumers drain.
-                deferred_release_slot_states[deferred_release_count++] = &dummy_slot;
-                if (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP) {
-                    while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                        (void)sched_->on_task_release(
-                            *deferred_release_slot_states[--deferred_release_count], thread_idx
-                        );
-#else
-                        sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-                    }
-                }
-                int32_t prev = completed_tasks_.fetch_add(1, std::memory_order_relaxed);
-                last_progress_count = prev + 1;
-                cur_thread_completed++;
-            }
-            if (dummy_got > 0) {
-                made_progress = true;
-            }
-        }
-
-        // Phase 4: MIX-strict-priority dispatch with phase-split and
-        // cross-thread idle gating. See dispatch_ready_tasks for the policy.
-        dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress, try_pushed);
-
-        // Phase 4b: early-dispatch onto spare cores, but ONLY when this thread is
-        // otherwise idle — nothing was dispatched this iteration AND no ready work is
-        // queued for any shape. Early-dispatch competes with normal dispatch for
-        // pending slots, so gating on "no ready work" keeps it from delaying a real
-        // ready task; skipping the producer-fanout scan when busy also removes its
-        // per-iteration cost (the discovery walk only runs on genuinely idle passes).
-        bool any_ready_work = try_pushed;
-        for (int s = 0; !any_ready_work && s < PTO2_NUM_RESOURCE_SHAPES; s++) {
-            if (sched_->ready_queues[s].size() > 0 || local_bufs[s].count > 0) any_ready_work = true;
-        }
-#if PTO2_PROFILING
-        bool early_dispatch_record = l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES;
-        uint64_t early_dispatch_t0 = early_dispatch_record ? get_sys_cnt_aicpu() : 0;
-#endif
-        // Skip speculative early-dispatch under PMU: dispatch_ready_tasks already
-        // withholds PENDING dispatch when pmu_active to preserve single-issue PMU
-        // windows, and staging gated work into idle/pending slots would perturb the
-        // same windows.
-        [[maybe_unused]] int32_t staged_count =
-            (pmu_active || any_ready_work) ? 0 : try_speculative_early_dispatch(thread_idx);
-#if PTO2_PROFILING
-        // Emit an EarlyDispatch bar so a staging-dominated iteration is attributed
-        // to early-dispatch rather than disappearing into a blank gap.
-        if (early_dispatch_record && staged_count > 0) {
-            l2_swimlane_aicpu_record_sched_phase(
-                thread_idx, L2SwimlaneSchedPhaseKind::EarlyDispatch, early_dispatch_t0, get_sys_cnt_aicpu(),
-                sched_l2_swimlane_[thread_idx].sched_loop_count, static_cast<uint32_t>(staged_count)
-            );
-            // prepare_block_for_dispatch bumped phase_dispatch_count while staging;
-            // those blocks belong to this EarlyDispatch bar, so clear the counter
-            // before it leaks into the next Dispatch bar.
-            sched_l2_swimlane_[thread_idx].phase_dispatch_count = 0;
-        }
-#endif
-
-        // Second completion poll. dispatch_ready_tasks + try_speculative_early_dispatch
-        // above can take several us in a busy window; a producer block that FINs
-        // during them would otherwise wait for the NEXT iteration's top-of-loop
-        // Phase-1 poll (the ~7us detection latency that delays a flagged
-        // producer's doorbell). Re-polling here observes those FINs immediately,
-        // so the doorbell fires this iteration. Idempotent (the poll is a poll);
-        // we drain deferred releases eagerly to keep the buffer from growing.
-        if (tracker.has_any_running_cores()) {
-            int32_t completed_2nd = 0;
-            check_running_cores_for_completion(
-                thread_idx, hank, completed_2nd, cur_thread_completed, made_progress, deferred_release_slot_states,
-                deferred_release_count, local_bufs
-            );
-            if (completed_2nd > 0) {
-#if PTO2_SCHED_PROFILING
-                sched_->tasks_completed.fetch_add(completed_2nd, std::memory_order_relaxed);
-#endif
-                completed_tasks_.fetch_add(completed_2nd, std::memory_order_relaxed);
-                last_progress_count = completed_tasks_.load(std::memory_order_relaxed);
-            }
-            // Eager drain so the second poll can't push deferred_release toward
-            // its cap between idle iterations.
-            while (deferred_release_count >= PTO2_DEFERRED_RELEASE_CAP - 96) {
-#if PTO2_SCHED_PROFILING
-                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-            }
-        }
-
-#if PTO2_PROFILING
-        if (!try_pushed) {
-            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
-        } else {
-            CYCLE_COUNT_LAP(l2_swimlane.sched_dispatch_cycle);
-            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && l2_swimlane.phase_dispatch_count > 0) {
-                // Final-drain at loop end emits the trailing-idle tail so
-                // sum-of-deltas == run-cumulative.
-                uint64_t pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
-                uint64_t pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
-                // L2SwimlaneAicpuSchedPhaseRecord's pop_hit / pop_miss are uint32 — a delta that overflows means
-                // an emit was missed for ~4 billion pops, which is well outside any
-                // realistic dispatch cadence and silently truncates without this guard.
-                debug_assert(pop_hit_delta < (1ULL << 32));
-                debug_assert(pop_miss_delta < (1ULL << 32));
-                int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
-                int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES];
-                capture_phase_end(phase_end_local, phase_end_shared);
-                l2_swimlane_aicpu_record_sched_phase(
-                    thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, _t0_phase, _t1, l2_swimlane.sched_loop_count,
-                    l2_swimlane.phase_dispatch_count, static_cast<uint32_t>(pop_hit_delta),
-                    static_cast<uint32_t>(pop_miss_delta), phase_start_local, phase_start_shared, phase_end_local,
-                    phase_end_shared
-                );
-                for (int s = 0; s < L2SWIMLANE_NUM_QUEUE_SHAPES; s++) {
-                    phase_start_local[s] = phase_end_local[s];
-                    phase_start_shared[s] = phase_end_shared[s];
-                }
-                _t0_phase = _t1;
-                l2_swimlane.phase_dispatch_count = 0;
-                l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
-                l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
-            }
-        }
-#endif
-
-#if !PTO2_PROFILING
-        (void)try_completed;
-        (void)try_pushed;
-#endif
-
-        if (made_progress) {
-            idle_iterations = 0;
-            last_progress_ts = get_sys_cnt_aicpu();
-        } else {
-#if PTO2_PROFILING
-            uint64_t rel_t0 = (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES && deferred_release_count > 0) ?
-                                  get_sys_cnt_aicpu() :
-                                  0;
-#endif
-            while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-                (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-                sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-            }
-#if PTO2_PROFILING
-            // Release is a distinct operation from the poll scan — emit it with
-            // its own span (Perfetto nests it inside the surrounding poll/idle
-            // run by time-containment) rather than competing with poll for one
-            // per-iteration label.
-            if (rel_t0 != 0) {
-                l2_swimlane_aicpu_record_sched_phase(
-                    thread_idx, L2SwimlaneSchedPhaseKind::Release, rel_t0, get_sys_cnt_aicpu(),
-                    l2_swimlane.sched_loop_count, /*tasks_processed=*/0
-                );
-            }
-#endif
-            idle_iterations++;
-
-            if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0) {
-                LoopAction action = check_idle_fatal_error(thread_idx, header, runtime);
-                if (action == LoopAction::BREAK_LOOP) break;
-            }
-
-            if (idle_iterations % STALL_LOG_INTERVAL == 0) {
-                log_stall_diagnostics(thread_idx, total_tasks_, idle_iterations, last_progress_count);
-            }
-            // Wall-clock budget gate, with two fatal-latch branches:
-            //
-            // 1. Self owns a RUNNING task — first-hand evidence the
-            //    dispatch is stuck. Latch.
-            // 2. No thread anywhere owns a RUNNING task AND tasks remain
-            //    unfinished — the system is in a pre-dispatch / WAIT-only
-            //    deadlock (e.g. dependency cycle). Ownerless idle threads
-            //    are the only observers; let this one latch on the global
-            //    evidence (`completed_tasks_ < total_tasks_` and
-            //    `no_thread_owns_running_task()`).
-            //
-            // Otherwise: a sibling thread owns a RUNNING task but hasn't
-            // hit its own budget yet (typical distributed startup-skew
-            // case) — refresh last_progress_ts and keep spinning. The
-            // STALL diagnostic above still fires periodically so
-            // observability is preserved.
-            if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES) {
-                bool self_owns = self_owns_running_task(thread_idx);
-                bool global_stuck = !self_owns && total_tasks_ > 0 &&
-                                    completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ &&
-                                    no_thread_owns_running_task();
-                if (self_owns || global_stuck) {
-                    // Latch the error + emergency_shutdown, then break to the
-                    // shared end-of-loop cleanup so the diagnostic buffers get
-                    // flushed to the host. An early return here would strand the
-                    // stuck task's already-dumped inputs and every completed
-                    // task's in/out records in the unflushed per-thread dump
-                    // buffer — exactly the state we need to triage the hang.
-                    timeout_rc = handle_timeout_exit(
-                        thread_idx, header, runtime, idle_iterations, last_progress_count
-#if PTO2_PROFILING
-                        ,
-                        l2_swimlane.sched_start_ts
-#endif
-                    );
-                    break;
-                }
-                last_progress_ts = get_sys_cnt_aicpu();
-            }
-            SPIN_WAIT_HINT();
-#if PTO2_PROFILING
-            CYCLE_COUNT_LAP(l2_swimlane.sched_idle_cycle);
-            // _t0_phase advances through idle laps so the next emitted
-            // COMPLETE/DISPATCH bar starts at the iter it actually ran in, not
-            // at the start of the preceding idle stretch. The idle/poll time
-            // itself is attributed by the activity-fill below — no blanks.
-            if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-                _t0_phase = _t1;
-            }
-#endif
-        }
-    }
-
-    // Drain any entries left in the deferred-release batch. The in-loop flush
-    // only fires on idle iterations and on buffer-full; a loop exit while the
-    // last iteration made progress can leave entries un-released. Drop them
-    // here so every consumed producer slot completes its on_task_release
-    // regardless of which loop-exit path fired.
-    while (deferred_release_count > 0) {
-#if PTO2_SCHED_PROFILING
-        (void)sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count], thread_idx);
-#else
-        sched_->on_task_release(*deferred_release_slot_states[--deferred_release_count]);
-#endif
-    }
-
-#if PTO2_PROFILING
-    // Final-drain: emit any pop_hit / pop_miss accrued since the last
-    // dispatch emit (typically the trailing idle loops while waiting for
-    // orchestrator_done_) as a zero-duration synthetic dispatch record so
-    // sum(record.pop_*) reconciles with the run-cumulative counter.
-    // Gate on SCHED_PHASES — at lower levels the phase buffer is never
-    // flushed (see below), so writing this record would be wasted work.
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        uint64_t final_pop_hit_delta = l2_swimlane.pop_hit - l2_swimlane.pop_hit_at_last_emit;
-        uint64_t final_pop_miss_delta = l2_swimlane.pop_miss - l2_swimlane.pop_miss_at_last_emit;
-        debug_assert(final_pop_hit_delta < (1ULL << 32));
-        debug_assert(final_pop_miss_delta < (1ULL << 32));
-        if (final_pop_hit_delta != 0 || final_pop_miss_delta != 0) {
-            uint64_t t_now = get_sys_cnt_aicpu();
-            int16_t phase_end_local[L2SWIMLANE_NUM_QUEUE_SHAPES];
-            int16_t phase_end_shared[L2SWIMLANE_NUM_QUEUE_SHAPES];
-            capture_phase_end(phase_end_local, phase_end_shared);
-            l2_swimlane_aicpu_record_sched_phase(
-                thread_idx, L2SwimlaneSchedPhaseKind::Dispatch, t_now, t_now, l2_swimlane.sched_loop_count, 0,
-                static_cast<uint32_t>(final_pop_hit_delta), static_cast<uint32_t>(final_pop_miss_delta),
-                phase_end_local, phase_end_shared, phase_end_local, phase_end_shared
-            );
-            l2_swimlane.pop_hit_at_last_emit = l2_swimlane.pop_hit;
-            l2_swimlane.pop_miss_at_last_emit = l2_swimlane.pop_miss;
-        }
-    }
-    log_l2_swimlane_summary(thread_idx, cur_thread_completed);
-#endif
-
-#if PTO2_PROFILING
-    if (l2_swimlane.l2_swimlane_enabled) {
-        l2_swimlane_aicpu_flush(
-            thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
-        );
-        if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-            l2_swimlane_aicpu_flush_sched_phase_buffer(thread_idx);
-        }
-    }
-#endif
-#if PTO2_PROFILING
-    if (is_dump_args_enabled()) {
-        dump_args_flush(thread_idx);
-    }
-#endif
-#if PTO2_PROFILING
-    if (is_pmu_enabled()) {
-        pmu_aicpu_flush_buffers(
-            thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
-        );
-    }
-#endif
-
-    return timeout_rc != 0 ? timeout_rc : cur_thread_completed;
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
deleted file mode 100644
index f1dc5d7f8..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_types.h
+++ /dev/null
@@ -1,468 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-#ifndef SCHEDULER_TYPES_H
-#define SCHEDULER_TYPES_H
-
-#include <atomic>
-#include <cstdint>
-
-#include "common/core_type.h"
-#include "common/platform_config.h"
-#include "pto_runtime2_types.h"
-#include "spin_hint.h"
-
-// =============================================================================
-// Profiling macros (compile-time gated)
-// =============================================================================
-
-#if PTO2_PROFILING
-#include "aicpu/device_time.h"
-// Accumulated nanoseconds per sub-step
-#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
-#define CYCLE_COUNT_LAP(acc)       \
-    do {                           \
-        _t1 = get_sys_cnt_aicpu(); \
-        acc += (_t1 - _t0);        \
-        _t0 = _t1;                 \
-    } while (0)
-#else
-#define CYCLE_COUNT_START()
-#define CYCLE_COUNT_LAP(acc)
-#endif
-
-// =============================================================================
-// Scheduler constants
-// =============================================================================
-
-constexpr int32_t MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS;
-
-// Periodic cadence (in idle iterations) for emitting the per-thread STALL
-// diagnostic while no progress is being made. Purely an observability knob,
-// independent of the wall-clock timeout below: small enough to fire a few times
-// before the budget expires, large enough not to flood device_log.
-constexpr int32_t STALL_LOG_INTERVAL = 480000;
-constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024;  // Check orchestrator error every N idle iters
-
-// Wall-clock budget for declaring "no progress = scheduler timeout". Replaces
-// the per-thread iteration-count cap that once lived here as MAX_IDLE_ITERATIONS
-// for the fatal-latch decision; STALL_LOG_INTERVAL above keeps the per-thread
-// diagnostic cadence.
-//
-// Using wall-clock here is load-bearing for distributed runs: with per-thread
-// iteration counts, a pure-idle thread spinning ~115 ns/iter hits the cap in
-// ~92 ms while a sibling thread polling a RUNNING task takes ~200 ms for the
-// same iteration count. The fast spinner racing ahead and latching fatal
-// kills the slower-but-correct poller mid-poll — see the distributed
-// startup-skew scenario in issue #897.
-//
-// The budget is platform-defined (PLATFORM_SCHEDULER_TIMEOUT_MS in spin_hint.h)
-// because the safe value differs per variant: onboard trims it to 2 s so the
-// AICPU detects a hang and flushes its diagnostics (tensor dump, in-flight
-// partial output) before STARS reaps the op and poisons the context (chain:
-// this < op-exec < host stream-sync, platform_config.h); sim has no STARS to
-// race and keeps the full 5 s #897 headroom. See spin_hint.h for the per-variant
-// rationale.
-constexpr int32_t SCHEDULER_TIMEOUT_MS = PLATFORM_SCHEDULER_TIMEOUT_MS;
-constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES =
-    static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
-constexpr int32_t STALL_DUMP_READY_MAX = 8;
-constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
-constexpr int32_t STALL_DUMP_CORE_MAX = 8;
-constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10;  // log every completion for the first N tasks
-constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions after threshold
-
-// =============================================================================
-// Control flow signal from cold-path helpers back to the main dispatch loop.
-// =============================================================================
-
-enum class LoopAction : int8_t {
-    NONE,        // cold path did not trigger; proceed normally
-    BREAK_LOOP,  // equivalent to 'break' from the while(true) loop
-};
-
-// =============================================================================
-// Per-core state: one cache line per core to eliminate false sharing
-// and co-locate all hot-path fields for minimal cache misses.
-// Dual-slot layout: running (currently executing) + pending (pre-loaded, awaiting hardware pickup).
-// =============================================================================
-
-struct alignas(64) CoreExecState {
-    // --- Hot fields (completion + dispatch, every iteration) ---
-    uint64_t reg_addr;                      // offset  0: register base address (set once in handshake)
-    PTO2TaskSlotState *running_slot_state;  // offset  8: slot state for running task (nullptr = empty)
-    PTO2TaskSlotState *pending_slot_state;  // offset 16: slot state for pending task (nullptr = empty)
-    int32_t running_reg_task_id;            // offset 24: register task ID (AICPU_TASK_INVALID = idle)
-    int32_t pending_reg_task_id;            // offset 28: pending register task ID (AICPU_TASK_INVALID = none)
-    uint32_t dispatch_seq;                  // offset 32: monotonic dispatch counter
-    PTO2SubtaskSlot running_subslot;        // offset 36: which subtask slot is running
-    PTO2SubtaskSlot pending_subslot;        // offset 37: which subtask slot is pending
-    uint8_t pad0_[2];                       // offset 38: alignment padding
-    // Precomputed COND register pointer; resolved once in handshake so the
-    // hot completion poll does a single volatile load instead of recomputing
-    // reg_base + reg_offset(COND) on every iteration.
-    volatile uint32_t *cond_ptr;  // offset 40: precomputed pointer to COND register
-#if PTO2_PROFILING
-    // --- Profiling fields (dispatch path, compile-time gated) ---
-    uint64_t running_dispatch_timestamp;  // offset 48: AICPU dispatch timestamp for running task
-    uint64_t pending_dispatch_timestamp;  // offset 56: AICPU dispatch timestamp for pending task
-#else
-    // --- Cold fields (init/diagnostics only, never in hot path) ---
-    int32_t worker_id;          // offset 48: index in runtime.workers[]
-    uint32_t physical_core_id;  // offset 52: hardware physical core ID
-    CoreType core_type;         // offset 56: AIC or AIV (enum class : int32_t)
-    uint8_t pad2_[4];           // offset 60: pad to 64 bytes
-#endif
-};
-static_assert(sizeof(CoreExecState) == 64, "CoreExecState must occupy exactly one cache line");
-
-// =============================================================================
-// CoreTracker: cluster-based bitmask tracker for idle/running core state.
-//
-// core_states_ encodes per-cluster core idle/running in 3 bits per cluster:
-//   bit i*3   = AIC of cluster i   (1 = idle, 0 = running)
-//   bit i*3+1 = AIV0 of cluster i
-//   bit i*3+2 = AIV1 of cluster i
-// Max 21 clusters per tracker (63 bits in uint64_t).
-// =============================================================================
-
-class alignas(64) CoreTracker {
-public:
-    static inline int32_t MAX_CORE_PER_THREAD = 63;
-    static constexpr int32_t MAX_CLUSTERS = 63 / 3;
-
-public:
-    CoreTracker() = default;
-
-    class BitStates {
-    public:
-        BitStates() = default;
-
-        explicit BitStates(uint64_t states) :
-            states_(states) {}
-        void init() { states_ = 0; }
-
-        BitStates operator~() const { return BitStates(~states_); }
-        BitStates operator&(const BitStates &other) const { return BitStates(states_ & other.states_); }
-        BitStates operator|(const BitStates &other) const { return BitStates(states_ | other.states_); }
-        BitStates operator^(const BitStates &other) const { return BitStates(states_ ^ other.states_); }
-        BitStates operator>>(int32_t offset) const { return BitStates(states_ >> offset); }
-        BitStates operator<<(int32_t offset) const { return BitStates(states_ << offset); }
-        void operator&=(const BitStates &other) { states_ &= other.states_; }
-        void operator|=(const BitStates &other) { states_ |= other.states_; }
-        void operator^=(const BitStates &other) { states_ ^= other.states_; }
-
-        bool has_value() const { return states_ > 0; }
-        int32_t count() const { return __builtin_popcountll(states_); }
-        void clear_bit(int32_t offset) { states_ &= ~(1ULL << offset); }
-
-        // Extract the lowest set bit from mask, clear it, and return its position.
-        // Returns -1 if mask is empty.
-        int32_t pop_first() {
-            if (states_ == 0) return -1;
-            int32_t pos = __builtin_ctzll(states_);
-            states_ &= states_ - 1;
-            return pos;
-        }
-
-    private:
-        uint64_t states_{0};
-    };
-
-public:
-    void init(int32_t cluster_count) {
-        cluster_count_ = cluster_count;
-        aic_mask_.init();
-        aiv_mask_.init();
-        pending_occupied_.init();
-        for (int32_t i = 0; i < cluster_count; i++) {
-            aic_mask_ |= BitStates(1ULL << (i * 3));
-            aiv_mask_ |= BitStates(6ULL << (i * 3));
-        }
-        core_states_ = aic_mask_ | aiv_mask_;
-    }
-
-    void set_cluster(int32_t cluster_idx, int32_t aic_wid, int32_t aiv0_wid, int32_t aiv1_wid) {
-        core_id_map_[cluster_idx * 3] = aic_wid;
-        core_id_map_[cluster_idx * 3 + 1] = aiv0_wid;
-        core_id_map_[cluster_idx * 3 + 2] = aiv1_wid;
-    }
-
-    int32_t get_cluster_count() const { return cluster_count_; }
-
-    // --- Running core queries ---
-
-    template <CoreType CT>
-    bool has_running_cores() const {
-        if constexpr (CT == CoreType::AIC) {
-            return ((~core_states_) & aic_mask_).has_value();
-        } else {
-            return ((~core_states_) & aiv_mask_).has_value();
-        }
-    }
-
-    bool has_any_running_cores() const { return ((~core_states_) & (aic_mask_ | aiv_mask_)).has_value(); }
-
-    template <CoreType CT>
-    int32_t get_running_count() const {
-        if constexpr (CT == CoreType::AIC) {
-            return ((~core_states_) & aic_mask_).count();
-        } else {
-            return ((~core_states_) & aiv_mask_).count();
-        }
-    }
-
-    // Return an opaque bitmask for iterating running cores of a given type.
-    // Use pop_first() to extract core bit offsets one at a time.
-    template <CoreType CT>
-    BitStates get_running_cores() const {
-        if constexpr (CT == CoreType::AIC) {
-            return (~core_states_) & aic_mask_;
-        } else {
-            return (~core_states_) & aiv_mask_;
-        }
-    }
-
-    BitStates get_all_running_cores() const { return (~core_states_) & (aic_mask_ | aiv_mask_); }
-    BitStates get_cluster_offset_states() const { return aic_mask_; }
-
-    // --- Cluster matching ---
-
-    BitStates get_valid_cluster_offset_states(PTO2ResourceShape shape) const {
-        switch (shape) {
-        case PTO2ResourceShape::AIC:
-            return core_states_ & aic_mask_;
-        case PTO2ResourceShape::AIV:
-            return ((core_states_ >> 1) | (core_states_ >> 2)) & aic_mask_;
-        case PTO2ResourceShape::MIX:
-            return (core_states_ >> 1) & (core_states_ >> 2) & core_states_ & aic_mask_;
-        case PTO2ResourceShape::DUMMY:
-            // DUMMY tasks never reach the core-tracker dispatch path; they are
-            // completed inline by resolve_and_dispatch via dummy_ready_queue.
-            return BitStates(0ULL);
-        }
-        return BitStates(0ULL);
-    }
-
-    int32_t get_aic_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset]; }
-    int32_t get_aiv0_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 1]; }
-    int32_t get_aiv1_core_id(int32_t cluster_offset) const { return core_id_map_[cluster_offset + 2]; }
-
-    int32_t get_aic_core_offset(int32_t cluster_offset) const { return cluster_offset; }
-    int32_t get_aiv0_core_offset(int32_t cluster_offset) const { return cluster_offset + 1; }
-    int32_t get_aiv1_core_offset(int32_t cluster_offset) const { return cluster_offset + 2; }
-
-    bool is_aic_core_idle(int32_t cluster_offset) const {
-        return ((core_states_ >> cluster_offset) & BitStates(1ULL)).has_value();
-    }
-    bool is_aiv0_core_idle(int32_t cluster_offset) const {
-        return ((core_states_ >> (cluster_offset + 1)) & BitStates(1ULL)).has_value();
-    }
-    bool is_aiv1_core_idle(int32_t cluster_offset) const {
-        return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value();
-    }
-
-    // --- State mutation ---
-
-    // Toggle bit at the given bit offset (running <-> idle)
-    void change_core_state(int32_t bit_offset) { core_states_ ^= BitStates(1ULL << bit_offset); }
-
-    // --- Pending-occupied tracking ---
-    // Tracks whether a core's pending payload slot is occupied (awaiting hardware ACK).
-    // SET on dispatch (both running-first and pending), CLEAR on idle or pending_freed.
-
-    void set_pending_occupied(int32_t bit_offset) { pending_occupied_ |= BitStates(1ULL << bit_offset); }
-    void clear_pending_occupied(int32_t bit_offset) {
-        pending_occupied_ ^= (pending_occupied_ & BitStates(1ULL << bit_offset));
-    }
-
-    // --- Two-phase dispatch queries ---
-
-    // Idle dispatch: returns bit offsets of idle cores for the given shape.
-    // For AIC: 1 bit per cluster (core offset == cluster offset).
-    // For AIV: 1 bit per AIV core (2 bits per cluster at aiv_mask_ positions).
-    // Only AIC needs pending_occupied filtering: by invariant, idle cores (core_states_ bit=1)
-    // always have pending_occupied=0, so AIV/MIX need no extra filtering.
-    // Skipping the AIC-centric filter also fixes a latent bug where a running+pending AIC core
-    // would incorrectly block AIV idle dispatch on the same cluster.
-    BitStates get_idle_core_offset_states(PTO2ResourceShape shape) const {
-        if (shape == PTO2ResourceShape::AIC) {
-            return get_valid_cluster_offset_states(shape) & ~(pending_occupied_ & aic_mask_);
-        }
-        if (shape == PTO2ResourceShape::AIV) {
-            return core_states_ & aiv_mask_;
-        }
-        return get_valid_cluster_offset_states(shape);  // MIX: cluster-level
-    }
-
-    // Pending dispatch: returns bit offsets of cores eligible for pending-slot dispatch.
-    // AIC: 1 bit per cluster (aic_mask_ positions). AIV: 1 bit per AIV core (aiv_mask_ positions).
-    // Runtime MIX dispatch uses classify_mix_cluster() so the decision follows the task's active_mask.
-    enum class MixPlacement : uint8_t { RUNNING, PENDING, REJECT };
-
-    // A MIX block must place all cores named by active_mask the same way:
-    // all idle means running placement, all running means pending placement,
-    // and any mixed state is retried later.
-    MixPlacement classify_mix_cluster(int32_t cluster_offset, uint8_t core_mask) const {
-        BitStates used(0ULL);
-        if (core_mask & PTO2_SUBTASK_MASK_AIC) {
-            used |= BitStates(1ULL << cluster_offset);
-        }
-        if (core_mask & PTO2_SUBTASK_MASK_AIV0) {
-            used |= BitStates(1ULL << (cluster_offset + 1));
-        }
-        if (core_mask & PTO2_SUBTASK_MASK_AIV1) {
-            used |= BitStates(1ULL << (cluster_offset + 2));
-        }
-        if (!used.has_value() || (pending_occupied_ & used).has_value()) {
-            return MixPlacement::REJECT;
-        }
-
-        BitStates idle = core_states_ & used;
-        if (idle.count() == used.count()) {
-            return MixPlacement::RUNNING;
-        }
-        if (!idle.has_value()) {
-            return MixPlacement::PENDING;
-        }
-        return MixPlacement::REJECT;
-    }
-
-    BitStates get_mix_running_cluster_offset_states(uint8_t core_mask) const {
-        BitStates result(0ULL);
-        BitStates candidates = get_cluster_offset_states();
-        while (candidates.has_value()) {
-            int32_t cluster_offset = candidates.pop_first();
-            if (classify_mix_cluster(cluster_offset, core_mask) == MixPlacement::RUNNING) {
-                result |= BitStates(1ULL << cluster_offset);
-            }
-        }
-        return result;
-    }
-
-    int32_t count_mix_running_clusters(uint8_t core_mask) const {
-        return get_mix_running_cluster_offset_states(core_mask).count();
-    }
-
-    BitStates get_pending_core_offset_states(PTO2ResourceShape shape) const {
-        if (shape == PTO2ResourceShape::MIX) {
-            // Shape-level query kept conservative for legacy callers/tests.
-            // The real MIX dispatch path applies active_mask in classify_mix_cluster().
-            // Any core without a pending payload can accept a dispatch (idle or running).
-            BitStates available = ~pending_occupied_;
-            BitStates mix_available =
-                (available & aic_mask_) & ((available >> 1) & aic_mask_) & ((available >> 2) & aic_mask_);
-            // Pending MIX can only reuse a fully-running cluster. Partially-running clusters
-            // could split one MIX block across immediate and pending placement.
-            BitStates running = ~core_states_;
-            BitStates cluster_all_running =
-                (running & aic_mask_) & ((running >> 1) & aic_mask_) & ((running >> 2) & aic_mask_);
-            return mix_available & cluster_all_running;
-        }
-        if (shape == PTO2ResourceShape::AIC) {
-            return (~core_states_) & aic_mask_ & ~(pending_occupied_ & aic_mask_);
-        }
-        // AIV
-        return (~core_states_) & aiv_mask_ & ~pending_occupied_;
-    }
-
-    // --- Two-phase dispatch unified query ---
-
-    enum class DispatchPhase : uint8_t { IDLE, PENDING };
-
-    BitStates get_dispatchable_cores(PTO2ResourceShape shape, DispatchPhase phase) const {
-        return (phase == DispatchPhase::IDLE) ? get_idle_core_offset_states(shape) :
-                                                get_pending_core_offset_states(shape);
-    }
-
-    // --- Bit offset <-> worker_id mapping ---
-
-    int32_t get_core_id_by_offset(int32_t offset) const { return core_id_map_[offset]; }
-
-    const int32_t *core_ids() const { return core_id_map_; }
-    int32_t core_num() const { return cluster_count_ * 3; }
-
-private:
-    int32_t cluster_count_;
-    BitStates aic_mask_;
-    BitStates aiv_mask_;
-    BitStates core_states_;
-    BitStates pending_occupied_;
-    int32_t core_id_map_[63];  // bit_position -> worker_id, max 21 clusters * 3
-};
-
-// =============================================================================
-// SlotTransition: pure event signals from a single register poll.
-// true = event occurred, false = no-op (maintain current state).
-// =============================================================================
-
-struct SlotTransition {
-    bool running_done = false;   // running task completed
-    bool pending_done = false;   // pending task completed
-    bool running_freed = false;  // running slot data should be released
-    bool pending_freed = false;  // pending_occupied can be cleared
-    bool matched = false;        // some case was hit (otherwise skip apply)
-};
-
-// =============================================================================
-// Profiling counters (compile-time gated)
-// =============================================================================
-
-#if PTO2_PROFILING
-struct alignas(64) SchedL2SwimlaneCounters {
-    bool l2_swimlane_enabled{false};
-    uint64_t sched_start_ts{0};
-    uint64_t sched_complete_cycle{0};
-    uint64_t sched_dispatch_cycle{0};
-    uint64_t sched_wiring_cycle{0};
-    uint64_t sched_idle_cycle{0};
-    uint64_t sched_loop_count{0};
-    uint32_t phase_complete_count{0};
-    // Sub-block retires that did NOT finish a slot (SPMD blocks of a multi-block
-    // task retiring one at a time). Counted separately so the Complete-phase
-    // emit can fire on poll iterations that only retired sub-blocks — otherwise
-    // the serial-harvest tail of an SPMD slot is invisible (no slot completes
-    // until the last block, leaving the scheduler lane blank for that window).
-    uint32_t phase_subretire_count{0};
-    uint32_t phase_dispatch_count{0};
-    // Per-emit delta is (current - *_at_last_emit). Accumulated only when
-    // l2_swimlane_level_ >= SCHED_PHASES.
-    uint64_t pop_hit{0};
-    uint64_t pop_miss{0};
-    uint64_t pop_hit_at_last_emit{0};
-    uint64_t pop_miss_at_last_emit{0};
-#if PTO2_SCHED_PROFILING
-    uint32_t phase_wiring_count{0};
-    uint64_t complete_probe_count{0};
-    uint64_t complete_hit_count{0};
-    uint64_t sched_complete_perf_cycle{0};
-    uint64_t sched_dispatch_pop_cycle{0};
-    uint64_t sched_dispatch_setup_cycle{0};
-#endif
-    void reset() { *this = SchedL2SwimlaneCounters{}; }
-};
-#endif
-
-// =============================================================================
-// sync_start drain coordination
-// =============================================================================
-
-// When sync_start_pending != 0, all scheduler threads skip dispatch
-// (only process completions) until the drain worker finishes launching all blocks.
-struct alignas(64) SyncStartDrainState {
-    std::atomic<int32_t> sync_start_pending{0};    // 0=normal; -1=initializing; >0=active (value=block_num)
-    std::atomic<int32_t> drain_worker_elected{0};  // 0=none; >0: elected thread's (thread_idx+1)
-    std::atomic<uint32_t> drain_ack_mask{0};       // bit per thread; all-set = all threads reached ack barrier
-    std::atomic<PTO2TaskSlotState *> pending_task{nullptr};  // held task (not re-queued)
-    int32_t _pad[10];
-};
-static_assert(sizeof(SyncStartDrainState) == 64);
-
-#endif  // SCHEDULER_TYPES_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
new file mode 100644
index 000000000..f0f33ff20
--- /dev/null
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_context.h
@@ -0,0 +1,1572 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#ifndef SCHEDULER_CONTEXT_H
+#define SCHEDULER_CONTEXT_H
+
+#include "aicpu/platform_regs.h"
+#include "common/l2_swimlane_profiling.h"
+#include "scheduler_types.h"
+
+#include "pto_scheduler.h"
+
+#include "aicore_completion_mailbox.h"
+#include "pto2_dispatch_payload.h"
+
+#include <cinttypes>
+#include <cstdio>
+#include "runtime.h"
+#include "pto_runtime2.h"
+#include "pto_shared_memory.h"
+#include "aicpu/device_time.h"
+#include "aicpu/pmu_collector_aicpu.h"
+#include "aicpu/tensor_dump_aicpu.h"
+#include "common/memory_barrier.h"
+#include "common/platform_config.h"
+#include "common/unified_log.h"
+#include "spin_hint.h"
+// SchedulerThreadProfile is defined in scheduler_types.h (above) so the
+// drain_wiring_queue method in pto_scheduler.h can take a pointer to it.
+
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+
+inline void latch_scheduler_error(PTO2SharedMemoryHeader *header, int32_t thread_idx, int32_t error_code)
+{
+    if (header == nullptr || error_code == PTO2_ERROR_NONE) return;
+    int32_t expected = PTO2_ERROR_NONE;
+    if (header->sched_error_code.compare_exchange_strong(expected, error_code, std::memory_order_acq_rel)) header->sched_error_thread.store(thread_idx, std::memory_order_release);
+    if (thread_idx >= 0 && thread_idx < 32) header->sched_error_bitmap.fetch_or(1U << static_cast<uint32_t>(thread_idx), std::memory_order_acq_rel);
+}
+
+inline void format_core_status(char *buf, size_t buf_size, int32_t core_id, bool idle, const CoreExecState *core_state, uint64_t reg_addr_for_cond)
+{
+    if (idle)
+    {
+        snprintf(buf, buf_size, "core%d(idle)", core_id);
+        return;
+    }
+    int32_t kernel = -1;
+    int64_t task_id_raw = -1;
+    if (core_state && core_state->running_slot_state)
+    {
+        int32_t subslot = static_cast<int32_t>(core_state->running_subslot);
+        kernel = core_state->running_slot_state->task->kernel_id[subslot];
+        task_id_raw = static_cast<int64_t>(core_state->running_slot_state->task->task_id.raw);
+    }
+    uint64_t cond_reg = read_reg(reg_addr_for_cond, RegId::COND);
+    int32_t hw_state = EXTRACT_TASK_STATE(cond_reg);
+    const char *cond_reg_state_str = (hw_state == TASK_ACK_STATE) ? "ack" : "fin";
+    if (hw_state == TASK_ACK_STATE) snprintf(buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s)", core_id, kernel, task_id_raw, cond_reg_state_str);
+    else snprintf(buf, buf_size, "core%d(busy kernel=%d task=%" PRId64 " cond_reg_state=%s ANOMALY)", core_id, kernel, task_id_raw, cond_reg_state_str);
+}
+
+#ifndef RUNTIME_MAX_WORKER
+#define RUNTIME_MAX_WORKER 72
+#endif
+#ifndef RUNTIME_MAX_FUNC_ID
+#define RUNTIME_MAX_FUNC_ID 1024
+#endif
+
+// Forward declarations — avoid pulling in full headers for pointer/reference params.
+class Runtime;
+struct Handshake;
+struct PTO2Runtime;
+
+class SchedulerContext
+{
+public:
+    int32_t init(Runtime *runtime, int32_t aicpu_thread_num, int32_t sched_thread_num, bool orch_to_sched, uint64_t regs_base)
+    {
+        always_assert(runtime != nullptr);
+
+        // Zero all per-core execution state before handshake
+        memset(core_exec_states_, 0, sizeof(core_exec_states_));
+
+        // Wire thread/transition configuration that handshake/assign need to read.
+        aicpu_thread_num_ = aicpu_thread_num;
+        sched_thread_num_ = sched_thread_num;
+        orch_to_sched_ = orch_to_sched;
+        regs_ = regs_base;
+
+        // Discover cores and assign to scheduler threads.
+        int32_t rc = handshake_all_cores(runtime);
+        if (rc != 0) return rc;
+        if (!assign_cores_to_threads()) return -1;
+
+        // Initialize task counters. Task count comes from PTO2 shared memory.
+        if (runtime->get_gm_sm_ptr())
+        {
+            auto *header = static_cast<PTO2SharedMemoryHeader *>(runtime->get_gm_sm_ptr());
+            int64_t pto2_count = 0;
+            for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            {
+                int32_t ring_tasks = header->rings[r].fc.current_task_index.load(std::memory_order_acquire);
+                if (ring_tasks > 0 && ring_tasks <= PTO2_SCOPE_TASKS_CAP) pto2_count += ring_tasks;
+            }
+            total_tasks_ = static_cast<int32_t>(pto2_count);
+        }
+        else
+        {
+            total_tasks_ = 0;
+        }
+        completed_tasks_.store(0, std::memory_order_release);
+
+        // Device orchestration: the orchestrator thread flips this when the graph is built.
+        orchestrator_done_ = false;
+
+        // Clear per-core dispatch payloads
+        memset(payload_per_core_, 0, sizeof(payload_per_core_));
+        memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+        // Initialize per-core GlobalContext (sub_block_id) based on cluster position.
+        // This is done once at startup and never modified afterwards.
+        for (int32_t t = 0; t < sched_thread_num_; t++)
+        {
+            CoreTracker &tracker = core_trackers_[t];
+            for (int32_t c = 0; c < tracker.get_cluster_count(); c++)
+            {
+                int32_t cluster_offset = c * 3;  // Each cluster = 1 AIC + 2 AIV
+                auto aiv0_id = tracker.get_core_id_by_offset(tracker.get_aiv0_core_offset(cluster_offset));
+                auto aiv1_id = tracker.get_core_id_by_offset(tracker.get_aiv1_core_offset(cluster_offset));
+                payload_per_core_[aiv0_id][0].global_context.sub_block_id = 0;
+                payload_per_core_[aiv0_id][1].global_context.sub_block_id = 0;
+                payload_per_core_[aiv1_id][0].global_context.sub_block_id = 1;
+                payload_per_core_[aiv1_id][1].global_context.sub_block_id = 1;
+            }
+        }
+
+        func_id_to_addr_ = runtime->func_id_to_addr_;
+
+        return 0;
+    }
+
+    // Reset all SchedulerContext-owned state to its post-construction defaults.
+    // Called by AicpuExecutor::deinit() during per-run teardown.
+    void deinit()
+    {
+        // Reset all per-core execution state
+        for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++)
+        {
+            core_exec_states_[i] = {};
+            core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+            core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+        }
+
+        // Clear per-core dispatch payloads
+        memset(payload_per_core_, 0, sizeof(payload_per_core_));
+        memset(deferred_slab_per_core_, 0, sizeof(deferred_slab_per_core_));
+
+        // Reset sync-start drain coordination — a previous run that aborted mid-drain
+        // would otherwise leave dirty pending/elected/ack state for the next reuse.
+        drain_state_.sync_start_pending.store(0, std::memory_order_release);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+        drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+        drain_state_.pending_task.store(nullptr, std::memory_order_release);
+
+        // Reset task counters and orchestrator state
+        completed_tasks_.store(0, std::memory_order_release);
+        total_tasks_ = 0;
+        orchestrator_done_ = false;
+        pto2_init_done_.store(false, std::memory_order_release);
+        pto2_init_complete_.store(false, std::memory_order_release);
+
+        // Reset core transition state
+        transition_requested_.store(false, std::memory_order_release);
+        wait_reassign_.store(0, std::memory_order_release);
+        reassigned_.store(false, std::memory_order_release);
+        completed_.store(false, std::memory_order_release);
+
+        // Reset core discovery and assignment state
+        aic_count_ = 0;
+        aiv_count_ = 0;
+        cores_total_num_ = 0;
+        aicpu_thread_num_ = 0;
+        sched_thread_num_ = 0;
+        orch_to_sched_ = false;
+        active_sched_threads_ = 0;
+        for (int32_t t = 0; t < MAX_AICPU_THREADS; t++) core_trackers_[t] = CoreTracker{};
+
+        regs_ = 0;
+        sched_ = nullptr;
+        rt_ = nullptr;
+        func_id_to_addr_ = nullptr;
+    }
+
+    // Main scheduler thread entry: poll completion + dispatch ready tasks.
+    int32_t resolve_and_dispatch(Runtime *runtime, int32_t thread_idx)
+    {
+        always_assert(sched_ != nullptr);
+        CoreTracker &tracker = core_trackers_[thread_idx];
+
+        PTO2SharedMemoryHeader *header = sched_->sm_header;
+        if (!header) return -1;
+
+        // One-time init: assign perf buffers (one thread does it; others wait)
+        if (!pto2_init_done_.exchange(true, std::memory_order_acq_rel)) pto2_init_complete_.store(true, std::memory_order_release);
+        else
+            while (!pto2_init_complete_.load(std::memory_order_acquire)) SPIN_WAIT_HINT();
+
+        int32_t cur_thread_completed = 0;
+        int32_t idle_iterations = 0;
+
+        constexpr int LOCAL_READY_CAP_PER_TYPE = 64;
+        PTO2TaskSlotState *local_ptrs[PTO2_NUM_RESOURCE_SHAPES][LOCAL_READY_CAP_PER_TYPE];
+        PTO2LocalReadyBuffer local_bufs[PTO2_NUM_RESOURCE_SHAPES];
+        for (int32_t i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) local_bufs[i].reset(local_ptrs[i], LOCAL_READY_CAP_PER_TYPE);
+
+        bool cores_released = false;
+
+        const bool pmu_active = is_pmu_enabled();
+
+        uint64_t last_progress_ts = get_sys_cnt_aicpu();
+
+        // Profile reset + total-cycle start. Reset here so each
+        // resolve_and_dispatch call (≈ one kernel launch) records its own
+        // breakdown. The dump happens at loop exit, well outside the hot path.
+        SchedulerThreadProfile &profile = thread_profiles_[thread_idx];
+        profile.reset();
+        const uint64_t profile_loop_start = get_sys_cnt_aicpu();
+
+        while (true)
+        {
+            if (completed_.load(std::memory_order_acquire)) break;
+            bool made_progress = false;
+            profile.total_iters++;
+            if (!tracker.has_any_running_cores())
+            {
+                LoopAction action = handle_orchestrator_exit(header, runtime);
+                if (action == LoopAction::BREAK_LOOP) break;
+            }
+
+            if (!cores_released && orch_to_sched_)
+            {
+                LoopAction action = handle_core_transition(cores_released);
+                if (action == LoopAction::BREAK_LOOP) break;
+            }
+
+            // Phase 1: Check running cores for completion
+            int32_t completed_this_turn = 0;
+
+            if (tracker.has_any_running_cores())
+            {
+                uint64_t t0 = get_sys_cnt_aicpu();
+                check_running_cores_for_completion(thread_idx, completed_this_turn, cur_thread_completed, made_progress);
+                profile.completion_cycles += get_sys_cnt_aicpu() - t0;
+                profile.completion_iters++;
+            }
+            if (completed_this_turn > 0)
+            {
+                completed_tasks_.fetch_add(completed_this_turn, std::memory_order_relaxed);
+            }
+
+            uint64_t t0_async = 0;
+            if (rt_ != nullptr && rt_->aicore_mailbox != nullptr && (sched_->async_wait_list.count > 0 || rt_->aicore_mailbox->has_pending()))
+            {
+                t0_async = get_sys_cnt_aicpu();
+                AsyncPollResult poll_result = sched_->async_wait_list.poll_and_complete<false>(rt_->aicore_mailbox, sched_);
+                if (poll_result.error_code != PTO2_ERROR_NONE)
+                {
+                    int32_t expected = PTO2_ERROR_NONE;
+                    header->sched_error_code.compare_exchange_strong(expected, poll_result.error_code, std::memory_order_acq_rel, std::memory_order_acquire);
+                    completed_.store(true, std::memory_order_release);
+                    break;
+                }
+                if (poll_result.completed > 0)
+                {
+                    completed_tasks_.fetch_add(poll_result.completed, std::memory_order_relaxed);
+                    made_progress = true;
+                }
+                profile.async_wait_cycles += get_sys_cnt_aicpu() - t0_async;
+                profile.async_wait_iters++;
+            }
+
+            // Phase 2 drain check
+            if (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0)
+            {
+                handle_drain_mode(thread_idx);
+                continue;
+            }
+
+            // Phase 3: Drain wiring queue (thread 0 only). Pass cumulative
+            // sub-phase counters (SPSC drain stage 1 / pending-FIFO poll
+            // stage 2) so drain_wiring_queue accumulates into them.
+            if (thread_idx == 0)
+            {
+                uint64_t t0 = get_sys_cnt_aicpu();
+                int wired = sched_->drain_wiring_queue(orchestrator_done_,
+                    &profile.spsc_drain_cycles, &profile.spsc_drain_iters,
+                    &profile.pending_poll_cycles, &profile.pending_poll_iters);
+                if (wired > 0) made_progress = true;
+                profile.drain_wiring_cycles += get_sys_cnt_aicpu() - t0;
+                profile.drain_wiring_iters++;
+            }
+
+            if (thread_idx == 0)
+            {
+                uint64_t t0 = get_sys_cnt_aicpu();
+                constexpr int DUMMY_DRAIN_BATCH = 16;
+                PTO2TaskSlotState *dummy_batch[DUMMY_DRAIN_BATCH];
+                int dummy_got = sched_->dummy_ready_queue.pop_batch(dummy_batch, DUMMY_DRAIN_BATCH);
+                for (int di = 0; di < dummy_got; di++)
+                {
+                    PTO2TaskSlotState &dummy_slot = *dummy_batch[di];
+                    sched_->on_mixed_task_complete(dummy_slot);
+                    completed_tasks_.fetch_add(1, std::memory_order_relaxed);
+                    cur_thread_completed++;
+                }
+                if (dummy_got > 0) made_progress = true;
+                profile.dummy_drain_cycles += get_sys_cnt_aicpu() - t0;
+                profile.dummy_drain_iters++;
+            }
+
+            // Phase 4: MIX-strict-priority dispatch with phase-split and
+            // cross-thread idle gating. See dispatch_ready_tasks for the policy.
+            {
+                uint64_t t0 = get_sys_cnt_aicpu();
+                dispatch_ready_tasks(thread_idx, tracker, local_bufs, pmu_active, made_progress);
+                profile.dispatch_cycles += get_sys_cnt_aicpu() - t0;
+                profile.dispatch_iters++;
+            }
+
+            if (made_progress)
+            {
+                idle_iterations = 0;
+                last_progress_ts = get_sys_cnt_aicpu();
+            }
+            else
+            {
+                uint64_t t0_idle = get_sys_cnt_aicpu();
+                idle_iterations++;
+
+                if (idle_iterations % FATAL_ERROR_CHECK_INTERVAL == 0)
+                {
+                    LoopAction action = check_idle_fatal_error(header, runtime);
+                    if (action == LoopAction::BREAK_LOOP) break;
+                }
+
+                if (idle_iterations % STALL_LOG_INTERVAL == 0) log_stall_diagnostics(thread_idx);
+                if (get_sys_cnt_aicpu() - last_progress_ts > SCHEDULER_TIMEOUT_CYCLES)
+                {
+                    bool self_owns = self_owns_running_task(thread_idx);
+                    bool global_stuck = !self_owns && total_tasks_ > 0 && completed_tasks_.load(std::memory_order_relaxed) < total_tasks_ && no_thread_owns_running_task();
+                    if (self_owns || global_stuck) return handle_timeout_exit(thread_idx, header, runtime);
+                    last_progress_ts = get_sys_cnt_aicpu();
+                }
+                SPIN_WAIT_HINT();
+                profile.idle_spin_cycles += get_sys_cnt_aicpu() - t0_idle;
+                profile.idle_iters++;
+            }
+        }
+
+        // Dump profile breakdown for this thread. Logged AFTER the hot loop
+        // exits, so this adds no overhead to the measured phases.
+        profile.total_cycles = get_sys_cnt_aicpu() - profile_loop_start;
+        LOG_INFO_V9(
+            "CLAUDE_PROFILING thread=%d total_cyc=%lu iters=%lu compl_cyc=%lu compl_n=%lu ctask_cyc=%lu ctask_n=%lu cores_scan=%lu async_cyc=%lu async_n=%lu drain_cyc=%lu drain_n=%lu spsc_cyc=%lu spsc_n=%lu poll_cyc=%lu poll_n=%lu poll_skipped=%lu dummy_cyc=%lu dummy_n=%lu dispatch_cyc=%lu dispatch_n=%lu idle_cyc=%lu idle_n=%lu",
+            (int)thread_idx,
+            (unsigned long)profile.total_cycles, (unsigned long)profile.total_iters,
+            (unsigned long)profile.completion_cycles, (unsigned long)profile.completion_iters,
+            (unsigned long)profile.complete_task_cycles, (unsigned long)profile.complete_task_calls,
+            (unsigned long)profile.cores_scanned,
+            (unsigned long)profile.async_wait_cycles, (unsigned long)profile.async_wait_iters,
+            (unsigned long)profile.drain_wiring_cycles, (unsigned long)profile.drain_wiring_iters,
+            (unsigned long)profile.spsc_drain_cycles, (unsigned long)profile.spsc_drain_iters,
+            (unsigned long)profile.pending_poll_cycles, (unsigned long)profile.pending_poll_iters,
+            (unsigned long)profile.pending_poll_skipped,
+            (unsigned long)profile.dummy_drain_cycles, (unsigned long)profile.dummy_drain_iters,
+            (unsigned long)profile.dispatch_cycles, (unsigned long)profile.dispatch_iters,
+            (unsigned long)profile.idle_spin_cycles, (unsigned long)profile.idle_iters);
+
+        return cur_thread_completed;
+    }
+
+    int32_t shutdown(int32_t thread_idx)
+    {
+        const int32_t *cores = core_trackers_[thread_idx].core_ids();
+        int32_t core_num = core_trackers_[thread_idx].core_num();
+        if (core_num == 0) return 0;
+
+        int32_t rc = 0;
+        for (int32_t i = 0; i < core_num; i++)
+        {
+            int32_t core_id = cores[i];
+            uint64_t reg_addr = core_exec_states_[core_id].reg_addr;
+            if (reg_addr != 0)
+            {
+                // Timeout means AICore is unresponsive. Log and continue deiniting remaining cores.
+                if (platform_deinit_aicore_regs(reg_addr) != 0) rc = -1;
+            }
+            else
+            {}
+        }
+        return rc;
+    }
+
+    void on_orchestration_done(Runtime *runtime, PTO2Runtime *rt, int32_t total_tasks)
+    {
+        total_tasks_ = total_tasks;
+
+        // Fold tasks completed inline during orchestration
+        int32_t inline_completed = static_cast<int32_t>(rt->orchestrator.inline_completed_tasks);
+        if (inline_completed > 0) completed_tasks_.fetch_add(inline_completed, std::memory_order_relaxed);
+        orchestrator_done_ = true;
+
+        // Check for fatal error from orchestration; if so, shut down immediately.
+        int32_t orch_err = 0;
+        if (sched_->sm_header) orch_err = sched_->sm_header->orch_error_code.load(std::memory_order_relaxed);
+        if (orch_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+        }
+
+        // Skip core transition on fatal error — cores already shut down above.
+        if (completed_.load(std::memory_order_acquire))
+        {
+            // Signal transition to unblock scheduler threads waiting at core transition
+            transition_requested_.store(true, std::memory_order_release);
+            reassigned_.store(true, std::memory_order_release);
+        }
+        else if (orch_to_sched_)
+        {
+            transition_requested_.store(true, std::memory_order_release);
+
+            // Wait for scheduler threads to acknowledge transition request
+            while (wait_reassign_.load(std::memory_order_acquire) != sched_thread_num_)
+            {
+                if (completed_.load(std::memory_order_acquire)) break;
+                SPIN_WAIT_HINT();
+            }
+            if (!completed_.load(std::memory_order_acquire))
+            {
+                reassign_cores_for_all_threads();
+                reassigned_.store(true, std::memory_order_release);
+            }
+        }
+    }
+
+    // Bind the PTO2Runtime scheduler pointer. Required in device-orchestration
+    // mode where rt is created by the orchestrator thread after init().
+    void bind_runtime(PTO2Runtime *rt)
+    {
+        rt_ = rt;
+        sched_ = &rt->scheduler;
+    }
+
+    int32_t aic_count() const
+    {
+        return aic_count_;
+    }
+    int32_t aiv_count() const
+    {
+        return aiv_count_;
+    }
+    bool is_completed() const
+    {
+        return completed_.load(std::memory_order_acquire);
+    }
+    int32_t completed_tasks_count() const
+    {
+        return completed_tasks_.load(std::memory_order_acquire);
+    }
+
+    // Block until the first scheduler thread has finished one-time PTO2 init.
+    // Called by the orchestrator thread in device-orch mode.
+    void wait_pto2_init_complete() const
+    {
+        while (!pto2_init_complete_.load(std::memory_order_acquire)) SPIN_WAIT_HINT();
+    }
+
+private:
+    // --- Scheduler binding & per-core runtime state ---
+    alignas(64) PTO2SchedulerState *sched_{nullptr};
+    PTO2Runtime *rt_{nullptr};
+
+    // Per-core execution state, indexed by core_id (= worker_id)
+    CoreExecState core_exec_states_[RUNTIME_MAX_WORKER];
+
+    // Cluster-ordered core trackers, one per scheduler thread
+    CoreTracker core_trackers_[MAX_AICPU_THREADS];
+    SchedulerThreadProfile thread_profiles_[MAX_AICPU_THREADS];
+
+    // Per-core dispatch payload storage: dual-buffer for pipelining.
+    // buf_idx = reg_task_id & 1; adjacent dispatches alternate automatically.
+    PTO2DispatchPayload payload_per_core_[RUNTIME_MAX_WORKER][2];
+
+    DeferredCompletionSlab deferred_slab_per_core_[RUNTIME_MAX_WORKER][2];
+
+    // sync_start drain coordination
+    SyncStartDrainState drain_state_;
+
+    // --- Task-execution tracking ---
+    std::atomic<int32_t> completed_tasks_{0};
+    int32_t total_tasks_{0};
+    // Device orchestration: set by last orchestrator when graph is built; schedulers poll it.
+    // volatile prevents the compiler from hoisting the load out of spin loops.
+    volatile bool orchestrator_done_{false};
+    std::atomic<bool> completed_{false};
+    uint64_t *func_id_to_addr_{nullptr};
+
+    // --- Core-transition coordination ---
+    std::atomic<bool> transition_requested_{false};
+    std::atomic<int32_t> wait_reassign_{0};
+    std::atomic<bool> reassigned_{false};
+
+    // --- Thread/core configuration ---
+    int32_t active_sched_threads_{0};
+    int32_t sched_thread_num_{0};
+    bool orch_to_sched_{false};
+    int32_t aicpu_thread_num_{0};
+    int32_t cores_total_num_{0};
+
+    // Cluster-ordered worker_id lists, populated by handshake_all_cores().
+    int32_t aic_worker_ids_[RUNTIME_MAX_WORKER]{};
+    int32_t aiv_worker_ids_[RUNTIME_MAX_WORKER]{};
+    int32_t aic_count_{0};
+    int32_t aiv_count_{0};
+
+    // Platform AICore-register base array (set by AicpuExecutor before init()).
+    uint64_t regs_{0};
+
+    // --- One-time init coordination ---
+    std::atomic<bool> pto2_init_done_{false};
+    std::atomic<bool> pto2_init_complete_{false};
+
+    // Handshake with all AICore workers; populates core_exec_states_, worker id lists.
+    int32_t handshake_all_cores(Runtime *runtime)
+    {
+        Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
+        cores_total_num_ = runtime->worker_count;
+
+        // Validate cores_total_num_ before using as array index
+        if (cores_total_num_ == 0 || cores_total_num_ > RUNTIME_MAX_WORKER) return -1;
+
+        aic_count_ = 0;
+        aiv_count_ = 0;
+
+        for (int32_t i = 0; i < cores_total_num_; i++)
+        {
+            all_handshakes[i].task = reinterpret_cast<uint64_t>(&payload_per_core_[i][0]);
+            OUT_OF_ORDER_STORE_BARRIER();
+            all_handshakes[i].aicpu_ready = 1;
+        }
+        OUT_OF_ORDER_STORE_BARRIER();
+
+        // Get platform physical cores count for validation
+        uint32_t max_physical_cores_count = platform_get_physical_cores_count();
+
+        // Step 2: Wait for all cores to respond, collect core type and register addresses
+        bool handshake_failed = false;
+        for (int32_t i = 0; i < cores_total_num_; i++)
+        {
+            Handshake *hank = &all_handshakes[i];
+
+            while (hank->aicore_regs_ready == 0) SPIN_WAIT_HINT();
+
+            uint32_t physical_core_id = hank->physical_core_id;
+
+            if (physical_core_id >= max_physical_cores_count)
+            {
+                handshake_failed = true;
+                continue;
+            }
+
+            uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
+            uint64_t reg_addr = regs[physical_core_id];
+
+            // Initialize AICore registers after discovery (first round)
+            platform_init_aicore_regs(reg_addr);
+            OUT_OF_ORDER_STORE_BARRIER();
+            hank->aicpu_regs_ready = 1;
+
+            OUT_OF_ORDER_STORE_BARRIER();
+
+            while (hank->aicore_done == 0) SPIN_WAIT_HINT();
+
+            CoreType type = hank->core_type;
+
+            core_exec_states_[i].reg_addr = reg_addr;
+            core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
+
+            core_exec_states_[i].worker_id = i;
+            core_exec_states_[i].physical_core_id = physical_core_id;
+            core_exec_states_[i].core_type = type;
+
+            if (type == CoreType::AIC) aic_worker_ids_[aic_count_++] = i;
+            else aiv_worker_ids_[aiv_count_++] = i;
+        }
+
+        if (handshake_failed)
+        {
+            emergency_shutdown(runtime);
+            return -1;
+        }
+
+        return 0;
+    }
+
+    // Assign discovered cores (cluster = 1 AIC + 2 AIV) round-robin across scheduler threads.
+    bool assign_cores_to_threads()
+    {
+        // Cluster-aligned round-robin assignment: cluster ci -> sched thread ci % active_sched_threads_.
+        // Each cluster = 1 AIC + 2 adjacent AIV; the triple is always kept together.
+        active_sched_threads_ = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
+        int32_t cluster_count = aic_count_;
+
+        // Max clusters any single sched thread can hold: ceil(cluster_count / active_sched_threads_).
+        int32_t max_clusters_per_thread = (cluster_count + active_sched_threads_ - 1) / active_sched_threads_;
+        int32_t thread_cores_num = max_clusters_per_thread * 3;
+
+        if (thread_cores_num > CoreTracker::MAX_CORE_PER_THREAD) return false;
+
+        for (int32_t i = 0; i < RUNTIME_MAX_WORKER; i++)
+        {
+            core_exec_states_[i].running_reg_task_id = AICPU_TASK_INVALID;
+            core_exec_states_[i].pending_reg_task_id = AICPU_TASK_INVALID;
+        }
+
+        // Count clusters per thread first (round-robin may distribute unevenly)
+        int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
+        for (int32_t ci = 0; ci < cluster_count; ci++) clusters_per_thread[ci % active_sched_threads_]++;
+        for (int32_t i = 0; i < active_sched_threads_; i++) core_trackers_[i].init(clusters_per_thread[i]);
+
+        int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
+
+        for (int32_t ci = 0; ci < cluster_count; ci++)
+        {
+            int32_t t = ci % active_sched_threads_;
+
+            int32_t aic_wid = aic_worker_ids_[ci];
+            int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
+            int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
+
+            core_trackers_[t].set_cluster(cluster_idx_per_thread[t]++, aic_wid, aiv0_wid, aiv1_wid);
+        }
+
+        for (int32_t t = 0; t < aicpu_thread_num_; t++)
+        {}
+
+        return true;
+    }
+
+    // Re-distribute all cores across all threads after orchestration completes.
+    void reassign_cores_for_all_threads()
+    {
+        // Collect running worker_ids from all current trackers
+        bool running_cores[RUNTIME_MAX_WORKER] = {};
+        for (int32_t i = 0; i < aicpu_thread_num_; i++)
+        {
+            auto all_running = core_trackers_[i].get_all_running_cores();
+            int32_t bp;
+            while ((bp = all_running.pop_first()) >= 0) running_cores[core_trackers_[i].get_core_id_by_offset(bp)] = true;
+        }
+
+        // Count clusters per thread (round-robin across all threads)
+        int32_t cluster_count = aic_count_;
+        int32_t clusters_per_thread[MAX_AICPU_THREADS] = {};
+        for (int32_t ci = 0; ci < cluster_count; ci++) clusters_per_thread[ci % aicpu_thread_num_]++;
+
+        // Re-init all trackers and reset core counts
+        for (int32_t i = 0; i < aicpu_thread_num_; i++) core_trackers_[i].init(clusters_per_thread[i]);
+
+        // Assign clusters round-robin and restore running state
+        int32_t cluster_idx_per_thread[MAX_AICPU_THREADS] = {};
+        for (int32_t ci = 0; ci < cluster_count; ci++)
+        {
+            int32_t t = ci % aicpu_thread_num_;
+
+            int32_t aic_wid = aic_worker_ids_[ci];
+            int32_t aiv0_wid = aiv_worker_ids_[2 * ci];
+            int32_t aiv1_wid = aiv_worker_ids_[2 * ci + 1];
+
+            int32_t cl_idx = cluster_idx_per_thread[t]++;
+            core_trackers_[t].set_cluster(cl_idx, aic_wid, aiv0_wid, aiv1_wid);
+
+            // init() marks all idle; toggle cores that were running and restore pending_occupied
+            if (running_cores[aic_wid])
+            {
+                core_trackers_[t].change_core_state(cl_idx * 3);
+                core_trackers_[t].set_pending_occupied(cl_idx * 3);
+            }
+            if (running_cores[aiv0_wid])
+            {
+                core_trackers_[t].change_core_state(cl_idx * 3 + 1);
+                core_trackers_[t].set_pending_occupied(cl_idx * 3 + 1);
+            }
+            if (running_cores[aiv1_wid])
+            {
+                core_trackers_[t].change_core_state(cl_idx * 3 + 2);
+                core_trackers_[t].set_pending_occupied(cl_idx * 3 + 2);
+            }
+        }
+
+        active_sched_threads_ = aicpu_thread_num_;
+    }
+
+    // Emergency shutdown: broadcast exit signal to every handshake'd core and
+    // deinit their AICore register blocks. Idempotent.
+    void emergency_shutdown(Runtime *runtime)
+    {
+        Handshake *all_handshakes = reinterpret_cast<Handshake *>(runtime->workers);
+        int32_t timeout_count = 0;
+        for (int32_t i = 0; i < cores_total_num_; i++)
+        {
+            Handshake *hank = &all_handshakes[i];
+            OUT_OF_ORDER_STORE_BARRIER();
+            hank->aicpu_regs_ready = 1;
+            if (core_exec_states_[i].reg_addr != 0)
+            {
+                if (platform_deinit_aicore_regs(core_exec_states_[i].reg_addr) != 0) timeout_count++;
+            }
+        }
+        if (timeout_count > 0)
+        {}
+    }
+
+    static const char *shape_name(PTO2ResourceShape shape)
+    {
+        switch (shape)
+        {
+        case PTO2ResourceShape::AIC:
+            return "AIC";
+        case PTO2ResourceShape::AIV:
+            return "AIV";
+        case PTO2ResourceShape::MIX:
+            return "MIX";
+        case PTO2ResourceShape::DUMMY:
+            return "DUMMY";
+        }
+        return "UNKNOWN";
+    }
+
+    static inline const char *subslot_name(PTO2SubtaskSlot s)
+    {
+        switch (s)
+        {
+        case PTO2SubtaskSlot::AIC:
+            return "aic";
+        case PTO2SubtaskSlot::AIV0:
+            return "aiv0";
+        case PTO2SubtaskSlot::AIV1:
+            return "aiv1";
+        }
+        return "?";
+    }
+
+    int pop_ready_tasks_batch(PTO2ResourceShape shape, PTO2LocalReadyBuffer &local_buf, PTO2TaskSlotState **out, int max_count)
+    {
+        return sched_->get_ready_tasks_batch(shape, local_buf, out, max_count);
+    }
+
+    void build_payload(PTO2DispatchPayload &dispatch_payload, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, const AsyncCtx &async_ctx, int32_t block_idx)
+    {
+        int32_t slot_idx = static_cast<int32_t>(subslot);
+        uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]);
+        const CoreCallable *callable = reinterpret_cast<const CoreCallable *>(callable_addr);
+        dispatch_payload.function_bin_addr = callable->resolved_addr();
+        auto &payload = *slot_state.payload;
+        int n = 0;
+        for (int32_t i = 0; i < payload.tensor_count; i++) dispatch_payload.args[n++] = reinterpret_cast<uint64_t>(&payload.tensors[i]);
+        for (int32_t i = 0; i < payload.scalar_count; i++) dispatch_payload.args[n++] = payload.scalars[i];
+        dispatch_payload.local_context.block_idx = block_idx;
+        dispatch_payload.local_context.block_num = slot_state.logical_block_num;
+        dispatch_payload.local_context.async_ctx = async_ctx;
+        dispatch_payload.args[PAYLOAD_LOCAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.local_context);
+        dispatch_payload.args[PAYLOAD_GLOBAL_CONTEXT_INDEX] = reinterpret_cast<uint64_t>(&dispatch_payload.global_context);
+    }
+
+    struct PublishHandle
+    {
+        uint64_t reg_addr;
+        uint32_t reg_task_id;
+        int32_t core_offset;
+        uint64_t *dispatch_timestamp_slot;
+    };
+
+    SchedulerContext::PublishHandle prepare_subtask_to_core(int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2SubtaskSlot subslot, bool to_pending, int32_t block_idx)
+    {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        auto core_id = tracker.get_core_id_by_offset(core_offset);
+        CoreExecState &core_exec_state = core_exec_states_[core_id];
+
+        core_exec_state.dispatch_seq++;
+        uint32_t reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
+        static_assert((TASK_ID_MASK - AICORE_EXIT_SIGNAL + 1) % 2 == 0, "Sentinel skip must be even to preserve dual-buffer parity");
+        if (reg_task_id >= AICORE_EXIT_SIGNAL)
+        {
+            core_exec_state.dispatch_seq += (TASK_ID_MASK - reg_task_id + 1);
+            reg_task_id = core_exec_state.dispatch_seq & TASK_ID_MASK;
+        }
+
+        uint32_t buf_idx = reg_task_id & 1u;
+        PTO2DispatchPayload &payload = payload_per_core_[core_id][buf_idx];
+        DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][buf_idx];
+        deferred_slab->count = 0;
+        deferred_slab->error_code = PTO2_ERROR_NONE;
+        AsyncCtx async_ctx = AsyncCtx::make(slot_state.task->task_id, deferred_slab);
+        build_payload(payload, slot_state, subslot, async_ctx, block_idx);
+
+        if (to_pending)
+        {
+            core_exec_state.pending_subslot = subslot;
+            core_exec_state.pending_slot_state = &slot_state;
+            core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
+        }
+        else
+        {
+            core_exec_state.running_subslot = subslot;
+            core_exec_state.running_slot_state = &slot_state;
+            core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
+            tracker.change_core_state(core_offset);
+        }
+        tracker.set_pending_occupied(core_offset);
+
+        uint64_t *dispatch_timestamp_slot = nullptr;
+
+        return PublishHandle{core_exec_state.reg_addr, reg_task_id, core_offset, dispatch_timestamp_slot};
+    }
+
+    inline void publish_subtask_to_core(const PublishHandle &h, uint64_t dispatch_ts)
+    {
+        if (h.dispatch_timestamp_slot != nullptr) *h.dispatch_timestamp_slot = dispatch_ts;
+        write_reg(h.reg_addr, RegId::DATA_MAIN_BASE, static_cast<uint64_t>(h.reg_task_id));
+    }
+
+    // Fan out one block's subtasks (1 for AIC/AIV, 1-3 for MIX) into the
+    // caller-supplied handles buffer. Returns the number of handles written.
+    int prepare_block_for_dispatch(int32_t thread_idx, int32_t core_offset, PTO2TaskSlotState &slot_state, PTO2ResourceShape shape, bool to_pending, int32_t block_idx, PublishHandle *out_handles)
+    {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        if (shape == PTO2ResourceShape::MIX)
+        {
+            uint8_t cmask = slot_state.active_mask.core_mask();
+            int n = 0;
+            if (cmask & PTO2_SUBTASK_MASK_AIC)
+            {
+                bool p = to_pending && !tracker.is_aic_core_idle(core_offset);
+                out_handles[n++] = prepare_subtask_to_core(thread_idx, tracker.get_aic_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIC, p, block_idx);
+            }
+            if (cmask & PTO2_SUBTASK_MASK_AIV0)
+            {
+                bool p = to_pending && !tracker.is_aiv0_core_idle(core_offset);
+                out_handles[n++] = prepare_subtask_to_core(thread_idx, tracker.get_aiv0_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV0, p, block_idx);
+            }
+            if (cmask & PTO2_SUBTASK_MASK_AIV1)
+            {
+                bool p = to_pending && !tracker.is_aiv1_core_idle(core_offset);
+                out_handles[n++] = prepare_subtask_to_core(thread_idx, tracker.get_aiv1_core_offset(core_offset), slot_state, PTO2SubtaskSlot::AIV1, p, block_idx);
+            }
+            return n;
+        }
+        else if (shape == PTO2ResourceShape::AIC)
+        {
+            out_handles[0] = prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIC, to_pending, block_idx);
+            return 1;
+        }
+        else
+        {
+            out_handles[0] = prepare_subtask_to_core(thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, to_pending, block_idx);
+            return 1;
+        }
+    }
+
+    void dispatch_shape(int32_t thread_idx, PTO2ResourceShape shape, CoreTracker::DispatchPhase phase, PTO2LocalReadyBuffer &local_buf, CoreTracker &tracker, bool &entered_drain, bool &made_progress)
+    {
+        if (entered_drain) return;
+
+        bool is_pending = (phase == CoreTracker::DispatchPhase::PENDING);
+        auto cores = tracker.get_dispatchable_cores(shape, phase);
+        if (!cores.has_value()) return;
+
+        while (cores.has_value() && !entered_drain)
+        {
+            int want = cores.count();
+            PTO2TaskSlotState *batch[CoreTracker::MAX_CLUSTERS * 3];
+            int got = pop_ready_tasks_batch(shape, local_buf, batch, want);
+            if (got == 0) break;
+
+            bool any_sync_start = false;
+            for (int bi = 0; bi < got; bi++)
+            {
+                if (batch[bi]->active_mask.requires_sync_start())
+                {
+                    any_sync_start = true;
+                    break;
+                }
+            }
+
+            PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
+            int handle_count = 0;
+            bool dispatched_any = false;
+
+            auto flush_publish = [&]() {
+                if (handle_count == 0) return;
+                wmb();
+                uint64_t dispatch_ts = 0;
+                for (int i = 0; i < handle_count; i++) publish_subtask_to_core(handles[i], dispatch_ts);
+                handle_count = 0;
+                made_progress = true;
+            };
+
+            for (int bi = 0; bi < got; bi++)
+            {
+                PTO2TaskSlotState *slot_state = batch[bi];
+
+                if (slot_state->active_mask.requires_sync_start())
+                {
+                    if (is_pending)
+                    {
+                        sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                        continue;
+                    }
+                    int32_t available = cores.count();
+                    if (available < slot_state->logical_block_num)
+                    {
+                        flush_publish();
+                        if (!enter_drain_mode(slot_state, slot_state->logical_block_num)) sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+                        for (int rem = bi + 1; rem < got; rem++) sched_->ready_queues[static_cast<int32_t>(shape)].push(batch[rem]);
+                        entered_drain = true;
+                        break;
+                    }
+                }
+
+                if (!cores.has_value())
+                {
+                    flush_publish();
+                    sched_->ready_queues[static_cast<int32_t>(shape)].push_batch(&batch[bi], got - bi);
+                    break;
+                }
+
+                dispatched_any = true;
+                int32_t remaining = slot_state->logical_block_num - slot_state->next_block_idx;
+                int32_t claim = std::min(cores.count(), remaining);
+                int32_t start = slot_state->next_block_idx;
+                slot_state->next_block_idx += claim;
+
+                if (slot_state->next_block_idx < slot_state->logical_block_num) sched_->ready_queues[static_cast<int32_t>(shape)].push(slot_state);
+
+                for (int32_t b = 0; b < claim; b++)
+                {
+                    auto core_offset = cores.pop_first();
+                    handle_count += prepare_block_for_dispatch(thread_idx, core_offset, *slot_state, shape, is_pending, start + b, &handles[handle_count]);
+                }
+
+                if (any_sync_start) flush_publish();
+            }
+
+            flush_publish();
+
+            if (!dispatched_any) break;
+
+            if (!cores.has_value()) cores = tracker.get_dispatchable_cores(shape, phase);
+        }
+    }
+
+    void dispatch_ready_tasks(int32_t thread_idx, CoreTracker &tracker, PTO2LocalReadyBuffer (&local_bufs)[PTO2_NUM_RESOURCE_SHAPES], bool pmu_active, bool &made_progress)
+    {
+        using Phase = CoreTracker::DispatchPhase;
+        constexpr int32_t MIX_I = static_cast<int32_t>(PTO2ResourceShape::MIX);
+
+        static constexpr PTO2ResourceShape kAicAivOrder[2][2] = {
+            {PTO2ResourceShape::AIC, PTO2ResourceShape::AIV},
+            {PTO2ResourceShape::AIV, PTO2ResourceShape::AIC},
+        };
+        const PTO2ResourceShape *aic_aiv = kAicAivOrder[thread_idx & 1];
+
+        const int32_t bd_per_thread = PLATFORM_MAX_BLOCKDIM / active_sched_threads_;
+        const int32_t thread_capacity[PTO2_NUM_RESOURCE_SHAPES] = {
+            bd_per_thread * PLATFORM_AIC_CORES_PER_BLOCKDIM,
+            bd_per_thread * PLATFORM_AIV_CORES_PER_BLOCKDIM,
+            bd_per_thread,
+        };
+        for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++)
+        {
+            auto &lb = local_bufs[s];
+            int32_t excess = lb.count - thread_capacity[s];
+            if (excess <= 0) continue;
+            if (!has_idle_in_other_threads(thread_idx, static_cast<PTO2ResourceShape>(s))) continue;
+            sched_->ready_queues[s].push_batch(&lb.slot_states[lb.count - excess], excess);
+            lb.count -= excess;
+        }
+
+        auto flush_local_bufs = [&]() {
+            for (int32_t s = 0; s < PTO2_NUM_RESOURCE_SHAPES; s++)
+            {
+                auto &lb = local_bufs[s];
+                if (lb.count > 0)
+                {
+                    sched_->ready_queues[s].push_batch(lb.slot_states, lb.count);
+                    lb.count = 0;
+                }
+            }
+        };
+        struct FlushGuard
+        {
+            decltype(flush_local_bufs) &flush_fn;
+            ~FlushGuard()
+            {
+                flush_fn();
+            }
+        } flush_guard{flush_local_bufs};
+
+        bool entered_drain = false;
+
+        // ===== IDLE stage =====
+        dispatch_shape(thread_idx, PTO2ResourceShape::MIX, Phase::IDLE, local_bufs[MIX_I], tracker, entered_drain, made_progress);
+        if (entered_drain) return;
+
+        bool skip_aic_aiv = has_residual_mix(local_bufs[MIX_I]);
+
+        if (!skip_aic_aiv)
+        {
+            for (int i = 0; i < 2; i++)
+            {
+                PTO2ResourceShape s = aic_aiv[i];
+                dispatch_shape(thread_idx, s, Phase::IDLE, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress);
+                if (entered_drain) return;
+            }
+        }
+
+        // Flush between IDLE and PENDING so PENDING-stage queue-size checks and any
+        // peer-thread reads see the IDLE-stage release_fanin output.
+        flush_local_bufs();
+
+        if (pmu_active) return;
+
+        if (!has_idle_in_other_threads(thread_idx, PTO2ResourceShape::MIX))
+        {
+            dispatch_shape(thread_idx, PTO2ResourceShape::MIX, Phase::PENDING, local_bufs[MIX_I], tracker, entered_drain, made_progress);
+            if (entered_drain) return;
+        }
+
+        // Re-check after MIX-PENDING. If MIX-IDLE already set skip_aic_aiv, leave
+        // it set; otherwise, escalate iff PENDING-MIX left residual.
+        if (!skip_aic_aiv && has_residual_mix(local_bufs[MIX_I])) skip_aic_aiv = true;
+
+        if (skip_aic_aiv) return;
+
+        // AIC/AIV-PENDING gate: a peer-idle skip is a delay, not a loss — the peer
+        // will pull from the global queue on its next IDLE pass.
+        for (int i = 0; i < 2; i++)
+        {
+            PTO2ResourceShape s = aic_aiv[i];
+            if (has_idle_in_other_threads(thread_idx, s)) continue;
+            dispatch_shape(thread_idx, s, Phase::PENDING, local_bufs[static_cast<int32_t>(s)], tracker, entered_drain, made_progress);
+            if (entered_drain) return;
+        }
+    }
+
+    bool has_idle_in_other_threads(int32_t self_thread_idx, PTO2ResourceShape shape) const
+    {
+        for (int32_t t = 0; t < active_sched_threads_; t++)
+        {
+            if (t == self_thread_idx) continue;
+            if (core_trackers_[t].get_idle_core_offset_states(shape).has_value()) return true;
+        }
+        return false;
+    }
+
+    bool has_residual_mix(const PTO2LocalReadyBuffer &mix_local_buf) const
+    {
+        return mix_local_buf.count > 0 || sched_->ready_queues[static_cast<int32_t>(PTO2ResourceShape::MIX)].size() > 0;
+    }
+
+    static SlotTransition decide_slot_transition(int32_t reg_task_id, int32_t reg_state, int32_t running_id, int32_t pending_id)
+    {
+        SlotTransition t;
+        if (pending_id != AICPU_TASK_INVALID && reg_task_id == pending_id)
+        {
+            t.matched = true;
+            t.running_done = true;  // Serial execution: pending event implies running done
+            t.running_freed = true;
+            t.pending_freed = true;
+            if (reg_state == TASK_FIN_STATE) t.pending_done = true;  // Case 1: pending FIN
+            // else: Case 2: pending ACK (pending_done stays false)
+        }
+        else if (reg_task_id == running_id)
+        {
+            if (reg_state == TASK_FIN_STATE)
+            {
+                if (pending_id == AICPU_TASK_INVALID)
+                {
+                    // Case 3.2: running FIN, no pending -> core goes idle
+                    t.matched = true;
+                    t.running_done = true;
+                    t.running_freed = true;
+                }
+                // Case 3.1: running FIN, pending exists -> skip (transient state).
+                // Case 1/2 (pending ACK/FIN) will complete running implicitly via running_done=true.
+            }
+            else
+            {
+                // Case 4: running ACK -- only pending_freed (slot now hardware-latched)
+                t.matched = true;
+                t.pending_freed = true;
+            }
+        }
+        return t;
+    }
+
+    void complete_slot_task(PTO2TaskSlotState &slot_state, int32_t expected_reg_task_id, int32_t core_id, int32_t &completed_this_turn)
+    {
+        AICoreCompletionMailbox *mailbox = rt_ != nullptr ? rt_->aicore_mailbox : nullptr;
+        bool defer_completion_to_consumer = false;
+
+        if (slot_state.payload != nullptr)
+        {
+            volatile DeferredCompletionSlab *deferred_slab = &deferred_slab_per_core_[core_id][expected_reg_task_id & 1];
+            // (q) Read count first. AICore only writes error_code as part of a
+            // condition-registration attempt that also increments count, so
+            // count == 0 ⇒ no error and no conditions to forward. This is the
+            // common path for kernels that don't use async waits (paged
+            // attention, GEMM, etc.) and saves an L1 load + branch per call.
+            uint32_t cond_count = deferred_slab->count;
+            if (cond_count != 0)
+            {
+                int32_t slab_err = deferred_slab->error_code;
+                if (slab_err != PTO2_ERROR_NONE)
+                {
+                    int32_t expected = PTO2_ERROR_NONE;
+                    sched_->sm_header->sched_error_code.compare_exchange_strong(expected, slab_err, std::memory_order_acq_rel, std::memory_order_acquire);
+                    completed_.store(true, std::memory_order_release);
+                    return;
+                }
+                if (cond_count > MAX_COMPLETIONS_PER_TASK)
+                {
+                    int32_t expected = PTO2_ERROR_NONE;
+                    sched_->sm_header->sched_error_code.compare_exchange_strong(expected, PTO2_ERROR_ASYNC_REGISTRATION_FAILED, std::memory_order_acq_rel, std::memory_order_acquire);
+                    completed_.store(true, std::memory_order_release);
+                    return;
+                }
+
+                slot_state.any_subtask_deferred.store(true, std::memory_order_release);
+
+                const PTO2TaskId token = slot_state.task->task_id;
+                for (uint32_t i = 0; i < cond_count; ++i)
+                {
+                    volatile DeferredCompletionEntry *e = &deferred_slab->entries[i];
+                    while (!mailbox->try_push_condition(token, e->addr, e->expected_value, e->engine, e->completion_type))
+                    {
+                        sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
+                        SPIN_WAIT_HINT();
+                    }
+                }
+            }
+        }
+
+        bool mixed_complete = sched_->on_subtask_complete(slot_state);
+
+        if (mixed_complete && slot_state.payload != nullptr && slot_state.any_subtask_deferred.load(std::memory_order_acquire))
+        {
+            // Some subtask of this task registered conditions; finish the
+            // registration by handing the slot_state off to the consumer.
+            while (!mailbox->try_push_normal_done(slot_state.task->task_id, reinterpret_cast<uint64_t>(&slot_state)))
+            {
+                sched_->async_wait_list.mpsc_skipped_count.fetch_add(1, std::memory_order_relaxed);
+                SPIN_WAIT_HINT();
+            }
+            defer_completion_to_consumer = true;
+        }
+
+        if (mixed_complete && !defer_completion_to_consumer)
+        {
+            sched_->on_mixed_task_complete(slot_state);
+            completed_this_turn++;
+        }
+    }
+
+    static void promote_pending_to_running(CoreExecState &core)
+    {
+        core.running_slot_state = core.pending_slot_state;
+        core.running_reg_task_id = core.pending_reg_task_id;
+        core.running_subslot = core.pending_subslot;
+        core.pending_slot_state = nullptr;
+        core.pending_reg_task_id = AICPU_TASK_INVALID;
+    }
+    static void clear_running_slot(CoreExecState &core)
+    {
+        core.running_slot_state = nullptr;
+        core.running_reg_task_id = AICPU_TASK_INVALID;
+    }
+
+    void check_running_cores_for_completion(int32_t thread_idx, int32_t &completed_this_turn, int32_t &cur_thread_completed, bool &made_progress)
+    {
+        SchedulerThreadProfile &profile = thread_profiles_[thread_idx];
+        CoreTracker &tracker = core_trackers_[thread_idx];
+        auto running_core_states = tracker.get_all_running_cores();
+        while (running_core_states.has_value())
+        {
+            int32_t bit_pos = running_core_states.pop_first();
+            int32_t core_id = tracker.get_core_id_by_offset(bit_pos);
+            CoreExecState &core = core_exec_states_[core_id];
+            profile.cores_scanned++;
+
+            uint64_t reg_val = static_cast<uint64_t>(*core.cond_ptr);
+            rmb();
+            int32_t reg_task_id = EXTRACT_TASK_ID(reg_val);
+            int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
+
+            SlotTransition t = decide_slot_transition(reg_task_id, reg_state, core.running_reg_task_id, core.pending_reg_task_id);
+            if (!t.matched) continue;
+
+            // --- Apply phase: execute actions based on transition ---
+
+            // 1. Complete finished tasks (capture pointers before modifying core state)
+            if (t.pending_done)
+            {
+                uint64_t tc0 = get_sys_cnt_aicpu();
+                complete_slot_task(*core.pending_slot_state, core.pending_reg_task_id, core_id, completed_this_turn);
+                profile.complete_task_cycles += get_sys_cnt_aicpu() - tc0;
+                profile.complete_task_calls++;
+                cur_thread_completed++;
+            }
+            if (t.running_done)
+            {
+                uint64_t tc0 = get_sys_cnt_aicpu();
+                complete_slot_task(*core.running_slot_state, core.running_reg_task_id, core_id, completed_this_turn);
+                profile.complete_task_cycles += get_sys_cnt_aicpu() - tc0;
+                profile.complete_task_calls++;
+                cur_thread_completed++;
+            }
+
+            // 2. Update slot data
+            if (t.running_freed)
+            {
+                if (core.pending_slot_state != nullptr && !t.pending_done)
+                {
+                    promote_pending_to_running(core);  // Case 2 or Case 3 (with pending)
+                }
+                else
+                {
+                    clear_running_slot(core);  // Case 1 or Case 3 (no pending)
+                    if (t.pending_done)
+                    {
+                        core.pending_slot_state = nullptr;
+                        core.pending_reg_task_id = AICPU_TASK_INVALID;
+                    }
+                }
+            }
+
+            // 3. Update tracker bitmap
+            bool is_idle = (core.running_reg_task_id == AICPU_TASK_INVALID);
+            if (is_idle)
+            {
+                tracker.change_core_state(bit_pos);       // Mark idle
+                tracker.clear_pending_occupied(bit_pos);  // Idle safeguard: no payload to protect
+            }
+            else if (t.pending_freed && core.pending_reg_task_id == AICPU_TASK_INVALID)
+            {
+                tracker.clear_pending_occupied(bit_pos);
+            }
+
+            // 4. Progress signal (only when running task completes)
+            if (t.running_done) made_progress = true;
+        }
+    }
+
+    bool enter_drain_mode(PTO2TaskSlotState *slot_state, int32_t block_num)
+    {
+        int32_t expected = 0;
+        if (!drain_state_.sync_start_pending.compare_exchange_strong(expected, -1, std::memory_order_relaxed, std::memory_order_relaxed)) return false;  // Another thread already holds the drain slot.
+        // We own the drain slot.  Store the task and reset election flag before making it visible.
+        drain_state_.pending_task.store(slot_state, std::memory_order_release);
+        drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+        // Release store: all stores above are now visible to any thread that
+        // acquire-loads sync_start_pending and sees block_num > 0.
+        drain_state_.sync_start_pending.store(block_num, std::memory_order_release);
+        return true;
+    }
+    int32_t count_global_available(PTO2ResourceShape shape)
+    {
+        int32_t total = 0;
+        for (int32_t t = 0; t < active_sched_threads_; t++) total += core_trackers_[t].get_idle_core_offset_states(shape).count();
+        return total;
+    }
+    void drain_worker_dispatch(int32_t block_num)
+    {
+        PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
+        if (!slot_state)
+        {
+            drain_state_.sync_start_pending.store(0, std::memory_order_release);
+            return;
+        }
+        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+
+        for (int32_t t = 0; t < active_sched_threads_ && slot_state->next_block_idx < block_num; t++)
+        {
+            auto valid = core_trackers_[t].get_idle_core_offset_states(shape);
+            int32_t remaining = slot_state->logical_block_num - slot_state->next_block_idx;
+            int32_t claim = std::min(valid.count(), remaining);
+            int32_t start = slot_state->next_block_idx;
+            slot_state->next_block_idx += claim;
+            PublishHandle handles[CoreTracker::MAX_CLUSTERS * 3];
+            int handle_count = 0;
+            for (int32_t b = 0; b < claim; b++)
+            {
+                auto core_offset = valid.pop_first();
+                handle_count += prepare_block_for_dispatch(t, core_offset, *slot_state, shape, false, start + b, &handles[handle_count]);
+            }
+            wmb();
+            uint64_t dispatch_ts = 0;
+            for (int i = 0; i < handle_count; i++) publish_subtask_to_core(handles[i], dispatch_ts);
+        }
+
+        std::atomic_thread_fence(std::memory_order_release);
+        drain_state_.pending_task.store(nullptr, std::memory_order_release);
+        drain_state_.drain_ack_mask.store(0, std::memory_order_relaxed);
+        drain_state_.drain_worker_elected.store(0, std::memory_order_relaxed);
+        drain_state_.sync_start_pending.store(0, std::memory_order_release);
+    }
+    void handle_drain_mode(int32_t thread_idx)
+    {
+        // Spin until drain is fully initialized (sentinel -1 -> block_num > 0).
+        int32_t block_num;
+        do {
+            block_num = drain_state_.sync_start_pending.load(std::memory_order_acquire);
+        } while (block_num < 0);
+        if (block_num == 0) return;
+
+        uint32_t all_acked = (1u << active_sched_threads_) - 1;
+
+        // Ack barrier -- signal this thread has stopped dispatch.
+        drain_state_.drain_ack_mask.fetch_or(1u << thread_idx, std::memory_order_release);
+
+        // Spin until all threads have acked.
+        // If our bit is cleared while waiting, elected reset due to insufficient resources.
+        while (true)
+        {
+            uint32_t ack = drain_state_.drain_ack_mask.load(std::memory_order_acquire);
+            if ((ack & all_acked) == all_acked) break;
+            if ((ack & (1u << thread_idx)) == 0) return;
+            SPIN_WAIT_HINT();
+        }
+
+        // Election -- exactly one thread wins the CAS.
+        int32_t expected = 0;
+        drain_state_.drain_worker_elected.compare_exchange_strong(expected, thread_idx + 1, std::memory_order_acquire, std::memory_order_relaxed);
+
+        if (drain_state_.drain_worker_elected.load(std::memory_order_relaxed) != thread_idx + 1)
+        {
+            // Non-elected: spin-wait for drain completion or resource-insufficient reset.
+            while (drain_state_.sync_start_pending.load(std::memory_order_acquire) != 0)
+            {
+                if (drain_state_.drain_worker_elected.load(std::memory_order_acquire) == 0) return;
+                SPIN_WAIT_HINT();
+            }
+            return;
+        }
+
+        // Elected: check if global resources are sufficient.
+        PTO2TaskSlotState *slot_state = drain_state_.pending_task.load(std::memory_order_acquire);
+        if (slot_state == nullptr)
+        {
+            drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+            return;
+        }
+        PTO2ResourceShape shape = slot_state->active_mask.to_shape();
+        int32_t available = count_global_available(shape);
+
+        if (available < block_num)
+        {
+            // Insufficient resources -- reset drain fields so threads can resume
+            // completion polling to free running cores, then retry.
+            drain_state_.drain_ack_mask.store(0, std::memory_order_release);
+            drain_state_.drain_worker_elected.store(0, std::memory_order_release);
+            return;
+        }
+
+        // Dispatch -- all other threads are spinning, elected thread has exclusive tracker access.
+        drain_worker_dispatch(block_num);
+    }
+
+    LoopAction handle_orchestrator_exit(PTO2SharedMemoryHeader *header, Runtime *runtime)
+    {
+        if (completed_.load(std::memory_order_acquire)) return LoopAction::BREAK_LOOP;
+        int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+        if (orch_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+            return LoopAction::BREAK_LOOP;
+        }
+        int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
+        if (sched_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+            return LoopAction::BREAK_LOOP;
+        }
+
+        if (!orchestrator_done_) return LoopAction::NONE;
+
+        if (total_tasks_ > 0 && completed_tasks_.load(std::memory_order_relaxed) >= total_tasks_)
+        {
+            completed_.store(true, std::memory_order_release);
+            return LoopAction::BREAK_LOOP;
+        }
+        return LoopAction::NONE;
+    }
+
+    LoopAction handle_core_transition(bool &cores_released)
+    {
+        if (!transition_requested_.load(std::memory_order_acquire)) return LoopAction::NONE;
+        if (!reassigned_.load(std::memory_order_acquire))
+        {
+            wait_reassign_.fetch_add(1, std::memory_order_release);
+            while (!reassigned_.load(std::memory_order_acquire))
+            {
+                if (completed_.load(std::memory_order_acquire)) return LoopAction::BREAK_LOOP;
+                SPIN_WAIT_HINT();
+            }
+        }
+        cores_released = true;
+        return LoopAction::NONE;
+    }
+
+    LoopAction check_idle_fatal_error(PTO2SharedMemoryHeader *header, Runtime *runtime)
+    {
+        if (completed_.load(std::memory_order_acquire)) return LoopAction::BREAK_LOOP;
+        int32_t orch_err = header->orch_error_code.load(std::memory_order_acquire);
+        if (orch_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+            return LoopAction::BREAK_LOOP;
+        }
+        int32_t sched_err = header->sched_error_code.load(std::memory_order_acquire);
+        if (sched_err != PTO2_ERROR_NONE)
+        {
+            if (!completed_.exchange(true, std::memory_order_acq_rel)) emergency_shutdown(runtime);
+            return LoopAction::BREAK_LOOP;
+        }
+        return LoopAction::NONE;
+    }
+
+    void log_stall_diagnostics(int32_t thread_idx)
+    {
+        CoreTracker &tracker = core_trackers_[thread_idx];
+
+        // T0 owns the shared-ring scan; printing it from other threads would
+        // produce identical TASK lines once per scheduler thread.
+        if (thread_idx == 0)
+        {
+            int32_t cnt_ready = 0, cnt_waiting = 0, cnt_running = 0, submitted_in_ring = 0;
+            for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
+            {
+                PTO2SharedMemoryRingHeader &ring = *sched_->ring_sched_states[r].ring;
+                int32_t ring_task_count = ring.fc.current_task_index.load(std::memory_order_relaxed);
+                submitted_in_ring += ring_task_count;
+                for (int32_t si = 0; si < ring_task_count; si++)
+                {
+                    PTO2TaskSlotState &slot_state = ring.get_slot_state_by_task_id(si);
+                    // (m) task_state retired; use completion_flags directly.
+                    bool fanin_ready = sched_->fanin_satisfied(&slot_state);
+                    if (ring.completion_flags[si & ring.task_window_mask].load(std::memory_order_relaxed) != 0) continue;
+                    char running_on[192] = {0};
+                    int32_t owner = -1;
+                    int32_t pos = 0;
+                    bool is_running = false;
+                    for (int32_t cid = 0; cid < cores_total_num_ && pos + 32 < (int32_t)sizeof(running_on); cid++)
+                    {
+                        if (core_exec_states_[cid].running_slot_state != &slot_state) continue;
+                        is_running = true;
+                        if (owner < 0) owner = find_core_owner_thread(cid);
+                        const char *sname = subslot_name(core_exec_states_[cid].running_subslot);
+                        int32_t written = snprintf(running_on + pos, sizeof(running_on) - pos, "%score=%d(%s)", pos == 0 ? "" : " ", cid, sname);
+                        if (written > 0) pos += written;
+                    }
+
+                    if (is_running)
+                    {
+                        cnt_running++;
+                        if (cnt_running > STALL_DUMP_READY_MAX) continue;
+                        continue;
+                    }
+                    if (fanin_ready)
+                    {
+                        cnt_ready++;
+                        if (cnt_ready > STALL_DUMP_READY_MAX) continue;
+                        continue;
+                    }
+                    cnt_waiting++;
+                    if (cnt_waiting > STALL_DUMP_WAIT_MAX) continue;
+                }
+            }
+        }
+
+        for (int32_t cli = 0; cli < tracker.get_cluster_count() && cli < STALL_DUMP_CORE_MAX; cli++)
+        {
+            int32_t offset = cli * 3;
+            int32_t aic_id = tracker.get_aic_core_id(offset);
+            int32_t aiv0_id = tracker.get_aiv0_core_id(offset);
+            int32_t aiv1_id = tracker.get_aiv1_core_id(offset);
+            bool aic_idle = tracker.is_aic_core_idle(offset);
+            bool aiv0_idle = tracker.is_aiv0_core_idle(offset);
+            bool aiv1_idle = tracker.is_aiv1_core_idle(offset);
+            char aic_buf[128], aiv0_buf[128], aiv1_buf[128];
+            format_core_status(aic_buf, sizeof(aic_buf), aic_id, aic_idle, &core_exec_states_[aic_id], core_exec_states_[aic_id].reg_addr);
+            format_core_status(aiv0_buf, sizeof(aiv0_buf), aiv0_id, aiv0_idle, &core_exec_states_[aiv0_id], core_exec_states_[aiv0_id].reg_addr);
+            format_core_status(aiv1_buf, sizeof(aiv1_buf), aiv1_id, aiv1_idle, &core_exec_states_[aiv1_id], core_exec_states_[aiv1_id].reg_addr);
+        }
+    }
+
+    void log_shutdown_stall_snapshot()
+    {
+        int32_t thread_count = active_sched_threads_ > 0 ? active_sched_threads_ : aicpu_thread_num_;
+        if (thread_count < 0 || thread_count > MAX_AICPU_THREADS) thread_count = thread_count < 0 ? 0 : MAX_AICPU_THREADS;
+        for (int32_t t = 0; t < thread_count; t++) log_stall_diagnostics(t);
+    }
+
+    int32_t find_core_owner_thread(int32_t core_id) const
+    {
+        for (int32_t t = 0; t < aicpu_thread_num_; t++)
+        {
+            const int32_t *ids = core_trackers_[t].core_ids();
+            int32_t n = core_trackers_[t].core_num();
+            for (int32_t i = 0; i < n; i++)
+                if (ids[i] == core_id) return t;
+        }
+        return -1;
+    }
+
+    bool self_owns_running_task(int32_t thread_idx) const
+    {
+        const int32_t *cores = core_trackers_[thread_idx].core_ids();
+        int32_t core_num = core_trackers_[thread_idx].core_num();
+        for (int32_t i = 0; i < core_num; i++)
+            if (core_exec_states_[cores[i]].running_slot_state != nullptr) return true;
+        return false;
+    }
+
+    bool no_thread_owns_running_task() const
+    {
+        for (int32_t t = 0; t < aicpu_thread_num_; t++)
+            if (self_owns_running_task(t)) return false;
+        return true;
+    }
+
+    int32_t handle_timeout_exit(int32_t thread_idx, PTO2SharedMemoryHeader *header, Runtime *runtime)
+    {
+        latch_scheduler_error(header, thread_idx, PTO2_ERROR_SCHEDULER_TIMEOUT);
+        if (!completed_.exchange(true, std::memory_order_acq_rel))
+        {
+            log_shutdown_stall_snapshot();
+            emergency_shutdown(runtime);
+        }
+        return -PTO2_ERROR_SCHEDULER_TIMEOUT;
+    }
+
+    uint64_t get_function_bin_addr(int func_id) const
+    {
+        if (!func_id_to_addr_ || func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
+        return func_id_to_addr_[func_id];
+    }
+};
+
+#endif  // SCHEDULER_CONTEXT_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h
new file mode 100644
index 000000000..68718affd
--- /dev/null
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler_types.h
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+#ifndef SCHEDULER_TYPES_H
+#define SCHEDULER_TYPES_H
+
+#include <atomic>
+#include <cstdint>
+
+#include "common/core_type.h"
+#include "common/platform_config.h"
+#include "pto_runtime2_types.h"
+#include "spin_hint.h"
+
+constexpr int32_t MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS;
+
+// PLATFORM_MAX_IDLE_ITERATIONS was removed upstream; fixed cadence matches a5's
+// equivalent (used only for per-thread diagnostic logging, not for the fatal-
+// timeout path which uses wall-clock).
+constexpr int32_t STALL_LOG_INTERVAL = 480000;
+constexpr int32_t FATAL_ERROR_CHECK_INTERVAL = 1024;  // Check orchestrator error every N idle iters
+
+constexpr int32_t SCHEDULER_TIMEOUT_MS = PLATFORM_SCHEDULER_TIMEOUT_MS;
+constexpr uint64_t SCHEDULER_TIMEOUT_CYCLES = static_cast<uint64_t>(SCHEDULER_TIMEOUT_MS) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
+constexpr int32_t STALL_DUMP_READY_MAX = 8;
+constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
+constexpr int32_t STALL_DUMP_CORE_MAX = 8;
+constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10;  // log every completion for the first N tasks
+constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions after threshold
+
+enum class LoopAction : int8_t
+{
+    NONE,        // cold path did not trigger; proceed normally
+    BREAK_LOOP,  // equivalent to 'break' from the while(true) loop
+};
+
+// Per-thread phase profiling. Accumulates cumulative cycle counts and entry
+// counts for each phase of resolve_and_dispatch's main loop. Dumped once at
+// loop exit via LOG_INFO_V9 — the hot path only does cycle counter math.
+struct alignas(64) SchedulerThreadProfile
+{
+    uint64_t total_cycles{0};
+    uint64_t completion_cycles{0};
+    // Sub-phase of completion: time spent INSIDE complete_slot_task, and
+    // count of times it ran (one per subtask completion observed).
+    uint64_t complete_task_cycles{0};
+    uint64_t complete_task_calls{0};
+    // Sub-phase of completion: count of cores scanned per iter (proxy for
+    // cond_ptr read cost; aggregate / completion_iters = avg cores/iter).
+    uint64_t cores_scanned{0};
+    uint64_t async_wait_cycles{0};
+    uint64_t drain_wiring_cycles{0};
+    uint64_t spsc_drain_cycles{0};    // sub-phase of drain_wiring: SPSC → pending FIFO
+    uint64_t pending_poll_cycles{0};  // sub-phase of drain_wiring: pending FIFO → ready
+    uint64_t dummy_drain_cycles{0};
+    uint64_t dispatch_cycles{0};
+    uint64_t idle_spin_cycles{0};
+    uint64_t completion_iters{0};
+    uint64_t async_wait_iters{0};
+    uint64_t drain_wiring_iters{0};
+    uint64_t spsc_drain_iters{0};
+    uint64_t pending_poll_iters{0};
+    uint64_t pending_poll_skipped{0};  // (a) gate hits: poll calls skipped due to no new completions
+    uint64_t dummy_drain_iters{0};
+    uint64_t dispatch_iters{0};
+    uint64_t idle_iters{0};
+    uint64_t total_iters{0};
+
+    void reset() { *this = SchedulerThreadProfile{}; }
+};
+
+struct alignas(64) CoreExecState
+{
+    // --- Hot fields (completion + dispatch, every iteration) ---
+    uint64_t reg_addr;                      // offset  0: register base address (set once in handshake)
+    PTO2TaskSlotState *running_slot_state;  // offset  8: slot state for running task (nullptr = empty)
+    PTO2TaskSlotState *pending_slot_state;  // offset 16: slot state for pending task (nullptr = empty)
+    int32_t running_reg_task_id;            // offset 24: register task ID (AICPU_TASK_INVALID = idle)
+    int32_t pending_reg_task_id;            // offset 28: pending register task ID (AICPU_TASK_INVALID = none)
+    uint32_t dispatch_seq;                  // offset 32: monotonic dispatch counter
+    PTO2SubtaskSlot running_subslot;        // offset 36: which subtask slot is running
+    PTO2SubtaskSlot pending_subslot;        // offset 37: which subtask slot is pending
+    uint8_t pad0_[2];                       // offset 38: alignment padding
+    volatile uint32_t *cond_ptr;            // offset 40: precomputed pointer to COND register
+    // --- Cold fields (init/diagnostics only, never in hot path) ---
+    int32_t worker_id;          // offset 48: index in runtime.workers[]
+    uint32_t physical_core_id;  // offset 52: hardware physical core ID
+    CoreType core_type;         // offset 56: AIC or AIV (enum class : int32_t)
+    uint8_t pad2_[4];           // offset 60: pad to 64 bytes
+};
+static_assert(sizeof(CoreExecState) == 64, "CoreExecState must occupy exactly one cache line");
+
+class alignas(64) CoreTracker
+{
+public:
+    static inline int32_t MAX_CORE_PER_THREAD = 63;
+    static constexpr int32_t MAX_CLUSTERS = 63 / 3;
+
+public:
+    CoreTracker() = default;
+
+    class BitStates
+    {
+    public:
+        BitStates() = default;
+
+        explicit BitStates(uint64_t states) :
+            states_(states)
+        {}
+        void init()
+        {
+            states_ = 0;
+        }
+
+        BitStates operator~() const
+        {
+            return BitStates(~states_);
+        }
+        BitStates operator&(const BitStates &other) const
+        {
+            return BitStates(states_ & other.states_);
+        }
+        BitStates operator|(const BitStates &other) const
+        {
+            return BitStates(states_ | other.states_);
+        }
+        BitStates operator^(const BitStates &other) const
+        {
+            return BitStates(states_ ^ other.states_);
+        }
+        BitStates operator>>(int32_t offset) const
+        {
+            return BitStates(states_ >> offset);
+        }
+        BitStates operator<<(int32_t offset) const
+        {
+            return BitStates(states_ << offset);
+        }
+        void operator&=(const BitStates &other)
+        {
+            states_ &= other.states_;
+        }
+        void operator|=(const BitStates &other)
+        {
+            states_ |= other.states_;
+        }
+        void operator^=(const BitStates &other)
+        {
+            states_ ^= other.states_;
+        }
+
+        bool has_value() const
+        {
+            return states_ > 0;
+        }
+        int32_t count() const
+        {
+            return __builtin_popcountll(states_);
+        }
+
+        // Extract the lowest set bit from mask, clear it, and return its position.
+        // Returns -1 if mask is empty.
+        int32_t pop_first()
+        {
+            if (states_ == 0) return -1;
+            int32_t pos = __builtin_ctzll(states_);
+            states_ &= states_ - 1;
+            return pos;
+        }
+
+    private:
+        uint64_t states_{0};
+    };
+
+public:
+    void init(int32_t cluster_count)
+    {
+        cluster_count_ = cluster_count;
+        aic_mask_.init();
+        aiv_mask_.init();
+        pending_occupied_.init();
+        for (int32_t i = 0; i < cluster_count; i++)
+        {
+            aic_mask_ |= BitStates(1ULL << (i * 3));
+            aiv_mask_ |= BitStates(6ULL << (i * 3));
+        }
+        core_states_ = aic_mask_ | aiv_mask_;
+    }
+
+    void set_cluster(int32_t cluster_idx, int32_t aic_wid, int32_t aiv0_wid, int32_t aiv1_wid)
+    {
+        core_id_map_[cluster_idx * 3] = aic_wid;
+        core_id_map_[cluster_idx * 3 + 1] = aiv0_wid;
+        core_id_map_[cluster_idx * 3 + 2] = aiv1_wid;
+    }
+
+    int32_t get_cluster_count() const
+    {
+        return cluster_count_;
+    }
+
+    // --- Running core queries ---
+
+    template <CoreType CT>
+    bool has_running_cores() const
+    {
+        if constexpr (CT == CoreType::AIC) return ((~core_states_) & aic_mask_).has_value();
+        else return ((~core_states_) & aiv_mask_).has_value();
+    }
+
+    bool has_any_running_cores() const
+    {
+        return ((~core_states_) & (aic_mask_ | aiv_mask_)).has_value();
+    }
+
+    template <CoreType CT>
+    int32_t get_running_count() const
+    {
+        if constexpr (CT == CoreType::AIC) return ((~core_states_) & aic_mask_).count();
+        else return ((~core_states_) & aiv_mask_).count();
+    }
+
+    // Return an opaque bitmask for iterating running cores of a given type.
+    // Use pop_first() to extract core bit offsets one at a time.
+    template <CoreType CT>
+    BitStates get_running_cores() const
+    {
+        if constexpr (CT == CoreType::AIC) return (~core_states_) & aic_mask_;
+        else return (~core_states_) & aiv_mask_;
+    }
+
+    BitStates get_all_running_cores() const
+    {
+        return (~core_states_) & (aic_mask_ | aiv_mask_);
+    }
+
+    // --- Cluster matching ---
+
+    BitStates get_valid_cluster_offset_states(PTO2ResourceShape shape) const
+    {
+        switch (shape)
+        {
+        case PTO2ResourceShape::AIC:
+            return core_states_ & aic_mask_;
+        case PTO2ResourceShape::AIV:
+            return ((core_states_ >> 1) | (core_states_ >> 2)) & aic_mask_;
+        case PTO2ResourceShape::MIX:
+            return (core_states_ >> 1) & (core_states_ >> 2) & core_states_ & aic_mask_;
+        case PTO2ResourceShape::DUMMY:
+            // DUMMY tasks never reach the core-tracker dispatch path; they are
+            // completed inline by resolve_and_dispatch via dummy_ready_queue.
+            return BitStates(0ULL);
+        }
+        return BitStates(0ULL);
+    }
+
+    int32_t get_aic_core_id(int32_t cluster_offset) const
+    {
+        return core_id_map_[cluster_offset];
+    }
+    int32_t get_aiv0_core_id(int32_t cluster_offset) const
+    {
+        return core_id_map_[cluster_offset + 1];
+    }
+    int32_t get_aiv1_core_id(int32_t cluster_offset) const
+    {
+        return core_id_map_[cluster_offset + 2];
+    }
+
+    int32_t get_aic_core_offset(int32_t cluster_offset) const
+    {
+        return cluster_offset;
+    }
+    int32_t get_aiv0_core_offset(int32_t cluster_offset) const
+    {
+        return cluster_offset + 1;
+    }
+    int32_t get_aiv1_core_offset(int32_t cluster_offset) const
+    {
+        return cluster_offset + 2;
+    }
+
+    bool is_aic_core_idle(int32_t cluster_offset) const
+    {
+        return ((core_states_ >> cluster_offset) & BitStates(1ULL)).has_value();
+    }
+    bool is_aiv0_core_idle(int32_t cluster_offset) const
+    {
+        return ((core_states_ >> (cluster_offset + 1)) & BitStates(1ULL)).has_value();
+    }
+    bool is_aiv1_core_idle(int32_t cluster_offset) const
+    {
+        return ((core_states_ >> (cluster_offset + 2)) & BitStates(1ULL)).has_value();
+    }
+
+    // --- State mutation ---
+
+    // Toggle bit at the given bit offset (running <-> idle)
+    void change_core_state(int32_t bit_offset)
+    {
+        core_states_ ^= BitStates(1ULL << bit_offset);
+    }
+
+    void set_pending_occupied(int32_t bit_offset)
+    {
+        pending_occupied_ |= BitStates(1ULL << bit_offset);
+    }
+    void clear_pending_occupied(int32_t bit_offset)
+    {
+        pending_occupied_ ^= (pending_occupied_ & BitStates(1ULL << bit_offset));
+    }
+
+    // --- Two-phase dispatch queries ---
+
+    BitStates get_idle_core_offset_states(PTO2ResourceShape shape) const
+    {
+        if (shape == PTO2ResourceShape::AIC) return get_valid_cluster_offset_states(shape) & ~(pending_occupied_ & aic_mask_);
+        if (shape == PTO2ResourceShape::AIV) return core_states_ & aiv_mask_;
+        return get_valid_cluster_offset_states(shape);  // MIX: cluster-level
+    }
+
+    BitStates get_pending_core_offset_states(PTO2ResourceShape shape) const
+    {
+        if (shape == PTO2ResourceShape::MIX)
+        {
+            // Any core without a pending payload can accept a dispatch (idle or running).
+            BitStates available = ~pending_occupied_;
+            BitStates mix_available = (available & aic_mask_) & ((available >> 1) & aic_mask_) & ((available >> 2) & aic_mask_);
+            // Exclude fully-idle clusters (handled by IDLE phase) to prevent double-dispatch.
+            BitStates running = ~core_states_;
+            BitStates cluster_has_running = (running & aic_mask_) | ((running >> 1) & aic_mask_) | ((running >> 2) & aic_mask_);
+            return mix_available & cluster_has_running;
+        }
+        if (shape == PTO2ResourceShape::AIC) return (~core_states_) & aic_mask_ & ~(pending_occupied_ & aic_mask_);
+        // AIV
+        return (~core_states_) & aiv_mask_ & ~pending_occupied_;
+    }
+
+    // --- Two-phase dispatch unified query ---
+
+    enum class DispatchPhase : uint8_t
+    {
+        IDLE,
+        PENDING
+    };
+
+    BitStates get_dispatchable_cores(PTO2ResourceShape shape, DispatchPhase phase) const
+    {
+        return (phase == DispatchPhase::IDLE) ? get_idle_core_offset_states(shape) : get_pending_core_offset_states(shape);
+    }
+
+    // --- Bit offset <-> worker_id mapping ---
+
+    int32_t get_core_id_by_offset(int32_t offset) const
+    {
+        return core_id_map_[offset];
+    }
+
+    const int32_t *core_ids() const
+    {
+        return core_id_map_;
+    }
+    int32_t core_num() const
+    {
+        return cluster_count_ * 3;
+    }
+
+private:
+    int32_t cluster_count_;
+    BitStates aic_mask_;
+    BitStates aiv_mask_;
+    BitStates core_states_;
+    BitStates pending_occupied_;
+    int32_t core_id_map_[63];  // bit_position -> worker_id, max 21 clusters * 3
+};
+
+struct SlotTransition
+{
+    bool running_done = false;   // running task completed
+    bool pending_done = false;   // pending task completed
+    bool running_freed = false;  // running slot data should be released
+    bool pending_freed = false;  // pending_occupied can be cleared
+    bool matched = false;        // some case was hit (otherwise skip apply)
+};
+
+// When sync_start_pending != 0, all scheduler threads skip dispatch
+// (only process completions) until the drain worker finishes launching all blocks.
+struct alignas(64) SyncStartDrainState
+{
+    std::atomic<int32_t> sync_start_pending{0};              // 0=normal; -1=initializing; >0=active (value=block_num)
+    std::atomic<int32_t> drain_worker_elected{0};            // 0=none; >0: elected thread's (thread_idx+1)
+    std::atomic<uint32_t> drain_ack_mask{0};                 // bit per thread; all-set = all threads reached ack barrier
+    std::atomic<PTO2TaskSlotState *> pending_task{nullptr};  // held task (not re-queued)
+    int32_t _pad[10];
+};
+static_assert(sizeof(SyncStartDrainState) == 64);
+
+#endif  // SCHEDULER_TYPES_H
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
deleted file mode 100644
index 951dec2c8..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_runtime2_init.cpp
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Host/AICPU shared runtime-arena layout, init_data and wire implementations.
- *
- * Lives under runtime/shared/ so it is included in both the host_runtime.so
- * build (host pre-populates the prebuilt arena image) and the aicpu_runtime
- * build (AICPU runs wire_arena_pointers + destroy after attach). The
- * device-only parts of pto_runtime2.cpp / pto_orchestrator.cpp / pto_scheduler.cpp
- * (ops table, scope/submit/dispatch business logic, profiling) stay in their
- * original files and the aicpu build only.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "pto_orchestrator.h"
-#include "pto_runtime2.h"
-#include "pto_ring_buffer.h"
-#include "pto_shared_memory.h"
-#include "pto_tensormap.h"
-#include "scheduler/pto_scheduler.h"
-
-// =============================================================================
-// Ready queue
-// =============================================================================
-
-size_t ready_queue_reserve_layout(DeviceArena &arena, uint64_t capacity) {
-    // Align the slots[] base to a full cache line so MPMC CAS traffic on the
-    // first slot cannot false-share with whatever region sits in front of us
-    // (e.g. orchestrator tensormap heads written by the orch thread).
-    return arena.reserve(capacity * sizeof(PTO2ReadyQueueSlot), PTO2_ALIGN_SIZE);
-}
-
-bool ready_queue_init_data_from_layout(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off, uint64_t capacity) {
-    // Address the slots region for data writes without storing the pointer in
-    // queue->slots — that field is set by ready_queue_wire_arena_pointers.
-    auto *slots_arena = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
-    queue->capacity = capacity;
-    queue->mask = capacity - 1;
-    queue->enqueue_pos.store(0, std::memory_order_relaxed);
-    queue->dequeue_pos.store(0, std::memory_order_relaxed);
-
-    for (uint64_t i = 0; i < capacity; i++) {
-        slots_arena[i].sequence.store((int64_t)i, std::memory_order_relaxed);
-        slots_arena[i].slot_state = nullptr;
-    }
-
-    return true;
-}
-
-void ready_queue_wire_arena_pointers(PTO2ReadyQueue *queue, DeviceArena &arena, size_t slots_off) {
-    queue->slots = static_cast<PTO2ReadyQueueSlot *>(arena.region_ptr(slots_off));
-}
-
-void ready_queue_destroy(PTO2ReadyQueue *queue) {
-    // Arena owns the slots[] buffer; just forget the pointer.
-    queue->slots = nullptr;
-}
-
-// =============================================================================
-// Scheduler
-// =============================================================================
-
-bool PTO2SchedulerState::RingSchedState::init_data_from_layout(void *sm_dev_base, int32_t ring_id) {
-    // ring stores the device address of the SM ring header — pure offset
-    // arithmetic, no SM load.
-    ring = pto2_sm_layout::ring_header_addr(sm_dev_base, ring_id);
-    last_task_alive = 0;
-    advance_lock.store(0, std::memory_order_relaxed);
-#if PTO2_PROFILING
-    dep_pool_snapshot_tail.store(1, std::memory_order_relaxed);
-    dep_pool_snapshot_top.store(1, std::memory_order_relaxed);
-#endif
-
-    // Per-slot SM-side initialization (bind_ring + reset_for_reuse +
-    // fanin_count/active_mask zero) lives in PTO2SharedMemoryHandle::
-    // init_header_per_ring so the AICPU performs it during SM reset; host
-    // prebuilt-arena init skips SM access here.
-
-    return true;
-}
-
-void PTO2SchedulerState::RingSchedState::destroy() { ring = nullptr; }
-
-PTO2SchedulerLayout PTO2SchedulerState::reserve_layout(DeviceArena &arena, int32_t dep_pool_capacity) {
-    PTO2SchedulerLayout layout{};
-    layout.ready_queue_capacity = PTO2_READY_QUEUE_SIZE;
-    layout.spsc_capacity = PTO2_WRIRING_QUEUE_SIZE;
-    layout.dep_pool_capacity = dep_pool_capacity;
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        layout.off_ready_queue_slots[i] = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    }
-    layout.off_dummy_ready_queue_slots = ready_queue_reserve_layout(arena, PTO2_READY_QUEUE_SIZE);
-    layout.off_early_dispatch_queue_slots = ready_queue_reserve_layout(arena, PTO2_EARLY_DISPATCH_QUEUE_SIZE);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        // Force a cache-line base so writes from scheduler thread 0 (sole
-        // writer of this ring's dep_pool) do not invalidate adjacent
-        // multi-threaded regions like ready_queue.slots.
-        layout.off_dep_pool_entries[r] =
-            arena.reserve(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2DepListEntry), PTO2_ALIGN_SIZE);
-    }
-    layout.off_wiring_spsc_buffer = PTO2SpscQueue::reserve_layout(arena, PTO2_WRIRING_QUEUE_SIZE);
-    return layout;
-}
-
-bool PTO2SchedulerState::init_data_from_layout(
-    const PTO2SchedulerLayout &layout, DeviceArena &arena, void *sm_dev_base
-) {
-    PTO2SchedulerState *sched = this;
-    sched->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
-#if PTO2_SCHED_PROFILING
-    sched->tasks_completed.store(0, std::memory_order_relaxed);
-    sched->tasks_consumed.store(0, std::memory_order_relaxed);
-#endif
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!sched->ring_sched_states[r].init_data_from_layout(sm_dev_base, r)) {
-            return false;
-        }
-    }
-
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        if (!ready_queue_init_data_from_layout(
-                &sched->ready_queues[i], arena, layout.off_ready_queue_slots[i], layout.ready_queue_capacity
-            )) {
-            return false;
-        }
-    }
-    if (!ready_queue_init_data_from_layout(
-            &sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots, layout.ready_queue_capacity
-        )) {
-        return false;
-    }
-    if (!ready_queue_init_data_from_layout(
-            &sched->early_dispatch_queue, arena, layout.off_early_dispatch_queue_slots, PTO2_EARLY_DISPATCH_QUEUE_SIZE
-        )) {
-        return false;
-    }
-
-    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto *dep_entries = static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
-        memset(dep_entries, 0, static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2DepListEntry));
-        sched->ring_sched_states[r].dep_pool.init(dep_entries, layout.dep_pool_capacity, orch_err);
-    }
-
-    if (!sched->wiring.queue.init_data_from_layout(arena, layout.off_wiring_spsc_buffer, layout.spsc_capacity)) {
-        return false;
-    }
-    sched->wiring.batch_count = 0;
-    sched->wiring.batch_index = 0;
-    sched->wiring.backoff_counter = 0;
-
-    return true;
-}
-
-void PTO2SchedulerState::wire_arena_pointers(const PTO2SchedulerLayout &layout, DeviceArena &arena) {
-    PTO2SchedulerState *sched = this;
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        ready_queue_wire_arena_pointers(&sched->ready_queues[i], arena, layout.off_ready_queue_slots[i]);
-    }
-    ready_queue_wire_arena_pointers(&sched->dummy_ready_queue, arena, layout.off_dummy_ready_queue_slots);
-    ready_queue_wire_arena_pointers(&sched->early_dispatch_queue, arena, layout.off_early_dispatch_queue_slots);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        sched->ring_sched_states[r].dep_pool.base =
-            static_cast<PTO2DepListEntry *>(arena.region_ptr(layout.off_dep_pool_entries[r]));
-    }
-    sched->wiring.queue.wire_arena_pointers(arena, layout.off_wiring_spsc_buffer);
-}
-
-void PTO2SchedulerState::destroy() {
-    PTO2SchedulerState *sched = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        sched->ring_sched_states[r].destroy();
-        sched->ring_sched_states[r].dep_pool.base = nullptr;
-    }
-    sched->wiring.queue.destroy();
-    for (int i = 0; i < PTO2_NUM_RESOURCE_SHAPES; i++) {
-        ready_queue_destroy(&sched->ready_queues[i]);
-    }
-    ready_queue_destroy(&sched->dummy_ready_queue);
-    ready_queue_destroy(&sched->early_dispatch_queue);
-}
-
-// =============================================================================
-// Orchestrator
-// =============================================================================
-
-PTO2OrchestratorLayout PTO2OrchestratorState::reserve_layout(
-    DeviceArena &arena, const int32_t task_window_sizes[PTO2_MAX_RING_DEPTH], int32_t dep_pool_capacity
-) {
-    PTO2OrchestratorLayout layout{};
-    layout.dep_pool_capacity = dep_pool_capacity;
-    layout.scope_tasks_cap = PTO2_SCOPE_TASKS_CAP;
-    layout.scope_stack_capacity = PTO2_MAX_SCOPE_DEPTH;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        layout.off_fanin_pool[r] = arena.reserve(fanin_pool_bytes, PTO2_ALIGN_SIZE);
-
-        always_assert(task_window_sizes[r] > 0 && (task_window_sizes[r] & (task_window_sizes[r] - 1)) == 0);
-        const size_t seen_epoch_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(task_window_sizes[r]) * sizeof(uint32_t), PTO2_ALIGN_SIZE);
-        layout.off_fanin_seen_epoch[r] = arena.reserve(seen_epoch_bytes, PTO2_ALIGN_SIZE);
-    }
-    layout.off_scope_tasks = arena.reserve(
-        static_cast<size_t>(layout.scope_tasks_cap) * sizeof(PTO2TaskSlotState *), alignof(PTO2TaskSlotState *)
-    );
-    layout.off_scope_begins =
-        arena.reserve(static_cast<size_t>(layout.scope_stack_capacity) * sizeof(int32_t), alignof(int32_t));
-    layout.tensor_map = PTO2TensorMap::reserve_layout_default(arena, task_window_sizes);
-    return layout;
-}
-
-bool PTO2OrchestratorState::init_data_from_layout(
-    const PTO2OrchestratorLayout &layout, DeviceArena &arena, void *sm_dev_base, void *gm_heap, uint64_t heap_size,
-    uint64_t task_window_size
-) {
-    auto *orch = this;
-    *orch = PTO2OrchestratorState{};
-
-    orch->sm_header = reinterpret_cast<PTO2SharedMemoryHeader *>(sm_dev_base);
-    orch->gm_heap_base = gm_heap;
-    orch->gm_heap_size = heap_size * PTO2_MAX_RING_DEPTH;
-    orch->fatal = false;
-
-    // Mirror the SM API's per-ring window-size shape so a future per-ring
-    // SM layout cannot silently disagree with the addresses we compute here.
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++)
-        task_window_sizes[r] = task_window_size;
-
-    auto *orch_err = pto2_sm_layout::orch_error_code_addr(sm_dev_base);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        void *ring_heap_base = reinterpret_cast<char *>(gm_heap) + r * heap_size;
-        auto *task_descs_dev = pto2_sm_layout::ring_task_descriptors_addr(sm_dev_base, task_window_sizes, r);
-        auto *cur_idx_dev = pto2_sm_layout::ring_current_task_index_addr(sm_dev_base, r);
-        auto *last_alive_dev = pto2_sm_layout::ring_last_task_alive_addr(sm_dev_base, r);
-
-        orch->rings[r].task_allocator.init(
-            task_descs_dev, static_cast<int32_t>(task_window_size), cur_idx_dev, last_alive_dev, ring_heap_base,
-            heap_size, orch_err
-        );
-
-        const size_t fanin_pool_bytes =
-            PTO2_ALIGN_UP(static_cast<size_t>(layout.dep_pool_capacity) * sizeof(PTO2FaninSpillEntry), PTO2_ALIGN_SIZE);
-        auto *fanin_entries = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
-        memset(fanin_entries, 0, fanin_pool_bytes);
-        orch->rings[r].fanin_pool.init(fanin_entries, layout.dep_pool_capacity, orch_err);
-
-        const size_t seen_epoch_bytes = PTO2_ALIGN_UP(
-            static_cast<size_t>(layout.tensor_map.task_window_sizes[r]) * sizeof(uint32_t), PTO2_ALIGN_SIZE
-        );
-        auto *seen_epoch = static_cast<uint32_t *>(arena.region_ptr(layout.off_fanin_seen_epoch[r]));
-        memset(seen_epoch, 0, seen_epoch_bytes);
-        orch->fanin_seen_epoch[r] = seen_epoch;
-    }
-
-    if (!orch->tensor_map.init_data_from_layout(layout.tensor_map, arena)) {
-        return false;
-    }
-
-    orch->scope_tasks_size = 0;
-    orch->scope_tasks_capacity = layout.scope_tasks_cap;
-    orch->scope_stack_top = -1;
-    orch->scope_stack_capacity = layout.scope_stack_capacity;
-    orch->manual_begin_depth = PTO2_MAX_SCOPE_DEPTH;
-
-    return true;
-}
-
-void PTO2OrchestratorState::wire_arena_pointers(
-    const PTO2OrchestratorLayout &layout, DeviceArena &arena, PTO2SchedulerState *scheduler_arg
-) {
-    auto *orch = this;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        orch->rings[r].fanin_pool.base = static_cast<PTO2FaninSpillEntry *>(arena.region_ptr(layout.off_fanin_pool[r]));
-        orch->fanin_seen_epoch[r] = static_cast<uint32_t *>(arena.region_ptr(layout.off_fanin_seen_epoch[r]));
-    }
-    orch->tensor_map.wire_arena_pointers(layout.tensor_map, arena);
-    orch->scope_tasks = static_cast<PTO2TaskSlotState **>(arena.region_ptr(layout.off_scope_tasks));
-    orch->scope_begins = static_cast<int32_t *>(arena.region_ptr(layout.off_scope_begins));
-    orch->scheduler = scheduler_arg;
-}
-
-void PTO2OrchestratorState::destroy() {
-    auto *orch = this;
-    orch->tensor_map.destroy();
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        orch->rings[r].fanin_pool.base = nullptr;
-        orch->fanin_seen_epoch[r] = nullptr;
-    }
-    orch->scope_tasks = nullptr;
-    orch->scope_begins = nullptr;
-}
-
-void PTO2OrchestratorState::set_scheduler(PTO2SchedulerState *scheduler) { this->scheduler = scheduler; }
-
-// =============================================================================
-// Top-level runtime arena
-// =============================================================================
-
-PTO2RuntimeArenaLayout
-runtime_reserve_layout(DeviceArena &arena, uint64_t task_window_size, int32_t dep_pool_capacity) {
-    PTO2RuntimeArenaLayout layout{};
-    layout.task_window_size = task_window_size;
-    layout.dep_pool_capacity = dep_pool_capacity;
-
-    int32_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = static_cast<int32_t>(task_window_size);
-    }
-
-    layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
-    layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes, dep_pool_capacity);
-    layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacity);
-    layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
-    layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
-
-    layout.arena_size = arena.total_size();
-    return layout;
-}
-
-PTO2Runtime *runtime_init_data_from_layout(
-    DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
-    uint64_t /*sm_size*/, void *gm_heap_dev_base, uint64_t heap_size
-) {
-    PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.off_runtime));
-    memset(rt, 0, sizeof(*rt));
-
-    auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
-    memset(sm_wrap, 0, sizeof(*sm_wrap));
-
-    // rt->ops is filled by the AICPU at boot.
-    rt->mode = mode;
-    rt->gm_heap = gm_heap_dev_base;
-    rt->gm_heap_size = heap_size > 0 ? heap_size * PTO2_MAX_RING_DEPTH : 0;
-    rt->gm_heap_owned = false;
-    rt->total_cycles = 0;
-
-    if (!rt->orchestrator.init_data_from_layout(
-            layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_size, layout.task_window_size
-        )) {
-        return nullptr;
-    }
-    if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) {
-        return nullptr;
-    }
-
-    auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
-    memset(mailbox, 0, sizeof(*mailbox));
-
-    return rt;
-}
-
-void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) {
-    rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
-    rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
-    rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler);
-    rt->scheduler.wire_arena_pointers(layout.sched, arena);
-}
-
-void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) {
-    // Arena buffer is pooled across runs by DeviceRunner — never freed here.
-    if (!rt) return;
-    rt->scheduler.destroy();
-    rt->orchestrator.destroy();
-    rt->aicore_mailbox = nullptr;
-    rt->sm_handle = nullptr;
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
deleted file mode 100644
index 1e1edff92..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_shared_memory.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - Shared Memory Implementation
- *
- * Implements shared memory allocation, initialization, and management
- * for Orchestrator-Scheduler communication.
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_shared_memory.h"
-#include <inttypes.h>
-#include <stdlib.h>
-#include <string.h>
-#include "common/unified_log.h"
-
-// =============================================================================
-// Size Calculation
-// =============================================================================
-
-uint64_t PTO2SharedMemoryHandle::calculate_size(uint64_t task_window_size) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-    }
-    return calculate_size_per_ring(task_window_sizes);
-}
-
-uint64_t PTO2SharedMemoryHandle::calculate_size_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
-    uint64_t size = 0;
-
-    // Header (aligned to cache line)
-    size += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-
-    // Per-ring task descriptors and payloads
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-        size += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
-    }
-
-    return size;
-}
-
-// =============================================================================
-// Creation and Destruction
-// =============================================================================
-
-void PTO2SharedMemoryHandle::setup_pointers_per_ring(const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]) {
-    char *ptr = (char *)sm_base;
-
-    // Header
-    header = (PTO2SharedMemoryHeader *)ptr;
-    ptr += PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-
-    // Per-ring task descriptors, payloads, and slot states
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto &ring = header->rings[r];
-        ring.task_descriptors = (PTO2TaskDescriptor *)ptr;
-        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-
-        ring.task_payloads = (PTO2TaskPayload *)ptr;
-        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-
-        ring.slot_states = (PTO2TaskSlotState *)ptr;
-        ptr += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
-    }
-}
-
-void PTO2SharedMemoryHandle::setup_pointers(uint64_t task_window_size) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-    }
-    setup_pointers_per_ring(task_window_sizes);
-}
-
-bool PTO2SharedMemoryHandle::init(
-    void *sm_base_arg, uint64_t sm_size_arg, uint64_t task_window_size, uint64_t heap_size
-) {
-    if (!sm_base_arg || sm_size_arg == 0) return false;
-    if (sm_size_arg < calculate_size(task_window_size)) return false;
-
-    sm_base = sm_base_arg;
-    sm_size = sm_size_arg;
-    is_owner = false;
-    setup_pointers(task_window_size);
-    init_header(task_window_size, heap_size);
-    return true;
-}
-
-PTO2SharedMemoryHandle *PTO2SharedMemoryHandle::create_and_init_default(DeviceArena &arena) {
-    const uint64_t buffer_size = calculate_size(PTO2_TASK_WINDOW_SIZE);
-    const size_t off_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
-    const size_t off_buffer = arena.reserve(static_cast<size_t>(buffer_size), PTO2_ALIGN_SIZE);
-    if (arena.commit() == nullptr) return nullptr;
-
-    auto *handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(off_handle));
-    memset(handle, 0, sizeof(*handle));
-    void *buffer = arena.region_ptr(off_buffer);
-    memset(buffer, 0, static_cast<size_t>(buffer_size));
-    if (!handle->init(buffer, buffer_size, PTO2_TASK_WINDOW_SIZE, PTO2_HEAP_SIZE)) return nullptr;
-    return handle;
-}
-
-void PTO2SharedMemoryHandle::destroy() {
-    // Arena-owned wrappers (is_owner == false) are reclaimed by arena.release();
-    // calling destroy on them is a no-op so existing callers stay safe.
-    if (is_owner && sm_base) {
-        free(sm_base);
-        free(this);
-    }
-}
-
-// =============================================================================
-// Initialization
-// =============================================================================
-//
-// no need init data in pool, init pool data when used
-void PTO2SharedMemoryHandle::init_header(uint64_t task_window_size, uint64_t heap_size) {
-    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
-    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_window_sizes[r] = task_window_size;
-        heap_sizes[r] = heap_size;
-    }
-    init_header_per_ring(task_window_sizes, heap_sizes);
-}
-
-void PTO2SharedMemoryHandle::init_header_per_ring(
-    const uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH], const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
-) {
-    // Per-ring flow control (start at 0)
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        header->rings[r].fc.init();
-    }
-
-    header->orchestrator_done.store(0, std::memory_order_relaxed);
-
-    // Per-ring layout info
-    uint64_t offset = PTO2_ALIGN_UP(sizeof(PTO2SharedMemoryHeader), PTO2_ALIGN_SIZE);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        header->rings[r].task_window_size = task_window_sizes[r];
-        header->rings[r].task_window_mask = static_cast<int32_t>(task_window_sizes[r] - 1);
-        header->rings[r].heap_size = heap_sizes[r];
-        header->rings[r].task_descriptors_offset = offset;
-        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskDescriptor), PTO2_ALIGN_SIZE);
-        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskPayload), PTO2_ALIGN_SIZE);
-        offset += PTO2_ALIGN_UP(task_window_sizes[r] * sizeof(PTO2TaskSlotState), PTO2_ALIGN_SIZE);
-    }
-
-    header->total_size = sm_size;
-    header->graph_output_ptr.store(0, std::memory_order_relaxed);
-    header->graph_output_size.store(0, std::memory_order_relaxed);
-
-    // Error reporting
-    header->orch_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
-    header->sched_error_bitmap.store(0, std::memory_order_relaxed);
-    header->sched_error_code.store(PTO2_ERROR_NONE, std::memory_order_relaxed);
-    header->sched_error_thread.store(-1, std::memory_order_relaxed);
-
-    // Per-ring slot_states reset. Previously lived in
-    // PTO2SchedulerState::RingSchedState::init(), but it writes into
-    // ring->slot_states[] which is SM-side storage — keeping it here lets
-    // host-side prebuilt-arena init skip all SM dereferences.
-    // bind_ring() pins the ring_id (slot-invariant after this point);
-    // reset_for_reuse() prepares dynamic fanout/refcount fields so the first
-    // submit doesn't need an explicit reset.
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto &ring = header->rings[r];
-        for (uint64_t i = 0; i < task_window_sizes[r]; i++) {
-            ring.slot_states[i].bind_ring(static_cast<uint8_t>(r));
-            ring.slot_states[i].reset_for_reuse();
-            ring.slot_states[i].fanin_count = 0;
-            ring.slot_states[i].active_mask = ActiveMask{};
-        }
-    }
-}
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-void PTO2SharedMemoryHandle::print_layout() {
-    if (!header) return;
-
-    PTO2SharedMemoryHeader *h = header;
-
-    LOG_INFO_V0("=== PTO2 Shared Memory Layout ===");
-    LOG_INFO_V0("Base address:       %p", sm_base);
-    LOG_INFO_V0("Total size:         %" PRIu64 " bytes", h->total_size);
-    LOG_INFO_V0("Ring depth:         %d", PTO2_MAX_RING_DEPTH);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        LOG_INFO_V0("Ring %d:", r);
-        LOG_INFO_V0("  task_window_size: %" PRIu64, h->rings[r].task_window_size);
-        LOG_INFO_V0("  heap_size:        %" PRIu64 " bytes", h->rings[r].heap_size);
-        LOG_INFO_V0(
-            "  descriptors_off:  %" PRIu64 " (0x%" PRIx64 ")", h->rings[r].task_descriptors_offset,
-            h->rings[r].task_descriptors_offset
-        );
-        LOG_INFO_V0("  current_task_idx: %d", h->rings[r].fc.current_task_index.load(std::memory_order_acquire));
-        LOG_INFO_V0("  last_task_alive:  %d", h->rings[r].fc.last_task_alive.load(std::memory_order_acquire));
-    }
-    LOG_INFO_V0("orchestrator_done:  %d", h->orchestrator_done.load(std::memory_order_acquire));
-    LOG_INFO_V0("Error state:");
-    LOG_INFO_V0("  orch_error_code:    %d", h->orch_error_code.load(std::memory_order_relaxed));
-    LOG_INFO_V0("  sched_error_bitmap: 0x%x", h->sched_error_bitmap.load(std::memory_order_relaxed));
-    LOG_INFO_V0("  sched_error_code:   %d", h->sched_error_code.load(std::memory_order_relaxed));
-    LOG_INFO_V0("  sched_error_thread: %d", h->sched_error_thread.load(std::memory_order_relaxed));
-    LOG_INFO_V0("================================");
-}
-
-bool PTO2SharedMemoryHandle::validate() {
-    if (!sm_base) return false;
-    if (!header) return false;
-
-    PTO2SharedMemoryHeader *h = header;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (!h->rings[r].fc.validate(this, r)) return false;
-    }
-
-    return true;
-}
-
-bool PTO2RingFlowControl::validate(PTO2SharedMemoryHandle *handle, int32_t ring_id) const {
-    if (!handle) return false;
-    if (!handle->header) return false;
-    if (ring_id < 0 || ring_id >= PTO2_MAX_RING_DEPTH) return false;
-
-    const PTO2SharedMemoryHeader *h = handle->header;
-
-    // Check that offsets are within bounds
-    if (h->rings[ring_id].task_descriptors_offset >= h->total_size) return false;
-
-    // Check pointer alignment
-    if ((uintptr_t)h->rings[ring_id].task_descriptors % PTO2_ALIGN_SIZE != 0) return false;
-
-    // Check flow control pointer sanity
-    int32_t current = current_task_index.load(std::memory_order_acquire);
-    int32_t last_alive = last_task_alive.load(std::memory_order_acquire);
-    if (current < 0) return false;
-    if (last_alive < 0) return false;
-
-    return true;
-}
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
deleted file mode 100644
index b99c67233..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/pto_tensormap.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * PTO Runtime2 - TensorMap Implementation
- *
- * Implements TensorMap with ring buffer pool, lazy invalidation,
- * and chain truncation optimization.
- *
- * Key features:
- * 1. O(1) insert at bucket head
- * 2. O(valid_entries) lookup with chain truncation
- * 3. Automatic stale entry cleanup during lookup
- * 4. Periodic explicit cleanup for long chains
- *
- * Based on: docs/RUNTIME_LOGIC.md
- */
-
-#include "pto_tensormap.h"
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "common.h"
-#include "common/unified_log.h"
-
-// =============================================================================
-// TensorMap Lookup Chain Length Statistics (compile-time toggle)
-// =============================================================================
-#if PTO2_TENSORMAP_PROFILING
-uint64_t g_lookup_chain_total = 0;
-uint64_t g_lookup_count = 0;
-int32_t g_lookup_chain_max = 0;
-uint64_t g_lookup_overlap_checks = 0;
-uint64_t g_lookup_overlap_hits = 0;
-uint64_t g_insert_count = 0;
-#endif
-
-// =============================================================================
-// Initialization and Destruction
-// =============================================================================
-
-PTO2TensorMapLayout PTO2TensorMap::reserve_layout(
-    DeviceArena &arena, int32_t new_num_buckets, int32_t new_pool_size,
-    const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]
-) {
-    // num_buckets must be a power of two for the hash truncation to work.
-    always_assert((new_num_buckets & (new_num_buckets - 1)) == 0);
-
-    PTO2TensorMapLayout layout{};
-    layout.num_buckets = new_num_buckets;
-    layout.pool_size = new_pool_size;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        layout.task_window_sizes[r] = new_task_window_sizes[r];
-    }
-
-    layout.off_buckets = arena.reserve(
-        static_cast<size_t>(new_num_buckets) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)
-    );
-    layout.off_entry_pool =
-        arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry), alignof(PTO2TensorMapEntry));
-    layout.off_free_entry_list =
-        arena.reserve(static_cast<size_t>(new_pool_size) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *));
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        layout.off_task_entry_heads[r] = arena.reserve(
-            static_cast<size_t>(new_task_window_sizes[r]) * sizeof(PTO2TensorMapEntry *), alignof(PTO2TensorMapEntry *)
-        );
-    }
-    return layout;
-}
-
-PTO2TensorMapLayout
-PTO2TensorMap::reserve_layout_default(DeviceArena &arena, const int32_t new_task_window_sizes[PTO2_MAX_RING_DEPTH]) {
-    return reserve_layout(arena, PTO2_TENSORMAP_NUM_BUCKETS, PTO2_TENSORMAP_POOL_SIZE, new_task_window_sizes);
-}
-
-bool PTO2TensorMap::init_data_from_layout(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
-    num_buckets = layout.num_buckets;
-    pool_size = layout.pool_size;
-
-    // Address arena regions for data writes; do not store these in struct
-    // fields (wire_arena_pointers does that).
-    auto *buckets_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
-    auto *entry_pool_arena = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
-    auto *free_list_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
-
-    // buckets[]: empty == nullptr.
-    for (int32_t i = 0; i < num_buckets; i++) {
-        buckets_arena[i] = nullptr;
-    }
-
-    // entry_pool: zero-init equivalent to the previous calloc(entry_pool, ...).
-    // The pool's persistent invariant after init is "bucket_index == -1 means
-    // not linked", set explicitly below.
-    memset(entry_pool_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry));
-    for (int32_t i = 0; i < pool_size; i++) {
-        entry_pool_arena[i].bucket_index = -1;
-        entry_pool_arena[i].next_in_bucket = nullptr;
-        entry_pool_arena[i].prev_in_bucket = nullptr;
-        entry_pool_arena[i].next_in_task = nullptr;
-        entry_pool_arena[i].prev_in_task = nullptr;
-        entry_pool_arena[i].producer_task_id = PTO2TaskId{};
-    }
-
-    // free_entry_list: zeroed (was calloc'd before); contents become meaningful
-    // only after entries are freed back, so the body of the array stays as 0.
-    memset(free_list_arena, 0, static_cast<size_t>(pool_size) * sizeof(PTO2TensorMapEntry *));
-
-    next_entry_idx = 0;
-    free_num = 0;
-
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        auto *heads_arena = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
-        for (int32_t i = 0; i < layout.task_window_sizes[r]; i++) {
-            heads_arena[i] = nullptr;
-        }
-        task_window_sizes[r] = layout.task_window_sizes[r];
-        last_task_alives[r] = 0;
-        last_cleanup[r] = 0;
-    }
-
-    return true;
-}
-
-void PTO2TensorMap::wire_arena_pointers(const PTO2TensorMapLayout &layout, DeviceArena &arena) {
-    buckets = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_buckets));
-    entry_pool = static_cast<PTO2TensorMapEntry *>(arena.region_ptr(layout.off_entry_pool));
-    free_entry_list = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_free_entry_list));
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_entry_heads[r] = static_cast<PTO2TensorMapEntry **>(arena.region_ptr(layout.off_task_entry_heads[r]));
-    }
-}
-
-void PTO2TensorMap::destroy() {
-    // Arena owns the backing memory; here we only forget our pointers so any
-    // stray post-destroy access trips a nullptr dereference instead of reading
-    // a recycled allocation.
-    buckets = nullptr;
-    entry_pool = nullptr;
-    free_entry_list = nullptr;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        task_entry_heads[r] = nullptr;
-    }
-}
-
-// =============================================================================
-// Debug Utilities
-// =============================================================================
-
-void PTO2TensorMap::print_stats() {
-    int32_t valid = 0;
-    int32_t stale = 0;
-    int32_t empty_buckets = 0;
-    int32_t max_chain = 0;
-    int64_t total_chain = 0;
-    int32_t non_empty_buckets = 0;
-
-    // Count entries
-    for (int32_t i = 0; i < pool_size; i++) {
-        if (entry_pool[i].bucket_index != -1) {
-            if (entry_valid(entry_pool[i])) {
-                valid++;
-            } else {
-                stale++;
-            }
-        }
-    }
-
-    // Count bucket stats
-    for (int32_t b = 0; b < num_buckets; b++) {
-        int32_t chain_len = 0;
-        auto cur_entry = buckets[b];
-
-        while (cur_entry != nullptr) {
-            chain_len++;
-            cur_entry = cur_entry->next_in_bucket;
-        }
-
-        if (chain_len == 0) {
-            empty_buckets++;
-        } else {
-            non_empty_buckets++;
-            total_chain += chain_len;
-            if (chain_len > max_chain) {
-                max_chain = chain_len;
-            }
-        }
-    }
-
-    LOG_INFO_V0("=== TensorMap Statistics ===");
-    LOG_INFO_V0("Pool size:           %d", pool_size);
-    LOG_INFO_V0("Pool next entry idx: %d", next_entry_idx);
-    LOG_INFO_V0("Pool free_num:       %d", free_num);
-    LOG_INFO_V0("Num buckets:         %d", num_buckets);
-    LOG_INFO_V0("Valid entries:       %d", valid);
-    LOG_INFO_V0("Stale entries:       %d", stale);
-    LOG_INFO_V0("Empty buckets:       %d", empty_buckets);
-    LOG_INFO_V0("Max chain len:       %d", max_chain);
-    LOG_INFO_V0("Avg chain len:       %.2f", non_empty_buckets > 0 ? (float)total_chain / non_empty_buckets : 0);
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        LOG_INFO_V0("Last task alive[%d]: %d", r, last_task_alives[r]);
-    }
-    LOG_INFO_V0("============================");
-}
-
-int32_t PTO2TensorMap::valid_count() {
-    int32_t count = 0;
-
-    for (int32_t i = 0; i < pool_size; i++) {
-        if (entry_pool[i].bucket_index != -1 && entry_valid(entry_pool[i])) {
-            count++;
-        }
-    }
-
-    return count;
-}
-
-void PTO2TensorMap::sync_tensormap(PTO2TaskId task_id, int32_t sm_last_task_alive) {
-    auto ring_id = task_id.ring();
-    auto local_id = task_id.local();
-    sync_validity(ring_id, sm_last_task_alive);
-
-    // Only attempt cleanup when last_task_alive has actually advanced;
-    // otherwise cleanup_retired would empty-loop and we'd spin forever.
-    auto overlap = get_task_local_id_slot(ring_id, local_id) == get_task_local_id_slot(ring_id, last_cleanup[ring_id]);
-    if (sm_last_task_alive - last_cleanup[ring_id] >= PTO2_TENSORMAP_CLEANUP_INTERVAL || overlap) {
-        cleanup_retired(ring_id, last_cleanup[ring_id], sm_last_task_alive);
-        last_cleanup[ring_id] = sm_last_task_alive;
-    }
-}
-
-// =============================================================================
-// TensorMap Lookup Profiling
-// =============================================================================
-#if PTO2_TENSORMAP_PROFILING
-PTO2TensorMapProfilingData pto2_tensormap_get_profiling() {
-    PTO2TensorMapProfilingData d;
-    d.lookup_chain_total = g_lookup_chain_total;
-    d.lookup_count = g_lookup_count;
-    d.lookup_chain_max = g_lookup_chain_max;
-    d.overlap_checks = g_lookup_overlap_checks;
-    d.overlap_hits = g_lookup_overlap_hits;
-    d.insert_count = g_insert_count;
-
-    // Reset
-    g_lookup_chain_total = 0;
-    g_lookup_count = 0;
-    g_lookup_chain_max = 0;
-    g_lookup_overlap_checks = 0;
-    g_lookup_overlap_hits = 0;
-    g_insert_count = 0;
-    return d;
-}
-#endif
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
deleted file mode 100644
index b3347b53c..000000000
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Runtime Class - Implementation
- *
- * Device execution and handshake control.
- * Task graph construction is handled by PTO2Runtime.
- */
-
-#include "runtime.h"
-
-#include "common/unified_log.h"
-#include "pto_runtime2_types.h"
-#include "pto_shared_memory.h"
-
-// =============================================================================
-// Constructor
-// =============================================================================
-
-Runtime::Runtime() {
-    // NOTE: host_api is initialized in InitRuntime() (host-only code)
-    // because the CApi functions don't exist when compiled for device.
-
-    // Initialize handshake buffers
-    memset(workers, 0, sizeof(workers));
-    worker_count = 0;
-    aicpu_thread_num = 1;
-    ready_queue_shards = RUNTIME_DEFAULT_READY_QUEUE_SHARDS;
-    task_window_size = 0;
-    heap_size = 0;
-    dep_pool_size = 0;
-    orch_to_sched = false;
-
-    // Initialize device orchestration state
-    gm_sm_ptr_ = nullptr;
-    gm_heap_ptr_ = nullptr;
-    slot_states_ptr_ = nullptr;
-    orch_args_storage_.clear();
-    prebuilt_arena_base_ = nullptr;
-    prebuilt_runtime_offset_ = 0;
-
-    // Initialize device orchestration SO binary
-    dev_orch_so_addr_ = 0;
-    dev_orch_so_size_ = 0;
-    active_callable_id_ = -1;
-    register_new_callable_id_ = false;
-    device_orch_func_name_[0] = '\0';
-    device_orch_config_name_[0] = '\0';
-
-    // Initialize kernel binary tracking
-    registered_kernel_count_ = 0;
-
-    // Initialize function address mapping
-    for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) {
-        func_id_to_addr_[i] = 0;
-    }
-}
-
-// =============================================================================
-// Device orchestration
-// =============================================================================
-
-void *Runtime::get_gm_sm_ptr() const { return gm_sm_ptr_; }
-void *Runtime::get_gm_heap_ptr() const { return gm_heap_ptr_; }
-const ChipStorageTaskArgs &Runtime::get_orch_args() const { return orch_args_storage_; }
-void Runtime::set_gm_sm_ptr(void *p) { gm_sm_ptr_ = p; }
-void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; }
-void Runtime::set_slot_states_ptr(void *p) { slot_states_ptr_ = p; }
-void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { orch_args_storage_ = args; }
-
-void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) {
-    prebuilt_arena_base_ = arena_base;
-    prebuilt_runtime_offset_ = runtime_off;
-}
-void *Runtime::get_prebuilt_arena_base() const { return prebuilt_arena_base_; }
-size_t Runtime::get_prebuilt_runtime_offset() const { return prebuilt_runtime_offset_; }
-
-// Device orchestration SO metadata (bytes live in a separate device buffer
-// owned by DeviceRunner; only the address/size travels in Runtime).
-void Runtime::set_dev_orch_so(uint64_t dev_addr, uint64_t size) {
-    dev_orch_so_addr_ = dev_addr;
-    dev_orch_so_size_ = size;
-}
-
-uint64_t Runtime::get_dev_orch_so_addr() const { return dev_orch_so_addr_; }
-
-uint64_t Runtime::get_dev_orch_so_size() const { return dev_orch_so_size_; }
-
-void Runtime::set_active_callable_id(int32_t callable_id, bool is_new) {
-    active_callable_id_ = callable_id;
-    register_new_callable_id_ = is_new;
-}
-
-int32_t Runtime::get_active_callable_id() const { return active_callable_id_; }
-
-bool Runtime::register_new_callable_id() const { return register_new_callable_id_; }
-
-void Runtime::set_device_orch_func_name(const char *name) {
-    if (name == nullptr) {
-        device_orch_func_name_[0] = '\0';
-        return;
-    }
-    std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
-    device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
-}
-
-const char *Runtime::get_device_orch_func_name() const { return device_orch_func_name_; }
-
-void Runtime::set_device_orch_config_name(const char *name) {
-    if (name == nullptr) {
-        device_orch_config_name_[0] = '\0';
-        return;
-    }
-    std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
-    device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
-}
-
-const char *Runtime::get_device_orch_config_name() const { return device_orch_config_name_; }
-
-uint64_t Runtime::get_function_bin_addr(int func_id) const {
-    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
-    return func_id_to_addr_[func_id];
-}
-
-void Runtime::set_function_bin_addr(int func_id, uint64_t addr) {
-    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
-        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
-        return;
-    }
-    if (addr != 0 && func_id_to_addr_[func_id] == 0) {
-        if (registered_kernel_count_ < RUNTIME_MAX_FUNC_ID) {
-            registered_kernel_func_ids_[registered_kernel_count_++] = func_id;
-        } else {
-            LOG_ERROR(
-                "[Runtime] Registration limit reached (%d). Cannot track func_id=%d for cleanup.", RUNTIME_MAX_FUNC_ID,
-                func_id
-            );
-        }
-    }
-    func_id_to_addr_[func_id] = addr;
-}
-
-void Runtime::replay_function_bin_addr(int func_id, uint64_t addr) {
-    if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) {
-        LOG_ERROR("[Runtime] func_id=%d is out of range [0, %d)", func_id, RUNTIME_MAX_FUNC_ID);
-        return;
-    }
-    func_id_to_addr_[func_id] = addr;
-}
-
-int Runtime::get_registered_kernel_count() const { return registered_kernel_count_; }
-
-int Runtime::get_registered_kernel_func_id(int index) const {
-    if (index < 0 || index >= registered_kernel_count_) return -1;
-    return registered_kernel_func_ids_[index];
-}
-
-void Runtime::clear_registered_kernels() { registered_kernel_count_ = 0; }
diff --git a/src/common/task_interface/pto_task_id.h b/src/common/task_interface/pto_task_id.h
index 0996ce5d8..f3040998c 100644
--- a/src/common/task_interface/pto_task_id.h
+++ b/src/common/task_interface/pto_task_id.h
@@ -9,43 +9,49 @@
  * -----------------------------------------------------------------------------------------------------------
  */
 
-/**
- * PTO2TaskId — minimal standalone header.
- *
- * Factored out of pto_runtime2_types.h so that tensor.h can include it
- * without pulling in scheduler-internal constants (heap sizes, timeouts, etc.).
- */
-
 #pragma once
 
 #include <cstdint>
 
-/**
- * TaskId: 64-bit encoding used across Runtime2.
- *
- * raw encoding: (ring_id << 32) | local_id
- *
- * ring_id:  which ring layer (0..PTO2_MAX_RING_DEPTH-1)
- * local_id: per-ring monotonic counter
- *
- * Invalid sentinel: raw == UINT64_MAX (no valid task has this encoding).
- */
-struct PTO2TaskId {
+struct PTO2TaskId
+{
     uint64_t raw;
 
-    static constexpr PTO2TaskId make(uint8_t ring_id, uint32_t local_id) {
+    static constexpr PTO2TaskId make(uint8_t ring_id, uint32_t local_id)
+    {
         return PTO2TaskId{(static_cast<uint64_t>(ring_id) << 32) | static_cast<uint64_t>(local_id)};
     }
 
-    static constexpr PTO2TaskId invalid() { return PTO2TaskId{UINT64_MAX}; }
+    static constexpr PTO2TaskId invalid()
+    {
+        return PTO2TaskId{UINT64_MAX};
+    }
 
-    constexpr uint8_t ring() const { return static_cast<uint8_t>(raw >> 32); }
-    constexpr uint32_t local() const { return static_cast<uint32_t>(raw & 0xFFFFFFFFu); }
-    constexpr bool is_valid() const { return raw != UINT64_MAX; }
-    constexpr bool is_invalid() const { return raw == UINT64_MAX; }
+    constexpr uint8_t ring() const
+    {
+        return static_cast<uint8_t>(raw >> 32);
+    }
+    constexpr uint32_t local() const
+    {
+        return static_cast<uint32_t>(raw & 0xFFFFFFFFu);
+    }
+    constexpr bool is_valid() const
+    {
+        return raw != UINT64_MAX;
+    }
+    constexpr bool is_invalid() const
+    {
+        return raw == UINT64_MAX;
+    }
 
-    constexpr bool operator==(const PTO2TaskId &other) const { return raw == other.raw; }
-    constexpr bool operator!=(const PTO2TaskId &other) const { return raw != other.raw; }
+    constexpr bool operator==(const PTO2TaskId &other) const
+    {
+        return raw == other.raw;
+    }
+    constexpr bool operator!=(const PTO2TaskId &other) const
+    {
+        return raw != other.raw;
+    }
 };
 
 static_assert(sizeof(PTO2TaskId) == 8, "PTO2TaskId must stay 8 bytes (shared memory ABI)");
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 3c70ee135..25ebeb655 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -418,7 +418,6 @@ add_a2a3_test(test_a2a3_fatal a2a3/test_a2a3_fatal.cpp)
 add_a2a3_runtime_test(test_task_allocator   a2a3/test_task_allocator.cpp)
 add_a2a3_runtime_test(test_dep_list_pool    a2a3/test_dep_list_pool.cpp)
 add_a2a3_runtime_test(test_scheduler_state  a2a3/test_scheduler_state.cpp)
-add_a2a3_runtime_test(test_task_state       a2a3/test_task_state.cpp)
 add_a2a3_runtime_test(test_ready_queue      a2a3/test_ready_queue.cpp)
 add_a2a3_runtime_test(test_shared_memory    a2a3/test_shared_memory.cpp)
 add_a2a3_runtime_test(test_a2a3_tensormap   a2a3/test_tensormap.cpp)
diff --git a/tests/ut/cpp/a2a3/test_task_state.cpp b/tests/ut/cpp/a2a3/test_task_state.cpp
deleted file mode 100644
index 916d9144f..000000000
--- a/tests/ut/cpp/a2a3/test_task_state.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Unit tests for PTO2TaskSlotState lifecycle through PTO2SchedulerState API.
- *
- * These tests drive state transitions via src methods (release_fanin,
- * on_subtask_complete, check_and_handle_consumed) rather than manually
- * operating atomic fields.  For concurrent exactly-once semantics of
- * fanin/subtask/fanout, see test_scheduler_state.cpp which already
- * covers those paths via the same API.
- *
- * This file focuses on:
- * - Full lifecycle through src API
- * - Ready-path behavior (task_state stays PENDING through dispatch)
- * - Double subtask completion (counter-model weakness)
- */
-
-#include <gtest/gtest.h>
-#include <atomic>
-#include <cstring>
-#include <thread>
-#include <vector>
-#include "utils/device_arena.h"
-#include "scheduler/pto_scheduler.h"
-
-class TaskStateTest : public ::testing::Test {
-protected:
-    PTO2SchedulerState sched;
-    PTO2SharedMemoryHandle *sm_handle = nullptr;
-    DeviceArena sm_arena;
-    DeviceArena sched_arena;
-
-    // Each init_slot()'d slot gets a distinct zeroed payload from this pool,
-    // mirroring orch::prepare_task's bind_buffers: every production slot has a
-    // payload, and the scheduler's release/propagate paths dereference it.
-    static constexpr int kSlotPayloadPoolSize = 16;
-    PTO2TaskPayload slot_payload_pool_[kSlotPayloadPoolSize];
-    int slot_payload_pool_idx_ = 0;
-
-    void SetUp() override {
-        sm_handle = PTO2SharedMemoryHandle::create_and_init_default(sm_arena);
-        ASSERT_NE(sm_handle, nullptr);
-        auto layout = PTO2SchedulerState::reserve_layout(sched_arena);
-        ASSERT_NE(sched_arena.commit(), nullptr);
-        ASSERT_TRUE(sched.init_data_from_layout(layout, sched_arena, sm_handle->header));
-        sched.wire_arena_pointers(layout, sched_arena);
-    }
-
-    void TearDown() override {
-        sched.destroy();
-        sched_arena.release();
-        sm_arena.release();
-    }
-
-    void init_slot(PTO2TaskSlotState &slot, PTO2TaskState state, int32_t fanin_count, int32_t fanout_count) {
-        memset(&slot, 0, sizeof(slot));
-        slot.task_state.store(state);
-        slot.fanin_count = fanin_count;
-        slot.fanin_refcount.store(0);
-        slot.fanout_count = fanout_count;
-        slot.fanout_refcount.store(0);
-        slot.fanout_lock.store(0);
-        slot.fanout_head = nullptr;
-        slot.ring_id = 0;
-        slot.active_mask = ActiveMask(PTO2_SUBTASK_MASK_AIC);
-        slot.completed_subtasks.store(0);
-        slot.total_required_subtasks = 1;
-        slot.logical_block_num = 1;
-        PTO2TaskPayload &slot_pl = slot_payload_pool_[slot_payload_pool_idx_++ % kSlotPayloadPoolSize];
-        memset(&slot_pl, 0, sizeof(slot_pl));
-        slot.payload = &slot_pl;
-    }
-};
-
-// =============================================================================
-// Full lifecycle through src API: PENDING -> (fanin) -> (queued + dispatched)
-// -> (subtask) -> COMPLETED -> (fanout) -> CONSUMED
-// =============================================================================
-TEST_F(TaskStateTest, FullLifecycleThroughAPI) {
-    alignas(64) PTO2TaskSlotState slot;
-    init_slot(slot, PTO2_TASK_PENDING, 1, 1);
-    slot.total_required_subtasks = 1;
-    slot.completed_subtasks.store(0);
-
-    // Fanin satisfied -> task becomes ready
-    bool ready = sched.release_fanin_and_check_ready(slot);
-    EXPECT_TRUE(ready);
-
-    // Subtask completes -> task done
-    bool done = sched.on_subtask_complete(slot);
-    EXPECT_TRUE(done);
-
-    // Manually transition to COMPLETED (normally done by scheduler dispatch loop)
-    slot.task_state.store(PTO2_TASK_COMPLETED, std::memory_order_release);
-
-    // Fanout released -> CONSUMED
-    sched.release_producer(slot);
-    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_CONSUMED);
-}
-
-// =============================================================================
-// release_fanin does not write task_state.
-//
-// Readiness is determined solely by fanin_refcount reaching fanin_count.
-// task_state stays PENDING from submit through "queued in ready_queue" and
-// "dispatched to a worker" until the worker stores COMPLETED.
-// =============================================================================
-TEST_F(TaskStateTest, ReadyPathStaysPending) {
-    alignas(64) PTO2TaskSlotState slot;
-    init_slot(slot, PTO2_TASK_PENDING, 1, 1);
-
-    bool ready = sched.release_fanin_and_check_ready(slot);
-    ASSERT_TRUE(ready) << "Task should be detected as ready via refcount";
-
-    // task_state remains PENDING -- there is no intermediate ready/running state.
-    EXPECT_EQ(slot.task_state.load(), PTO2_TASK_PENDING) << "release_fanin_and_check_ready must not write task_state";
-}
-
-// =============================================================================
-// Multi-fanin: partial release does not trigger ready
-// =============================================================================
-TEST_F(TaskStateTest, MultiFaninPartialNotReady) {
-    alignas(64) PTO2TaskSlotState slot;
-    init_slot(slot, PTO2_TASK_PENDING, 3, 1);
-
-    EXPECT_FALSE(sched.release_fanin_and_check_ready(slot));
-    EXPECT_FALSE(sched.release_fanin_and_check_ready(slot));
-    EXPECT_TRUE(sched.release_fanin_and_check_ready(slot));
-}
-
-// =============================================================================
-// Concurrent fanin: exactly one thread detects ready (via src API)
-// =============================================================================
-TEST_F(TaskStateTest, ConcurrentFaninExactlyOneReady) {
-    constexpr int ROUNDS = 500;
-
-    for (int round = 0; round < ROUNDS; round++) {
-        alignas(64) PTO2TaskSlotState slot;
-        init_slot(slot, PTO2_TASK_PENDING, 3, 1);
-        std::atomic<int> ready_count{0};
-
-        auto release = [&]() {
-            if (sched.release_fanin_and_check_ready(slot)) {
-                ready_count.fetch_add(1);
-            }
-        };
-
-        std::thread t1(release), t2(release), t3(release);
-        t1.join();
-        t2.join();
-        t3.join();
-
-        EXPECT_EQ(ready_count.load(), 1) << "Round " << round;
-    }
-}
-
-// =============================================================================
-// Concurrent subtask completion: exactly one thread sees done (via src API)
-// =============================================================================
-TEST_F(TaskStateTest, ConcurrentSubtaskCompletion) {
-    constexpr int ROUNDS = 500;
-
-    for (int round = 0; round < ROUNDS; round++) {
-        alignas(64) PTO2TaskSlotState slot;
-        init_slot(slot, PTO2_TASK_PENDING, 1, 1);
-        slot.total_required_subtasks = 3;
-        slot.completed_subtasks.store(0);
-        std::atomic<int> done_count{0};
-
-        auto complete = [&]() {
-            if (sched.on_subtask_complete(slot)) {
-                done_count.fetch_add(1);
-            }
-        };
-
-        std::thread t1(complete), t2(complete), t3(complete);
-        t1.join();
-        t2.join();
-        t3.join();
-
-        EXPECT_EQ(done_count.load(), 1) << "Round " << round;
-        EXPECT_EQ(slot.completed_subtasks.load(), 3);
-    }
-}
-
-// =============================================================================
-// Double subtask completion (counter-model weakness).
-// With the counter model, double-completing the same subtask increments
-// completed_subtasks twice, potentially reaching total prematurely.
-// Unlike the old bitmask model, the counter cannot detect duplicates.
-// =============================================================================
-TEST_F(TaskStateTest, DoubleSubtaskCompletionCounterWeakness) {
-    alignas(64) PTO2TaskSlotState slot;
-    init_slot(slot, PTO2_TASK_PENDING, 1, 1);
-    slot.total_required_subtasks = 2;
-    slot.completed_subtasks.store(0);
-
-    // First subtask completion
-    bool done1 = sched.on_subtask_complete(slot);
-    EXPECT_FALSE(done1) << "Single completion doesn't complete the task";
-
-    // Same subtask completes AGAIN (logic error at caller level)
-    bool done2 = sched.on_subtask_complete(slot);
-    EXPECT_TRUE(done2) << "Counter model: double-completion falsely triggers done";
-}

From 771675646e71ad4a2bf382eaf0e57d9780e6526d Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Mon, 22 Jun 2026 13:42:34 +0200
Subject: [PATCH 13/14] Make squashed rebase compile + run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes after the rebase commit:

1. pto_runtime2_types.h: the PTO2TaskPayload compatibility layer for
   upstream spec-dispatch references PTO2FaninPool and
   PTO2_FANIN_INLINE_CAP. Upstream defines them in this same header
   but the merge dropped the lines. Restore: #define
   PTO2_FANIN_INLINE_CAP 64 and forward-declare struct PTO2FaninPool
   alongside PTO2_MAX_FANIN.

2. orchestration/common.cpp: assert_impl + AssertionError + the
   addr2line / backtrace machinery used to live inline in
   wireless2's runtime/common.h. Upstream moved the declarations to
   src/common/task_interface/assert_compat.h and expects the runtime
   target to provide the definitions in orchestration/common.cpp
   (a5 does so). Port a5's common.cpp into the a2a3 orchestration
   path. Sidestep the LOG_ERROR vs LOG_INFO_V macro conflict by not
   pulling common/unified_log.h (would re-#define LOG_INFO_V0..V9
   already supplied by pto_orchestration_api.h) and using a local
   stderr-printing LOG_ERROR for the assert path.

paged_attention Case4 passes (1389 µs, 10 rounds). Case1 trimmed
device avg = 30587 µs over 100 rounds — works but ~11% slower than
the same wireless2 stack on the c4b0aac2 baseline (27451 µs). The
extra cost is likely overhead from coexisting with upstream's
additions (spec-dispatch storage, profiling fields, etc.) that the
wireless poller never reads but the orchestrator still populates.
Investigation + tightening of the coexistence layer is a follow-up.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../orchestration/common.cpp                  | 171 +++++++++++++++++-
 .../runtime/pto_runtime2_types.h              |   4 +
 2 files changed, 170 insertions(+), 5 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp
index 13b4af4fb..dad04f73e 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/orchestration/common.cpp
@@ -11,20 +11,181 @@
 #include "common.h"
 #include "pto_orchestration_api.h"
 
+// LOG_ERROR can't be pulled from common/unified_log.h here because that header
+// would re-#define LOG_INFO_V0..V9 already provided by pto_orchestration_api.h
+// (orchestration routes them through the runtime ops table). For the limited
+// use inside this file, write directly to stderr.
+#include <cstdio>
+#define LOG_ERROR(fmt, ...) std::fprintf(stderr, "[ERROR] " fmt "\n", ##__VA_ARGS__)
+
+#ifdef __linux__
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <unistd.h>
+
+#include <array>
+#include <cstring>
+#include <vector>
+#endif
+
 struct PTO2Runtime;
 
 namespace {
+// Plain global (not thread_local) to avoid glibc TLSDESC stale-resolution
+// crash (BZ #32412) when the orchestration SO is dlclose'd/re-dlopen'd
+// between execution rounds.  All orchestrator threads bind the same rt
+// value, so per-thread storage is unnecessary.
 PTO2Runtime *g_current_runtime = nullptr;
 }  // namespace
 
-extern "C" __attribute__((visibility("default"))) void framework_bind_runtime(PTO2Runtime *rt)
-{
+extern "C" __attribute__((visibility("default"))) void framework_bind_runtime(PTO2Runtime *rt) {
     g_current_runtime = rt;
 }
 
 // Keep current_runtime local to this .so so orchestration helpers do not
 // accidentally bind to the AICPU binary's same-named symbol.
-extern "C" __attribute__((visibility("hidden"))) PTO2Runtime *framework_current_runtime()
-{
-    return g_current_runtime;
+extern "C" __attribute__((visibility("hidden"))) PTO2Runtime *framework_current_runtime() { return g_current_runtime; }
+
+/**
+ * Use addr2line to convert an address to file:line information.
+ * Uses the -i flag to expand inlines; returns the first line (innermost actual code location).
+ * If inlining is present, also returns the outer call chain via inline_chain.
+ */
+#ifdef __linux__
+static std::string addr_to_line(const char *executable, void *addr, std::string *inline_chain = nullptr) {
+    char cmd[512];
+    snprintf(cmd, sizeof(cmd), "addr2line -e %s -f -C -p -i %p 2>/dev/null", executable, addr);
+
+    std::array<char, 256> buffer;
+    std::string raw_output;
+
+    FILE *pipe = popen(cmd, "r");
+    if (pipe) {
+        while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
+            raw_output += buffer.data();
+        }
+        pclose(pipe);
+    }
+
+    if (raw_output.empty() || raw_output.find("??") != std::string::npos) {
+        return "";
+    }
+
+    // Split by lines
+    std::vector<std::string> lines;
+    size_t pos = 0;
+    while (pos < raw_output.size()) {
+        size_t nl = raw_output.find('\n', pos);
+        if (nl == std::string::npos) nl = raw_output.size();
+        std::string line = raw_output.substr(pos, nl - pos);
+        while (!line.empty() && line.back() == '\r')
+            line.pop_back();
+        if (!line.empty()) lines.push_back(line);
+        pos = nl + 1;
+    }
+
+    if (lines.empty()) return "";
+
+    // First line is the innermost actual code location; subsequent lines are outer inline callers
+    if (inline_chain && lines.size() > 1) {
+        *inline_chain = "";
+        for (size_t j = 1; j < lines.size(); j++) {
+            *inline_chain += "    [inlined by] " + lines[j] + "\n";
+        }
+    }
+
+    return lines.front();
+}
+#endif
+
+/**
+ * Get current stack trace information (including file paths and line numbers).
+ * Uses dladdr to locate the shared library for each stack frame, then calls addr2line with relative addresses.
+ */
+std::string get_stacktrace(int skip_frames) {
+    (void)skip_frames;  // May be unused on non-Linux platforms
+    std::string result;
+#ifdef __linux__
+    const int max_frames = 64;
+    void *buffer[max_frames];
+    int nframes = backtrace(buffer, max_frames);
+    char **symbols = backtrace_symbols(buffer, nframes);
+
+    if (symbols) {
+        result = "Stack trace:\n";
+        for (int i = skip_frames; i < nframes; i++) {
+            std::string frame_info;
+
+            void *addr = (void *)((char *)buffer[i] - 1);
+
+            Dl_info dl_info;
+            std::string inline_chain;
+            if (dladdr(addr, &dl_info) && dl_info.dli_fname) {
+                void *rel_addr = (void *)((char *)addr - (char *)dl_info.dli_fbase);
+                std::string addr2line_result = addr_to_line(dl_info.dli_fname, rel_addr, &inline_chain);
+
+                if (addr2line_result.empty()) {
+                    addr2line_result = addr_to_line(dl_info.dli_fname, addr, &inline_chain);
+                }
+
+                if (!addr2line_result.empty()) {
+                    frame_info = std::string(dl_info.dli_fname) + ": " + addr2line_result;
+                }
+            }
+
+            if (frame_info.empty()) {
+                std::string frame(symbols[i]);
+
+                size_t start = frame.find('(');
+                size_t end = frame.find('+', start);
+                if (start != std::string::npos && end != std::string::npos) {
+                    std::string mangled = frame.substr(start + 1, end - start - 1);
+                    int status;
+                    char *demangled = abi::__cxa_demangle(mangled.c_str(), nullptr, nullptr, &status);
+                    if (status == 0 && demangled) {
+                        frame = frame.substr(0, start + 1) + demangled + frame.substr(end);
+                        free(demangled);
+                    }
+                }
+                frame_info = frame;
+            }
+
+            char buf[16];
+            snprintf(buf, sizeof(buf), "  #%d ", i - skip_frames);
+            result += buf + frame_info + "\n";
+            if (!inline_chain.empty()) {
+                result += inline_chain;
+            }
+        }
+        free(symbols);
+    }
+#else
+    result = "(Stack trace is only available on Linux)\n";
+#endif
+    return result;
+}
+
+// AssertionError constructor
+static std::string build_assert_message(const char *condition, const char *file, int line) {
+    std::string msg = "Assertion failed: " + std::string(condition) + "\n";
+    msg += "  Location: " + std::string(file) + ":" + std::to_string(line) + "\n";
+    msg += get_stacktrace(3);
+    return msg;
+}
+
+AssertionError::AssertionError(const char *condition, const char *file, int line) :
+    std::runtime_error(build_assert_message(condition, file, line)),
+    condition_(condition),
+    file_(file),
+    line_(line) {}
+
+[[noreturn]] void assert_impl(const char *condition, const char *file, int line) {
+    LOG_ERROR("\n========================================");
+    LOG_ERROR("Assertion failed: %s", condition);
+    LOG_ERROR("Location: %s:%d", file, line);
+    LOG_ERROR("%s", get_stacktrace(2).c_str());
+    LOG_ERROR("========================================\n");
+
+    throw AssertionError(condition, file, line);
 }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 602abf83e..758c85086 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -72,6 +72,10 @@
 // Fanin storage — absolute max number of unique fanin dependencies per task.
 #define PTO2_MAX_FANIN 16
 
+// Upstream spec-dispatch compatibility: inline fanin cap + spill pool fwd decl.
+#define PTO2_FANIN_INLINE_CAP 64
+struct PTO2FaninPool;  // Forward declaration (defined by upstream spec-dispatch path)
+
 // TensorMap cleanup interval
 #define PTO2_TENSORMAP_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
 #define PTO2_DEP_POOL_CLEANUP_INTERVAL 64   // Cleanup every N retired tasks

From 98cdbb658afceeaa6c6bf3e6fb9fae2dc38d10cc Mon Sep 17 00:00:00 2001
From: s00831018 <sergio.miguel.martin@huawei.com>
Date: Mon, 22 Jun 2026 14:03:48 +0200
Subject: [PATCH 14/14] Drop dead spec-dispatch compatibility fields from
 PTO2TaskPayload
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root cause of the +11% Case1 / pa_manual_scope regression I measured
on wireless3 yesterday.

When I merged wireless2 onto upstream/main I added a "compatibility
layer" to PTO2TaskPayload: kept upstream's
  fanin_inline_slot_states[PTO2_FANIN_INLINE_CAP]   // 512 B
  fanin_actual_count, fanin_spill_start
  fanin_spill_pool*
  staged_core_mask[PTO2_SPEC_CORE_MASK_WORDS]      // 16 B
  dispatch_fanin, allow_early_resolve, spec_state,
  dispatch_propagated, spec_chain_active, spec_chain_depth
alongside the wireless model's flat fanin_local_ids[]. The intent was
to give spec-dispatch's release path something to link against. But
the spec-dispatch implementation lived in scheduler/* and
pto_orchestrator.cpp / pto_runtime2.cpp — files we deleted as part
of the wireless directory collapse. After the merge nothing in the
tree actually reads/writes any of those fields (verified by grep).

So: ~560 bytes of dead per-payload storage. With 65K tasks per
Case1 round that's ~36 MB of cache thrash per round even though
the wireless poller never touches the bytes. Bench confirmed: the
regression was workload-size-correlated and only hit the
biggest workloads (Case1, pa_manual_scope Case1/2).

Remove:
- fanin_inline_slot_states, fanin_spill_pool, fanin_*_count|start
- staged_core_mask, dispatch_fanin, allow_early_resolve, spec_state,
  dispatch_propagated, spec_chain_active, spec_chain_depth
- PTO2SpecState enum and PTO2_SPEC_CORE_MASK_WORDS constant
- PTO2_FANIN_INLINE_CAP define and PTO2FaninPool fwd decl
- The init() block that zeroed those fields
- The +512 prefetch in prefetch() that targeted them
- A reset_for_reuse comment referring to them

Bench post-fix (wireless3 vs wireless2 on bench_baseline):
  paged_attention Case1                 27919  vs 27692  (+0.8% wash)
  paged_attention Case4                  1134  vs  1382  (−18%)
  paged_attention CaseSmall1              302  vs   650  (−54%)
  pa_unroll_manual_scope Case1           1626  vs  1883  (−14%)
  pa_unroll_manual_scope Case2           1016  vs  1272  (−20%)
  paged_attention_manual_scope Case1    25249  vs 24933  (+1.3% wash)
  paged_attention_manual_scope Case2    13382  vs 13109  (+2.1% wash)
  benchmark_bgemm Case0                  1038  vs  1274  (−19%)

The three heavy cases are within run-to-run noise of wireless2;
every other case is significantly faster (smaller workloads benefit
from upstream's improvements between c4b0aac2 and current main).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../runtime/pto_runtime2_types.h              | 68 +------------------
 1 file changed, 2 insertions(+), 66 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
index 758c85086..d504afe0b 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h
@@ -72,10 +72,6 @@
 // Fanin storage — absolute max number of unique fanin dependencies per task.
 #define PTO2_MAX_FANIN 16
 
-// Upstream spec-dispatch compatibility: inline fanin cap + spill pool fwd decl.
-#define PTO2_FANIN_INLINE_CAP 64
-struct PTO2FaninPool;  // Forward declaration (defined by upstream spec-dispatch path)
-
 // TensorMap cleanup interval
 #define PTO2_TENSORMAP_CLEANUP_INTERVAL 64  // Cleanup every N retired tasks
 #define PTO2_DEP_POOL_CLEANUP_INTERVAL 64   // Cleanup every N retired tasks
@@ -134,26 +130,9 @@ struct PTO2TaskDescriptor
 /**
  * Task payload data (cold path - only accessed during orchestration and dispatch)
  *
- * Layout: metadata + inline fanin packed in the first 9 cache lines, followed
- * by bulk tensor and scalar data. Small fanins stay fully inline; larger
- * fanins spill into a per-ring ring buffer slice.
+ * Layout: metadata + flat fanin_local_ids[] in the first 2 cache lines,
+ * followed by bulk tensor and scalar data.
  */
-// Speculative early-dispatch claim states for PTO2TaskPayload::spec_state.
-enum PTO2SpecState : uint8_t {
-    PTO2_SPEC_NONE = 0,       // not pre-staged
-    PTO2_SPEC_STAGING = 1,    // Hook 1 claimed it; staging in progress
-    PTO2_SPEC_STAGED = 2,     // staged on a core, gated; staged_* fields valid
-    PTO2_SPEC_DISPATCHED = 3  // routed via the normal dispatch path (no pre-stage)
-};
-
-// A pre-staged consumer occupies one core per gated subtask block. WHICH cores
-// it occupies is recorded as a bitmask (staged_core_mask, 1 bit per global
-// core_id); the completion-path release iterates the set bits and rings each
-// core's doorbell from the scheduler's per-core doorbell table. Bounded by the
-// chip's core count (RUNTIME_MAX_WORKER = 72; no two-level pre-dispatch means
-// gated cores in flight <= core count), NOT by block_num — so a wide SPMD
-// consumer can pre-stage all its idle cores. 2 words = 128 bits >= 72.
-inline constexpr int PTO2_SPEC_CORE_MASK_WORDS = 2;
 
 struct PTO2TaskPayload {
     // === Cache lines 0-2 (192B) — metadata + fanin (wireless model) ===
@@ -165,22 +144,6 @@ struct PTO2TaskPayload {
     // slot_state.
     int32_t fanin_count{0};
     int32_t fanin_local_ids[PTO2_MAX_FANIN];
-    // ---- Upstream spec-dispatch coexistence (compatibility layer) ----
-    // Speculative early-dispatch (#1079) was built on a fanin_refcount /
-    // fanin_slot_states model. The wireless poller doesn't read these
-    // fields, but the spec-dispatch code paths still do — keep the storage
-    // so that code links. Populated alongside fanin_local_ids[].
-    int32_t fanin_actual_count{0};
-    int32_t fanin_spill_start{0};
-    PTO2FaninPool *fanin_spill_pool{nullptr};
-    PTO2TaskSlotState *fanin_inline_slot_states[PTO2_FANIN_INLINE_CAP];
-    std::atomic<uint64_t> staged_core_mask[PTO2_SPEC_CORE_MASK_WORDS]{};
-    std::atomic<int32_t> dispatch_fanin{0};
-    bool allow_early_resolve{false};
-    std::atomic<uint8_t> spec_state{0};
-    std::atomic<uint8_t> dispatch_propagated{0};
-    std::atomic<uint8_t> spec_chain_active{0};
-    uint8_t spec_chain_depth{0};
     // === Tensors (Tensor is alignas(64); array is naturally aligned) ===
     Tensor tensors[MAX_TENSOR_ARGS];
     // === Scalars ===
@@ -207,7 +170,6 @@ struct PTO2TaskPayload {
         __builtin_prefetch(this, 1, 3);
         __builtin_prefetch(reinterpret_cast<const char *>(this) + 64, 1, 3);
         __builtin_prefetch(reinterpret_cast<const char *>(this) + 128, 1, 3);
-        __builtin_prefetch(reinterpret_cast<const char *>(this) + 512, 1, 3);  // spec fields (cache line 8)
     }
 
     /**
@@ -243,27 +205,6 @@ struct PTO2TaskPayload {
         // Round up to cache line boundary. Both arrays are 128B so no overrun.
         // Eliminates branches; extra bytes within the same CL have zero additional cost.
         memcpy(scalars, args.scalars(), PTO2_ALIGN_UP(args.scalar_count() * sizeof(uint64_t), 64));
-
-        // Speculative early-dispatch metadata — the single init point for these
-        // fields. reset_for_reuse MUST NOT touch the payload (it runs on the
-        // scheduler's advance-ring path and would pull this cold cache line across
-        // structures); prepare_task only allocates/binds. prefetch() warms this
-        // line (offset 512) so these writes land in warm cache.
-        //
-        // spec_state / staged_core_mask / dispatch_fanin / spec_chain_* are all
-        // CONSUMER-side: a task with allow_early_resolve == false still has them
-        // touched when one of ITS producers is flagged (propagate_dispatch_fanin
-        // bumps dispatch_fanin and may CAS spec_state / set the auto-chain flag on
-        // any consumer, independent of the consumer's own hint). So they MUST be
-        // zeroed here unconditionally — no per-task allow_early_resolve gating.
-        allow_early_resolve = args.allow_early_resolve();
-        spec_state.store(PTO2_SPEC_NONE, std::memory_order_relaxed);
-        for (int w = 0; w < PTO2_SPEC_CORE_MASK_WORDS; w++)
-            staged_core_mask[w].store(0, std::memory_order_relaxed);
-        dispatch_fanin.store(0, std::memory_order_relaxed);
-        dispatch_propagated.store(0, std::memory_order_relaxed);
-        spec_chain_active.store(0, std::memory_order_relaxed);
-        spec_chain_depth = 0;
     }
 };
 
@@ -326,11 +267,6 @@ struct alignas(64) PTO2TaskSlotState
         completed_subtasks.store(0, std::memory_order_relaxed);
         next_block_idx.store(0, std::memory_order_relaxed);
         any_subtask_deferred.store(false, std::memory_order_relaxed);
-        // Note: payload spec fields (spec_state / staged_core_mask / dispatch_fanin /
-        // spec_chain_*) are NOT reset here — this method skips the payload by
-        // contract. They are (re)initialized in PTO2TaskPayload::init on every
-        // submit, before the slot becomes visible to the scheduler.
-
         // (e) Wake list: clear for the next incarnation. Previous incarnation
         // left it at WAKE_LIST_SENTINEL (set by its on_mixed_task_complete).
         wake_list_head.store(nullptr, std::memory_order_relaxed);