diff --git a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
index 1ed9b5209..3dd01ffb4 100644
--- a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
+++ b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h
@@ -217,13 +217,22 @@ void l2_swimlane_aicpu_init_core_assignments(int total_cores);
 void l2_swimlane_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num);
 
 /**
- * Flush remaining phase records for a thread
+ * Flush the remaining scheduler-phase records for a scheduler thread.
  *
- * Marks the current WRITING phase buffer as READY and enqueues it
- * for host collection. Called at thread exit (analogous to l2_swimlane_aicpu_flush).
+ * Marks the thread's current WRITING sched-phase buffer as READY and enqueues
+ * it for host collection. Called at scheduler-thread exit.
  *
- * @param thread_idx Thread index (scheduler thread or orchestrator)
+ * @param thread_idx Scheduler thread index (= sched pool index = ready queue)
  */
-void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx);
+void l2_swimlane_aicpu_flush_sched_phase_buffer(int thread_idx);
+
+/**
+ * Flush the remaining orchestrator-phase records (single orch instance, pool
+ * ordinal 0). Called once by the orchestrator thread at orchestration end.
+ *
+ * @param thread_idx Calling (orchestrator) AICPU thread index — selects the
+ *                   ready queue to enqueue into. The pool/lane tag is ordinal 0.
+ */
+void l2_swimlane_aicpu_flush_orch_phase_buffer(int thread_idx);
 
 #endif  // PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_
diff --git a/src/a2a3/platform/include/common/l2_swimlane_profiling.h b/src/a2a3/platform/include/common/l2_swimlane_profiling.h
index 5477b3df5..aac96c8c5 100644
--- a/src/a2a3/platform/include/common/l2_swimlane_profiling.h
+++ b/src/a2a3/platform/include/common/l2_swimlane_profiling.h
@@ -379,11 +379,13 @@ struct L2SwimlaneDataHeader {
                                  // at init; AICPU reads in l2_swimlane_aicpu_init.
 
     // Phase profiling metadata (AICPU writes in l2_swimlane_aicpu_init_phase;
-    // Host reads at drain time). Both thread counts == 0 means phase
-    // profiling was not initialized. Gated by l2_swimlane_level >=
-    // SCHED_PHASES at write time. Sched and orch pools are sized
-    // independently — typically num_orch_phase_threads == 1, but in
-    // orch_to_sched mode both equal num_aicpu_threads.
+    // Host reads at drain time). Both counts == 0 means phase profiling was not
+    // initialized. Gated by l2_swimlane_level >= SCHED_PHASES at write time.
+    // num_sched_phase_threads counts the active scheduler threads (sched-phase
+    // pools are per scheduler thread, indexed by thread id). Orchestration is
+    // single-threaded, so orch-phase is a single instance: num_orch_phase_threads
+    // == 1 and records land in orch pool ordinal 0 (dep_gen / scope_stats style),
+    // regardless of which AICPU thread the orchestrator runs on.
     uint32_t num_sched_phase_threads;           // Number of sched-phase pools the AICPU initialized
     uint32_t num_orch_phase_threads;            // Number of orch-phase pools the AICPU initialized
     uint32_t num_phase_cores;                   // Number of valid entries in core_to_thread (0 = unset)
diff --git a/src/a2a3/platform/include/host/l2_swimlane_collector.h b/src/a2a3/platform/include/host/l2_swimlane_collector.h
index a2fbc00c8..7ab3b3acf 100644
--- a/src/a2a3/platform/include/host/l2_swimlane_collector.h
+++ b/src/a2a3/platform/include/host/l2_swimlane_collector.h
@@ -322,8 +322,9 @@ class L2SwimlaneCollector : public profiling_common::ProfilerBase<L2SwimlaneColl
      * @return 0 on success, error code on failure
      */
     int initialize(
-        int num_aicore, int device_id, L2SwimlaneLevel l2_swimlane_level, const L2SwimlaneAllocCallback &alloc_cb,
-        L2SwimlaneRegisterCallback register_cb, const L2SwimlaneFreeCallback &free_cb, const std::string &output_prefix
+        int num_aicore, int aicpu_thread_num, int device_id, L2SwimlaneLevel l2_swimlane_level,
+        const L2SwimlaneAllocCallback &alloc_cb, L2SwimlaneRegisterCallback register_cb,
+        const L2SwimlaneFreeCallback &free_cb, const std::string &output_prefix
     );
 
     /**
@@ -413,6 +414,11 @@ class L2SwimlaneCollector : public profiling_common::ProfilerBase<L2SwimlaneColl
     void *aicore_ring_addr_table_dev_{nullptr};
 
     int num_aicore_{0};
+    // Total AICPU threads launched this run. The dedicated orchestrator runs on
+    // the last one (aicpu_thread_num_ - 1); used to report its thread number in
+    // the phase-metadata log (orch-phase is a single pool, so its index alone
+    // does not encode the AICPU thread).
+    int aicpu_thread_num_{0};
     L2SwimlaneLevel l2_swimlane_level_{L2SwimlaneLevel::DISABLED};
 
     // Per-task output directory captured at initialize() time. Consumed by
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index dd0e2ac33..929a07c34 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -253,7 +253,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
 
     // Initialize per-subsystem shared memory.
     if (enable_l2_swimlane_) {
-        rc = init_l2_swimlane(num_aicore, device_id_);
+        rc = init_l2_swimlane(num_aicore, runtime.aicpu_thread_num, device_id_);
         if (rc != 0) {
             LOG_ERROR("init_l2_swimlane failed: %d", rc);
             return rc;
@@ -457,7 +457,7 @@ int DeviceRunner::finalize() {
 
 // `launch_aicpu_kernel` and `launch_aicore_kernel` live on `DeviceRunnerBase`.
 
-int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) {
+int DeviceRunner::init_l2_swimlane(int num_aicore, int aicpu_thread_num, int device_id) {
     auto alloc_cb = [this](size_t size) -> void * {
         return mem_alloc_.alloc(size);
     };
@@ -480,7 +480,7 @@ int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) {
     };
 
     int rc = l2_swimlane_collector_.initialize(
-        num_aicore, device_id, l2_swimlane_level_, alloc_cb, register_cb, free_cb, output_prefix_
+        num_aicore, aicpu_thread_num, device_id, l2_swimlane_level_, alloc_cb, register_cb, free_cb, output_prefix_
     );
     if (rc != 0) {
         return rc;
diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h
index cf27afffc..795bee313 100644
--- a/src/a2a3/platform/onboard/host/device_runner.h
+++ b/src/a2a3/platform/onboard/host/device_runner.h
@@ -222,7 +222,7 @@ class DeviceRunner : public DeviceRunnerBase {
      * @param device_id Device ID for host registration
      * @return 0 on success, error code on failure
      */
-    int init_l2_swimlane(int num_aicore, int device_id);
+    int init_l2_swimlane(int num_aicore, int aicpu_thread_num, int device_id);
 
     /**
      * Initialize tensor dump shared memory and collector.
diff --git a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
index dbe921c06..76bb27d0e 100644
--- a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
+++ b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
@@ -105,8 +105,8 @@ L2SwimlaneLevel get_l2_swimlane_level() { return g_l2_swimlane_level; }
  * Enqueue ready buffer to per-thread queue
  *
  * @param header L2SwimlaneDataHeader pointer
- * @param thread_idx Thread index
- * @param core_index Core index (or thread_idx for phase entries)
+ * @param thread_idx AICPU thread index (selects the per-thread ready queue)
+ * @param core_index Core index for task entries, or pool ordinal for phase entries
  * @param buffer_ptr Device pointer to the full buffer
  * @param buffer_seq Sequence number for ordering
  * @param kind Buffer kind discriminator (see L2SwimlaneBufferKind)
@@ -684,10 +684,16 @@ void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_phase_threads,
 // ready queue under `kind`, then pop a fresh buffer from free_queue. Sets
 // `*current_buf_out` to nullptr if no free buffer is available — subsequent
 // records on that thread will drop until the host catches up.
+// `thread_idx` is the AICPU thread doing the enqueue (always the caller); it
+// selects that thread's own SPSC ready queue, which it must own exclusively.
+// `pool_idx` is the pool ordinal the host uses to file records and recycle the
+// buffer to that pool (the same ordinal indexes the output lane). For sched
+// pools the two coincide (thread t → queue t, pool t); for the single orch
+// instance they differ (orchestrator's thread, but pool ordinal 0).
 template <typename Buffer>
 static void switch_phase_buffer_kind(
-    int thread_idx, L2SwimlaneAicpuTaskPool *state, Buffer **current_buf_out, L2SwimlaneBufferKind kind,
-    const char *kind_label
+    int thread_idx, uint32_t pool_idx, L2SwimlaneAicpuTaskPool *state, Buffer **current_buf_out,
+    L2SwimlaneBufferKind kind, const char *kind_label
 ) {
     Buffer *full_buf = *current_buf_out;
     if (state == nullptr || full_buf == nullptr) return;
@@ -695,7 +701,7 @@ static void switch_phase_buffer_kind(
     LOG_INFO_V0("Thread %d: %s phase buffer is full (count=%u)", thread_idx, kind_label, full_buf->count);
 
     uint32_t seq = state->head.current_buf_seq;
-    int rc = enqueue_ready_buffer(s_l2_swimlane_header, thread_idx, thread_idx, state->head.current_buf_ptr, seq, kind);
+    int rc = enqueue_ready_buffer(s_l2_swimlane_header, thread_idx, pool_idx, state->head.current_buf_ptr, seq, kind);
     if (rc != 0) {
         LOG_ERROR(
             "Thread %d: failed to enqueue %s phase buffer (queue full), %u records lost!", thread_idx, kind_label,
@@ -741,8 +747,8 @@ static void switch_phase_buffer_kind(
 // callers should bump `dropped_record_count` and return when nullptr.
 template <typename Buffer, typename Record>
 static Record *acquire_phase_slot(
-    int thread_idx, L2SwimlaneAicpuTaskPool *state, Buffer **current_buf_out, L2SwimlaneBufferKind kind,
-    const char *kind_label
+    int thread_idx, uint32_t pool_idx, L2SwimlaneAicpuTaskPool *state, Buffer **current_buf_out,
+    L2SwimlaneBufferKind kind, const char *kind_label
 ) {
     Buffer *buf = *current_buf_out;
     if (buf == nullptr) {
@@ -766,7 +772,7 @@ static Record *acquire_phase_slot(
 
     uint32_t idx = buf->count;
     if (idx >= PLATFORM_PHASE_RECORDS_PER_THREAD) {
-        switch_phase_buffer_kind(thread_idx, state, current_buf_out, kind, kind_label);
+        switch_phase_buffer_kind(thread_idx, pool_idx, state, current_buf_out, kind, kind_label);
         buf = *current_buf_out;
         if (buf == nullptr) return nullptr;
         idx = buf->count;
@@ -788,7 +794,8 @@ void l2_swimlane_aicpu_record_sched_phase(
     state->head.total_record_count += 1;
 
     auto *record = acquire_phase_slot<L2SwimlaneAicpuSchedPhaseBuffer, L2SwimlaneAicpuSchedPhaseRecord>(
-        thread_idx, state, &s_current_sched_phase_buffers[thread_idx], L2SwimlaneBufferKind::AicpuSchedPhase, "sched"
+        /*thread_idx=*/thread_idx, /*pool_idx=*/static_cast<uint32_t>(thread_idx), state,
+        &s_current_sched_phase_buffers[thread_idx], L2SwimlaneBufferKind::AicpuSchedPhase, "sched"
     );
     if (record == nullptr) {
         state->head.dropped_record_count += 1;
@@ -809,13 +816,18 @@ void l2_swimlane_aicpu_record_orch_phase(
     uint64_t start_time, uint64_t end_time, uint64_t task_id, uint32_t submit_idx
 ) {
     if (s_orch_thread_idx < 0 || !s_phase_initialized) return;
-    auto *state = s_orch_phase_pools[s_orch_thread_idx];
+    // Single orch instance (dep_gen / scope_stats style): all orch records
+    // funnel into pool ordinal 0, regardless of which AICPU thread the
+    // orchestrator runs on. s_orch_thread_idx is the orchestrator's AICPU
+    // thread index — used only to pick its own ready queue (SPSC owner); the
+    // entry is tagged with pool ordinal 0 so the host files it into orch lane 0.
+    auto *state = s_orch_phase_pools[0];
     if (state == nullptr) return;
 
     state->head.total_record_count += 1;
 
     auto *record = acquire_phase_slot<L2SwimlaneAicpuOrchPhaseBuffer, L2SwimlaneAicpuOrchPhaseRecord>(
-        s_orch_thread_idx, state, &s_current_orch_phase_buffers[s_orch_thread_idx],
+        /*thread_idx=*/s_orch_thread_idx, /*pool_idx=*/0, state, &s_current_orch_phase_buffers[0],
         L2SwimlaneBufferKind::AicpuOrchPhase, "orch"
     );
     if (record == nullptr) {
@@ -828,39 +840,56 @@ void l2_swimlane_aicpu_record_orch_phase(
     record->submit_idx = submit_idx;
 }
 
-// Final-drain flush for both phase pools owned by this thread (sched + orch).
-// Called once per AICPU thread at end-of-run.
-void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx) {
-    if (!s_phase_initialized || s_l2_swimlane_header == nullptr) return;
-
-    auto flush_one = [&](L2SwimlaneAicpuTaskPool *state, L2SwimlaneBufferKind kind, const char *kind_label) {
-        if (state == nullptr) return;
-        rmb();
-        uint64_t buf_ptr = state->head.current_buf_ptr;
-        if (buf_ptr == 0) return;
-        // Reuse TypedBuffer's count layout — same offset regardless of payload type.
-        auto *buf = reinterpret_cast<L2SwimlaneAicpuSchedPhaseBuffer *>(buf_ptr);
-        if (buf->count == 0) return;
-        uint32_t seq = state->head.current_buf_seq;
-        int rc = enqueue_ready_buffer(s_l2_swimlane_header, thread_idx, thread_idx, buf_ptr, seq, kind);
-        if (rc == 0) {
-            LOG_INFO_V0("Thread %d: flushed %s phase buffer with %u records", thread_idx, kind_label, buf->count);
-        } else {
-            LOG_ERROR(
-                "Thread %d: failed to enqueue %s phase buffer (queue full), %u records lost!", thread_idx, kind_label,
-                buf->count
-            );
-            state->head.dropped_record_count += buf->count;
-            buf->count = 0;
-        }
-        state->head.current_buf_ptr = 0;
-        wmb();
-    };
+// Final-drain flush of one phase pool's active buffer. `thread_idx` / `pool_idx`
+// as in switch_phase_buffer_kind.
+static void flush_phase_pool(
+    int thread_idx, uint32_t pool_idx, L2SwimlaneAicpuTaskPool *state, L2SwimlaneBufferKind kind, const char *kind_label
+) {
+    if (state == nullptr) return;
+    rmb();
+    uint64_t buf_ptr = state->head.current_buf_ptr;
+    if (buf_ptr == 0) return;
+    // `count` sits AFTER the records[] array in TypedBuffer, so its byte offset
+    // is N * sizeof(Record) — different for sched (40B) vs orch (32B) records.
+    // Read/write it through the matching buffer type; a single fixed cast reads
+    // past the orch buffer, sees 0, and silently skips the orch flush.
+    volatile uint32_t *count_ptr = (kind == L2SwimlaneBufferKind::AicpuOrchPhase) ?
+                                       &reinterpret_cast<L2SwimlaneAicpuOrchPhaseBuffer *>(buf_ptr)->count :
+                                       &reinterpret_cast<L2SwimlaneAicpuSchedPhaseBuffer *>(buf_ptr)->count;
+    if (*count_ptr == 0) return;
+    uint32_t seq = state->head.current_buf_seq;
+    int rc = enqueue_ready_buffer(s_l2_swimlane_header, thread_idx, pool_idx, buf_ptr, seq, kind);
+    if (rc == 0) {
+        LOG_INFO_V0("Thread %d: flushed %s phase buffer with %u records", thread_idx, kind_label, *count_ptr);
+    } else {
+        LOG_ERROR(
+            "Thread %d: failed to enqueue %s phase buffer (queue full), %u records lost!", thread_idx, kind_label,
+            *count_ptr
+        );
+        state->head.dropped_record_count += *count_ptr;
+        *count_ptr = 0;
+    }
+    state->head.current_buf_ptr = 0;
+    wmb();
+}
 
-    flush_one(s_sched_phase_pools[thread_idx], L2SwimlaneBufferKind::AicpuSchedPhase, "sched");
+// Final-drain flush of the scheduler-phase pool owned by this scheduler thread.
+void l2_swimlane_aicpu_flush_sched_phase_buffer(int thread_idx) {
+    if (!s_phase_initialized || s_l2_swimlane_header == nullptr) return;
+    flush_phase_pool(
+        thread_idx, static_cast<uint32_t>(thread_idx), s_sched_phase_pools[thread_idx],
+        L2SwimlaneBufferKind::AicpuSchedPhase, "sched"
+    );
     s_current_sched_phase_buffers[thread_idx] = nullptr;
-    flush_one(s_orch_phase_pools[thread_idx], L2SwimlaneBufferKind::AicpuOrchPhase, "orch");
-    s_current_orch_phase_buffers[thread_idx] = nullptr;
+}
+
+// Final-drain flush of the single orchestrator's orch-phase pool (ordinal 0).
+// Called once by the orchestrator thread at orchestration end; see
+// record_orch_phase for the pool-0 / own-ready-queue tagging.
+void l2_swimlane_aicpu_flush_orch_phase_buffer(int thread_idx) {
+    if (!s_phase_initialized || s_l2_swimlane_header == nullptr) return;
+    flush_phase_pool(thread_idx, /*pool_idx=*/0, s_orch_phase_pools[0], L2SwimlaneBufferKind::AicpuOrchPhase, "orch");
+    s_current_orch_phase_buffers[0] = nullptr;
 }
 
 void l2_swimlane_aicpu_init_core_assignments(int total_cores) {
diff --git a/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp b/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp
index 4a7348543..9846d4fa0 100644
--- a/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp
+++ b/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp
@@ -77,8 +77,9 @@ void *L2SwimlaneCollector::alloc_single_buffer(size_t size, void **host_ptr_out)
 }
 
 int L2SwimlaneCollector::initialize(
-    int num_aicore, int device_id, L2SwimlaneLevel l2_swimlane_level, const L2SwimlaneAllocCallback &alloc_cb,
-    L2SwimlaneRegisterCallback register_cb, const L2SwimlaneFreeCallback &free_cb, const std::string &output_prefix
+    int num_aicore, int aicpu_thread_num, int device_id, L2SwimlaneLevel l2_swimlane_level,
+    const L2SwimlaneAllocCallback &alloc_cb, L2SwimlaneRegisterCallback register_cb,
+    const L2SwimlaneFreeCallback &free_cb, const std::string &output_prefix
 ) {
     if (shm_host_ != nullptr) {
         LOG_ERROR("L2SwimlaneCollector already initialized");
@@ -93,6 +94,7 @@ int L2SwimlaneCollector::initialize(
     }
 
     num_aicore_ = num_aicore;
+    aicpu_thread_num_ = aicpu_thread_num;
     l2_swimlane_level_ = l2_swimlane_level;
     output_prefix_ = output_prefix;
     total_perf_collected_ = 0;
@@ -271,13 +273,20 @@ int L2SwimlaneCollector::initialize(
     // — sched and orch buffers have DIFFERENT sizes (40B vs 32B records),
     // so a single cast type for both would land the count store past the end
     // of the orch allocation and corrupt the heap.
+    // state_count pool states are zeroed (so the host's [0, PLATFORM_MAX)
+    // reconcile/iteration reads count=0 for unused slots); buffers are
+    // allocated only for the first buffer_count pools. For sched the two are
+    // equal; orch is a single instance (pool 0), so it zeroes all slots but
+    // allocates buffers for just pool 0 — no buffers wasted on unused slots.
     auto init_phase_pools = [&](auto buffer_tag, L2SwimlaneAicpuTaskPool *(*get_state)(void *, int, int),
-                                int thread_count, ProfBufferType recycle_kind, const char *kind_label) -> int {
+                                int state_count, int buffer_count, ProfBufferType recycle_kind,
+                                const char *kind_label) -> int {
         using Buffer = typename decltype(buffer_tag)::type;
         constexpr size_t buffer_bytes = sizeof(Buffer);
-        for (int t = 0; t < thread_count; t++) {
+        for (int t = 0; t < state_count; t++) {
             auto *state = get_state(perf_host_ptr, num_aicore, t);
             memset(state, 0, sizeof(L2SwimlaneAicpuTaskPool));
+            if (t >= buffer_count) continue;  // zeroed state only; no buffers (unused slot)
             for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) {
                 void *host_buf_ptr = nullptr;
                 void *dev_buf_ptr = alloc_single_buffer(buffer_bytes, &host_buf_ptr);
@@ -312,20 +321,27 @@ int L2SwimlaneCollector::initialize(
         using type = L2SwimlaneAicpuOrchPhaseBuffer;
     };
 
+    // Sched: actual scheduler-thread count is unknown at host-alloc time, so
+    // size buffers to the platform max. Orch: a single instance (pool 0), so
+    // allocate buffers for just one pool while still zeroing all MAX states.
     if (init_phase_pools(
-            SchedTag{}, get_sched_phase_buffer_state, num_phase_threads, ProfBufferType::AICPU_SCHED_PHASE, "sched"
+            SchedTag{}, get_sched_phase_buffer_state, /*state_count=*/num_phase_threads,
+            /*buffer_count=*/num_phase_threads, ProfBufferType::AICPU_SCHED_PHASE, "sched"
         ) != 0) {
         return -1;
     }
     auto orch_get_state = [](void *base, int n_cores, int t) {
         return get_orch_phase_buffer_state(base, n_cores, t);
     };
-    if (init_phase_pools(OrchTag{}, orch_get_state, num_phase_threads, ProfBufferType::AICPU_ORCH_PHASE, "orch") != 0) {
+    if (init_phase_pools(
+            OrchTag{}, orch_get_state, /*state_count=*/num_phase_threads, /*buffer_count=*/1,
+            ProfBufferType::AICPU_ORCH_PHASE, "orch"
+        ) != 0) {
         return -1;
     }
     LOG_DEBUG(
-        "Initialized %d sched + %d orch PhaseBufferStates: 1 buffer/thread, %d in recycled pool each",
-        num_phase_threads, num_phase_threads, PLATFORM_PROF_BUFFERS_PER_THREAD - 1
+        "Initialized %d sched (+1 orch) PhaseBufferStates: 1 buffer/thread, %d in recycled pool each",
+        num_phase_threads, PLATFORM_PROF_BUFFERS_PER_THREAD - 1
     );
 
     wmb();
@@ -625,7 +641,17 @@ void L2SwimlaneCollector::read_phase_header_metadata() {
         );
         return;
     }
-    LOG_INFO_V0("Collecting phase metadata: %d sched threads, %d orch threads", num_sched, num_orch);
+    // Scheduler threads occupy AICPU threads [0, num_sched); the dedicated
+    // orchestrator runs on the last AICPU thread (aicpu_thread_num_ - 1). The
+    // orch-phase pool is a single instance, so its pool index does not encode
+    // the AICPU thread — derive the thread number from aicpu_thread_num_.
+    // aicpu_thread_num_ is >= 1 (DeviceRunner::run validates launch_aicpu_num in
+    // [1, PLATFORM_MAX_AICPU_THREADS] before initialize()), so the subtraction
+    // can't go negative. This is a log-only display value, never an index.
+    const int orch_thread = aicpu_thread_num_ - 1;
+    LOG_INFO_V0(
+        "Collecting phase metadata: scheduler threads 0-%d, orchestrator thread %d", num_sched - 1, orch_thread
+    );
 
     for (size_t t = 0; t < collected_sched_phase_records_.size(); t++) {
         if (!collected_sched_phase_records_[t].empty()) {
@@ -634,7 +660,7 @@ void L2SwimlaneCollector::read_phase_header_metadata() {
     }
     for (size_t t = 0; t < collected_orch_phase_records_.size(); t++) {
         if (!collected_orch_phase_records_[t].empty()) {
-            LOG_INFO_V0("  Orch thread %zu: %zu records", t, collected_orch_phase_records_[t].size());
+            LOG_INFO_V0("  Orch thread %d: %zu records", orch_thread, collected_orch_phase_records_[t].size());
         }
     }
 
@@ -752,6 +778,13 @@ void L2SwimlaneCollector::join_aicore_records() {
 }
 
 int L2SwimlaneCollector::export_swimlane_json() {
+    // shm_host_ is read once (phase-header metadata) below; guard it up front
+    // like the other collector methods so a never-initialized / post-finalize
+    // call returns instead of dereferencing null.
+    if (shm_host_ == nullptr) {
+        return -1;
+    }
+
     // Step 0: Join AICore-emitted start/end/task_id records into the AICPU
     // record stream (AICore-as-producer design).
     join_aicore_records();
@@ -962,8 +995,15 @@ int L2SwimlaneCollector::export_swimlane_json() {
             }
         }
         if (has_orch_phases) {
+            // Orch is a single instance (pool ordinal 0): emit only the actual
+            // orch lane count (num_orch_phase_threads), not the full MAX-sized
+            // vector, so the trace shows one orchestrator lane with no empties.
+            size_t orch_lanes = static_cast<size_t>(get_l2_swimlane_header(shm_host_)->num_orch_phase_threads);
+            if (orch_lanes == 0 || orch_lanes > collected_orch_phase_records_.size()) {
+                orch_lanes = collected_orch_phase_records_.size();
+            }
             outfile << ",\n  \"aicpu_orchestrator_phases\": [\n";
-            for (size_t t = 0; t < collected_orch_phase_records_.size(); t++) {
+            for (size_t t = 0; t < orch_lanes; t++) {
                 outfile << "    [\n";
                 bool first = true;
                 for (const auto &pr : collected_orch_phase_records_[t]) {
@@ -980,7 +1020,7 @@ int L2SwimlaneCollector::export_swimlane_json() {
                 }
                 if (!first) outfile << "\n";
                 outfile << "    ]";
-                if (t < collected_orch_phase_records_.size() - 1) outfile << ",";
+                if (t < orch_lanes - 1) outfile << ",";
                 outfile << "\n";
             }
             outfile << "  ]";
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index 5a141f141..907ab694f 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -260,7 +260,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) {
     last_runtime_ = &runtime;
 
     if (enable_l2_swimlane_) {
-        rc = init_l2_swimlane(num_aicore, device_id_);
+        rc = init_l2_swimlane(num_aicore, runtime.aicpu_thread_num, device_id_);
         if (rc != 0) {
             LOG_ERROR("init_l2_swimlane failed: %d", rc);
             return rc;
@@ -618,7 +618,7 @@ int DeviceRunner::finalize() {
 // Performance Profiling Implementation
 // =============================================================================
 
-int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) {
+int DeviceRunner::init_l2_swimlane(int num_aicore, int aicpu_thread_num, int device_id) {
     auto alloc_cb = [this](size_t size) -> void * {
         return mem_alloc_.alloc(size);
     };
@@ -627,7 +627,7 @@ int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) {
     };
 
     int rc = l2_swimlane_collector_.initialize(
-        num_aicore, device_id, l2_swimlane_level_, alloc_cb, nullptr, free_cb, output_prefix_
+        num_aicore, aicpu_thread_num, device_id, l2_swimlane_level_, alloc_cb, nullptr, free_cb, output_prefix_
     );
     if (rc != 0) {
         return rc;
diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h
index 711934a6c..ed9615a19 100644
--- a/src/a2a3/platform/sim/host/device_runner.h
+++ b/src/a2a3/platform/sim/host/device_runner.h
@@ -41,7 +41,7 @@ class DeviceRunner : public SimDeviceRunnerBase {
     int ensure_binaries_loaded() override;
     void unload_executor_binaries();
 
-    int init_l2_swimlane(int num_aicore, int device_id);
+    int init_l2_swimlane(int num_aicore, int aicpu_thread_num, int device_id);
     int init_tensor_dump(Runtime &runtime, int device_id);
     int init_pmu(int num_cores, int num_threads, const std::string &csv_path, PmuEventType event_type, int device_id);
     int init_dep_gen(int num_threads, int device_id);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index 30c94ccfc..fa354e855 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -867,17 +867,16 @@ int32_t SchedulerContext::init(
         l2_swimlane_level_ = get_l2_swimlane_level();
         if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
             // Sched-phase pool count: matches the dump_tensor_init branch in
-            // scheduler_dispatch.cpp. Orch-phase pool count: typically 1 (one
-            // orch thread), but in orch_to_sched mode all scheduler threads
-            // can write orch records, so we size both pools to aicpu_thread_num_.
-            // sched_thread_num_ <= 0 means "use all AICPU threads as scheduler
-            // threads" (see assign_cores_to_threads' active_sched_threads_
-            // normalization at line 689). Without this normalization here,
-            // init_phase would prime zero sched pools and all sched_phase
-            // emits would silently drop.
+            // scheduler_dispatch.cpp. sched_thread_num_ <= 0 means "use all
+            // AICPU threads as scheduler threads" (see assign_cores_to_threads'
+            // active_sched_threads_ normalization at line 689). Without this
+            // normalization here, init_phase would prime zero sched pools and
+            // all sched_phase emits would silently drop.
             const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_;
             const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched;
-            const int orch_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : 1;
+            // Orchestration is always single-threaded, so orch-phase is one pool
+            // (ordinal 0) in both modes — see record_orch_phase.
+            const int orch_phase_threads = 1;
             l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_phase_threads, orch_phase_threads);
         }
     } else {
@@ -1005,9 +1004,11 @@ void SchedulerContext::on_orchestration_done(
     Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks
 ) {
 #if PTO2_PROFILING
-    if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-        // Flush orchestrator's phase record buffer
-        l2_swimlane_aicpu_flush_phase_buffers(thread_idx);
+    if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) {
+        // Flush the orchestrator's orch-phase buffer (single instance, pool 0).
+        // The orchestrator has no scheduler-phase pool of its own — those belong
+        // to the scheduler threads and are flushed in scheduler_dispatch.
+        l2_swimlane_aicpu_flush_orch_phase_buffer(thread_idx);
     }
 #endif
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
index 2d235f21d..e691ddb2f 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp
@@ -855,7 +855,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
             thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num()
         );
         if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) {
-            l2_swimlane_aicpu_flush_phase_buffers(thread_idx);
+            l2_swimlane_aicpu_flush_sched_phase_buffer(thread_idx);
         }
     }
 #endif