diff --git a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h index 1ed9b5209..3dd01ffb4 100644 --- a/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h +++ b/src/a2a3/platform/include/aicpu/l2_swimlane_collector_aicpu.h @@ -217,13 +217,22 @@ void l2_swimlane_aicpu_init_core_assignments(int total_cores); void l2_swimlane_aicpu_write_core_assignments_for_thread(int thread_idx, const int *core_ids, int core_num); /** - * Flush remaining phase records for a thread + * Flush the remaining scheduler-phase records for a scheduler thread. * - * Marks the current WRITING phase buffer as READY and enqueues it - * for host collection. Called at thread exit (analogous to l2_swimlane_aicpu_flush). + * Marks the thread's current WRITING sched-phase buffer as READY and enqueues + * it for host collection. Called at scheduler-thread exit. * - * @param thread_idx Thread index (scheduler thread or orchestrator) + * @param thread_idx Scheduler thread index (= sched pool index = ready queue) */ -void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx); +void l2_swimlane_aicpu_flush_sched_phase_buffer(int thread_idx); + +/** + * Flush the remaining orchestrator-phase records (single orch instance, pool + * ordinal 0). Called once by the orchestrator thread at orchestration end. + * + * @param thread_idx Calling (orchestrator) AICPU thread index — selects the + * ready queue to enqueue into. The pool/lane tag is ordinal 0. + */ +void l2_swimlane_aicpu_flush_orch_phase_buffer(int thread_idx); #endif // PLATFORM_AICPU_L2_SWIMLANE_COLLECTOR_AICPU_H_ diff --git a/src/a2a3/platform/include/common/l2_swimlane_profiling.h b/src/a2a3/platform/include/common/l2_swimlane_profiling.h index 5477b3df5..aac96c8c5 100644 --- a/src/a2a3/platform/include/common/l2_swimlane_profiling.h +++ b/src/a2a3/platform/include/common/l2_swimlane_profiling.h @@ -379,11 +379,13 @@ struct L2SwimlaneDataHeader { // at init; AICPU reads in l2_swimlane_aicpu_init. // Phase profiling metadata (AICPU writes in l2_swimlane_aicpu_init_phase; - // Host reads at drain time). Both thread counts == 0 means phase - // profiling was not initialized. Gated by l2_swimlane_level >= - // SCHED_PHASES at write time. Sched and orch pools are sized - // independently — typically num_orch_phase_threads == 1, but in - // orch_to_sched mode both equal num_aicpu_threads. + // Host reads at drain time). Both counts == 0 means phase profiling was not + // initialized. Gated by l2_swimlane_level >= SCHED_PHASES at write time. + // num_sched_phase_threads counts the active scheduler threads (sched-phase + // pools are per scheduler thread, indexed by thread id). Orchestration is + // single-threaded, so orch-phase is a single instance: num_orch_phase_threads + // == 1 and records land in orch pool ordinal 0 (dep_gen / scope_stats style), + // regardless of which AICPU thread the orchestrator runs on. uint32_t num_sched_phase_threads; // Number of sched-phase pools the AICPU initialized uint32_t num_orch_phase_threads; // Number of orch-phase pools the AICPU initialized uint32_t num_phase_cores; // Number of valid entries in core_to_thread (0 = unset) diff --git a/src/a2a3/platform/include/host/l2_swimlane_collector.h b/src/a2a3/platform/include/host/l2_swimlane_collector.h index a2fbc00c8..7ab3b3acf 100644 --- a/src/a2a3/platform/include/host/l2_swimlane_collector.h +++ b/src/a2a3/platform/include/host/l2_swimlane_collector.h @@ -322,8 +322,9 @@ class L2SwimlaneCollector : public profiling_common::ProfilerBase void * { return mem_alloc_.alloc(size); }; @@ -480,7 +480,7 @@ int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) { }; int rc = l2_swimlane_collector_.initialize( - num_aicore, device_id, l2_swimlane_level_, alloc_cb, register_cb, free_cb, output_prefix_ + num_aicore, aicpu_thread_num, device_id, l2_swimlane_level_, alloc_cb, register_cb, free_cb, output_prefix_ ); if (rc != 0) { return rc; diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index cf27afffc..795bee313 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -222,7 +222,7 @@ class DeviceRunner : public DeviceRunnerBase { * @param device_id Device ID for host registration * @return 0 on success, error code on failure */ - int init_l2_swimlane(int num_aicore, int device_id); + int init_l2_swimlane(int num_aicore, int aicpu_thread_num, int device_id); /** * Initialize tensor dump shared memory and collector. diff --git a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp index dbe921c06..76bb27d0e 100644 --- a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp +++ b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp @@ -105,8 +105,8 @@ L2SwimlaneLevel get_l2_swimlane_level() { return g_l2_swimlane_level; } * Enqueue ready buffer to per-thread queue * * @param header L2SwimlaneDataHeader pointer - * @param thread_idx Thread index - * @param core_index Core index (or thread_idx for phase entries) + * @param thread_idx AICPU thread index (selects the per-thread ready queue) + * @param core_index Core index for task entries, or pool ordinal for phase entries * @param buffer_ptr Device pointer to the full buffer * @param buffer_seq Sequence number for ordering * @param kind Buffer kind discriminator (see L2SwimlaneBufferKind) @@ -684,10 +684,16 @@ void l2_swimlane_aicpu_init_phase(int worker_count, int num_sched_phase_threads, // ready queue under `kind`, then pop a fresh buffer from free_queue. Sets // `*current_buf_out` to nullptr if no free buffer is available — subsequent // records on that thread will drop until the host catches up. +// `thread_idx` is the AICPU thread doing the enqueue (always the caller); it +// selects that thread's own SPSC ready queue, which it must own exclusively. +// `pool_idx` is the pool ordinal the host uses to file records and recycle the +// buffer to that pool (the same ordinal indexes the output lane). For sched +// pools the two coincide (thread t → queue t, pool t); for the single orch +// instance they differ (orchestrator's thread, but pool ordinal 0). template static void switch_phase_buffer_kind( - int thread_idx, L2SwimlaneAicpuTaskPool *state, Buffer **current_buf_out, L2SwimlaneBufferKind kind, - const char *kind_label + int thread_idx, uint32_t pool_idx, L2SwimlaneAicpuTaskPool *state, Buffer **current_buf_out, + L2SwimlaneBufferKind kind, const char *kind_label ) { Buffer *full_buf = *current_buf_out; if (state == nullptr || full_buf == nullptr) return; @@ -695,7 +701,7 @@ static void switch_phase_buffer_kind( LOG_INFO_V0("Thread %d: %s phase buffer is full (count=%u)", thread_idx, kind_label, full_buf->count); uint32_t seq = state->head.current_buf_seq; - int rc = enqueue_ready_buffer(s_l2_swimlane_header, thread_idx, thread_idx, state->head.current_buf_ptr, seq, kind); + int rc = enqueue_ready_buffer(s_l2_swimlane_header, thread_idx, pool_idx, state->head.current_buf_ptr, seq, kind); if (rc != 0) { LOG_ERROR( "Thread %d: failed to enqueue %s phase buffer (queue full), %u records lost!", thread_idx, kind_label, @@ -741,8 +747,8 @@ static void switch_phase_buffer_kind( // callers should bump `dropped_record_count` and return when nullptr. template static Record *acquire_phase_slot( - int thread_idx, L2SwimlaneAicpuTaskPool *state, Buffer **current_buf_out, L2SwimlaneBufferKind kind, - const char *kind_label + int thread_idx, uint32_t pool_idx, L2SwimlaneAicpuTaskPool *state, Buffer **current_buf_out, + L2SwimlaneBufferKind kind, const char *kind_label ) { Buffer *buf = *current_buf_out; if (buf == nullptr) { @@ -766,7 +772,7 @@ static Record *acquire_phase_slot( uint32_t idx = buf->count; if (idx >= PLATFORM_PHASE_RECORDS_PER_THREAD) { - switch_phase_buffer_kind(thread_idx, state, current_buf_out, kind, kind_label); + switch_phase_buffer_kind(thread_idx, pool_idx, state, current_buf_out, kind, kind_label); buf = *current_buf_out; if (buf == nullptr) return nullptr; idx = buf->count; @@ -788,7 +794,8 @@ void l2_swimlane_aicpu_record_sched_phase( state->head.total_record_count += 1; auto *record = acquire_phase_slot( - thread_idx, state, &s_current_sched_phase_buffers[thread_idx], L2SwimlaneBufferKind::AicpuSchedPhase, "sched" + /*thread_idx=*/thread_idx, /*pool_idx=*/static_cast(thread_idx), state, + &s_current_sched_phase_buffers[thread_idx], L2SwimlaneBufferKind::AicpuSchedPhase, "sched" ); if (record == nullptr) { state->head.dropped_record_count += 1; @@ -809,13 +816,18 @@ void l2_swimlane_aicpu_record_orch_phase( uint64_t start_time, uint64_t end_time, uint64_t task_id, uint32_t submit_idx ) { if (s_orch_thread_idx < 0 || !s_phase_initialized) return; - auto *state = s_orch_phase_pools[s_orch_thread_idx]; + // Single orch instance (dep_gen / scope_stats style): all orch records + // funnel into pool ordinal 0, regardless of which AICPU thread the + // orchestrator runs on. s_orch_thread_idx is the orchestrator's AICPU + // thread index — used only to pick its own ready queue (SPSC owner); the + // entry is tagged with pool ordinal 0 so the host files it into orch lane 0. + auto *state = s_orch_phase_pools[0]; if (state == nullptr) return; state->head.total_record_count += 1; auto *record = acquire_phase_slot( - s_orch_thread_idx, state, &s_current_orch_phase_buffers[s_orch_thread_idx], + /*thread_idx=*/s_orch_thread_idx, /*pool_idx=*/0, state, &s_current_orch_phase_buffers[0], L2SwimlaneBufferKind::AicpuOrchPhase, "orch" ); if (record == nullptr) { @@ -828,39 +840,56 @@ void l2_swimlane_aicpu_record_orch_phase( record->submit_idx = submit_idx; } -// Final-drain flush for both phase pools owned by this thread (sched + orch). -// Called once per AICPU thread at end-of-run. -void l2_swimlane_aicpu_flush_phase_buffers(int thread_idx) { - if (!s_phase_initialized || s_l2_swimlane_header == nullptr) return; - - auto flush_one = [&](L2SwimlaneAicpuTaskPool *state, L2SwimlaneBufferKind kind, const char *kind_label) { - if (state == nullptr) return; - rmb(); - uint64_t buf_ptr = state->head.current_buf_ptr; - if (buf_ptr == 0) return; - // Reuse TypedBuffer's count layout — same offset regardless of payload type. - auto *buf = reinterpret_cast(buf_ptr); - if (buf->count == 0) return; - uint32_t seq = state->head.current_buf_seq; - int rc = enqueue_ready_buffer(s_l2_swimlane_header, thread_idx, thread_idx, buf_ptr, seq, kind); - if (rc == 0) { - LOG_INFO_V0("Thread %d: flushed %s phase buffer with %u records", thread_idx, kind_label, buf->count); - } else { - LOG_ERROR( - "Thread %d: failed to enqueue %s phase buffer (queue full), %u records lost!", thread_idx, kind_label, - buf->count - ); - state->head.dropped_record_count += buf->count; - buf->count = 0; - } - state->head.current_buf_ptr = 0; - wmb(); - }; +// Final-drain flush of one phase pool's active buffer. `thread_idx` / `pool_idx` +// as in switch_phase_buffer_kind. +static void flush_phase_pool( + int thread_idx, uint32_t pool_idx, L2SwimlaneAicpuTaskPool *state, L2SwimlaneBufferKind kind, const char *kind_label +) { + if (state == nullptr) return; + rmb(); + uint64_t buf_ptr = state->head.current_buf_ptr; + if (buf_ptr == 0) return; + // `count` sits AFTER the records[] array in TypedBuffer, so its byte offset + // is N * sizeof(Record) — different for sched (40B) vs orch (32B) records. + // Read/write it through the matching buffer type; a single fixed cast reads + // past the orch buffer, sees 0, and silently skips the orch flush. + volatile uint32_t *count_ptr = (kind == L2SwimlaneBufferKind::AicpuOrchPhase) ? + &reinterpret_cast(buf_ptr)->count : + &reinterpret_cast(buf_ptr)->count; + if (*count_ptr == 0) return; + uint32_t seq = state->head.current_buf_seq; + int rc = enqueue_ready_buffer(s_l2_swimlane_header, thread_idx, pool_idx, buf_ptr, seq, kind); + if (rc == 0) { + LOG_INFO_V0("Thread %d: flushed %s phase buffer with %u records", thread_idx, kind_label, *count_ptr); + } else { + LOG_ERROR( + "Thread %d: failed to enqueue %s phase buffer (queue full), %u records lost!", thread_idx, kind_label, + *count_ptr + ); + state->head.dropped_record_count += *count_ptr; + *count_ptr = 0; + } + state->head.current_buf_ptr = 0; + wmb(); +} - flush_one(s_sched_phase_pools[thread_idx], L2SwimlaneBufferKind::AicpuSchedPhase, "sched"); +// Final-drain flush of the scheduler-phase pool owned by this scheduler thread. +void l2_swimlane_aicpu_flush_sched_phase_buffer(int thread_idx) { + if (!s_phase_initialized || s_l2_swimlane_header == nullptr) return; + flush_phase_pool( + thread_idx, static_cast(thread_idx), s_sched_phase_pools[thread_idx], + L2SwimlaneBufferKind::AicpuSchedPhase, "sched" + ); s_current_sched_phase_buffers[thread_idx] = nullptr; - flush_one(s_orch_phase_pools[thread_idx], L2SwimlaneBufferKind::AicpuOrchPhase, "orch"); - s_current_orch_phase_buffers[thread_idx] = nullptr; +} + +// Final-drain flush of the single orchestrator's orch-phase pool (ordinal 0). +// Called once by the orchestrator thread at orchestration end; see +// record_orch_phase for the pool-0 / own-ready-queue tagging. +void l2_swimlane_aicpu_flush_orch_phase_buffer(int thread_idx) { + if (!s_phase_initialized || s_l2_swimlane_header == nullptr) return; + flush_phase_pool(thread_idx, /*pool_idx=*/0, s_orch_phase_pools[0], L2SwimlaneBufferKind::AicpuOrchPhase, "orch"); + s_current_orch_phase_buffers[0] = nullptr; } void l2_swimlane_aicpu_init_core_assignments(int total_cores) { diff --git a/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp b/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp index 4a7348543..9846d4fa0 100644 --- a/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp +++ b/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp @@ -77,8 +77,9 @@ void *L2SwimlaneCollector::alloc_single_buffer(size_t size, void **host_ptr_out) } int L2SwimlaneCollector::initialize( - int num_aicore, int device_id, L2SwimlaneLevel l2_swimlane_level, const L2SwimlaneAllocCallback &alloc_cb, - L2SwimlaneRegisterCallback register_cb, const L2SwimlaneFreeCallback &free_cb, const std::string &output_prefix + int num_aicore, int aicpu_thread_num, int device_id, L2SwimlaneLevel l2_swimlane_level, + const L2SwimlaneAllocCallback &alloc_cb, L2SwimlaneRegisterCallback register_cb, + const L2SwimlaneFreeCallback &free_cb, const std::string &output_prefix ) { if (shm_host_ != nullptr) { LOG_ERROR("L2SwimlaneCollector already initialized"); @@ -93,6 +94,7 @@ int L2SwimlaneCollector::initialize( } num_aicore_ = num_aicore; + aicpu_thread_num_ = aicpu_thread_num; l2_swimlane_level_ = l2_swimlane_level; output_prefix_ = output_prefix; total_perf_collected_ = 0; @@ -271,13 +273,20 @@ int L2SwimlaneCollector::initialize( // — sched and orch buffers have DIFFERENT sizes (40B vs 32B records), // so a single cast type for both would land the count store past the end // of the orch allocation and corrupt the heap. + // state_count pool states are zeroed (so the host's [0, PLATFORM_MAX) + // reconcile/iteration reads count=0 for unused slots); buffers are + // allocated only for the first buffer_count pools. For sched the two are + // equal; orch is a single instance (pool 0), so it zeroes all slots but + // allocates buffers for just pool 0 — no buffers wasted on unused slots. auto init_phase_pools = [&](auto buffer_tag, L2SwimlaneAicpuTaskPool *(*get_state)(void *, int, int), - int thread_count, ProfBufferType recycle_kind, const char *kind_label) -> int { + int state_count, int buffer_count, ProfBufferType recycle_kind, + const char *kind_label) -> int { using Buffer = typename decltype(buffer_tag)::type; constexpr size_t buffer_bytes = sizeof(Buffer); - for (int t = 0; t < thread_count; t++) { + for (int t = 0; t < state_count; t++) { auto *state = get_state(perf_host_ptr, num_aicore, t); memset(state, 0, sizeof(L2SwimlaneAicpuTaskPool)); + if (t >= buffer_count) continue; // zeroed state only; no buffers (unused slot) for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) { void *host_buf_ptr = nullptr; void *dev_buf_ptr = alloc_single_buffer(buffer_bytes, &host_buf_ptr); @@ -312,20 +321,27 @@ int L2SwimlaneCollector::initialize( using type = L2SwimlaneAicpuOrchPhaseBuffer; }; + // Sched: actual scheduler-thread count is unknown at host-alloc time, so + // size buffers to the platform max. Orch: a single instance (pool 0), so + // allocate buffers for just one pool while still zeroing all MAX states. if (init_phase_pools( - SchedTag{}, get_sched_phase_buffer_state, num_phase_threads, ProfBufferType::AICPU_SCHED_PHASE, "sched" + SchedTag{}, get_sched_phase_buffer_state, /*state_count=*/num_phase_threads, + /*buffer_count=*/num_phase_threads, ProfBufferType::AICPU_SCHED_PHASE, "sched" ) != 0) { return -1; } auto orch_get_state = [](void *base, int n_cores, int t) { return get_orch_phase_buffer_state(base, n_cores, t); }; - if (init_phase_pools(OrchTag{}, orch_get_state, num_phase_threads, ProfBufferType::AICPU_ORCH_PHASE, "orch") != 0) { + if (init_phase_pools( + OrchTag{}, orch_get_state, /*state_count=*/num_phase_threads, /*buffer_count=*/1, + ProfBufferType::AICPU_ORCH_PHASE, "orch" + ) != 0) { return -1; } LOG_DEBUG( - "Initialized %d sched + %d orch PhaseBufferStates: 1 buffer/thread, %d in recycled pool each", - num_phase_threads, num_phase_threads, PLATFORM_PROF_BUFFERS_PER_THREAD - 1 + "Initialized %d sched (+1 orch) PhaseBufferStates: 1 buffer/thread, %d in recycled pool each", + num_phase_threads, PLATFORM_PROF_BUFFERS_PER_THREAD - 1 ); wmb(); @@ -625,7 +641,17 @@ void L2SwimlaneCollector::read_phase_header_metadata() { ); return; } - LOG_INFO_V0("Collecting phase metadata: %d sched threads, %d orch threads", num_sched, num_orch); + // Scheduler threads occupy AICPU threads [0, num_sched); the dedicated + // orchestrator runs on the last AICPU thread (aicpu_thread_num_ - 1). The + // orch-phase pool is a single instance, so its pool index does not encode + // the AICPU thread — derive the thread number from aicpu_thread_num_. + // aicpu_thread_num_ is >= 1 (DeviceRunner::run validates launch_aicpu_num in + // [1, PLATFORM_MAX_AICPU_THREADS] before initialize()), so the subtraction + // can't go negative. This is a log-only display value, never an index. + const int orch_thread = aicpu_thread_num_ - 1; + LOG_INFO_V0( + "Collecting phase metadata: scheduler threads 0-%d, orchestrator thread %d", num_sched - 1, orch_thread + ); for (size_t t = 0; t < collected_sched_phase_records_.size(); t++) { if (!collected_sched_phase_records_[t].empty()) { @@ -634,7 +660,7 @@ void L2SwimlaneCollector::read_phase_header_metadata() { } for (size_t t = 0; t < collected_orch_phase_records_.size(); t++) { if (!collected_orch_phase_records_[t].empty()) { - LOG_INFO_V0(" Orch thread %zu: %zu records", t, collected_orch_phase_records_[t].size()); + LOG_INFO_V0(" Orch thread %d: %zu records", orch_thread, collected_orch_phase_records_[t].size()); } } @@ -752,6 +778,13 @@ void L2SwimlaneCollector::join_aicore_records() { } int L2SwimlaneCollector::export_swimlane_json() { + // shm_host_ is read once (phase-header metadata) below; guard it up front + // like the other collector methods so a never-initialized / post-finalize + // call returns instead of dereferencing null. + if (shm_host_ == nullptr) { + return -1; + } + // Step 0: Join AICore-emitted start/end/task_id records into the AICPU // record stream (AICore-as-producer design). join_aicore_records(); @@ -962,8 +995,15 @@ int L2SwimlaneCollector::export_swimlane_json() { } } if (has_orch_phases) { + // Orch is a single instance (pool ordinal 0): emit only the actual + // orch lane count (num_orch_phase_threads), not the full MAX-sized + // vector, so the trace shows one orchestrator lane with no empties. + size_t orch_lanes = static_cast(get_l2_swimlane_header(shm_host_)->num_orch_phase_threads); + if (orch_lanes == 0 || orch_lanes > collected_orch_phase_records_.size()) { + orch_lanes = collected_orch_phase_records_.size(); + } outfile << ",\n \"aicpu_orchestrator_phases\": [\n"; - for (size_t t = 0; t < collected_orch_phase_records_.size(); t++) { + for (size_t t = 0; t < orch_lanes; t++) { outfile << " [\n"; bool first = true; for (const auto &pr : collected_orch_phase_records_[t]) { @@ -980,7 +1020,7 @@ int L2SwimlaneCollector::export_swimlane_json() { } if (!first) outfile << "\n"; outfile << " ]"; - if (t < collected_orch_phase_records_.size() - 1) outfile << ","; + if (t < orch_lanes - 1) outfile << ","; outfile << "\n"; } outfile << " ]"; diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index 5a141f141..907ab694f 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -260,7 +260,7 @@ int DeviceRunner::run(Runtime &runtime, int block_dim, int launch_aicpu_num) { last_runtime_ = &runtime; if (enable_l2_swimlane_) { - rc = init_l2_swimlane(num_aicore, device_id_); + rc = init_l2_swimlane(num_aicore, runtime.aicpu_thread_num, device_id_); if (rc != 0) { LOG_ERROR("init_l2_swimlane failed: %d", rc); return rc; @@ -618,7 +618,7 @@ int DeviceRunner::finalize() { // Performance Profiling Implementation // ============================================================================= -int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) { +int DeviceRunner::init_l2_swimlane(int num_aicore, int aicpu_thread_num, int device_id) { auto alloc_cb = [this](size_t size) -> void * { return mem_alloc_.alloc(size); }; @@ -627,7 +627,7 @@ int DeviceRunner::init_l2_swimlane(int num_aicore, int device_id) { }; int rc = l2_swimlane_collector_.initialize( - num_aicore, device_id, l2_swimlane_level_, alloc_cb, nullptr, free_cb, output_prefix_ + num_aicore, aicpu_thread_num, device_id, l2_swimlane_level_, alloc_cb, nullptr, free_cb, output_prefix_ ); if (rc != 0) { return rc; diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index 711934a6c..ed9615a19 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -41,7 +41,7 @@ class DeviceRunner : public SimDeviceRunnerBase { int ensure_binaries_loaded() override; void unload_executor_binaries(); - int init_l2_swimlane(int num_aicore, int device_id); + int init_l2_swimlane(int num_aicore, int aicpu_thread_num, int device_id); int init_tensor_dump(Runtime &runtime, int device_id); int init_pmu(int num_cores, int num_threads, const std::string &csv_path, PmuEventType event_type, int device_id); int init_dep_gen(int num_threads, int device_id); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index 30c94ccfc..fa354e855 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -867,17 +867,16 @@ int32_t SchedulerContext::init( l2_swimlane_level_ = get_l2_swimlane_level(); if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { // Sched-phase pool count: matches the dump_tensor_init branch in - // scheduler_dispatch.cpp. Orch-phase pool count: typically 1 (one - // orch thread), but in orch_to_sched mode all scheduler threads - // can write orch records, so we size both pools to aicpu_thread_num_. - // sched_thread_num_ <= 0 means "use all AICPU threads as scheduler - // threads" (see assign_cores_to_threads' active_sched_threads_ - // normalization at line 689). Without this normalization here, - // init_phase would prime zero sched pools and all sched_phase - // emits would silently drop. + // scheduler_dispatch.cpp. sched_thread_num_ <= 0 means "use all + // AICPU threads as scheduler threads" (see assign_cores_to_threads' + // active_sched_threads_ normalization at line 689). Without this + // normalization here, init_phase would prime zero sched pools and + // all sched_phase emits would silently drop. const int active_sched = (sched_thread_num_ > 0) ? sched_thread_num_ : aicpu_thread_num_; const int sched_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : active_sched; - const int orch_phase_threads = orch_to_sched_ ? aicpu_thread_num_ : 1; + // Orchestration is always single-threaded, so orch-phase is one pool + // (ordinal 0) in both modes — see record_orch_phase. + const int orch_phase_threads = 1; l2_swimlane_aicpu_init_phase(runtime->worker_count, sched_phase_threads, orch_phase_threads); } } else { @@ -1005,9 +1004,11 @@ void SchedulerContext::on_orchestration_done( Runtime *runtime, PTO2Runtime *rt, int32_t thread_idx, int32_t total_tasks ) { #if PTO2_PROFILING - if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { - // Flush orchestrator's phase record buffer - l2_swimlane_aicpu_flush_phase_buffers(thread_idx); + if (l2_swimlane_level_ >= L2SwimlaneLevel::ORCH_PHASES) { + // Flush the orchestrator's orch-phase buffer (single instance, pool 0). + // The orchestrator has no scheduler-phase pool of its own — those belong + // to the scheduler threads and are flushed in scheduler_dispatch. + l2_swimlane_aicpu_flush_orch_phase_buffer(thread_idx); } #endif diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp index 2d235f21d..e691ddb2f 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_dispatch.cpp @@ -855,7 +855,7 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_ thread_idx, core_trackers_[thread_idx].core_ids(), core_trackers_[thread_idx].core_num() ); if (l2_swimlane_level_ >= L2SwimlaneLevel::SCHED_PHASES) { - l2_swimlane_aicpu_flush_phase_buffers(thread_idx); + l2_swimlane_aicpu_flush_sched_phase_buffer(thread_idx); } } #endif